From 48ef8697d064c02cabe8c666c9d4ab7aa519fe45 Mon Sep 17 00:00:00 2001
From: gooding470 <andy@aerospike.com>
Date: Fri, 2 Mar 2018 18:57:11 -0800
Subject: [PATCH] AER-5819 - Fix ordered list search for lowest valued element
 position.

---
 .build.yml                                    |   41 +
 .github/ISSUE_TEMPLATE.md                     |   22 +
 .gitignore                                    |   20 +
 .gitmodules                                   |   27 +
 CONTRIBUTING.md                               |    3 +
 LICENSE                                       |  260 +
 LICENSE-AGPL                                  |  661 ++
 LICENSE-APACHE                                |  202 +
 LICENSE.3rdParty                              |  235 +
 LICENSE.CE                                    |   25 +
 Makefile                                      |  163 +
 README.md                                     |  224 +
 ai/Makefile                                   |    9 +
 ai/include/ai_btree.h                         |   68 +
 ai/include/ai_obj.h                           |   42 +
 ai/include/ai_types.h                         |  101 +
 ai/include/bt.h                               |   90 +
 ai/include/bt_iterator.h                      |   98 +
 ai/include/bt_output.h                        |   34 +
 ai/include/btree.h                            |   87 +
 ai/include/btreepriv.h                        |  105 +
 ai/include/stream.h                           |   41 +
 ai/src/Makefile                               |   33 +
 ai/src/ai_btree.c                             | 1178 +++
 ai/src/ai_obj.c                               |  103 +
 ai/src/bt.c                                   |  133 +
 ai/src/bt_code.c                              | 1016 ++
 ai/src/bt_iterator.c                          |  528 +
 ai/src/bt_output.c                            |  178 +
 ai/src/stream.c                               |  166 +
 apidocs/Makefile                              |   10 +
 apidocs/src/doxyfile                          | 1792 ++++
 apidocs/src/footer.html                       |   20 +
 apidocs/src/header.html                       |   52 +
 apidocs/src/layout.xml                        |  187 +
 apidocs/src/style.css                         | 1174 +++
 as/Makefile                                   |    9 +
 as/etc/README.sample.conf.md                  |   15 +
 as/etc/aerospike-server.sysconfig             |    4 +
 as/etc/aerospike-server.tmpfiles              |    1 +
 as/etc/aerospike.conf                         |   67 +
 as/etc/aerospike.service.d/aerospike.conf     |    0
 .../aerospike.conf.coldstart                  |    2 +
 .../aerospike.conf.default                    |    0
 as/etc/aerospike.service.head                 |   14 +
 as/etc/aerospike.service.tail                 |    3 +
 as/etc/aerospike.service.telemetry            |    2 +
 as/etc/aerospike_dev.conf                     |   81 +
 as/etc/aerospike_mesh.conf                    |   70 +
 as/etc/aerospike_mesh_systemd.conf            |   66 +
 as/etc/aerospike_ssd.conf                     |   65 +
 as/etc/aerospike_ssd_systemd.conf             |   61 +
 as/etc/aerospike_systemd.conf                 |   63 +
 as/etc/aerospike_telemetry.service            |   11 +
 as/etc/aerospike_telemetry.sysconfig          |    1 +
 as/etc/asd-systemd-helper                     |   36 +
 as/etc/init-script                            |  193 +
 as/etc/init-script.deb                        |  162 +
 as/etc/init-telemetry-script                  |   49 +
 as/etc/init-telemetry-script.deb              |   49 +
 as/etc/irqbalance-ban.sh                      |    5 +
 as/etc/logrotate_asd                          |   12 +
 as/etc/logrotate_telemetry                    |   12 +
 as/etc/telemetry.conf                         |   13 +
 as/etc/telemetry_dev.conf                     |   13 +
 as/etc/valgrind.supp                          |  190 +
 as/include/base/aggr.h                        |   54 +
 as/include/base/as_stap.h                     |   52 +
 as/include/base/batch.h                       |   40 +
 as/include/base/cdt.h                         |  492 +
 as/include/base/cfg.h                         |  284 +
 as/include/base/datamodel.h                   | 1207 +++
 as/include/base/features.h                    |   30 +
 as/include/base/index.h                       |  337 +
 as/include/base/job_manager.h                 |  171 +
 as/include/base/json_init.h                   |   34 +
 as/include/base/monitor.h                     |  103 +
 as/include/base/packet_compression.h          |   81 +
 as/include/base/particle.h                    |   98 +
 as/include/base/particle_blob.h               |   63 +
 as/include/base/particle_integer.h            |   63 +
 as/include/base/predexp.h                     |   57 +
 as/include/base/proto.h                       |  693 ++
 as/include/base/rec_props.h                   |   79 +
 as/include/base/scan.h                        |   58 +
 as/include/base/secondary_index.h             |  691 ++
 as/include/base/security.h                    |  106 +
 as/include/base/security_config.h             |   78 +
 as/include/base/stats.h                       |  129 +
 as/include/base/system_metadata.h             |  236 +
 as/include/base/thr_batch.h                   |   31 +
 as/include/base/thr_demarshal.h               |   46 +
 as/include/base/thr_info.h                    |   88 +
 as/include/base/thr_info_port.h               |   30 +
 as/include/base/thr_query.h                   |   42 +
 as/include/base/thr_sindex.h                  |   78 +
 as/include/base/thr_tsvc.h                    |   55 +
 as/include/base/ticker.h                      |   29 +
 as/include/base/transaction.h                 |  378 +
 as/include/base/transaction_policy.h          |  114 +
 as/include/base/truncate.h                    |   94 +
 as/include/base/udf_aerospike.h               |   27 +
 as/include/base/udf_arglist.h                 |   31 +
 as/include/base/udf_cask.h                    |   70 +
 as/include/base/udf_memtracker.h              |   51 +
 as/include/base/udf_record.h                  |  110 +
 as/include/base/udf_timer.h                   |   47 +
 as/include/base/xdr_config.h                  |  128 +
 as/include/base/xdr_serverside.h              |   87 +
 as/include/fabric/clustering.h                |  296 +
 as/include/fabric/endpoint.h                  |  324 +
 as/include/fabric/exchange.h                  |  158 +
 as/include/fabric/fabric.h                    |  129 +
 as/include/fabric/hb.h                        |  473 +
 as/include/fabric/hlc.h                       |  160 +
 as/include/fabric/meta_batch.h                |   42 +
 as/include/fabric/migrate.h                   |  215 +
 as/include/fabric/partition.h                 |  285 +
 as/include/fabric/partition_balance.h         |  197 +
 as/include/fabric/roster.h                    |   52 +
 as/include/fabric/skew_monitor.h              |   67 +
 as/include/geospatial/geojson.h               |   56 +
 as/include/geospatial/geospatial.h            |   61 +
 as/include/geospatial/scoped.h                |  107 +
 as/include/geospatial/throwstream.h           |   35 +
 as/include/storage/drv_ssd.h                  |  463 +
 as/include/storage/storage.h                  |  183 +
 as/include/transaction/delete.h               |   56 +
 as/include/transaction/duplicate_resolve.h    |   50 +
 as/include/transaction/proxy.h                |   60 +
 as/include/transaction/re_replicate.h         |   43 +
 as/include/transaction/read.h                 |   36 +
 as/include/transaction/replica_ping.h         |   54 +
 as/include/transaction/replica_write.h        |   51 +
 as/include/transaction/rw_request.h           |  209 +
 as/include/transaction/rw_request_hash.h      |  111 +
 as/include/transaction/rw_utils.h             |  201 +
 as/include/transaction/udf.h                  |   98 +
 as/include/transaction/write.h                |   43 +
 as/src/Makefile                               |  192 +
 as/src/base/aggr.c                            |  337 +
 as/src/base/as.c                              |  520 +
 as/src/base/batch.c                           | 1155 +++
 as/src/base/bin.c                             |  685 ++
 as/src/base/cdt.c                             | 2607 +++++
 as/src/base/cfg.c                             | 4671 +++++++++
 as/src/base/cfg_ce.c                          |   90 +
 as/src/base/features_ce.c                     |   38 +
 as/src/base/index.c                           | 1254 +++
 as/src/base/index_ce.c                        |   67 +
 as/src/base/job_manager.c                     |  806 ++
 as/src/base/json_init.c                       |   62 +
 as/src/base/monitor.c                         |  474 +
 as/src/base/namespace.c                       |  746 ++
 as/src/base/namespace_ce.c                    |  142 +
 as/src/base/packet_compression.c              |  234 +
 as/src/base/particle.c                        | 1016 ++
 as/src/base/particle_blob.c                   |  432 +
 as/src/base/particle_float.c                  |  200 +
 as/src/base/particle_geojson.c                |  600 ++
 as/src/base/particle_integer.c                |  446 +
 as/src/base/particle_list.c                   | 4519 ++++++++
 as/src/base/particle_map.c                    | 6886 +++++++++++++
 as/src/base/particle_string.c                 |  173 +
 as/src/base/predexp.c                         | 2149 ++++
 as/src/base/probes.d                          |   25 +
 as/src/base/proto.c                           |  885 ++
 as/src/base/rec_props.c                       |  230 +
 as/src/base/record.c                          |  958 ++
 as/src/base/record_ce.c                       |  136 +
 as/src/base/scan.c                            | 1409 +++
 as/src/base/secondary_index.c                 | 4539 +++++++++
 as/src/base/security_ce.c                     |  163 +
 as/src/base/signal.c                          |  249 +
 as/src/base/system_metadata.c                 | 3471 +++++++
 as/src/base/thr_batch.c                       |  467 +
 as/src/base/thr_demarshal.c                   |  914 ++
 as/src/base/thr_info.c                        | 7024 +++++++++++++
 as/src/base/thr_info_port.c                   |  316 +
 as/src/base/thr_nsup.c                        | 1276 +++
 as/src/base/thr_query.c                       | 3383 ++++++
 as/src/base/thr_sindex.c                      |  841 ++
 as/src/base/thr_tsvc.c                        |  580 ++
 as/src/base/ticker.c                          |  919 ++
 as/src/base/transaction.c                     |  480 +
 as/src/base/truncate.c                        |  621 ++
 as/src/base/truncate_ce.c                     |   62 +
 as/src/base/udf_aerospike.c                   |  971 ++
 as/src/base/udf_arglist.c                     |   81 +
 as/src/base/udf_cask.c                        |  745 ++
 as/src/base/udf_memtracker.c                  |  105 +
 as/src/base/udf_record.c                      |  959 ++
 as/src/base/udf_timer.c                       |   96 +
 as/src/base/xdr_config.c                      |   73 +
 as/src/base/xdr_serverside_stubs.c            |  130 +
 as/src/fabric/clustering.c                    | 8163 +++++++++++++++
 as/src/fabric/endpoint.c                      |  880 ++
 as/src/fabric/exchange.c                      | 3457 +++++++
 as/src/fabric/fabric.c                        | 2943 ++++++
 as/src/fabric/hb.c                            | 9055 +++++++++++++++++
 as/src/fabric/hlc.c                           |  557 +
 as/src/fabric/meta_batch_ce.c                 |   65 +
 as/src/fabric/migrate.c                       | 1758 ++++
 as/src/fabric/migrate_ce.c                    |   94 +
 as/src/fabric/partition.c                     |  809 ++
 as/src/fabric/partition_balance.c             | 1456 +++
 as/src/fabric/partition_balance_ce.c          |  126 +
 as/src/fabric/partition_ce.c                  |   67 +
 as/src/fabric/roster_ce.c                     |   50 +
 as/src/fabric/skew_monitor.c                  |  611 ++
 as/src/geospatial/geojson.cc                  |  344 +
 as/src/geospatial/geospatial.cc               |  228 +
 as/src/storage/drv_memory.c                   |   78 +
 as/src/storage/drv_memory_ce.c                |   44 +
 as/src/storage/drv_ssd.c                      | 4312 ++++++++
 as/src/storage/drv_ssd_ce.c                   |  181 +
 as/src/storage/storage.c                      |  688 ++
 as/src/transaction/delete.c                   |  486 +
 as/src/transaction/delete_ce.c                |   69 +
 as/src/transaction/duplicate_resolve.c        |  578 ++
 as/src/transaction/proxy.c                    |  698 ++
 as/src/transaction/re_replicate_ce.c          |   43 +
 as/src/transaction/read.c                     |  625 ++
 as/src/transaction/replica_ping_ce.c          |   88 +
 as/src/transaction/replica_write.c            |  520 +
 as/src/transaction/rw_request.c               |  223 +
 as/src/transaction/rw_request_hash.c          |  448 +
 as/src/transaction/rw_utils.c                 |  470 +
 as/src/transaction/rw_utils_ce.c              |  259 +
 as/src/transaction/udf.c                      | 1094 ++
 as/src/transaction/write.c                    | 1958 ++++
 build/VersionCheck.py                         |   32 +
 build/gen_version                             |   18 +
 build/os_version                              |  135 +
 build/prep-ce                                 |   19 +
 build/version                                 |   10 +
 cf/.gitignore                                 |    2 +
 cf/Makefile                                   |    9 +
 cf/README.md                                  |   13 +
 cf/include/arenax.h                           |  131 +
 cf/include/bits.h                             |   80 +
 cf/include/cf_mutex.h                         |   63 +
 cf/include/cf_str.h                           |   73 +
 cf/include/compare.h                          |   52 +
 cf/include/daemon.h                           |   30 +
 cf/include/dynbuf.h                           |  126 +
 cf/include/enhanced_alloc.h                   |  126 +
 cf/include/fault.h                            |  434 +
 cf/include/hardware.h                         |   56 +
 cf/include/hist.h                             |   67 +
 cf/include/hist_track.h                       |   86 +
 cf/include/linear_hist.h                      |   61 +
 cf/include/mem_count.h                        |   51 +
 cf/include/meminfo.h                          |   33 +
 cf/include/msg.h                              |  232 +
 cf/include/node.h                             |   71 +
 cf/include/olock.h                            |   49 +
 cf/include/shash.h                            |  110 +
 cf/include/socket.h                           |  340 +
 cf/include/tls.h                              |   75 +
 cf/include/vmapx.h                            |  100 +
 cf/include/warnings.h                         |   28 +
 cf/src/Makefile                               |   41 +
 cf/src/alloc.c                                | 1075 ++
 cf/src/arenax.c                               |  201 +
 cf/src/arenax_ce.c                            |   59 +
 cf/src/cf_mutex.c                             |  175 +
 cf/src/cf_str.c                               |  419 +
 cf/src/daemon.c                               |  167 +
 cf/src/dynbuf.c                               |  534 +
 cf/src/fault.c                                | 1138 +++
 cf/src/hardware.c                             | 1791 ++++
 cf/src/hist.c                                 |  305 +
 cf/src/hist_track.c                           |  732 ++
 cf/src/linear_hist.c                          |  366 +
 cf/src/meminfo.c                              |  152 +
 cf/src/msg.c                                  | 1205 +++
 cf/src/node.c                                 |   67 +
 cf/src/olock.c                                |  114 +
 cf/src/shash.c                                |  712 ++
 cf/src/socket.c                               | 2551 +++++
 cf/src/socket_ce.c                            |  459 +
 cf/src/tls_ce.c                               |  159 +
 cf/src/vmapx.c                                |  398 +
 make_in/Makefile.in                           |   97 +
 make_in/Makefile.targets                      |   16 +
 make_in/Makefile.vars                         |   98 +
 modules/common                                |    1 +
 modules/jansson                               |    1 +
 modules/jemalloc                              |    1 +
 modules/lua-core                              |    1 +
 modules/luajit                                |    1 +
 modules/mod-lua                               |    1 +
 modules/s2-geometry-library                   |    1 +
 modules/telemetry                             |    1 +
 pkg/deb/Makefile                              |  148 +
 pkg/deb/asinstall                             |   66 +
 pkg/deb/conffiles                             |    1 +
 pkg/deb/conffiles.telemetry                   |    1 +
 pkg/deb/copyright                             |    9 +
 pkg/deb/postinst.server                       |   31 +
 pkg/deb/server-64                             |    9 +
 pkg/dist/.gitignore                           |    1 +
 pkg/packages/.gitignore                       |    1 +
 pkg/rpm/Makefile                              |  158 +
 pkg/rpm/asinstall                             |   66 +
 pkg/rpm/server-spec-base                      |   20 +
 pkg/rpm/server-spec-config                    |   10 +
 pkg/rpm/server-spec-files                     |    5 +
 pkg/rpm/server-spec-logrotate                 |    1 +
 pkg/rpm/server-spec-scripts                   |    9 +
 pkg/rpm/server-spec-scripts-systemd           |    4 +
 pkg/rpm/server-spec-systemd                   |    8 +
 pkg/rpm/server-spec-sysv                      |    4 +
 pkg/rpm/server-spec-telemetry                 |    1 +
 pkg/rpm/server-spec-telemetry-systemd         |    2 +
 pkg/rpm/server-spec-telemetry-sysv            |    2 +
 pkg/src/Makefile                              |   29 +
 pkg/src/git-cp-files.sh                       |   47 +
 pkg/tar/Makefile                              |   71 +
 pkg/tar/README                                |  101 +
 pkg/tar/bin/aerospike                         |    4 +
 pkg/tar/share/bin/aerospike                   |  224 +
 pkg/tar/share/etc/aerospike.conf              |   54 +
 pkg/tar/share/lib/aerospike-render.py         |   38 +
 pkg/tar/share/libexec/aerospike-destroy       |   26 +
 pkg/tar/share/libexec/aerospike-init          |  211 +
 pkg/tar/share/libexec/aerospike-restart       |   25 +
 pkg/tar/share/libexec/aerospike-start         |  105 +
 pkg/tar/share/libexec/aerospike-status        |   24 +
 pkg/tar/share/libexec/aerospike-stop          |   36 +
 pkg/tar/share/man/aerospike-destroy.man       |   34 +
 pkg/tar/share/man/aerospike-init.man          |   77 +
 pkg/tar/share/man/aerospike-restart.man       |   34 +
 pkg/tar/share/man/aerospike-start.man         |   34 +
 pkg/tar/share/man/aerospike-status.man        |   34 +
 pkg/tar/share/man/aerospike-stop.man          |   34 +
 tools/bin/asd-coldstart                       |   13 +
 tools/bin/iddecode                            |   76 +
 tools/citrus2aero/upgrade2to3                 |   93 +
 tools/fixownership/fixownership.py            |  282 +
 tools/jem/.gitignore                          |    1 +
 tools/jem/README.md                           |  118 +
 tools/jem/extract-jem-stats                   |   41 +
 tools/jem/extract-jem-stats.sh                |   41 +
 tools/jem/get-jem-stats                       |   20 +
 tools/jem/get-jem-stats.sh                    |   20 +
 tools/jem/jemabs                              |   40 +
 tools/jem/jemdefs.py                          |   65 +
 tools/jem/jemdel                              |   45 +
 tools/jem/jemeff                              |   37 +
 tools/memacct/asparsemem                      |  121 +
 tools/systemtap/README.md                     |   40 +
 tools/systemtap/queries.stp                   |  145 +
 tools/systemtap/query_annotate                |  277 +
 355 files changed, 146643 insertions(+)
 create mode 100644 .build.yml
 create mode 100644 .github/ISSUE_TEMPLATE.md
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 LICENSE-AGPL
 create mode 100644 LICENSE-APACHE
 create mode 100644 LICENSE.3rdParty
 create mode 100644 LICENSE.CE
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 ai/Makefile
 create mode 100644 ai/include/ai_btree.h
 create mode 100644 ai/include/ai_obj.h
 create mode 100644 ai/include/ai_types.h
 create mode 100644 ai/include/bt.h
 create mode 100644 ai/include/bt_iterator.h
 create mode 100644 ai/include/bt_output.h
 create mode 100644 ai/include/btree.h
 create mode 100644 ai/include/btreepriv.h
 create mode 100644 ai/include/stream.h
 create mode 100644 ai/src/Makefile
 create mode 100644 ai/src/ai_btree.c
 create mode 100644 ai/src/ai_obj.c
 create mode 100644 ai/src/bt.c
 create mode 100644 ai/src/bt_code.c
 create mode 100644 ai/src/bt_iterator.c
 create mode 100644 ai/src/bt_output.c
 create mode 100644 ai/src/stream.c
 create mode 100644 apidocs/Makefile
 create mode 100644 apidocs/src/doxyfile
 create mode 100644 apidocs/src/footer.html
 create mode 100644 apidocs/src/header.html
 create mode 100644 apidocs/src/layout.xml
 create mode 100644 apidocs/src/style.css
 create mode 100644 as/Makefile
 create mode 100644 as/etc/README.sample.conf.md
 create mode 100644 as/etc/aerospike-server.sysconfig
 create mode 100644 as/etc/aerospike-server.tmpfiles
 create mode 100644 as/etc/aerospike.conf
 create mode 100644 as/etc/aerospike.service.d/aerospike.conf
 create mode 100644 as/etc/aerospike.service.d/aerospike.conf.coldstart
 create mode 100644 as/etc/aerospike.service.d/aerospike.conf.default
 create mode 100644 as/etc/aerospike.service.head
 create mode 100644 as/etc/aerospike.service.tail
 create mode 100644 as/etc/aerospike.service.telemetry
 create mode 100644 as/etc/aerospike_dev.conf
 create mode 100644 as/etc/aerospike_mesh.conf
 create mode 100644 as/etc/aerospike_mesh_systemd.conf
 create mode 100644 as/etc/aerospike_ssd.conf
 create mode 100644 as/etc/aerospike_ssd_systemd.conf
 create mode 100644 as/etc/aerospike_systemd.conf
 create mode 100644 as/etc/aerospike_telemetry.service
 create mode 100644 as/etc/aerospike_telemetry.sysconfig
 create mode 100644 as/etc/asd-systemd-helper
 create mode 100644 as/etc/init-script
 create mode 100755 as/etc/init-script.deb
 create mode 100644 as/etc/init-telemetry-script
 create mode 100644 as/etc/init-telemetry-script.deb
 create mode 100755 as/etc/irqbalance-ban.sh
 create mode 100644 as/etc/logrotate_asd
 create mode 100644 as/etc/logrotate_telemetry
 create mode 100644 as/etc/telemetry.conf
 create mode 100644 as/etc/telemetry_dev.conf
 create mode 100644 as/etc/valgrind.supp
 create mode 100644 as/include/base/aggr.h
 create mode 100644 as/include/base/as_stap.h
 create mode 100644 as/include/base/batch.h
 create mode 100644 as/include/base/cdt.h
 create mode 100644 as/include/base/cfg.h
 create mode 100644 as/include/base/datamodel.h
 create mode 100644 as/include/base/features.h
 create mode 100644 as/include/base/index.h
 create mode 100644 as/include/base/job_manager.h
 create mode 100644 as/include/base/json_init.h
 create mode 100644 as/include/base/monitor.h
 create mode 100644 as/include/base/packet_compression.h
 create mode 100644 as/include/base/particle.h
 create mode 100644 as/include/base/particle_blob.h
 create mode 100644 as/include/base/particle_integer.h
 create mode 100644 as/include/base/predexp.h
 create mode 100644 as/include/base/proto.h
 create mode 100644 as/include/base/rec_props.h
 create mode 100644 as/include/base/scan.h
 create mode 100644 as/include/base/secondary_index.h
 create mode 100644 as/include/base/security.h
 create mode 100644 as/include/base/security_config.h
 create mode 100644 as/include/base/stats.h
 create mode 100644 as/include/base/system_metadata.h
 create mode 100644 as/include/base/thr_batch.h
 create mode 100644 as/include/base/thr_demarshal.h
 create mode 100644 as/include/base/thr_info.h
 create mode 100644 as/include/base/thr_info_port.h
 create mode 100644 as/include/base/thr_query.h
 create mode 100644 as/include/base/thr_sindex.h
 create mode 100644 as/include/base/thr_tsvc.h
 create mode 100644 as/include/base/ticker.h
 create mode 100644 as/include/base/transaction.h
 create mode 100644 as/include/base/transaction_policy.h
 create mode 100644 as/include/base/truncate.h
 create mode 100644 as/include/base/udf_aerospike.h
 create mode 100644 as/include/base/udf_arglist.h
 create mode 100644 as/include/base/udf_cask.h
 create mode 100644 as/include/base/udf_memtracker.h
 create mode 100644 as/include/base/udf_record.h
 create mode 100644 as/include/base/udf_timer.h
 create mode 100644 as/include/base/xdr_config.h
 create mode 100644 as/include/base/xdr_serverside.h
 create mode 100644 as/include/fabric/clustering.h
 create mode 100644 as/include/fabric/endpoint.h
 create mode 100644 as/include/fabric/exchange.h
 create mode 100644 as/include/fabric/fabric.h
 create mode 100644 as/include/fabric/hb.h
 create mode 100644 as/include/fabric/hlc.h
 create mode 100644 as/include/fabric/meta_batch.h
 create mode 100644 as/include/fabric/migrate.h
 create mode 100644 as/include/fabric/partition.h
 create mode 100644 as/include/fabric/partition_balance.h
 create mode 100644 as/include/fabric/roster.h
 create mode 100644 as/include/fabric/skew_monitor.h
 create mode 100644 as/include/geospatial/geojson.h
 create mode 100644 as/include/geospatial/geospatial.h
 create mode 100644 as/include/geospatial/scoped.h
 create mode 100644 as/include/geospatial/throwstream.h
 create mode 100644 as/include/storage/drv_ssd.h
 create mode 100644 as/include/storage/storage.h
 create mode 100644 as/include/transaction/delete.h
 create mode 100644 as/include/transaction/duplicate_resolve.h
 create mode 100644 as/include/transaction/proxy.h
 create mode 100644 as/include/transaction/re_replicate.h
 create mode 100644 as/include/transaction/read.h
 create mode 100644 as/include/transaction/replica_ping.h
 create mode 100644 as/include/transaction/replica_write.h
 create mode 100644 as/include/transaction/rw_request.h
 create mode 100644 as/include/transaction/rw_request_hash.h
 create mode 100644 as/include/transaction/rw_utils.h
 create mode 100644 as/include/transaction/udf.h
 create mode 100644 as/include/transaction/write.h
 create mode 100644 as/src/Makefile
 create mode 100644 as/src/base/aggr.c
 create mode 100644 as/src/base/as.c
 create mode 100644 as/src/base/batch.c
 create mode 100644 as/src/base/bin.c
 create mode 100644 as/src/base/cdt.c
 create mode 100644 as/src/base/cfg.c
 create mode 100644 as/src/base/cfg_ce.c
 create mode 100644 as/src/base/features_ce.c
 create mode 100644 as/src/base/index.c
 create mode 100644 as/src/base/index_ce.c
 create mode 100644 as/src/base/job_manager.c
 create mode 100644 as/src/base/json_init.c
 create mode 100644 as/src/base/monitor.c
 create mode 100644 as/src/base/namespace.c
 create mode 100644 as/src/base/namespace_ce.c
 create mode 100644 as/src/base/packet_compression.c
 create mode 100644 as/src/base/particle.c
 create mode 100644 as/src/base/particle_blob.c
 create mode 100644 as/src/base/particle_float.c
 create mode 100644 as/src/base/particle_geojson.c
 create mode 100644 as/src/base/particle_integer.c
 create mode 100644 as/src/base/particle_list.c
 create mode 100644 as/src/base/particle_map.c
 create mode 100644 as/src/base/particle_string.c
 create mode 100644 as/src/base/predexp.c
 create mode 100644 as/src/base/probes.d
 create mode 100644 as/src/base/proto.c
 create mode 100644 as/src/base/rec_props.c
 create mode 100644 as/src/base/record.c
 create mode 100644 as/src/base/record_ce.c
 create mode 100644 as/src/base/scan.c
 create mode 100644 as/src/base/secondary_index.c
 create mode 100644 as/src/base/security_ce.c
 create mode 100644 as/src/base/signal.c
 create mode 100644 as/src/base/system_metadata.c
 create mode 100644 as/src/base/thr_batch.c
 create mode 100644 as/src/base/thr_demarshal.c
 create mode 100644 as/src/base/thr_info.c
 create mode 100644 as/src/base/thr_info_port.c
 create mode 100644 as/src/base/thr_nsup.c
 create mode 100644 as/src/base/thr_query.c
 create mode 100644 as/src/base/thr_sindex.c
 create mode 100644 as/src/base/thr_tsvc.c
 create mode 100644 as/src/base/ticker.c
 create mode 100644 as/src/base/transaction.c
 create mode 100644 as/src/base/truncate.c
 create mode 100644 as/src/base/truncate_ce.c
 create mode 100644 as/src/base/udf_aerospike.c
 create mode 100644 as/src/base/udf_arglist.c
 create mode 100644 as/src/base/udf_cask.c
 create mode 100644 as/src/base/udf_memtracker.c
 create mode 100644 as/src/base/udf_record.c
 create mode 100644 as/src/base/udf_timer.c
 create mode 100644 as/src/base/xdr_config.c
 create mode 100644 as/src/base/xdr_serverside_stubs.c
 create mode 100644 as/src/fabric/clustering.c
 create mode 100644 as/src/fabric/endpoint.c
 create mode 100644 as/src/fabric/exchange.c
 create mode 100644 as/src/fabric/fabric.c
 create mode 100644 as/src/fabric/hb.c
 create mode 100644 as/src/fabric/hlc.c
 create mode 100644 as/src/fabric/meta_batch_ce.c
 create mode 100644 as/src/fabric/migrate.c
 create mode 100644 as/src/fabric/migrate_ce.c
 create mode 100644 as/src/fabric/partition.c
 create mode 100644 as/src/fabric/partition_balance.c
 create mode 100644 as/src/fabric/partition_balance_ce.c
 create mode 100644 as/src/fabric/partition_ce.c
 create mode 100644 as/src/fabric/roster_ce.c
 create mode 100644 as/src/fabric/skew_monitor.c
 create mode 100644 as/src/geospatial/geojson.cc
 create mode 100644 as/src/geospatial/geospatial.cc
 create mode 100644 as/src/storage/drv_memory.c
 create mode 100644 as/src/storage/drv_memory_ce.c
 create mode 100644 as/src/storage/drv_ssd.c
 create mode 100644 as/src/storage/drv_ssd_ce.c
 create mode 100644 as/src/storage/storage.c
 create mode 100644 as/src/transaction/delete.c
 create mode 100644 as/src/transaction/delete_ce.c
 create mode 100644 as/src/transaction/duplicate_resolve.c
 create mode 100644 as/src/transaction/proxy.c
 create mode 100644 as/src/transaction/re_replicate_ce.c
 create mode 100644 as/src/transaction/read.c
 create mode 100644 as/src/transaction/replica_ping_ce.c
 create mode 100644 as/src/transaction/replica_write.c
 create mode 100644 as/src/transaction/rw_request.c
 create mode 100644 as/src/transaction/rw_request_hash.c
 create mode 100644 as/src/transaction/rw_utils.c
 create mode 100644 as/src/transaction/rw_utils_ce.c
 create mode 100644 as/src/transaction/udf.c
 create mode 100644 as/src/transaction/write.c
 create mode 100755 build/VersionCheck.py
 create mode 100755 build/gen_version
 create mode 100755 build/os_version
 create mode 100755 build/prep-ce
 create mode 100755 build/version
 create mode 100644 cf/.gitignore
 create mode 100644 cf/Makefile
 create mode 100644 cf/README.md
 create mode 100644 cf/include/arenax.h
 create mode 100644 cf/include/bits.h
 create mode 100644 cf/include/cf_mutex.h
 create mode 100644 cf/include/cf_str.h
 create mode 100644 cf/include/compare.h
 create mode 100644 cf/include/daemon.h
 create mode 100644 cf/include/dynbuf.h
 create mode 100644 cf/include/enhanced_alloc.h
 create mode 100644 cf/include/fault.h
 create mode 100644 cf/include/hardware.h
 create mode 100644 cf/include/hist.h
 create mode 100644 cf/include/hist_track.h
 create mode 100644 cf/include/linear_hist.h
 create mode 100644 cf/include/mem_count.h
 create mode 100644 cf/include/meminfo.h
 create mode 100644 cf/include/msg.h
 create mode 100644 cf/include/node.h
 create mode 100644 cf/include/olock.h
 create mode 100644 cf/include/shash.h
 create mode 100644 cf/include/socket.h
 create mode 100644 cf/include/tls.h
 create mode 100644 cf/include/vmapx.h
 create mode 100644 cf/include/warnings.h
 create mode 100644 cf/src/Makefile
 create mode 100644 cf/src/alloc.c
 create mode 100644 cf/src/arenax.c
 create mode 100644 cf/src/arenax_ce.c
 create mode 100644 cf/src/cf_mutex.c
 create mode 100644 cf/src/cf_str.c
 create mode 100644 cf/src/daemon.c
 create mode 100644 cf/src/dynbuf.c
 create mode 100644 cf/src/fault.c
 create mode 100644 cf/src/hardware.c
 create mode 100644 cf/src/hist.c
 create mode 100644 cf/src/hist_track.c
 create mode 100644 cf/src/linear_hist.c
 create mode 100644 cf/src/meminfo.c
 create mode 100644 cf/src/msg.c
 create mode 100644 cf/src/node.c
 create mode 100644 cf/src/olock.c
 create mode 100644 cf/src/shash.c
 create mode 100644 cf/src/socket.c
 create mode 100644 cf/src/socket_ce.c
 create mode 100644 cf/src/tls_ce.c
 create mode 100644 cf/src/vmapx.c
 create mode 100644 make_in/Makefile.in
 create mode 100644 make_in/Makefile.targets
 create mode 100644 make_in/Makefile.vars
 create mode 160000 modules/common
 create mode 160000 modules/jansson
 create mode 160000 modules/jemalloc
 create mode 160000 modules/lua-core
 create mode 160000 modules/luajit
 create mode 160000 modules/mod-lua
 create mode 160000 modules/s2-geometry-library
 create mode 160000 modules/telemetry
 create mode 100644 pkg/deb/Makefile
 create mode 100755 pkg/deb/asinstall
 create mode 100644 pkg/deb/conffiles
 create mode 100644 pkg/deb/conffiles.telemetry
 create mode 100644 pkg/deb/copyright
 create mode 100755 pkg/deb/postinst.server
 create mode 100644 pkg/deb/server-64
 create mode 100644 pkg/dist/.gitignore
 create mode 100644 pkg/packages/.gitignore
 create mode 100644 pkg/rpm/Makefile
 create mode 100755 pkg/rpm/asinstall
 create mode 100644 pkg/rpm/server-spec-base
 create mode 100644 pkg/rpm/server-spec-config
 create mode 100644 pkg/rpm/server-spec-files
 create mode 100644 pkg/rpm/server-spec-logrotate
 create mode 100644 pkg/rpm/server-spec-scripts
 create mode 100644 pkg/rpm/server-spec-scripts-systemd
 create mode 100644 pkg/rpm/server-spec-systemd
 create mode 100644 pkg/rpm/server-spec-sysv
 create mode 100644 pkg/rpm/server-spec-telemetry
 create mode 100644 pkg/rpm/server-spec-telemetry-systemd
 create mode 100644 pkg/rpm/server-spec-telemetry-sysv
 create mode 100644 pkg/src/Makefile
 create mode 100755 pkg/src/git-cp-files.sh
 create mode 100644 pkg/tar/Makefile
 create mode 100644 pkg/tar/README
 create mode 100644 pkg/tar/bin/aerospike
 create mode 100755 pkg/tar/share/bin/aerospike
 create mode 100644 pkg/tar/share/etc/aerospike.conf
 create mode 100644 pkg/tar/share/lib/aerospike-render.py
 create mode 100644 pkg/tar/share/libexec/aerospike-destroy
 create mode 100644 pkg/tar/share/libexec/aerospike-init
 create mode 100644 pkg/tar/share/libexec/aerospike-restart
 create mode 100644 pkg/tar/share/libexec/aerospike-start
 create mode 100644 pkg/tar/share/libexec/aerospike-status
 create mode 100644 pkg/tar/share/libexec/aerospike-stop
 create mode 100644 pkg/tar/share/man/aerospike-destroy.man
 create mode 100644 pkg/tar/share/man/aerospike-init.man
 create mode 100644 pkg/tar/share/man/aerospike-restart.man
 create mode 100644 pkg/tar/share/man/aerospike-start.man
 create mode 100644 pkg/tar/share/man/aerospike-status.man
 create mode 100644 pkg/tar/share/man/aerospike-stop.man
 create mode 100755 tools/bin/asd-coldstart
 create mode 100755 tools/bin/iddecode
 create mode 100755 tools/citrus2aero/upgrade2to3
 create mode 100755 tools/fixownership/fixownership.py
 create mode 100644 tools/jem/.gitignore
 create mode 100644 tools/jem/README.md
 create mode 100755 tools/jem/extract-jem-stats
 create mode 100755 tools/jem/extract-jem-stats.sh
 create mode 100755 tools/jem/get-jem-stats
 create mode 100755 tools/jem/get-jem-stats.sh
 create mode 100755 tools/jem/jemabs
 create mode 100644 tools/jem/jemdefs.py
 create mode 100755 tools/jem/jemdel
 create mode 100755 tools/jem/jemeff
 create mode 100755 tools/memacct/asparsemem
 create mode 100644 tools/systemtap/README.md
 create mode 100755 tools/systemtap/queries.stp
 create mode 100755 tools/systemtap/query_annotate

diff --git a/.build.yml b/.build.yml
new file mode 100644
index 00000000..75e3101b
--- /dev/null
+++ b/.build.yml
@@ -0,0 +1,41 @@
+name: aerospike-server
+
+environment:
+  EEREPO: modules/ee
+
+dependency:
+  - url: git@github.com:citrusleaf/aerospike-server-enterprise
+    dir: $EEREPO
+
+container:
+  - base:
+      - docker.qe.aerospike.com/build/aerospike-server:centos-6
+      - docker.qe.aerospike.com/build/aerospike-server:centos-7
+      - docker.qe.aerospike.com/build/aerospike-server:debian-7
+      - docker.qe.aerospike.com/build/aerospike-server:debian-8
+      - docker.qe.aerospike.com/build/aerospike-server:ubuntu-12.04
+      - docker.qe.aerospike.com/build/aerospike-server:ubuntu-14.04
+      - docker.qe.aerospike.com/build/aerospike-server:ubuntu-16.04
+
+build:
+  - name: community
+    environment:
+      EEREPO:
+    script:
+      - make
+      - make $PKG
+      - make tar
+      - make source
+      - cp -p modules/telemetry/{README,TELEMETRY}.md
+    artifact:
+      - pkg/packages/*
+      - modules/telemetry/TELEMETRY.md
+  - name: enterprise
+    environment:
+      EEREPO: /work/source/$EEREPO
+    script:
+      - make +ee
+      - make $PKG+ee
+      - make source+ee
+    artifact:
+      - pkg/packages/*
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 00000000..76d5afff
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,22 @@
+Please direct the following to our [community forum](https://discuss.aerospike.com/):
+- general questions
+- help requests
+- feature requests
+- non Aerospike Server issues
+
+The issues submitted here should be Aerospike Server **code** related. Examples include:
+- crashes (please provide stack trace from logs)
+- bugs (not behaving as expected/documented)
+- code quality
+
+__________
+
+**OS:** *Put your operating system here. For example: "Ubuntu 16.10", "CentOS 7", "Debian 8" etc.*
+
+**Aerospike version:** *Put your Aerospike release version or `git describe --long --all` output here. For example: "3.15.0.1, heads/master-0-g450aee1"*
+
+**Client version:** *Put which client and the version of the client you are using (if applicable) here. For example: "Java 4.0.8, C 4.2.0"
+
+__________
+
+*Explain your _Aerospike Server_ issue in detail here and (if applicable) provide logs snippets, configuration, and/or reproduction instructions.*
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..f40050ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+.DS_Store
+.cproject
+.project
+.settings
+/vg.log
+TAGS
+run
+target
+
+# emacs backup / temp files
+*~
+\#*\#
+.\#*
+
+# TLS credentials
+key.pem
+cert.pem
+chain.pem
+cacert.pem
+cbl.txt
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..e42544ba
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,27 @@
+[submodule "modules/common"]
+	path = modules/common
+	url = https://github.com/aerospike/aerospike-common.git
+	ignore = dirty
+[submodule "modules/mod-lua"]
+	path = modules/mod-lua
+	url = https://github.com/aerospike/aerospike-mod-lua.git
+[submodule "modules/jansson"]
+	path = modules/jansson
+	url = https://github.com/aerospike/jansson.git
+	ignore = dirty
+[submodule "modules/lua-core"]
+	path = modules/lua-core
+	url = https://github.com/aerospike/aerospike-lua-core.git
+[submodule "modules/luajit"]
+	path = modules/luajit
+	url = https://github.com/aerospike/luajit.git
+[submodule "modules/s2-geometry-library"]
+	path = modules/s2-geometry-library
+	url = https://github.com/aerospike/s2-geometry-library.git
+[submodule "modules/telemetry"]
+	path = modules/telemetry
+	url = https://github.com/aerospike/aerospike-telemetry-agent.git
+[submodule "modules/jemalloc"]
+	path = modules/jemalloc
+	url = https://github.com/aerospike/jemalloc.git
+	ignore = dirty
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..96473525
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,3 @@
+# Contributing
+
+For details on contributing to Aerospike, please read http://www.aerospike.com/community/contributor/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..efc2e191
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,260 @@
+================================================================================
+
+AEROSPIKE SERVER LICENSE
+
+The Aerospike Server Community Edition is made available under the terms of
+the GNU Affero General Public License version 3 (AGPLv3), as stated in the
+file `LICENSE-AGPL`.
+
+Individual files may be made available under their own specific license, 
+all compatible with AGPLv3. Please see individual files for details.
+
+================================================================================
+
+AEROSPIKE MODULE LICENSE
+
+The following directories and their subdirectories thereof are made available 
+under the terms of the Apache License, version 2.0, as stated in the file
+`LICENSE-APACHE`, or a compatible license stated in the file itself. Please
+see individual files for details.
+
+  - modules/common
+  - modules/lua-core
+  - modules/mod-lua
+  - modules/telemetry
+
+================================================================================
+
+THIRD PARTY LIBRARY LICENSES
+
+The following are the licenses for 3rd party libraries utilized by Aerospike 
+Server.
+
+--------------------------------------------------------------------------------
+
+Lua
+---
+
+Copyright © 1994–2013 Lua.org, PUC-Rio.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, 
+including without limitation the rights to use, copy, modify, merge, publish, distribute, 
+sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial 
+portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+LuaJIT
+------
+
+Copyright (C) 2005-2014 Mike Pall. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+[ MIT license: http://www.opensource.org/licenses/mit-license.php ]
+
+[ LuaJIT includes code from dlmalloc, which has this license statement: ]
+
+This is a version (aka dlmalloc) of malloc/free/realloc written by
+Doug Lea and released to the public domain, as explained at
+http://creativecommons.org/licenses/publicdomain
+
+--------------------------------------------------------------------------------
+
+Jansson
+-------
+
+Copyright (c) 2009-2012 Petri Lehtinen <petri@digip.org>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, 
+including without limitation the rights to use, copy, modify, merge, publish, distribute, 
+sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or 
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+jemalloc
+--------
+
+Copyright (C) 2002-2014 Jason Evans <jasone@canonware.com>.
+All rights reserved.
+Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+Copyright (C) 2009-2014 Facebook, Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice(s),
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice(s),
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+BTree
+-----
+
+Copyright 1997-1999, 2001 John-Mark Gurney.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+--------------------------------------------------------------------------------
+
+BCrypt
+------
+
+Written by Solar Designer <solar at openwall.com> in 1998-2011.
+No copyright is claimed, and the software is hereby placed in the public
+domain.  In case this attempt to disclaim copyright and place the software
+in the public domain is deemed null and void, then the software is
+Copyright (c) 1998-2011 Solar Designer and it is hereby released to the
+general public under the following terms:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted.
+
+There's ABSOLUTELY NO WARRANTY, express or implied.
+
+It is my intent that you should be able to use this on your system,
+as part of a software package, or anywhere else to improve security,
+ensure compatibility, or for any other purpose.  I would appreciate
+it if you give credit where it is due and keep your modifications in
+the public domain as well, but I don't require that in order to let
+you place this code and any modifications you make under a license
+of your choice.
+
+--------------------------------------------------------------------------------
+
+Concurrency Kit
+---------------
+
+Copyright 2010-2013 Samy Al Bahra.
+Copyright 2011-2013 AppNexus, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+Hazard Pointers (src/ck_hp.c) also includes this license:
+
+(c) Copyright 2008, IBM Corporation.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+ck_pr_rtm leverages work from Andi Kleen:
+Copyright (c) 2012,2013 Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that: (1) source code distributions
+retain the above copyright notice and this paragraph in its entirety, (2)
+distributions including binary code include the above copyright notice and
+this paragraph in its entirety in the documentation or other materials
+provided with the distribution
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+--------------------------------------------------------------------------------
+
+S2
+--
+
+Copyright 2005 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
diff --git a/LICENSE-AGPL b/LICENSE-AGPL
new file mode 100644
index 00000000..2def0e88
--- /dev/null
+++ b/LICENSE-AGPL
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
\ No newline at end of file
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LICENSE.3rdParty b/LICENSE.3rdParty
new file mode 100644
index 00000000..44db61d9
--- /dev/null
+++ b/LICENSE.3rdParty
@@ -0,0 +1,235 @@
+================================================================================
+
+THIRD PARTY LIBRARY LICENSES
+
+The following are the licenses for 3rd party libraries utilized by Aerospike 
+Server.
+
+--------------------------------------------------------------------------------
+
+Lua
+---
+
+Copyright © 1994–2013 Lua.org, PUC-Rio.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, 
+including without limitation the rights to use, copy, modify, merge, publish, distribute, 
+sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial 
+portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+LuaJIT
+------
+
+Copyright (C) 2005-2014 Mike Pall. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+[ MIT license: http://www.opensource.org/licenses/mit-license.php ]
+
+[ LuaJIT includes code from dlmalloc, which has this license statement: ]
+
+This is a version (aka dlmalloc) of malloc/free/realloc written by
+Doug Lea and released to the public domain, as explained at
+http://creativecommons.org/licenses/publicdomain
+
+--------------------------------------------------------------------------------
+
+Jansson
+-------
+
+Copyright (c) 2009-2012 Petri Lehtinen <petri@digip.org>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, 
+including without limitation the rights to use, copy, modify, merge, publish, distribute, 
+sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or 
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+jemalloc
+--------
+
+Copyright (C) 2002-2014 Jason Evans <jasone@canonware.com>.
+All rights reserved.
+Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+Copyright (C) 2009-2014 Facebook, Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice(s),
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice(s),
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+BTree
+-----
+
+Copyright 1997-1999, 2001 John-Mark Gurney.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+--------------------------------------------------------------------------------
+
+BCrypt
+------
+
+Written by Solar Designer <solar at openwall.com> in 1998-2011.
+No copyright is claimed, and the software is hereby placed in the public
+domain.  In case this attempt to disclaim copyright and place the software
+in the public domain is deemed null and void, then the software is
+Copyright (c) 1998-2011 Solar Designer and it is hereby released to the
+general public under the following terms:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted.
+
+There's ABSOLUTELY NO WARRANTY, express or implied.
+
+It is my intent that you should be able to use this on your system,
+as part of a software package, or anywhere else to improve security,
+ensure compatibility, or for any other purpose.  I would appreciate
+it if you give credit where it is due and keep your modifications in
+the public domain as well, but I don't require that in order to let
+you place this code and any modifications you make under a license
+of your choice.
+
+--------------------------------------------------------------------------------
+
+Concurrency Kit
+---------------
+
+Copyright 2010-2013 Samy Al Bahra.
+Copyright 2011-2013 AppNexus, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+Hazard Pointers (src/ck_hp.c) also includes this license:
+
+(c) Copyright 2008, IBM Corporation.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+ck_pr_rtm leverages work from Andi Kleen:
+Copyright (c) 2012,2013 Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that: (1) source code distributions
+retain the above copyright notice and this paragraph in its entirety, (2)
+distributions including binary code include the above copyright notice and
+this paragraph in its entirety in the documentation or other materials
+provided with the distribution
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+--------------------------------------------------------------------------------
+
+S2
+--
+
+Copyright 2005 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
diff --git a/LICENSE.CE b/LICENSE.CE
new file mode 100644
index 00000000..b45bda4e
--- /dev/null
+++ b/LICENSE.CE
@@ -0,0 +1,25 @@
+================================================================================
+
+AEROSPIKE SERVER LICENSE
+
+The Aerospike Server Community Edition is made available under the terms of
+the GNU Affero General Public License version 3 (AGPLv3), as stated in the
+file `LICENSE-AGPL`.
+
+Individual files may be made available under their own specific license, 
+all compatible with AGPLv3. Please see individual files for details.
+
+================================================================================
+
+AEROSPIKE MODULE LICENSE
+
+The following directories and their subdirectories thereof are made available 
+under the terms of the Apache License, version 2.0, as stated in the file
+`LICENSE-APACHE`, or a compatible license stated in the file itself. Please
+see individual files for details.
+
+  - modules/common
+  - modules/lua-core
+  - modules/mod-lua
+  - modules/telemetry
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..65c3d595
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,163 @@
+# Aerospike Server
+# Makefile
+#
+# Main Build Targets:
+#
+#   make {all|server} - Build the Aerospike Server.
+#   make clean        - Remove build products, excluding built packages.
+#   make cleanpkg     - Remove built packages.
+#   make cleanall     - Remove all build products, including built packages.
+#   make cleangit     - Remove all files untracked by Git.  (Use with caution!)
+#   make strip        - Build stripped versions of the server executables.
+#
+# Packaging Targets:
+#
+#   make deb     - Package server for Debian / Ubuntu platforms as a ".deb" file.
+#   make rpm     - Package server for the Red Hat Package Manager (RPM.)
+#   make tar     - Package server as a compressed tarball for every Linux platform.
+#   make source  - Package the server source code as a compressed "tar" archive.
+#
+# Building a distribution release is a two step process:
+#
+#   1). The initial "make" builds the server itself.
+#
+#   2). The second step packages up the server using "make" with one of the following targets:
+#
+#       rpm:  Suitable for building and installing on Red Hat-derived systems.
+#       deb:  Suitable for building and installing on Debian-derived systems.
+#       tar:  Makes an "Every Linux" distribution, packaged as a compressed "tar" archive.
+#
+# Targets for running the Aerospike Server in the source tree:
+#
+#   make init    - Initialize the server run-time directories.
+#   make start   - Start the server.
+#   make stop    - Stop the server.
+#
+
+# Common variable definitions:
+include make_in/Makefile.vars
+
+.PHONY: all server
+all server:	targetdirs version $(JANSSON)/Makefile $(JEMALLOC)/Makefile $(LUAJIT)/src/luaconf.h
+ifeq ($(USE_LUAJIT),1)
+	$(MAKE) -C $(LUAJIT) Q= TARGET_SONAME=libluajit.so CCDEBUG=-g
+endif
+	$(MAKE) -C $(JEMALLOC)
+	$(MAKE) -C $(JANSSON)
+	$(MAKE) -C $(COMMON) CF=$(CF) EXT_CFLAGS="$(EXT_CFLAGS)"
+	$(MAKE) -C $(CF)
+	$(MAKE) -C $(MOD_LUA) CF=$(CF) COMMON=$(COMMON) LUA_CORE=$(LUA_CORE) EXT_CFLAGS="$(EXT_CFLAGS)" USE_LUAJIT=$(USE_LUAJIT) LUAJIT=$(LUAJIT) TARGET_SERVER=1
+	$(MAKE) -C $(S2)
+	$(MAKE) -C ai
+	$(MAKE) -C as
+
+.PHONY: targetdirs
+targetdirs:
+	mkdir -p $(GEN_DIR) $(LIBRARY_DIR) $(BIN_DIR)
+	mkdir -p $(OBJECT_DIR)/base $(OBJECT_DIR)/fabric $(OBJECT_DIR)/storage $(OBJECT_DIR)/geospatial $(OBJECT_DIR)/transaction
+
+strip:	server
+	$(MAKE) -C xdr strip
+	$(MAKE) -C as strip
+
+.PHONY: init start stop
+init:
+	@echo "Creating and initializing working directories..."
+	mkdir -p run/log run/work/smd run/work/sys/udf/lua run/work/usr/udf/lua
+	cp -pr modules/lua-core/src/* run/work/sys/udf/lua
+
+start:
+	@echo "Running the Aerospike Server locally..."
+	@PIDFILE=run/asd.pid ; if [ -f $$PIDFILE ]; then echo "Aerospike already running?  Please do \"make stop\" first."; exit -1; fi
+	@nohup ./modules/telemetry/telemetry.py as/etc/telemetry_dev.conf > /dev/null 2>&1 &
+	$(BIN_DIR)/asd --config-file as/etc/aerospike_dev.conf
+
+stop:
+	@echo "Stopping the local Aerospike Server..."
+	@PIDFILE=run/asd.pid ; if [ -f $$PIDFILE ]; then kill `cat $$PIDFILE`; rm $$PIDFILE; fi
+	@PID=`pgrep telemetry.py | grep -v grep`; if [ -n "$$PID" ]; then kill $$PID; fi
+
+.PHONY: clean
+clean:	cleanmodules cleandist
+	$(RM) $(VERSION_SRC) $(VERSION_OBJ)
+	$(RM) -rf $(TARGET_DIR)
+
+.PHONY: cleanmodules
+cleanmodules:
+	$(MAKE) -C $(COMMON) clean
+	if [ -e "$(JANSSON)/Makefile" ]; then \
+		$(MAKE) -C $(JANSSON) clean; \
+		$(MAKE) -C $(JANSSON) distclean; \
+	fi
+	if [ -e "$(JEMALLOC)/Makefile" ]; then \
+		$(MAKE) -C $(JEMALLOC) clean; \
+		$(MAKE) -C $(JEMALLOC) distclean; \
+	fi
+	if [ -e "$(LUAJIT)/Makefile" ]; then \
+		$(MAKE) -C $(LUAJIT) clean; \
+	fi
+	$(MAKE) -C $(MOD_LUA) COMMON=$(COMMON) LUA_CORE=$(LUA_CORE) USE_LUAJIT=$(USE_LUAJIT) LUAJIT=$(LUAJIT) clean
+	$(MAKE) -C $(S2) clean
+
+.PHONY: cleandist
+cleandist:
+	$(RM) -r pkg/dist/*
+
+.PHONY: cleanall
+cleanall: clean cleanpkg
+
+.PHONY: cleanpkg
+cleanpkg:
+	$(RM) pkg/packages/*
+
+GIT_CLEAN = git clean -fdx
+
+.PHONY: cleangit
+cleangit:
+	cd $(COMMON); $(GIT_CLEAN)
+	cd $(JANSSON); $(GIT_CLEAN)
+	cd $(JEMALLOC); $(GIT_CLEAN)
+	cd $(LUA_CORE); $(GIT_CLEAN)
+	cd $(LUAJIT); $(GIT_CLEAN)
+	cd $(MOD_LUA); $(GIT_CLEAN)
+	cd $(S2); $(GIT_CLEAN)
+	$(GIT_CLEAN)
+
+.PHONY: rpm deb tar
+rpm deb tar src:
+	$(MAKE) -C pkg/$@ EDITION=$(EDITION)
+
+$(VERSION_SRC):	targetdirs
+	build/gen_version $(EDITION) $(shell $(DEPTH)/build/os_version) > $(VERSION_SRC)
+
+$(VERSION_OBJ):	$(VERSION_SRC)
+	$(CC) -o $@ -c $<
+
+.PHONY: version
+version:	$(VERSION_OBJ)
+
+$(JANSSON)/configure:
+	cd $(JANSSON) && autoreconf -i
+
+$(JANSSON)/Makefile: $(JANSSON)/configure
+	cd $(JANSSON) && ./configure $(JANSSON_CONFIG_OPT)
+
+$(JEMALLOC)/configure:
+	cd $(JEMALLOC) && autoconf
+
+$(JEMALLOC)/Makefile: $(JEMALLOC)/configure
+	cd $(JEMALLOC) && ./configure $(JEM_CONFIG_OPT)
+
+$(LUAJIT)/src/luaconf.h: $(LUAJIT)/src/luaconf.h.orig
+	ln -s $(notdir $<) $@
+
+.PHONY: source
+source: src
+
+tags etags:
+	etags `find ai as cf modules xdr $(EEREPO) -name "*.[ch]" -o -name "*.cc" | egrep -v '(target/Linux|m4)'` `find /usr/include -name "*.h"`
+
+# Common target definitions:
+ifneq ($(EEREPO),)
+  include $(EEREPO)/make_in/Makefile.targets
+endif
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..cb7c5f0a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,224 @@
+# Aerospike Database Server
+
+Welcome to the Aerospike Database Server source code tree!
+
+Aerospike is a distributed, scalable NoSQL database. It is architected with three key objectives:
+
+- To create a high-performance, scalable platform that would meet the needs of today's web-scale applications
+- To provide the robustness and reliability (i.e., ACID) expected from traditional databases.
+- To provide operational efficiency (minimal manual involvement)
+
+For more information on Aerospike, please visit: [`http://aerospike.com`](http://aerospike.com)
+
+## Telemetry Anonymized Data Collection
+
+The Aerospike Community Edition collects anonymized server performance statistics.
+Please see the
+[Aerospike Telemetery web page](http://aerospike.com/aerospike-telemetry) for more
+information.  The full Telemetry data collection agent source code may be found in the
+["telemetry" submodule](https://github.com/aerospike/aerospike-telemetry-agent/blob/master/README.md).
+
+## Build Prerequisites
+
+The Aerospike Database Server can be built and deployed on various
+current 64-bit GNU/Linux platform versions, such as the Red Hat family (e.g.,
+CentOS 6 or later), Debian 7 or later, and Ubuntu 10.04 or later.
+
+### Dependencies
+
+The majority of the Aerospike source code is written in the C
+programming language, conforming to the ANSI C99 standard.
+
+In particular, the following tools and libraries are needed:
+
+#### C Compiler Toolchain
+
+Building Aerospike requires the GCC 4.1 or later C compiler toolchain,
+with the standard GNU/Linux development tools and libraries installed in
+the build environment, including:
+
+* `autoconf`
+
+* `automake`
+
+* `libtool`
+
+* `make`
+
+#### C++
+
+The C++ compiler is required for the Aerospike geospatial indexing
+feature and its dependency, Google's S2 Geometry Library (both written in C++.)
+
+* The required CentOS 6/7 package to install is: `gcc-c++`.
+
+* The required Debian 7/8 and Ubuntu 10/12/14/16 package to install is: `g++`.
+
+#### OpenSSL
+
+OpenSSL 0.9.8b or later is required for cryptographic hash functions
+(RIPEMD-160 & SHA-1) and pseudo-random number generation.
+
+* The CentOS 6/7 OpenSSL packages to install are:  `openssl`,
+`openssl-devel`, `openssl-static`.
+
+* The Debian 7/8 and Ubuntu 10/12/14/16 OpenSSL packages to install are:
+`openssl` and `libssl-dev`.
+
+#### Lua 5.1
+
+The [Lua](http://www.lua.org) 5.1 language is required for User Defined
+Function (UDF) support.
+
+* By default, Aerospike builds with Lua 5.1 support provided by the
+[LuaJIT](http://luajit.org) submodule.
+
+* Alternatively, it is possible to build with standard Lua 5.1 provided
+by the build environment.  In that case:
+
+	* The CentOS 6/7 Lua packages to install are:  `lua`,
+`lua-devel`, and `lua-static`.
+
+	* The Debian 7/8 and Ubuntu 10/12/14/16 Lua packages to install are:
+`lua5.1` and `liblua5.1-dev`.
+
+	* Build by passing the `USE_LUAJIT=0` option to `make`.
+
+#### Python 2
+
+Running the Telemetry Agent requires Python 2.6+, which is available by default on most
+platforms, and can be installed on Ubuntu 16.04 as the package `python`.
+
+### Submodules
+
+The Aerospike Database Server build depends upon 8 submodules:
+
+| Submodule | Description |
+|---------- | ----------- |
+| common    | The Aerospike Common Library |
+| jansson   | C library for encoding, decoding and manipulating JSON data |
+| jemalloc  | The JEMalloc Memory Allocator |
+| lua-core  | The Aerospike Core Lua Source Files |
+| luajit    | The LuaJIT (Just-In-Time Compiler for Lua) |
+| mod-lua   | The Aerospike Lua Interface |
+| s2-geometry-library | The S2 Spherical Geometry Library |
+| telemetry | The Aerospike Telemetry Agent (Community Edition only) |
+
+After the initial cloning of the `aerospike-server` repo., the
+submodules must be fetched for the first time using the following
+command:
+
+	$ git submodule update --init
+
+*Note:*  As this project uses submodules, the source archive downloadable
+via GitHub's `Download ZIP` button will not build unless the correct
+revision of each submodule is first manually installed in the appropriate
+`modules` subdirectory.
+
+## Building Aerospike
+
+### Default Build
+
+	$ make          -- Perform the default build (no packaging.)
+
+*Note:* You can use the `-j` option with `make` to speed up the build
+on multiple CPU cores. For example, to run four parallel jobs:
+
+    $ make -j4
+
+### Build Options
+
+	$ make deb      -- Build the Debian (Ubuntu) package.
+
+	$ make rpm      -- Build the Red Hat Package Manager (RPM) package.
+
+	$ make tar      -- Build the "Every Linux" compressed "tar" archive (".tgz") package.
+
+	$ make source   -- Package the source code as a compressed "tar" archive.
+
+	$ make clean    -- Delete any existing build products, excluding built packages.
+
+	$ make cleanpkg -- Delete built packages.
+
+	$ make cleanall -- Delete all existing build products, including built packages.
+
+	$ make cleangit -- Delete all files untracked by Git.  (Use with caution!)
+
+	$ make strip    -- Build "strip(1)"ed versions of the server executables.
+
+### Overriding Default Build Options
+
+	$ make {<Target>}* {<VARIABLE>=<VALUE>}*  -- Build <Target>(s) with optional variable overrides.
+
+#### Example:
+
+	$ make USE_JEM=0   -- Default build *without* JEMalloc support.
+
+## Configuring Aerospike
+
+Sample Aerospike configuration files are provided in `as/etc`.  The
+developer configuration file, `aerospike_dev.conf`, contains basic
+settings that should work out-of-the-box on most systems. The package
+example configuration files, `aerospike.conf`, and the Solid State Drive
+(SSD) version, `aerospike_ssd.conf`, are suitable for running Aerospike
+as a system daemon.
+
+These sample files may be modified for specific use cases (e.g., setting
+network addresses, defining namespaces, and setting storage engine
+properties) and tuned for for maximum performance on a particular
+system.  Also, system resource limits may need to be increased to allow,
+e.g., a greater number of concurrent connections to the database.  See
+"man limits.conf" for how to change the system's limit on a process'
+number of open file descriptors ("nofile".)
+
+## Running Aerospike
+
+There are several options for running the Aerospike database. Which
+option to use depends upon whether the primary purpose is production
+deployment or software development.
+
+The preferred method for running Aerospike in a production environment
+is to build and install the Aerospike package appropriate for the target
+Linux distribution (i.e., an `".rpm"`, `".deb"`, or `".tgz"` file), and
+then to control the state of the Aerospike daemon, either via the SysV
+daemon init script commands, e.g., `service aerospike start`, or else
+via `systemctl` on `systemd`-based systems, e.g., `systemctl start aerospike`.
+
+A convenient way to run Aerospike in a development environment is to use
+the following commands from within the top-level directory of the source
+code tree (`aerospike-server`):
+
+To create and initialize the `run` directory with the files needed for
+running Aerospike, use:
+
+	$ make init
+
+or, equivalently:
+
+	$ mkdir -p run/{log,work/{smd,{sys,usr}/udf/lua}}
+	$ cp -pr modules/lua-core/src/* run/work/sys/udf/lua
+
+To launch the server with `as/etc/aerospike_dev.conf` as the config:
+
+	$ make start
+
+or, equivalently:
+
+	$ nohup ./modules/telemetry/telemetry.py as/etc/telemetry_dev.conf > /dev/null 2>&1 &
+	$ target/Linux-x86_64/bin/asd --config-file as/etc/aerospike_dev.conf
+
+To halt the server:
+
+	$ make stop
+
+or, equivalently:
+
+	$ PID=`pgrep telemetry.py | grep -v grep`; if [ -n "$PID" ]; then kill $PID; fi
+	$ kill `cat run/asd.pid` ; rm run/asd.pid
+
+Please refer to the full documentation on the Aerospike web site,
+[`http://aerospike.com/docs/`](http://aerospike.com/docs/), for more
+detailed information about configuring and running the Aerospike
+Database Server, as well as about the Aerospike client API packages
+for popular programming languages.
+
diff --git a/ai/Makefile b/ai/Makefile
new file mode 100644
index 00000000..a800e597
--- /dev/null
+++ b/ai/Makefile
@@ -0,0 +1,9 @@
+# Aerospike Server -- Aerospike Index
+# Makefile
+
+.PHONY: default
+default: all
+	@echo "done."
+
+%:
+	$(MAKE) -C src $@
diff --git a/ai/include/ai_btree.h b/ai/include/ai_btree.h
new file mode 100644
index 00000000..e431d074
--- /dev/null
+++ b/ai/include/ai_btree.h
@@ -0,0 +1,68 @@
+/*
+ * ai_btree.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "base/secondary_index.h"
+
+#include "ai_obj.h"
+#include "btreepriv.h"
+
+#include <citrusleaf/cf_ll.h>
+
+void ai_btree_create(as_sindex_metadata *imd);
+
+void ai_btree_destroy(as_sindex_metadata *imd);
+
+int ai_btree_put(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void *key, cf_digest *value);
+
+int ai_btree_delete(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void *key, cf_digest *val);
+
+int ai_btree_query(as_sindex_metadata *imd, as_sindex_range *range, as_sindex_qctx *qctx);
+
+uint64_t ai_btree_get_isize(as_sindex_metadata *imd);
+
+uint64_t ai_btree_get_nsize(as_sindex_metadata *imd);
+
+uint64_t ai_btree_get_pimd_nsize(as_sindex_pmetadata *pimd);
+
+uint64_t ai_btree_get_pimd_isize(as_sindex_pmetadata *pimd);
+
+int ai_btree_list(char *ns, char *set, as_sindex_metadata **imds, int *num_indexes);
+
+uint64_t ai_btree_get_numkeys(as_sindex_metadata *imd);
+
+void ai_btree_dump(as_sindex_metadata *imd, char *fname, bool verbose);
+
+int ai_btree_build_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, struct ai_obj *icol, ulong *nofst, ulong lim, uint64_t * tot_processed, uint64_t * tot_found, cf_ll *apk2d);
+
+bool ai_btree_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, cf_ll *apk2d, ulong n2del, ulong *deleted);
+
+int ai_btree_key_hash_from_sbin(as_sindex_metadata *imd, as_sindex_bin_data *sbin);
+
+int ai_btree_key_hash(as_sindex_metadata *imd, void *skey);
+
+void ai_btree_delete_ibtr(bt *ibtr);
+
+void ai_btree_reinit_pimd(as_sindex_pmetadata *pimd, col_type_t sktype);
+
+void ai_btree_reset_pimd(as_sindex_pmetadata * pimd);
diff --git a/ai/include/ai_obj.h b/ai/include/ai_obj.h
new file mode 100644
index 00000000..9107a949
--- /dev/null
+++ b/ai/include/ai_obj.h
@@ -0,0 +1,42 @@
+/*
+ * ai_obj.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ *  Aerospike Index Object Declarations.
+ */
+
+#pragma once
+
+#include <stdio.h>
+
+#include "ai_types.h"
+
+void init_ai_obj(ai_obj *a);
+
+void init_ai_objLong(ai_obj *a, ulong l);
+
+void init_ai_objU160(ai_obj *a, uint160 y);
+
+void ai_objClone(ai_obj *dest, ai_obj *src);
+
+bool ai_objEQ(ai_obj *a, ai_obj *b);
+
+void dump_ai_obj_as_digest(FILE *fp, ai_obj *a);
diff --git a/ai/include/ai_types.h b/ai/include/ai_types.h
new file mode 100644
index 00000000..ab8a1de2
--- /dev/null
+++ b/ai/include/ai_types.h
@@ -0,0 +1,101 @@
+/*
+ * ai_types.h
+ *
+ * Copyright (C) 2013-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ *  SYNOPSIS
+ *    This file provides common declarations and definitions for
+ *    the Aerospike Index module.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <citrusleaf/cf_ll.h>
+#include <base/secondary_index.h>
+
+#define uchar    unsigned char
+#define ushort16 unsigned short
+#define uint32   unsigned int
+#define ull      unsigned long long
+#define uint128  __uint128_t
+
+#define AS_DIGEST_KEY_SZ 20
+typedef struct uint160 {
+	char digest[AS_DIGEST_KEY_SZ];
+} uint160;
+
+// Same as as_sindex_ktype
+typedef uint8_t col_type_t;
+#define COL_TYPE_INVALID  0
+#define COL_TYPE_LONG     1
+#define COL_TYPE_DIGEST   2
+#define COL_TYPE_GEOJSON  3
+#define COL_TYPE_MAX      4
+
+#define C_IS_L(ctype)    (ctype == COL_TYPE_LONG)
+#define C_IS_DG(ctype)   (ctype == COL_TYPE_DIGEST)
+#define C_IS_G(ctype)    (ctype == COL_TYPE_GEOJSON)
+// TODO - should this have C_IS_G as well
+#define C_IS_NUM(ctype)  (C_IS_L(ctype))
+
+#define VOIDINT (void *) (long)
+
+#define SPLICE_160(num)											\
+	ull ubh, ubm; uint32 u;										\
+	char *pbu = (char *) &num;									\
+	memcpy(&ubh, pbu + 12, 8);									\
+	memcpy(&ubm, pbu + 4,  8);									\
+	memcpy(&u,   pbu,      4);
+
+#define DEBUG_U160(fp, num)										\
+	{															\
+		SPLICE_160(num);										\
+		fprintf(fp, "DEBUG_U160: high: %llu mid: %llu low: %u", ubh, ubm, u); \
+	}
+
+/***************** Opaque Forward Type Declarations *****************/
+
+/*
+ *  B-Tree Object [Implementation defined in "btreepriv.h".]
+ */
+typedef struct btree bt;
+
+
+/***************** Type Declarations *****************/
+typedef struct ai_obj {
+	ulong    l;
+	uint160  y;
+	col_type_t type;
+} ai_obj;
+
+typedef struct filter {
+	ai_obj   alow;
+	ai_obj   ahigh;
+} f_t;
+
+typedef struct check_sql_where_clause {
+	f_t     wf;
+} cswc_t;
diff --git a/ai/include/bt.h b/ai/include/bt.h
new file mode 100644
index 00000000..8d0f140d
--- /dev/null
+++ b/ai/include/bt.h
@@ -0,0 +1,90 @@
+/*
+ * bt.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ * Creation of different btree types and
+ * Public Btree Operations w/ stream abstractions under the covers
+ */
+
+#pragma once
+
+#include "ai_obj.h"
+#include "btreepriv.h"
+
+bt *createIBT      (col_type_t ktype, int imatch);
+bt *createNBT      (col_type_t ktype);
+
+/* different Btree types */
+#define INDEX_BTREE   0
+#define NODE_BTREE    1
+
+// SPAN OUTS
+// This values are choosen to fit the node size into multiples
+// of cacheline (64 byte)
+#define BTREE_LONG_TYPE_DEGREE    31 // node size becomes 504
+#define BTREE_STRING_TYPE_DEGREE  18 // node size becomes 512
+
+#define NBT_DG(btr) \
+  (btr->s.btype == NODE_BTREE && C_IS_DG(btr->s.ktype))
+
+#define NBT(btr) (NBT_DG(btr))
+
+typedef struct ulong_ulong_key {
+	ulong key;
+	ulong val;
+}  __attribute__ ((packed)) llk;
+#define LL(btr) (btr->s.bflag & BTFLAG_ULONG_ULONG)
+#define LL_SIZE 16
+typedef struct u160_ulong_key {
+	uint160 key;
+	ulong   val;
+}  __attribute__ ((packed)) ylk;
+#define YL(btr) (btr->s.bflag & BTFLAG_U160_ULONG)
+#define YL_SIZE 28
+
+typedef struct btk_t {
+	llk LL;
+	ylk YL;
+} btk_t;
+
+#define DECLARE_BT_KEY(akey, ret)                                            \
+    bool  med; uint32 ksize; btk_t btk;                                      \
+    char *btkey = createBTKey(akey, &med, &ksize, btr, &btk);/*FREE ME 026*/ \
+    if (!btkey) return ret;
+
+typedef struct crs_t {
+	llk LL_StreamPtr;
+	ylk YL_StreamPtr;
+} crs_t;
+
+#define OTHER_BT(btr) (btr->s.bflag >= BTFLAG_ULONG_ULONG)
+#define NONE_BT(btr)  (btr->s.bflag == BTFLAG_U160)
+#define BIG_BT(btr)   (btr->s.ksize > 8)
+
+#define IS_GHOST(btr, rrow) (NONE_BT(btr) && rrow && !(*(uchar *)rrow))
+
+void  btIndAdd   (bt *ibtr, ai_obj *ikey, bt  *nbtr);
+bt   *btIndFind  (bt *ibtr, ai_obj *ikey);
+int   btIndDelete(bt *ibtr, ai_obj *ikey);
+
+bool  btIndNodeAdd    (bt *nbtr, ai_obj *apk);
+bool  btIndNodeExist  (bt *nbtr, ai_obj *apk);
+int   btIndNodeDelete (bt *nbtr, ai_obj *apk, ai_obj *ocol);
diff --git a/ai/include/bt_iterator.h b/ai/include/bt_iterator.h
new file mode 100644
index 00000000..468706c9
--- /dev/null
+++ b/ai/include/bt_iterator.h
@@ -0,0 +1,98 @@
+/*
+ * bt_iteretor.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * This file implements Aerospike Index B-tree iterators.
+ */
+
+#pragma once
+
+#include "ai_types.h"
+#include "bt.h"
+
+typedef struct btEntry {
+	void   *key;
+	void   *val;
+	void   *stream; // some iterators need the raw stream (INDEX CURSORS)
+	bt_n   *x;      // some iterators need the position in the bt_n
+	int     i;      // some iterators need the position in the bt_n
+	bool    missed;
+	uint32  dr;     // RANGE DELETEs simulate Keys using DR
+} btEntry;
+
+typedef struct bTreeLinkedListNode { // 3ptr(24) 2int(8) -> 32 bytes
+	struct bTreeLinkedListNode *parent;
+	struct btreenode           *self;
+	struct bTreeLinkedListNode *child;
+	int                         ik;
+	int                         in; //TODO in not needed, ik & logic is enough
+} bt_ll_n;
+
+typedef void iter_single(struct btIterator *iter);
+
+/* using 16 as 8^16 can hold 2.8e14 elements (8 is min members in a btn)*/
+#define MAX_BTREE_DEPTH 16
+typedef struct btIterator { // 60B + 16*bt_ll_n(512) -> dont malloc
+	bt          *btr;
+	bt_ll_n     *bln;
+	int          depth;
+	iter_single *iNode;     // function to iterate on node's
+	iter_single *iLeaf;     // function to iterate on leaf's
+	bool         finished;
+	long         high;      // HIGH for INT & LONG
+	uint160      highy;     // HIGH for U160
+	uchar        num_nodes; // \/-slot in nodes[]
+	bt_ll_n      nodes[MAX_BTREE_DEPTH];
+} btIterator;
+
+typedef struct btSIter { // btIterator 500+ bytes -> STACK (globals) ALLOCATE
+	btIterator x;
+	bool       missed; // CURRENT iteration is miss
+	bool       nim;    // NEXT    iteration is miss
+	bool       empty;
+	bool       scan;
+	col_type_t ktype;
+	btEntry    be;
+	ai_obj     key;    // static AI_OBJ for be.key
+	char       dofree;
+} btSIter;
+
+#define II_FAIL       -1
+#define II_OK          0
+#define II_LEAF_EXIT   1
+#define II_ONLY_RIGHT  2
+#define II_MISS        3
+#define II_L_MISS      4
+
+bt_ll_n *get_new_iter_child(btIterator *iter);
+void     to_child(btIterator *iter, bt_n* self);
+int      init_iterator(bt *btr, bt_data_t simkey, struct btIterator *iter);
+
+btSIter *btGetRangeIter    (bt *btr, ai_obj *alow, ai_obj *ahigh,         bool asc);
+btSIter *btGetFullRangeIter(bt *btr,             bool asc, cswc_t *w);
+btSIter *btGetFullXthIter  (bt *btr,     ulong x, bool asc, cswc_t *w, long lim);
+btSIter *btSetFullRangeIter(btSIter *iter, bt *btr, bool asc, cswc_t *w);
+btSIter *btSetRangeIter    (btSIter *iter, bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc);
+btEntry *btRangeNext           (btSIter *iter,                        bool asc);
+void     btReleaseRangeIterator(btSIter *iter);
+bool assignMinKey(bt *btr, ai_obj *key);
+bool assignMaxKey(bt *btr, ai_obj *key);
diff --git a/ai/include/bt_output.h b/ai/include/bt_output.h
new file mode 100644
index 00000000..605d7428
--- /dev/null
+++ b/ai/include/bt_output.h
@@ -0,0 +1,34 @@
+/*
+ * bt_output.h
+ *
+ * Copyright (C) 2013 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ *  SYNOPSIS
+ *    This file provides declarations for the B-Tree output functions.
+ */
+
+#pragma once
+
+#include <stdio.h>
+
+#include "bt.h"
+
+void bt_dump_info(FILE *fp, bt *btr);
+void bt_dumptree(FILE *fp, bt *btr, bool is_index, bool verbose);
diff --git a/ai/include/btree.h b/ai/include/btree.h
new file mode 100644
index 00000000..6b40272c
--- /dev/null
+++ b/ai/include/btree.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright 1997, 1998, 2001 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#pragma once
+
+#include "ai_types.h"
+
+struct btree;
+struct btreenode;
+
+#define VOIDSIZE   8 /* force to 8, otherwise UU would not work on 32bit */
+#define U160SIZE  AS_DIGEST_KEY_SZ
+
+typedef struct btree_specification { /* size 9B */
+	unsigned char   ktype;    /* [STRING,INT,FLOAT,LONG]--------------------| */
+	unsigned char   btype;    /* [data,index,node]                          | */
+	unsigned char   ksize;    /* UU&INDEX(8), UL&LU(12), LL(16) | */
+	unsigned int    bflag;    /* [OTHER_BT + BTFLAG_*_INDEX]                | */
+	unsigned short  num;      /*--------------------------------------------| */
+} __attribute__ ((packed)) bts_t;
+
+typedef void * bt_data_t;
+typedef int (*bt_cmp_t)(bt_data_t k1, bt_data_t k2);
+
+// CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR
+struct btree *bt_create(bt_cmp_t cmp, bts_t *s, char dirty);
+
+// CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD
+typedef struct data_with_dirt_t {
+	bt_data_t k;     // the data
+	uint32    dr;    // dirty-right
+} dwd_t;
+bool      bt_insert  (struct btree *btr, bt_data_t k, uint32     dr);
+dwd_t     bt_delete  (struct btree *btr, bt_data_t k, bool leafd);
+
+// OPERATORS OPERATORS OPERATORS OPERATORS OPERATORS OPERATORS OPERATORS
+bt_data_t  bt_max     (struct btree *btr);
+bt_data_t  bt_min     (struct btree *btr);
+bt_data_t  bt_find    (struct btree *btr, bt_data_t k, ai_obj *akey);
+
+// DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY
+struct btreenode *addDStoBTN(struct btree *btr, struct btreenode *x,
+							 struct btreenode *p, int pi, char dirty);
+
+uint32    getDR          (struct btree *btr, struct btreenode *x, int i);
+bool      bt_exist       (struct btree *btr, bt_data_t k, ai_obj *akey);
+
+typedef struct data_with_miss_t {
+	bt_data_t         k;    // the data
+	bool              miss;
+	struct btreenode *x;    // NOTE: used for DELETE an EVICTed row
+	int               i;    // NOTE: used for DELETE an EVICTed row
+	struct btreenode *p;    // NOTE: used for DELETE an EVICTed row
+	int               pi;   // NOTE: used for DELETE an EVICTed row
+} dwm_t;
+
+struct ai_obj;
+dwm_t findnodekey(struct btree *btr, struct btreenode *x, bt_data_t k, ai_obj *akey);
+
+// ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR
+struct btIterator;
+int  bt_init_iterator(struct btree *br, bt_data_t k, struct btIterator *iter, ai_obj *alow);
+void bt_destroy   (struct btree *btr);
diff --git a/ai/include/btreepriv.h b/ai/include/btreepriv.h
new file mode 100644
index 00000000..f010eeeb
--- /dev/null
+++ b/ai/include/btreepriv.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright 1997-1999, 2001 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "btree.h"
+
+// BTREE TYPE FLAGS
+#define BTFLAG_U160          0x00
+#define BTFLAG_ULONG_ULONG   0x01
+#define BTFLAG_U160_ULONG    0x02
+
+struct btree { // 62 Bytes -> 64B
+	struct btreenode  *root;
+	bt_cmp_t           cmp;
+
+	unsigned long      msize;
+	unsigned long      nsize;   // sizeof underlying nbtr
+	unsigned long      dsize;
+
+	unsigned int       numkeys;  /* --- 8 bytes | */
+	unsigned int       numnodes; /* ------------| */
+
+	unsigned short     keyofst;  /* --- 8 bytes | */ //TODO can be computed
+	unsigned short     nodeofst; /*             | */ //TODO can be computed
+	unsigned short     nbyte;    /*             | */
+	unsigned short     kbyte;    /* ------------| */
+
+	unsigned char      t;
+	unsigned char      nbits;
+	bts_t              s;          // 9 bytes
+
+	unsigned int       dirty_left; // 4 bytes (num evicted before 1st key)
+	unsigned char      dirty;      // NOTE: bool: if ANY btn in btr is dirty
+} __attribute__ ((packed));
+
+// Aerospike Index local list ... this is to optimize for space for the high selectivity index.
+typedef struct {
+	uint8_t    capacity;
+	uint8_t    used;
+	uint8_t    data[];
+} __attribute__ ((__packed__)) ai_arr;
+
+/*
+ *  Note:  The "ai_arr" structure is limited to 8 bits for capacity / used.
+ */
+#define AI_ARR_MAX_SIZE 255
+
+// Do not change order it is same as struct B-tree inside Aerospike Index ~~
+//  pretty hacky stuff.  Inside Aerospike Index code is_btree is checked
+typedef struct {
+	union {
+		ai_arr *arr;
+		bt      *nbtr;
+	} u;
+	bool     is_btree;
+} __attribute__ ((__packed__)) ai_nbtr;
+
+//NOTE: For Aerospike, not currently using EVICT, save one byte in bt_n
+//      This changes a 2049 allocation to 2048 -> which is IMPORTANT
+typedef struct btreenode { // 9 bytes -> 16 bytes
+	unsigned int   scion;       /* 4 billion max scion */
+	unsigned short n;           /* 65 thousand max entries (per bt_n)*/
+	unsigned char  leaf;
+	// DIRTY: -1->CLEAN,
+	//         0->TreeDirty but BTN_clean, 1->ucharDR, 2->ushortDR, 3->uintDR
+	char           dirty;
+} __attribute__ ((packed)) bt_n;
+
+// BTREE access of KEYs & NODEs via position in bt_n
+void *KEYS(bt *btr, bt_n *x, int i);
+#define NODES(btr, x) ((bt_n **)((char *)x + btr->nodeofst))
+
+#define GET_BTN_SIZE(leaf)   \
+  size_t nsize = leaf          ? btr->kbyte : btr->nbyte;
+#define GET_BTN_MSIZE(dirty) \
+  size_t msize = (dirty == -1) ? nsize      : nsize + sizeof(void *);
+#define GET_BTN_SIZES(leaf, dirty) \
+    GET_BTN_SIZE(leaf) GET_BTN_MSIZE(dirty)
+#define GET_DS(x, nsize) (*((void **)((char *)x + nsize)))
+
+bt_n *findminnode(bt *btr, bt_n *x);
diff --git a/ai/include/stream.h b/ai/include/stream.h
new file mode 100644
index 00000000..bc6c8658
--- /dev/null
+++ b/ai/include/stream.h
@@ -0,0 +1,41 @@
+/*
+ * stream.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ * This file implements stream parsing for rows
+ */
+
+#pragma once
+
+#include "ai_obj.h"
+#include "bt.h"
+
+int u160Cmp (void *s1, void *s2);
+int llCmp   (void *s1, void *s2);
+int ylCmp   (void *s1, void *s2);
+
+char *createBTKey(ai_obj *key, bool *med, uint32 *ksize, bt *btr, btk_t *btk);
+void  destroyBTKey(char *btkey, bool  med);
+
+void   convertStream2Key(uchar *stream, ai_obj *key, bt *btr);
+uchar *parseStream(uchar *stream, bt *btr);
+void  *createStream(bt *btr, void *val, char *btkey, uint32 klen, uint32 *ssize, crs_t *crs);
+bool   destroyStream(bt *btr, uchar *ostream);
diff --git a/ai/src/Makefile b/ai/src/Makefile
new file mode 100644
index 00000000..64cccaf9
--- /dev/null
+++ b/ai/src/Makefile
@@ -0,0 +1,33 @@
+# Aerospike Server - Aerospike Index
+# Makefile
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.in
+
+HEADERS = ai_btree.h ai_types.h ai_obj.h bt.h bt_iterator.h bt_output.h btree.h btreepriv.h stream.h
+
+SOURCES = ai_btree.c ai_obj.c bt.c bt_code.c bt_iterator.c bt_output.c stream.c
+
+INCLUDES += $(INCLUDE_DIR:%=-I%)
+INCLUDES += -I$(CF)/include -I$(AS)/include
+INCLUDES += -I$(AS)/include
+INCLUDES += -I$(COMMON)/target/$(PLATFORM)/include
+INCLUDES += -I$(MOD_LUA)/target/$(PLATFORM)/include
+
+LIBRARY = $(LIBRARY_DIR)/libai.a
+
+OBJECTS = $(SOURCES:%.c=$(OBJECT_DIR)/%.o)
+DEPENDENCIES = $(OBJECTS:%.o=%.d)
+
+.PHONY: all
+all: $(LIBRARY)
+
+.PHONY: clean
+clean:
+	$(RM) $(OBJECTS) $(LIBRARY)
+	$(RM) $(DEPENDENCIES)
+
+$(LIBRARY): $(OBJECTS)
+	$(AR) rs $(LIBRARY) $(OBJECTS)
+
+include $(DEPTH)/make_in/Makefile.targets
diff --git a/ai/src/ai_btree.c b/ai/src/ai_btree.c
new file mode 100644
index 00000000..c81ce4b7
--- /dev/null
+++ b/ai/src/ai_btree.c
@@ -0,0 +1,1178 @@
+/*
+ * ai_btree.c
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <sys/time.h>
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ai_obj.h"
+#include "ai_btree.h"
+#include "bt_iterator.h"
+#include "bt_output.h"
+#include "stream.h"
+#include "base/thr_sindex.h"
+#include "base/cfg.h"
+#include "fabric/partition.h"
+
+#include <citrusleaf/alloc.h>
+#include <citrusleaf/cf_clock.h>
+#include <citrusleaf/cf_digest.h>
+#include <citrusleaf/cf_ll.h>
+
+#include "fault.h"
+
+#define AI_ARR_MAX_USED 32
+
+/*
+ *  Global determining whether to use array rather than B-Tree.
+ */
+bool g_use_arr = true;
+
+static void
+cloneDigestFromai_obj(cf_digest *d, ai_obj *akey)
+{
+	memcpy(d, &akey->y, CF_DIGEST_KEY_SZ);
+}
+
+static void
+init_ai_objFromDigest(ai_obj *akey, cf_digest *d)
+{
+	init_ai_objU160(akey, *(uint160 *)d);
+}
+
+const uint8_t INIT_CAPACITY = 1;
+
+static ai_arr *
+ai_arr_new()
+{
+	ai_arr *arr = cf_malloc(sizeof(ai_arr) + (INIT_CAPACITY * CF_DIGEST_KEY_SZ));
+	arr->capacity = INIT_CAPACITY;
+	arr->used = 0;
+	return arr;
+}
+
+static void
+ai_arr_move_to_tree(ai_arr *arr, bt *nbtr)
+{
+	for (int i = 0; i < arr->used; i++) {
+		ai_obj apk;
+		init_ai_objFromDigest(&apk, (cf_digest *)&arr->data[i * CF_DIGEST_KEY_SZ]);
+		if (!btIndNodeAdd(nbtr, &apk)) {
+			// what to do ??
+			continue;
+		}
+	}
+}
+
+/*
+ * Side effect if success full *arr will be freed
+ */
+static void
+ai_arr_destroy(ai_arr *arr)
+{
+	if (!arr) return;
+	cf_free(arr);
+}
+
+static int
+ai_arr_size(ai_arr *arr)
+{
+	if (!arr) return 0;
+	return(sizeof(ai_arr) + (arr->capacity * CF_DIGEST_KEY_SZ));
+}
+
+/*
+ * Finds the digest in the AI array.
+ * Returns
+ *      idx if found
+ *      -1  if not found
+ */
+static int
+ai_arr_find(ai_arr *arr, cf_digest *dig)
+{
+	for (int i = 0; i < arr->used; i++) {
+		if (0 == cf_digest_compare(dig, (cf_digest *)&arr->data[i * CF_DIGEST_KEY_SZ])) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+static ai_arr *
+ai_arr_shrink(ai_arr *arr)
+{
+	int size = arr->capacity / 2;
+
+	// Do not shrink if the capacity not greater than 4
+	// or if the halving capacity is not a extra level
+	// over currently used
+	if ((arr->capacity <= 4) ||
+			(size < arr->used * 2)) {
+		return arr;
+	}
+
+	ai_arr * temp_arr = cf_realloc(arr, sizeof(ai_arr) + (size * CF_DIGEST_KEY_SZ));
+	temp_arr->capacity = size;
+	return temp_arr;
+}
+
+static ai_arr *
+ai_arr_delete(ai_arr *arr, cf_digest *dig, bool *notfound)
+{
+	int idx = ai_arr_find(arr, dig);
+	// Nothing to delete
+	if (idx < 0) {
+		*notfound = true;
+		return arr;
+	}
+	if (idx != arr->used - 1) {
+		int dest_offset = idx * CF_DIGEST_KEY_SZ;
+		int src_offset = (arr->used - 1) * CF_DIGEST_KEY_SZ;
+		// move last element
+		memcpy(&arr->data[dest_offset], &arr->data[src_offset], CF_DIGEST_KEY_SZ);
+	}
+	arr->used--;
+	return ai_arr_shrink(arr);
+}
+
+/*
+ * Returns
+ *      arr pointer in case of successful operation
+ *      NULL in case of failure
+ */
+static ai_arr *
+ai_arr_expand(ai_arr *arr)
+{
+	int size = arr->capacity * 2;
+
+	if (size > AI_ARR_MAX_SIZE) {
+		cf_crash(AS_SINDEX, "Refusing to expand ai_arr to %d (beyond limit of %d)", size, AI_ARR_MAX_SIZE);
+	}
+
+	arr = cf_realloc(arr, sizeof(ai_arr) + (size * CF_DIGEST_KEY_SZ));
+	//cf_info(AS_SINDEX, "EXPAND REALLOC to %d", size);
+	arr->capacity = size;
+	return arr;
+}
+
+/*
+ * Returns
+ *      arr in case of success
+ *      NULL in case of failure
+ */
+static ai_arr *
+ai_arr_insert(ai_arr *arr, cf_digest *dig, bool *found)
+{
+	int idx = ai_arr_find(arr, dig);
+	// already found
+	if (idx >= 0) {
+		*found = true;
+		return arr;
+	}
+	if (arr->used == arr->capacity) {
+		arr = ai_arr_expand(arr);
+	}
+	memcpy(&arr->data[arr->used * CF_DIGEST_KEY_SZ], dig, CF_DIGEST_KEY_SZ);
+	arr->used++;
+	return arr;
+}
+
+/*
+ * Returns the size diff
+ */
+static int
+anbtr_check_convert(ai_nbtr *anbtr, col_type_t sktype)
+{
+	// Nothing to do
+	if (anbtr->is_btree)
+		return 0;
+
+	ai_arr *arr = anbtr->u.arr;
+	if (arr && (arr->used >= AI_ARR_MAX_USED)) {
+		//cf_info(AS_SINDEX,"Flipped @ %d", arr->used);
+		ulong ba = ai_arr_size(arr);
+		// Allocate btree move digest from arr to btree
+		bt *nbtr = createNBT(sktype);
+		if (!nbtr) {
+			cf_warning(AS_SINDEX, "btree allocation failure");
+			return 0;
+		}
+
+		ai_arr_move_to_tree(arr, nbtr);
+		ai_arr_destroy(anbtr->u.arr);
+
+		// Update anbtr
+		anbtr->u.nbtr = nbtr;
+		anbtr->is_btree = true;
+
+		ulong aa = nbtr->msize;
+		return (aa - ba);
+	}
+	return 0;
+}
+
+/*
+ *  return -1    in case of failure
+ *          size of allocation in case of success
+ */
+static int
+anbtr_check_init(ai_nbtr *anbtr, col_type_t sktype)
+{
+	bool create_arr = false;
+	bool create_nbtr = false;
+
+	if (anbtr->is_btree) {
+		if (anbtr->u.nbtr) {
+			create_nbtr = false;
+		} else {
+			create_nbtr = true;
+		}
+	} else {
+		if (anbtr->u.arr) {
+			create_arr = false;
+		} else {
+			if (g_use_arr) {
+				create_arr = true;
+			} else {
+				create_nbtr = true;
+			}
+		}
+	}
+
+	// create array or btree
+	if (create_arr) {
+		anbtr->u.arr = ai_arr_new();
+		return ai_arr_size(anbtr->u.arr);
+	} else if (create_nbtr) {
+		anbtr->u.nbtr = createNBT(sktype);
+		if (!anbtr->u.nbtr) {
+			return -1;
+		}
+		anbtr->is_btree = true;
+		return anbtr->u.nbtr->msize;
+	} else {
+		if (!anbtr->u.arr && !anbtr->u.nbtr) {
+			cf_warning(AS_SINDEX, "Something wrong!!!");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Insert operation for the nbtr does the following
+ * 1. Sets up anbtr if it is set up
+ * 2. Inserts in the arr or nbtr depending number of elements.
+ * 3. Cuts over from arr to btr at AI_ARR_MAX_USED
+ *
+ * Parameter:   ibtr  : Btree of key
+ *              acol  : Secondary index key
+ *              apk   : value (primary key to be inserted)
+ *              sktype : value type (U160 currently)
+ *
+ * Returns:
+ *      AS_SINDEX_OK        : In case of success
+ *      AS_SINDEX_ERR       : In case of failure
+ *      AS_SINDEX_KEY_FOUND : If key already exists
+ */
+static int
+reduced_iAdd(bt *ibtr, ai_obj *acol, ai_obj *apk, col_type_t sktype)
+{
+	ai_nbtr *anbtr = (ai_nbtr *)btIndFind(ibtr, acol);
+	ulong ba = 0, aa = 0;
+	bool allocated_anbtr = false;
+	if (!anbtr) {
+		anbtr = cf_malloc(sizeof(ai_nbtr));
+		aa += sizeof(ai_nbtr);
+		memset(anbtr, 0, sizeof(ai_nbtr));
+		allocated_anbtr = true;
+	}
+
+	// Init the array
+	int ret = anbtr_check_init(anbtr, sktype);
+	if (ret < 0) {
+		if (allocated_anbtr) {
+			cf_free(anbtr);
+		}
+		return AS_SINDEX_ERR;
+	} else if (ret) {
+		ibtr->nsize += ret;
+		btIndAdd(ibtr, acol, (bt *)anbtr);
+	}
+
+	// Convert from arr to nbtr if limit is hit
+	ibtr->nsize += anbtr_check_convert(anbtr, sktype);
+
+	// If already a btree use it
+	if (anbtr->is_btree) {
+		bt *nbtr = anbtr->u.nbtr;
+		if (!nbtr) {
+			return AS_SINDEX_ERR;
+		}
+
+		if (btIndNodeExist(nbtr, apk)) {
+			return AS_SINDEX_KEY_FOUND;
+		}
+
+		ba += nbtr->msize;
+		if (!btIndNodeAdd(nbtr, apk)) {
+			return AS_SINDEX_ERR;
+		}
+		aa += nbtr->msize;
+
+	} else {
+		ai_arr *arr = anbtr->u.arr;
+		if (!arr) {
+			return AS_SINDEX_ERR;
+		}
+
+		ba += ai_arr_size(anbtr->u.arr);
+		bool found = false;
+		ai_arr *t_arr = ai_arr_insert(arr, (cf_digest *)&apk->y, &found);
+		if (found) {
+			return AS_SINDEX_KEY_FOUND;
+		}
+		anbtr->u.arr = t_arr;
+		aa += ai_arr_size(anbtr->u.arr);
+	}
+	ibtr->nsize += (aa - ba);  // ibtr inherits nbtr
+
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Delete operation for the nbtr does the following. Delete in the arr or nbtr
+ * based on state of anbtr
+ *
+ * Parameter:   ibtr  : Btree of key
+ *              acol  : Secondary index key
+ *              apk   : value (primary key to be inserted)
+ *
+ * Returns:
+ *      AS_SINDEX_OK           : In case of success
+ *      AS_SINDEX_ERR          : In case of failure
+ *      AS_SINDEX_KEY_NOTFOUND : If key does not exist
+ */
+static int
+reduced_iRem(bt *ibtr, ai_obj *acol, ai_obj *apk)
+{
+	ai_nbtr *anbtr = (ai_nbtr *)btIndFind(ibtr, acol);
+	ulong ba = 0, aa = 0;
+	if (!anbtr) {
+		return AS_SINDEX_KEY_NOTFOUND;
+	}
+	if (anbtr->is_btree) {
+		if (!anbtr->u.nbtr) return AS_SINDEX_ERR;
+
+		// Remove from nbtr if found
+		bt *nbtr = anbtr->u.nbtr;
+		if (!btIndNodeExist(nbtr, apk)) {
+			return AS_SINDEX_KEY_NOTFOUND;
+		}
+		ba = nbtr->msize;
+
+		// TODO - Needs to be cleaner, type convert from signed
+		// to unsigned. Should be 64 bit !!
+		int nkeys_before = nbtr->numkeys; 
+		int nkeys_after = btIndNodeDelete(nbtr, apk, NULL);
+		aa = nbtr->msize;
+
+		if (nkeys_after == nkeys_before) {
+			return AS_SINDEX_KEY_NOTFOUND;
+		}
+
+		// remove from ibtr
+		if (nkeys_after == 0) {
+			btIndDelete(ibtr, acol);
+			aa = 0;
+			bt_destroy(nbtr);
+			ba += sizeof(ai_nbtr);
+			cf_free(anbtr);
+		}
+	} else {
+		if (!anbtr->u.arr) return AS_SINDEX_ERR;
+
+		// Remove from arr if found
+		bool notfound = false;
+		ba = ai_arr_size(anbtr->u.arr);
+		anbtr->u.arr = ai_arr_delete(anbtr->u.arr, (cf_digest *)&apk->y, &notfound);
+		if (notfound) return AS_SINDEX_KEY_NOTFOUND;
+		aa = ai_arr_size(anbtr->u.arr);
+
+		// Remove from ibtr
+		if (anbtr->u.arr->used == 0) {
+			btIndDelete(ibtr, acol);
+			aa = 0;
+			ai_arr_destroy(anbtr->u.arr);
+			ba += sizeof(ai_nbtr);
+			cf_free(anbtr);
+		}
+	}
+	ibtr->nsize -= (ba - aa);
+
+	return AS_SINDEX_OK;
+}
+
+int
+ai_btree_key_hash_from_sbin(as_sindex_metadata *imd, as_sindex_bin_data *b)
+{
+	uint64_t u;
+
+	if (C_IS_DG(imd->sktype)) {
+		char *x = (char *) &b->digest; // x += 4;
+		u = ((* (uint128 *) x) % imd->nprts);
+	} else {
+		u = (((uint64_t) b->u.i64) % imd->nprts);
+	}
+
+	return (int) u;
+}
+
+int
+ai_btree_key_hash(as_sindex_metadata *imd, void *skey)
+{
+	uint64_t u;
+
+	if (C_IS_DG(imd->sktype)) {
+		char *x = (char *) ((cf_digest *)skey); // x += 4;
+		u = ((* (uint128 *) x) % imd->nprts);
+	} else {
+		u = ((*(uint64_t*)skey) % imd->nprts);
+	}
+
+	return (int) u;
+}
+
+/*
+ * Return 0  in case of success
+ *        -1 in case of failure
+ */
+static int
+btree_addsinglerec(as_sindex_metadata *imd, ai_obj * key, cf_digest *dig, cf_ll *recl, uint64_t *n_bdigs, 
+								bool * can_partition_query, bool partitions_pre_reserved)
+{
+	// The digests which belongs to one of the query-able partitions are elligible to go into recl
+	uint32_t pid =  as_partition_getid(dig);
+	as_namespace * ns = imd->si->ns;
+	if (partitions_pre_reserved) {
+		if (!can_partition_query[pid]) {
+			return 0;
+		}
+	}
+	else {
+		if (! client_replica_maps_is_partition_queryable(ns, pid)) {
+			return 0;
+		}
+	}
+
+	bool create                     = (cf_ll_size(recl) == 0) ? true : false;
+	as_index_keys_arr * keys_arr    = NULL;
+	if (!create) {
+		cf_ll_element * ele         = cf_ll_get_tail(recl);
+		keys_arr                    = ((as_index_keys_ll_element*)ele)->keys_arr;
+		if (keys_arr->num == AS_INDEX_KEYS_PER_ARR) {
+			create = true;
+		}
+	}
+	if (create) {
+		keys_arr                    = as_index_get_keys_arr();
+		if (!keys_arr) {
+			cf_warning(AS_SINDEX, "Fail to allocate sindex key value array");
+			return -1;
+		}
+		as_index_keys_ll_element * node =  cf_malloc(sizeof(as_index_keys_ll_element));
+		node->keys_arr                  = keys_arr;
+		cf_ll_append(recl, (cf_ll_element *)node);
+	}
+	// Copy the digest (value)
+	memcpy(&keys_arr->pindex_digs[keys_arr->num], dig, CF_DIGEST_KEY_SZ);
+
+	// Copy the key
+	if (C_IS_DG(imd->sktype)) {
+		memcpy(&keys_arr->sindex_keys[keys_arr->num].key.str_key, &key->y, CF_DIGEST_KEY_SZ);
+	}
+	else {
+		keys_arr->sindex_keys[keys_arr->num].key.int_key = key->l;
+	}
+
+	keys_arr->num++;
+	*n_bdigs = *n_bdigs + 1;
+	return 0;
+}
+
+/*
+ * Return 0 in case of success
+ *       -1 in case of failure
+ */
+static int
+add_recs_from_nbtr(as_sindex_metadata *imd, ai_obj *ikey, bt *nbtr, as_sindex_qctx *qctx, bool fullrng)
+{
+	int ret = 0;
+	ai_obj sfk, efk;
+	init_ai_obj(&sfk);
+	init_ai_obj(&efk);
+	btSIter *nbi;
+	btEntry *nbe;
+	btSIter stack_nbi;
+
+	if (fullrng) {
+		nbi = btSetFullRangeIter(&stack_nbi, nbtr, 1, NULL);
+	} else { // search from LAST batches end-point
+		init_ai_objFromDigest(&sfk, &qctx->bdig);
+		assignMaxKey(nbtr, &efk);
+		nbi = btSetRangeIter(&stack_nbi, nbtr, &sfk, &efk, 1);
+	}
+ 	if (nbi) {
+		while ((nbe = btRangeNext(nbi, 1))) {
+			ai_obj *akey = nbe->key;
+			// FIRST can be REPEAT (last batch)
+			if (!fullrng && ai_objEQ(&sfk, akey)) {
+				continue;
+			}
+			if (btree_addsinglerec(imd, ikey, (cf_digest *)&akey->y, qctx->recl, &qctx->n_bdigs,
+									qctx->can_partition_query, qctx->partitions_pre_reserved)) {
+				ret = -1;
+				break;
+			}
+			if (qctx->n_bdigs == qctx->bsize) {
+				if (ikey) {
+					ai_objClone(qctx->bkey, ikey);
+				}
+				cloneDigestFromai_obj(&qctx->bdig, akey);
+				break;
+			}
+		}
+		btReleaseRangeIterator(nbi);
+	} else {
+		cf_warning(AS_QUERY, "Could not find nbtr iterator.. skipping !!");
+	}
+	return ret;
+}
+
+static int
+add_recs_from_arr(as_sindex_metadata *imd, ai_obj *ikey, ai_arr *arr, as_sindex_qctx *qctx)
+{
+	bool ret = 0;
+
+	for (int i = 0; i < arr->used; i++) {
+		if (btree_addsinglerec(imd, ikey, (cf_digest *)&arr->data[i * CF_DIGEST_KEY_SZ], qctx->recl, 
+					&qctx->n_bdigs, qctx->can_partition_query, qctx->partitions_pre_reserved)) {
+			ret = -1;
+			break;
+		}
+		// do not break on hitting batch limit, if the tree converts to
+		// bt from arr, there is no way to know which digest were already
+		// returned when attempting subsequent batch. Return the entire
+		// thing.
+	}
+	// mark nbtr as finished and copy the offset
+	qctx->nbtr_done = true;
+	if (ikey) {
+		ai_objClone(qctx->bkey, ikey);
+	}
+
+	return ret;
+}
+
+/*
+ * Return 0  in case of success
+ *        -1 in case of failure
+ */
+static int
+get_recl(as_sindex_metadata *imd, ai_obj *afk, as_sindex_qctx *qctx)
+{
+	as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx];
+	ai_nbtr *anbtr = (ai_nbtr *)btIndFind(pimd->ibtr, afk);
+
+	if (!anbtr) {
+		return 0;
+	}
+
+	if (anbtr->is_btree) {
+		if (add_recs_from_nbtr(imd, afk, anbtr->u.nbtr, qctx, qctx->new_ibtr)) {
+			return -1;
+		}
+	} else {
+		// If already entire batch is returned
+		if (qctx->nbtr_done) {
+			return 0;
+		}
+		if (add_recs_from_arr(imd, afk, anbtr->u.arr, qctx)) {
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Return 0  in case of success
+ *        -1 in case of failure
+ */
+static int
+get_numeric_range_recl(as_sindex_metadata *imd, uint64_t begk, uint64_t endk, as_sindex_qctx *qctx)
+{
+	ai_obj sfk;
+	init_ai_objLong(&sfk, qctx->new_ibtr ? begk : qctx->bkey->l);
+	ai_obj efk;
+	init_ai_objLong(&efk, endk);
+	as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx];
+	bool fullrng              = qctx->new_ibtr;
+	int ret                   = 0;
+	btSIter *bi               = btGetRangeIter(pimd->ibtr, &sfk, &efk, 1);
+	btEntry *be;
+
+	if (bi) {
+		while ((be = btRangeNext(bi, 1))) {
+			ai_obj  *ikey  = be->key;
+			ai_nbtr *anbtr = be->val;
+
+			if (!anbtr) {
+				ret = -1;
+				break;
+			}
+
+			// figure out nbtr to deal with. If the key which was
+			// used last time vanishes work with next key. If the
+			// key exist but 'last' entry made to list in the last
+			// iteration; Move to next nbtr
+			if (!fullrng) {
+				if (!ai_objEQ(&sfk, ikey)) {
+					fullrng = 1; // bkey disappeared
+				} else if (qctx->nbtr_done) {
+					qctx->nbtr_done = false;
+					// If we are moving to the next key, we need 
+					// to search the full range.
+					fullrng = 1;
+					continue;
+				}
+			}
+
+			if (anbtr->is_btree) {
+				if (add_recs_from_nbtr(imd, ikey, anbtr->u.nbtr, qctx, fullrng)) {
+					ret = -1;
+					break;
+				}
+			} else {
+				if (add_recs_from_arr(imd, ikey, anbtr->u.arr, qctx)) {
+					ret = -1;
+					break;
+				}
+			}
+
+			// Since add_recs_from_arr() returns entire thing and do not support the batch limit,
+			// >= operator is needed here.
+			if (qctx->n_bdigs >= qctx->bsize) {
+				break;
+			}
+
+			// If it reaches here, this means last key could not fill the batch.
+			// So if we are to start a new key, search should be done on full range 
+			// and the new nbtr is obviously not done.
+			fullrng         = 1;
+			qctx->nbtr_done = false;
+		}
+		btReleaseRangeIterator(bi);
+	}
+	return ret;
+}
+
+int
+ai_btree_query(as_sindex_metadata *imd, as_sindex_range *srange, as_sindex_qctx *qctx)
+{
+	bool err = 1;
+	if (!srange->isrange) { // EQUALITY LOOKUP
+		ai_obj afk;
+		init_ai_obj(&afk);
+		if (C_IS_DG(imd->sktype)) {
+			init_ai_objFromDigest(&afk, &srange->start.digest);
+		}
+		else {
+			init_ai_objLong(&afk, srange->start.u.i64);
+		}
+		err = get_recl(imd, &afk, qctx);
+	} else {                // RANGE LOOKUP
+		err = get_numeric_range_recl(imd, srange->start.u.i64, srange->end.u.i64, qctx);
+	}
+	return (err ? AS_SINDEX_ERR_NO_MEMORY :
+			(qctx->n_bdigs >= qctx->bsize) ? AS_SINDEX_CONTINUE : AS_SINDEX_OK);
+}
+
+int
+ai_btree_put(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void *skey, cf_digest *value)
+{
+	ai_obj ncol;
+	if (C_IS_DG(imd->sktype)) {
+		init_ai_objFromDigest(&ncol, (cf_digest*)skey);
+	}
+	else {
+		// TODO - ai_obj type is LONG for both Geo and Long
+		init_ai_objLong(&ncol, *(ulong *)skey);
+	}
+
+	ai_obj apk;
+	init_ai_objFromDigest(&apk, value);
+
+
+	uint64_t before = pimd->ibtr->msize + pimd->ibtr->nsize;
+	int ret = reduced_iAdd(pimd->ibtr, &ncol, &apk, COL_TYPE_DIGEST);
+	uint64_t after = pimd->ibtr->msize + pimd->ibtr->nsize;
+	cf_atomic64_add(&imd->si->ns->n_bytes_sindex_memory, (after - before));
+
+	if (ret && ret != AS_SINDEX_KEY_FOUND) {
+		cf_warning(AS_SINDEX, "Insert into the btree failed");
+		return AS_SINDEX_ERR_NO_MEMORY;
+	}
+	return ret;
+}
+
+int
+ai_btree_delete(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void * skey, cf_digest * value)
+{
+	int ret = AS_SINDEX_OK;
+
+	if (!pimd->ibtr) {
+		return AS_SINDEX_KEY_NOTFOUND;
+	}
+
+	ai_obj ncol;
+	if (C_IS_DG(imd->sktype)) {
+		init_ai_objFromDigest(&ncol, (cf_digest *)skey);
+	}
+	else {
+		// TODO - ai_obj type is LONG for both Geo and Long
+		init_ai_objLong(&ncol, *(ulong *)skey);
+	}
+
+	ai_obj apk;
+	init_ai_objFromDigest(&apk, value);
+
+	uint64_t before = pimd->ibtr->msize + pimd->ibtr->nsize;
+	ret = reduced_iRem(pimd->ibtr, &ncol, &apk);
+	uint64_t after = pimd->ibtr->msize + pimd->ibtr->nsize;
+	cf_atomic64_sub(&imd->si->ns->n_bytes_sindex_memory, (before - after));
+
+	return ret;
+}
+
+/*
+ * Internal function which adds digests to the defrag_list
+ * Mallocs the nodes of defrag_list
+ * Returns :
+ *      -1 : Error
+ *      number of digests found : success
+ *
+ */
+static long
+build_defrag_list_from_nbtr(as_namespace *ns, ai_obj *acol, bt *nbtr, ulong nofst, ulong *limit, uint64_t * tot_found, cf_ll *gc_list)
+{
+	int error = -1;
+	btEntry *nbe;
+	// STEP 1: go thru a portion of the nbtr and find to-be-deleted-PKs
+	// TODO: a range query may be smarter then using the Xth Iterator
+	btSIter *nbi = (nofst ? btGetFullXthIter(nbtr, nofst, 1, NULL, 0) :
+					btGetFullRangeIter(nbtr, 1, NULL));
+	if (!nbi) {
+		return error;
+	}
+
+	long      found             = 0;
+	long  processed             = 0;
+	while ((nbe = btRangeNext(nbi, 1))) {
+		ai_obj *akey = nbe->key;
+		int ret = as_sindex_can_defrag_record(ns, (cf_digest *) (&akey->y));
+
+		if (ret == AS_SINDEX_GC_SKIP_ITERATION) {
+			*limit = 0;
+			break;
+		} else if (ret == AS_SINDEX_GC_OK) {
+
+			bool create   = (cf_ll_size(gc_list) == 0) ? true : false;
+			objs_to_defrag_arr *dt;
+
+			if (!create) {
+				cf_ll_element * ele = cf_ll_get_tail(gc_list);
+				dt = ((ll_sindex_gc_element*)ele)->objs_to_defrag;
+				if (dt->num == SINDEX_GC_NUM_OBJS_PER_ARR) {
+					create = true;
+				}
+			}
+			if (create) {
+				dt = as_sindex_gc_get_defrag_arr();
+				if (!dt) {
+					*tot_found += found;
+					return -1;
+				}
+				ll_sindex_gc_element  * node;
+				node = cf_malloc(sizeof(ll_sindex_gc_element));
+				node->objs_to_defrag = dt;
+				cf_ll_append(gc_list, (cf_ll_element *)node);
+			}
+			cloneDigestFromai_obj(&(dt->acol_digs[dt->num].dig), akey);
+			ai_objClone(&(dt->acol_digs[dt->num].acol), acol);
+
+			dt->num += 1;		
+			found++;
+		}
+		processed++;
+		(*limit)--;
+		if (*limit == 0) break;
+	}
+	btReleaseRangeIterator(nbi);
+	*tot_found += found; 
+	return processed;
+}
+
+static long
+build_defrag_list_from_arr(as_namespace *ns, ai_obj *acol, ai_arr *arr, ulong nofst, ulong *limit, uint64_t * tot_found, cf_ll *gc_list)
+{
+	long     found              = 0;
+	long     processed          = 0;
+
+	for (ulong i = nofst; i < arr->used; i++) {
+		int ret = as_sindex_can_defrag_record(ns, (cf_digest *) &arr->data[i * CF_DIGEST_KEY_SZ]);
+		if (ret == AS_SINDEX_GC_SKIP_ITERATION) {
+			*limit = 0;
+			break;
+		} else if (ret == AS_SINDEX_GC_OK) {
+			bool create   = (cf_ll_size(gc_list) == 0) ? true : false;
+			objs_to_defrag_arr *dt;
+
+			if (!create) {
+				cf_ll_element * ele = cf_ll_get_tail(gc_list);
+				dt = ((ll_sindex_gc_element*)ele)->objs_to_defrag;
+				if (dt->num == SINDEX_GC_NUM_OBJS_PER_ARR) {
+					create = true;
+				}
+			}
+			if (create) {
+				dt = as_sindex_gc_get_defrag_arr();
+				if (!dt) {
+					*tot_found += found;
+					return -1;
+				}
+				ll_sindex_gc_element  * node;
+				node = cf_malloc(sizeof(ll_sindex_gc_element));
+				node->objs_to_defrag = dt;
+				cf_ll_append(gc_list, (cf_ll_element *)node);
+			}
+			memcpy(&(dt->acol_digs[dt->num].dig), (cf_digest *) &arr->data[i * CF_DIGEST_KEY_SZ], CF_DIGEST_KEY_SZ);	
+			ai_objClone(&(dt->acol_digs[dt->num].acol), acol);
+
+			dt->num += 1;		
+			found++;
+		}
+		processed++;
+		(*limit)--;
+		if (*limit == 0) {
+			break;
+		}
+	}
+	*tot_found += found; 
+	return processed;
+}
+
+/*
+ * Aerospike Index interface to build a defrag_list.
+ *
+ * Returns :
+ *  AS_SINDEX_DONE     ---> The current pimd has been scanned completely for defragging
+ *  AS_SINDEX_CONTINUE ---> Current pimd sill may have some candidate digest to be defragged
+ *  AS_SINDEX_ERR      ---> Error. Abort this pimd.
+ *
+ *  Notes :  Caller has the responsibility to free the iterators.
+ *           Requires a proper offset value from the caller.
+ */
+int
+ai_btree_build_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, ai_obj *icol,
+						   ulong *nofst, ulong limit, uint64_t * tot_processed, uint64_t * tot_found, cf_ll *gc_list)
+{
+	int ret = AS_SINDEX_ERR;
+
+	if (!pimd || !imd) {
+		return ret;
+	}
+
+	as_namespace *ns = imd->si->ns;
+	if (!ns) {
+		ns = as_namespace_get_byname((char *)imd->ns_name);
+	}
+
+	if (!pimd || !pimd->ibtr || !pimd->ibtr->numkeys) {
+		goto END;
+	}
+	//Entry is range query, FROM previous icol TO maxKey(ibtr)
+	if (icol->type == COL_TYPE_INVALID) {
+		assignMinKey(pimd->ibtr, icol); // init first call
+	}
+	ai_obj iH;
+	assignMaxKey(pimd->ibtr, &iH);
+	btEntry *be = NULL;
+	btSIter *bi = btGetRangeIter(pimd->ibtr, icol, &iH, 1);
+	if (!bi) {
+		goto END;
+	}
+
+	while ( true ) {
+		be = btRangeNext(bi, 1);
+		if (!be) {
+			ret = AS_SINDEX_DONE;
+			break;
+		}
+		ai_obj *acol = be->key;
+		ai_nbtr *anbtr = be->val;
+		long processed = 0;
+		if (!anbtr) {
+			break;
+		}
+		if (anbtr->is_btree) {
+			processed = build_defrag_list_from_nbtr(ns, acol, anbtr->u.nbtr, *nofst, &limit, tot_found, gc_list);
+		} else {
+			processed = build_defrag_list_from_arr(ns, acol, anbtr->u.arr, *nofst, &limit, tot_found, gc_list);
+		}
+
+		if (processed < 0) {    // error .. abort everything.
+			cf_detail(AS_SINDEX, "build_defrag_list returns an error. Aborting defrag on current pimd");
+			ret = AS_SINDEX_ERR;
+			break;
+		}
+		*tot_processed += processed;
+		// This tree may have some more digest to defrag
+		if (limit == 0) {
+			*nofst = *nofst + processed;
+			ai_objClone(icol, acol);
+			cf_detail(AS_SINDEX, "Current pimd may need more iteration of defragging.");
+			ret = AS_SINDEX_CONTINUE;
+			break;
+		}
+
+		// We have finished this tree. Yet we have not reached our limit to defrag.
+		// Goes to next iteration
+		*nofst = 0;
+		ai_objClone(icol, acol);
+	};
+	btReleaseRangeIterator(bi);
+END:
+
+	return ret;
+}
+
+/*
+ * Deletes the digest as in the passed in as gc_list, bound by n2del number of
+ * elements per iteration, with *deleted successful deletes.
+ */
+bool
+ai_btree_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, cf_ll *gc_list, ulong n2del, ulong *deleted)
+{
+	// If n2del is zero here, that means caller do not want to defrag
+	if (n2del == 0) {
+		return false;
+	}
+	ulong success = 0;
+	as_namespace *ns = imd->si->ns;
+	// STEP 3: go thru the PKtoDeleteList and delete the keys
+
+	uint64_t before = 0;
+	uint64_t after = 0;
+
+	while (cf_ll_size(gc_list)) {
+		cf_ll_element        * ele  = cf_ll_get_head(gc_list);
+		ll_sindex_gc_element * node = (ll_sindex_gc_element * )ele;
+		objs_to_defrag_arr   * dt   = node->objs_to_defrag;
+
+		// check before deleting. The digest may re-appear after the list
+		// creation and before deletion from the secondary index
+
+		int i = 0;
+		while (dt->num != 0) {
+			i = dt->num - 1;
+			int ret = as_sindex_can_defrag_record(ns, &(dt->acol_digs[i].dig));
+			if (ret == AS_SINDEX_GC_SKIP_ITERATION) {
+				goto END;
+			} else if (ret == AS_SINDEX_GC_OK) {
+				ai_obj apk;
+				init_ai_objFromDigest(&apk, &(dt->acol_digs[i].dig));
+				ai_obj *acol = &(dt->acol_digs[i].acol);
+				cf_detail(AS_SINDEX, "Defragged %lu %ld", acol->l, *((uint64_t *)&apk.y));
+				
+				before += pimd->ibtr->msize + pimd->ibtr->nsize;
+				if (reduced_iRem(pimd->ibtr, acol, &apk) == AS_SINDEX_OK) {
+					success++;
+				}
+				after += pimd->ibtr->msize + pimd->ibtr->nsize;
+			}
+			dt->num -= 1;
+			n2del--;
+			if (n2del == 0) {
+				goto END;
+			}
+		}
+		cf_ll_delete(gc_list, (cf_ll_element*)node);
+	}
+
+END:
+	cf_atomic64_sub(&imd->si->ns->n_bytes_sindex_memory, (before - after));
+	*deleted += success;
+	return cf_ll_size(gc_list) ? true : false;
+}
+
+void
+ai_btree_create(as_sindex_metadata *imd)
+{
+	for (int i = 0; i < imd->nprts; i++) {
+		as_sindex_pmetadata *pimd = &imd->pimd[i];
+		pimd->ibtr = createIBT(imd->sktype, -1);
+		if (! pimd->ibtr) {
+			cf_crash(AS_SINDEX, "Failed to allocate secondary index tree for ns:%s, indexname:%s",
+					imd->ns_name, imd->iname);
+		}
+	}
+}
+
+static void
+destroy_index(bt *ibtr, bt_n *n)                        
+{                                                                               
+	if (! n->leaf) {                                                             
+		for (int i = 0; i <= n->n; i++) {                                       
+			destroy_index(ibtr, NODES(ibtr, n)[i]);                     
+		}                                                                       
+	}                                                                           
+
+	for (int i = 0; i < n->n; i++) {                                            
+		void *be = KEYS(ibtr, n, i);                                            
+		ai_nbtr *anbtr = (ai_nbtr *) parseStream(be, ibtr);                     
+		if (anbtr) {                                                            
+			if (anbtr->is_btree) {                                              
+				bt_destroy(anbtr->u.nbtr);                                      
+			} else {                                                            
+				ai_arr_destroy(anbtr->u.arr);                                   
+			}                                                                   
+			cf_free(anbtr);                                                     
+		}                                                                       
+	}                                                                           
+}                 
+
+void
+ai_btree_dump(as_sindex_metadata *imd, char *fname, bool verbose)
+{
+	FILE *fp = NULL;                                                            
+	if (!(fp = fopen(fname, "w"))) {                                         
+		return;                                                              
+	}           
+
+	fprintf(fp, "Namespace: %s set: %s\n", imd->ns_name, imd->set ? imd->set : "None");
+
+	for (int i = 0; i < imd->nprts; i++) {
+		as_sindex_pmetadata *pimd = &imd->pimd[i];
+		fprintf(fp, "INDEX: name: %s:%d (%p)\n", imd->iname, i, (void *) pimd->ibtr);
+		if (pimd->ibtr) {
+			bt_dumptree(fp, pimd->ibtr, 1, verbose);
+		}
+	}
+
+	fclose(fp);
+}
+
+uint64_t
+ai_btree_get_numkeys(as_sindex_metadata *imd)
+{
+	uint64_t val = 0;
+
+	for (int i = 0; i < imd->nprts; i++) {
+		as_sindex_pmetadata *pimd = &imd->pimd[i];
+		PIMD_RLOCK(&pimd->slock);
+		val += pimd->ibtr->numkeys;
+		PIMD_RUNLOCK(&pimd->slock);
+	}
+
+	return val;
+}
+
+uint64_t
+ai_btree_get_pimd_isize(as_sindex_pmetadata *pimd)
+{
+	// TODO - Why check of > 0
+	return pimd->ibtr->msize > 0 ? pimd->ibtr->msize : 0;
+}
+
+uint64_t
+ai_btree_get_isize(as_sindex_metadata *imd)
+{
+	uint64_t size = 0;
+	for (int i = 0; i < imd->nprts; i++) {
+		as_sindex_pmetadata *pimd = &imd->pimd[i];
+		PIMD_RLOCK(&pimd->slock);
+		size += ai_btree_get_pimd_isize(pimd);
+		PIMD_RUNLOCK(&pimd->slock);
+	}
+	return size;
+}
+
+uint64_t
+ai_btree_get_pimd_nsize(as_sindex_pmetadata *pimd)
+{
+	// TODO - Why check of > 0
+	return pimd->ibtr->nsize > 0 ? pimd->ibtr->nsize : 0;
+}
+
+uint64_t
+ai_btree_get_nsize(as_sindex_metadata *imd)
+{
+	uint64_t size = 0;
+	for (int i = 0; i < imd->nprts; i++) {
+		as_sindex_pmetadata *pimd = &imd->pimd[i];
+		PIMD_RLOCK(&pimd->slock);
+		size += ai_btree_get_pimd_nsize(pimd);
+		PIMD_RUNLOCK(&pimd->slock)
+	}
+
+	return size;
+}
+
+void
+ai_btree_reinit_pimd(as_sindex_pmetadata * pimd, col_type_t sktype)
+{
+	if (! pimd->ibtr) {
+		cf_crash(AS_SINDEX, "IBTR is null");
+	}
+	pimd->ibtr = createIBT(sktype, -1);
+}
+
+void
+ai_btree_reset_pimd(as_sindex_pmetadata *pimd)
+{
+	if (! pimd->ibtr) {
+		cf_crash(AS_SINDEX, "IBTR is null");
+	}
+	pimd->ibtr = NULL;
+}
+
+void
+ai_btree_delete_ibtr(bt * ibtr)
+{
+	if (! ibtr) {
+		cf_crash(AS_SINDEX, "IBTR is null");
+	}
+	destroy_index(ibtr, ibtr->root); 
+}
diff --git a/ai/src/ai_obj.c b/ai/src/ai_obj.c
new file mode 100644
index 00000000..ac70b03a
--- /dev/null
+++ b/ai/src/ai_obj.c
@@ -0,0 +1,103 @@
+/*
+ * ai_obj.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ *  Aerospike Index Object Implementation.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <sys/param.h>  // For MIN().
+
+#include "ai_obj.h"
+#include "stream.h"
+
+#include <citrusleaf/alloc.h>
+
+void init_ai_obj(ai_obj *a)
+{
+	bzero(a, sizeof(ai_obj));
+	a->type = COL_TYPE_INVALID;
+}
+
+void init_ai_objLong(ai_obj *a, ulong l)
+{
+	init_ai_obj(a);
+	a->l = l;
+	a->type = COL_TYPE_LONG;
+}
+
+void init_ai_objU160(ai_obj *a, uint160 y) {
+	a->type = COL_TYPE_DIGEST;
+	a->y = y;
+}
+
+void ai_objClone(ai_obj *dest, ai_obj *src)
+{
+	memcpy(dest, src, sizeof(ai_obj));
+}
+
+static int ai_objCmp(ai_obj *a, ai_obj *b)
+{
+	if (C_IS_L(a->type) || C_IS_G(a->type)) {
+		return (a->l == b->l) ? 0 : ((a->l > b->l) ? 1 : -1);
+	} else if (C_IS_DG(a->type)) {
+		return u160Cmp(&a->y, &b->y);
+	} else {
+		assert(!"ai_objCmp ERROR");
+	}
+}
+
+bool ai_objEQ(ai_obj *a, ai_obj *b)
+{
+	return !ai_objCmp(a, b);
+}
+
+static void dump_ai_obj_internal(FILE *fp, ai_obj *a, bool as_digest)
+{
+	if (C_IS_L(a->type) || C_IS_G(a->type)) {
+		fprintf(fp, "\tLONG ai_obj: val: %lu\n", a->l);
+	} else if (C_IS_DG(a->type)) {
+		fprintf(fp, "\tU160 ai_obj:");
+		if (as_digest) {
+			const int len = 20;
+			char digest_str[2 + (len * 2) + 1];
+			digest_str[0] = '\0';
+			generate_packed_hex_string((uint8_t *) &(a->y), len, digest_str);
+			fprintf(fp, "%s\n", digest_str);
+		} else {
+			DEBUG_U160(fp, a->y);
+			fprintf(fp, "\n");
+		}
+	} else {
+		fprintf(fp, "\tUNINITIALISED ai_obj\n");
+	}
+}
+
+
+void dump_ai_obj_as_digest(FILE *fp, ai_obj *a)
+{
+	dump_ai_obj_internal(fp, a, true);
+}
diff --git a/ai/src/bt.c b/ai/src/bt.c
new file mode 100644
index 00000000..b43bc64f
--- /dev/null
+++ b/ai/src/bt.c
@@ -0,0 +1,133 @@
+/*
+ * bt.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ * Creation of different btree types and
+ * Public B-tree operations w/ stream abstractions under the covers.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "bt.h"
+#include "bt_iterator.h"
+#include "stream.h"
+
+#include <citrusleaf/alloc.h>
+
+bt *createIBT(col_type_t ktype, int imatch) {
+	bt_cmp_t cmp;
+	bts_t bts;
+	bts.ktype = ktype;
+	bts.btype = INDEX_BTREE;
+	bts.num = imatch;
+	if (C_IS_L(ktype)) { /* NOTE: under the covers: LL */
+		bts.ksize = LL_SIZE;
+		cmp = llCmp;
+		bts.bflag = BTFLAG_ULONG_ULONG;
+	} else if (C_IS_G(ktype)) { /* NOTE: under the covers: LL */
+		bts.ksize = LL_SIZE;
+		cmp = llCmp;
+		bts.bflag = BTFLAG_ULONG_ULONG;
+	} else if (C_IS_DG(ktype)) { /* NOTE: under the covers: YL */
+		bts.ksize = YL_SIZE;
+		cmp = ylCmp;
+		bts.bflag = BTFLAG_U160_ULONG;
+	} else {                  /* STRING or FLOAT */
+		assert(!"Unsupport Key Type");
+	}
+
+	return bt_create(cmp, &bts, 0);
+}
+
+bt *createNBT(col_type_t ktype) {
+	bt_cmp_t cmp;
+	bts_t bts;
+	bts.ktype = ktype;
+	bts.btype = NODE_BTREE;
+	bts.num   = -1;
+	if (C_IS_DG(ktype)) {
+		cmp = u160Cmp;
+		bts.ksize = U160SIZE;
+		bts.bflag = BTFLAG_U160;
+	} else {
+		assert(!"Unsupport Key Type");
+	}
+
+	return bt_create(cmp, &bts, 0);
+}
+
+static void *abt_find(bt *btr, ai_obj *akey) {
+	DECLARE_BT_KEY(akey, 0)
+	uchar *stream = bt_find(btr, btkey, akey);
+	destroyBTKey(btkey, med);                            /* FREED 026 */
+	return parseStream(stream, btr);
+}
+static bool abt_exist(bt *btr, ai_obj *akey) { //NOTE: Evicted Indexes are NULL
+	DECLARE_BT_KEY(akey, 0)
+	bool ret = bt_exist(btr, btkey, akey);
+	destroyBTKey(btkey, med);                            /* FREED 026 */
+	return ret;
+}
+static bool abt_del(bt *btr, ai_obj *akey, bool leafd) { // DELETE the row
+	DECLARE_BT_KEY(akey, 0)
+	dwd_t  dwd    = bt_delete(btr, btkey, leafd);        /* FREED 028 */
+	if (!dwd.k) return 0;
+	uchar *stream = dwd.k;
+	destroyBTKey(btkey, med);                            /* FREED 026 */
+	return destroyStream(btr, stream);                   /* DESTROYED 027 */
+}
+static uint32 abt_insert(bt *btr, ai_obj *akey, void *val) {
+	crs_t crs;
+	uint32 ssize;
+	DECLARE_BT_KEY(akey, 0)
+	char *stream = createStream(btr, val, btkey, ksize, &ssize, &crs); // D 027
+	if (!stream) return 0;
+	destroyBTKey(btkey, med);                            /* FREED 026 */
+	if (!bt_insert(btr, stream, 0)) return 0;            /* FREE ME 028 */
+	return 1;
+}
+
+/* INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX */
+void  btIndAdd   (bt *ibtr, ai_obj *ikey, bt *nbtr) {
+	abt_insert (ibtr, ikey, nbtr);
+}
+bt   *btIndFind  (bt *ibtr, ai_obj *ikey) {
+	return abt_find   (ibtr, ikey);
+}
+int   btIndDelete(bt *ibtr, ai_obj *ikey) {
+	abt_del    (ibtr, ikey, 0);
+	return ibtr->numkeys;
+}
+
+bool  btIndNodeExist(bt *nbtr, ai_obj *apk) {
+	return abt_exist(nbtr, apk);
+}
+bool  btIndNodeAdd(bt *nbtr, ai_obj *apk) { //DEBUG_NBT_ADD
+	return abt_insert(nbtr, apk, NULL);
+}
+int  btIndNodeDelete(bt *nbtr, ai_obj *apk, ai_obj *ocol) {
+	abt_del  (nbtr, ocol ? ocol : apk, 0);
+	return nbtr->numkeys;
+}
diff --git a/ai/src/bt_code.c b/ai/src/bt_code.c
new file mode 100644
index 00000000..a6fe2561
--- /dev/null
+++ b/ai/src/bt_code.c
@@ -0,0 +1,1016 @@
+/*
+ * Copyright 1997-1999, 2001 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+
+#include "bt.h"
+#include "bt_iterator.h"
+#include "stream.h"
+
+#include <citrusleaf/alloc.h>
+
+/* CACHE TODO LIST
+   8.) U128PK/FK CACHE:[EVICT,MISS] support
+
+  11.) DS as stream         -\/
+   7.) DS in rdbSave/Load  (dependency on 11)
+
+  12.) slab allocator for ALL btn's
+
+  14.) btFind() in setUniqIndexVal() -> btFindD() + TESTING
+
+  18.) CREATE TABLE () DIRTY
+
+  19.) btreesplitchild dirty math (only set dirty if new split child has dirty)
+*/
+
+// DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG
+
+//#define DEBUG_DEL_CASE_STATS
+//#define BT_MEM_PROFILE
+#ifdef BT_MEM_PROFILE
+static ulong tot_bt_data     = 0; static ulong tot_bt_data_mem = 0;
+static ulong tot_num_bt_ns   = 0; static ulong tnbtnmem        = 0;
+static ulong tot_num_bts     = 0; static ulong tot_num_bt_mem  = 0;
+  #define BT_MEM_PROFILE_BT   {tot_num_bts++; tot_num_bt_mem += size;}
+  #define BT_MEM_PROFILE_NODE {tot_num_bt_ns++; tnbtnmem += size;}
+#else
+  #define BT_MEM_PROFILE_BT
+  #define BT_MEM_PROFILE_NODE
+#endif
+
+/* PROTOYPES */
+static void      release_dirty_stream(bt *btr, bt_n *x);
+static int       real_log2           (unsigned int a, int nbits);
+static bt_data_t findminkey          (bt *btr, bt_n *x);
+static bt_data_t findmaxkey          (bt *btr, bt_n *x);
+
+// HELPER HELPER HELPER HELPER HELPER HELPER HELPER HELPER HELPER HELPER
+static ulong getNumKey(bt *btr, bt_n *x, int i) { //TODO U128 support
+    if (i < 0 || i >= x->n) return 0;
+    else {
+        ai_obj  akey; void *be = KEYS(btr, x, i);
+        convertStream2Key(be, &akey, btr);
+        return akey.l;
+    }
+}
+
+// MEMORY_MANAGEMENT MEMORY_MANAGEMENT MEMORY_MANAGEMENT MEMORY_MANAGEMENT
+/* NOTE used-memory bookkeeping maintained at the Btree level */
+static void bt_increment_used_memory(bt *btr, size_t size) {  //DEBUG_INCR_MEM
+    btr->msize += (ull)size;
+}
+static void bt_decrement_used_memory(bt *btr, size_t size) {  //DEBUG_DECR_MEM
+    btr->msize -= (ull)size;
+}
+// DIRTY_STREAM DIRTY_STREAM DIRTY_STREAM DIRTY_STREAM DIRTY_STREAM
+static uint32 get_dssize(bt *btr, char dirty) {
+    assert(dirty > 0);
+    uint32 drsize = (dirty == 3) ? sizeof(uint32)   :
+                    (dirty == 2) ? sizeof(ushort16) : sizeof(uchar); // 1 
+    //DEBUG_GETDSSIZE
+    return (btr->t * 2) * drsize;
+}
+static void alloc_ds(bt *btr, bt_n *x, size_t size, char dirty) {
+    assert(dirty != -1);
+    void   **dsp    = (void *)((char *)x + size);
+    if (!dirty) { *dsp = NULL; return; }
+    size_t   dssize = get_dssize(btr, dirty);
+    void    *ds     = cf_malloc(dssize); bzero(ds, dssize); // FREEME 108
+    bt_increment_used_memory(btr, dssize);
+    *dsp            = ds;                                      //DEBUG_ALLOC_DS
+}
+void incr_ds(bt *btr, bt_n *x) {//USE: when a DR is too big for its DS (incr_ds)
+    assert(x->dirty > 0);
+    GET_BTN_SIZE(x->leaf)
+    void   *ods    = GET_DS(x, nsize);
+    uint32  osize  = get_dssize(btr, x->dirty);
+    uint32  num    = (x->leaf ? (btr->t * 2) : btr->t);     //DEBUG_RESIZE_DS_1
+    alloc_ds(btr, x, nsize, x->dirty + 1);
+    void   *nds    = GET_DS(x, nsize);
+    if        (x->dirty == 1) {
+        uchar    *s_ds = (uchar    *)ods; ushort16 *d_ds = (ushort16 *)nds;
+        for (uint32 i = 0; i < num; i++) d_ds[i] = (ushort16)s_ds[i];
+    } else if (x->dirty == 2) {
+        ushort16 *s_ds = (ushort16 *)ods; uint32   *d_ds = (uint32   *)nds;
+        for (uint32 i = 0; i < num; i++) d_ds[i] = (uint32  )s_ds[i];
+    } else assert(!"incr_ds ERROR");
+    x->dirty++;                                             //DEBUG_RESIZE_DS_2
+    cf_free(ods); bt_decrement_used_memory(btr, osize);
+}
+
+// BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE
+// BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE
+static bt_n *allocbtreenode(bt *btr, bool leaf, char dirty) {
+    btr->numnodes++;
+    GET_BTN_SIZES(leaf, dirty)   BT_MEM_PROFILE_NODE          //DEBUG_ALLOC_BTN
+    bt_n   *x     = cf_malloc(msize); bzero(x, msize);
+    bt_increment_used_memory(btr, msize);
+    x->leaf       = -1;
+    x->dirty      = dirty;
+    if (dirty != -1) alloc_ds(btr, x, nsize, dirty);
+    return x;
+}
+static bt *allocbtree() {
+    int  size = sizeof(struct btree);
+    BT_MEM_PROFILE_BT
+    bt  *btr  = (bt *) cf_malloc(size); bzero(btr, size);    // FREE ME 035
+    bt_increment_used_memory(btr, size);                    //DEBUG_ALLOC_BTREE
+    return btr;
+}
+
+static void release_dirty_stream(bt *btr, bt_n *x) {      //DEBUG_BTF_BTN_DIRTY
+    assert(x->dirty > 0);
+    GET_BTN_SIZE(x->leaf)
+    bt_decrement_used_memory(btr, get_dssize(btr, x->dirty));
+    void **dsp = GET_DS(x, nsize); cf_free(dsp);          // FREED 108
+    x->dirty   = 0;
+}
+static void bt_free_btreenode(bt *btr, bt_n *x) {
+    GET_BTN_SIZES(x->leaf, x->dirty) bt_decrement_used_memory(btr, msize);
+    if (x->dirty > 0) release_dirty_stream(btr, x);
+    cf_free(x);                                           // FREED 035
+}
+static void bt_free_btree(bt *btr) { cf_free(btr); }
+
+// BT_CREATE BT_CREATE BT_CREATE BT_CREATE BT_CREATE BT_CREATE BT_CREATE
+bt *bt_create(bt_cmp_t cmp, bts_t *s, char dirty) {
+	int n = BTREE_LONG_TYPE_DEGREE;
+
+	if (C_IS_L(s->ktype) || C_IS_G(s->ktype)) {
+		n = BTREE_LONG_TYPE_DEGREE;
+	}
+	else if (C_IS_DG(s->ktype)) {
+		n = BTREE_STRING_TYPE_DEGREE;
+	}
+
+    uchar  t        = (uchar)((int)(n + 1) / 2);
+    int    kbyte    = sizeof(bt_n) + n * s->ksize;
+    int    nbyte    = kbyte + (n + 1) * VOIDSIZE;
+    bt    *btr      = allocbtree();
+    if (!btr) return NULL;
+    memcpy(&btr->s, s, sizeof(bts_t)); /* ktype, btype, ksize, bflag, num */
+    btr->cmp        = cmp;
+    btr->keyofst    = sizeof(bt_n);
+    uint32 nodeofst = btr->keyofst + n * s->ksize;
+    btr->nodeofst   = (ushort16)nodeofst;
+    btr->t          = t;
+    int nbits       = real_log2(n, sizeof(int) * 8) + 1;
+    nbits           = 1 << (real_log2(nbits, sizeof(int) * 8) + 1);
+    btr->nbits      = (uchar)nbits;
+    btr->nbyte      = nbyte;
+    btr->kbyte      = kbyte;
+    btr->dirty      = dirty;
+    btr->root       = allocbtreenode(btr, 1, dirty ? 0: -1);
+    if (!btr->root) return NULL;
+    btr->numnodes   = 1; //printf("bt_create\n"); bt_dump_info(printf, btr);
+    return btr;
+}
+
+// BINARY_SEARCH BINARY_SEARCH BINARY_SEARCH BINARY_SEARCH BINARY_SEARCH
+/* This is the real log2 function.  It is only called when we don't have
+ * a value in the table. -> which is basically never */
+static inline int real_log2(unsigned int a, int nbits) {
+    uint32 i = 0;
+    uint32 b = (nbits + 1) / 2; /* divide in half rounding up */
+    while (b) {
+        i = (i << 1);
+        if (a >= (unsigned int)(1 << b)) { // select top half and mark this bit
+            a /= (1 << b);
+            i  = i | 1;
+        } else {                           // select bottom half & dont set bit 
+            a &= (1 << b) - 1;
+        }
+        b /= 2;
+    }
+    return i;
+}
+
+#if 0
+
+// TODO: global table is pain disabled for avoiding issue
+// open it up later
+/* Implement a lookup table for the log values.  This will only allocate
+ * memory that we need.  This is much faster than calling the log2 routine
+ * every time.  Doing 1 million insert, searches, and deletes will generate
+ * ~58 million calls to log2.  Using a lookup table IS NECESSARY!
+ -> memory usage of this is trivial, like less than 1KB */
+static inline int _log2(unsigned int a, int nbits) {
+    static char   *table   = NULL;
+    static uint32  alloced = 0;
+    uint32 i;
+    if (a >= alloced) {
+        table = cf_realloc(table, (a + 1) * sizeof *table);
+        for (i = alloced; i < a + 1; i++) table[i] = -1;
+        alloced = a + 1;
+    }
+    if (table[a] == -1) table[a] = real_log2(a, nbits);
+    return table[a];
+}
+#endif
+
+static inline int _log2(unsigned int a, int nbits) {
+	return real_log2(a, nbits);
+}
+
+static int findkindex(bt *btr, bt_n *x, bt_data_t k, int *r, btIterator *iter) {
+    if (x->n == 0) return -1;
+    int b, tr;
+    int *rr = r ? r : &tr ; /* rr: key is greater than current entry */
+    int  i  = 0;
+    int  a  = x->n - 1;
+    while (a > 0) {
+        b            = _log2(a, (int)btr->nbits);
+        int slot     = (1 << b) + i;
+        bt_data_t k2 = KEYS(btr, x, slot);
+        if ((*rr = btr->cmp(k, k2)) < 0) {
+            a        = (1 << b) - 1;
+        } else {
+            a       -= (1 << b);
+            i       |= (1 << b);
+        }
+    }
+    if ((*rr = btr->cmp(k, KEYS(btr, x, i))) < 0)  i--;
+    if (iter) { iter->bln->in = iter->bln->ik = (i > 0) ? i : 0; }
+    return i;
+}
+
+// KEY_SHUFFLING KEY_SHUFFLING KEY_SHUFFLING KEY_SHUFFLING KEY_SHUFFLING
+// NOTE: KEYS are variable sizes: [4,8,12,16,20,24,32 bytes]
+#define ISVOID(btr)  (btr->s.ksize == VOIDSIZE)
+
+static inline void **AKEYS(bt *btr, bt_n *x, int i) {
+    int   ofst = (i * btr->s.ksize);
+    char *v    = (char *)x + btr->keyofst + ofst;                 //DEBUG_AKEYS
+    return (void **)v;
+}
+#define OKEYS(btr, x) ((void **)((char *)x + btr->keyofst))
+inline void *KEYS(bt *btr, bt_n *x, int i) {                       //DEBUG_KEYS
+    if      ISVOID(btr) return                  OKEYS(btr, x)[i];
+    else /* OTHER_BT */ return (void *)         AKEYS(btr, x, i);
+}
+
+// SCION SCION SCION SCION SCION SCION SCION SCION SCION SCION SCION SCION
+static inline void incr_scion(bt_n *x, int n) { x->scion += n; }
+static inline void decr_scion(bt_n *x, int n) { x->scion -= n; }
+static inline void move_scion(bt *btr, bt_n *y, bt_n *z, int n) {
+    for (int i = 0; i < n; i++) { incr_scion(y, NODES(btr, z)[i]->scion); }
+}
+static inline int get_scion_range(bt *btr, bt_n *x, int beg, int end) {
+    if (x->dirty <= 0) return end - beg;
+    int scion = 0;
+    for (int i = beg; i < end; i++) scion += 1 + getDR(btr, x, i);
+    return scion;
+}
+
+// DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY
+typedef struct btn_pos {
+    bt_n *x; int i;
+} bp_t;
+typedef struct two_bp_gens {
+    bp_t p; /* parent */ bp_t c; /* child */
+} tbg_t;
+static inline void free_bp(void *v) { cf_free(v); }
+
+typedef struct ll_ai_bp_element_s {
+    cf_ll_element   ele;
+    bp_t          * value;
+} ll_ai_bp_element;
+
+void
+ll_ai_bp_destroy_fn(cf_ll_element * ele)
+{
+    cf_free(((ll_ai_bp_element *)ele)->value);
+    cf_free((ll_ai_bp_element *)ele);
+}
+int
+ll_ai_bp_reduce_fn(cf_ll_element *ele, void *udata)
+{
+    return CF_LL_REDUCE_DELETE;
+}
+
+//TODO inline
+bt_n *addDStoBTN(bt *btr, bt_n *x, bt_n *p, int pi, char dirty) {
+    bt_n *y = allocbtreenode(btr, x->leaf, dirty);
+    GET_BTN_SIZE(x->leaf) memcpy(y, x, nsize); 
+    y->dirty = dirty; btr->dirty = 1;
+    if (x == btr->root) btr->root         = y;
+    else                NODES(btr, p)[pi] = y; // update parent NODE bookkeeping
+    bt_free_btreenode(btr, x);                            //DEBUG_ADD_DS_TO_BTN
+    return y;
+}
+uint32 getDR(bt *btr, bt_n *x, int i) {
+    if (x->dirty <= 0) return 0;
+    GET_BTN_SIZE(x->leaf)
+    void  *dsp = GET_DS(x, nsize);;
+    if        (x->dirty == 1) {
+        uchar    *ds = (uchar    *)dsp; return (uint32)ds[i];
+    } else if (x->dirty == 2) {
+        ushort16 *ds = (ushort16 *)dsp; return (uint32)ds[i];
+    } else if (x->dirty == 3) {
+        uint32   *ds = (uint32   *)dsp; return         ds[i];
+    } else assert(!"getDR ERROR");
+}
+#define INCR_DS_SET_DR                                 \
+  { incr_ds(btr, x); __setDR(btr, x, i, dr); return; }
+
+static void __setDR(bt *btr, bt_n *x, int i, uint32 dr) {
+    uint32 odr; GET_BTN_SIZE(x->leaf)
+    void  *dsp = GET_DS(x, nsize);
+    if        (x->dirty == 1) {
+        uchar    *ds = (uchar    *)dsp; if (dr > UCHAR_MAX) INCR_DS_SET_DR
+        odr = ds[i]; ds[i] = dr;
+    } else if (x->dirty == 2) {
+        ushort16 *ds = (ushort16 *)dsp; if (dr > USHRT_MAX) INCR_DS_SET_DR
+        odr = ds[i]; ds[i] = dr;
+    } else if (x->dirty == 3) { 
+        uint32   *ds = (uint32   *)dsp;
+        odr = ds[i]; ds[i] = dr;
+    } else assert(!"setDR ERROR");
+    (void) odr;        // silence compiler warnings
+}
+static bt_n *setDR(bt *btr, bt_n *x, int i, uint32 dr, bt_n *p, int pi) {
+    if (!dr)                return x;
+    if (x->dirty <= 0) x = addDStoBTN(btr, x, p, pi, 1);
+    __setDR(btr, x, i, dr); return x;
+}
+static bt_n *zeroDR(bt *btr, bt_n *x, int i, bt_n *p, int pi) {
+    (void)p; (void) pi; // compiler warnings - these will be used later
+    if (x->dirty <= 0)     return x;
+    __setDR(btr, x, i, 0); return x;
+}
+static bt_n *incrDR(bt *btr, bt_n *x, int i, uint32 dr, bt_n *p, int pi) {
+    if (!dr) return x;
+    if (x->dirty <= 0) x = addDStoBTN(btr, x, p, pi, 1);
+    uint32 odr  = getDR(btr, x, i);
+    odr        += dr;
+    return setDR(btr, x, i, odr, p, pi);
+}
+static bt_n *overwriteDR(bt *btr, bt_n *x, int i, uint32 dr, bt_n *p, int pi) {
+    if (dr) return setDR (btr, x, i, dr, p, pi);
+    else    return zeroDR(btr, x, i,     p, pi);
+}
+
+// DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR
+static bt_n *incrPrevDR(bt   *btr, bt_n *x,  int   i, uint32 dr,
+                        bt_n *p,   int   pi, cf_ll *plist) {
+    if (!dr)   return x;                                   //DEBUG_INCR_PREV_DR
+    if (i > 0) return incrDR(btr, x, i - 1, dr, p,  pi); // prev sibling
+    else   {
+        //TODO findminnode() is too inefficient -> needs to be a part of btr
+        if (x == findminnode(btr, btr->root)) { // MIN KEY
+            btr->dirty_left += dr; btr->dirty = 1; return x;
+        }
+        cf_ll_element   * ele;
+        cf_ll_iterator  * iter = cf_ll_getIterator(plist, true);
+        bt_n *rx = btr->root; int ri = 0;
+
+		while ((ele = cf_ll_getNext(iter))) {
+            bp_t *bp = ((ll_ai_bp_element *)ele)->value;
+            if (bp->i) { rx = bp->x; ri = bp->i - 1; break; }
+        }
+        bt_n *prx = btr->root; int pri = 0;
+        if (rx != btr->root) { // get parent
+            ele = cf_ll_getNext(iter);
+            bp_t *bp = ((ll_ai_bp_element *)ele)->value;
+            prx = bp->x; pri = bp->i;
+        }
+        cf_ll_releaseIterator(iter);
+        //printf("rx: %p ri: %d prx: %p pri: %d\n", rx, ri, prx, pri);
+        incrDR(btr, rx, ri, dr, prx, pri);
+        return x; // x not modified (only rx)
+    }
+}
+static tbg_t get_prev_child_recurse(bt *btr, bt_n *x, int i) {
+    bt_n *xp = NODES(btr, x)[i];                          //DEBUG_GET_C_REC_1
+    if (!xp->leaf) return get_prev_child_recurse(btr, xp, xp->n);
+    tbg_t tbg;
+    tbg.p.x = x;  tbg.p.i = i;
+    tbg.c.x = xp; tbg.c.i = xp->n - 1;                    //DEBUG_GET_C_REC_2
+    return tbg;
+}
+static bt_n *incrCase2B(bt *btr, bt_n *x, int i, int dr) {  //DEBUG_INCR_CASE2B
+    tbg_t  tbg = get_prev_child_recurse(btr, x, i);       //DEBUG_INCR_PREV
+    bt_n  *nc  = incrDR(btr, tbg.c.x, tbg.c.i, dr, tbg.p.x, tbg.p.i);
+    incr_scion(nc, dr);
+    return x; // x not modified (only tbg.c.x)
+}
+
+// SET_BT_KEY SET_BT_KEY SET_BT_KEY SET_BT_KEY SET_BT_KEY SET_BT_KEY
+static void setBTKeyRaw(bt *btr, bt_n *x, int i, void *src) { //PRIVATE
+    void **dest = AKEYS(btr, x, i);
+    if      ISVOID(btr) *dest                  = src;   
+    else                memcpy(dest, src, btr->s.ksize);
+    //DEBUG_SET_KEY
+}
+static bt_n *setBTKey(bt *btr,  bt_n *dx, int di,  bt_n *sx, int si,
+                      bool drt, bt_n *pd, int pdi, bt_n *ps, int psi) {
+    if (drt) {
+        uint32 dr = getDR      (btr, sx, si);             //DEBUG_SET_BTKEY
+        dx        = overwriteDR(btr, dx, di, dr, pd, pdi);
+        sx        = zeroDR     (btr, sx, si,     ps, psi);
+    } else sx = zeroDR         (btr, sx, si,     ps, psi);
+    setBTKeyRaw(btr, dx, di, KEYS(btr, sx, si)); return dx;
+}
+
+static void mvXKeys(bt *btr, bt_n   **dx, int di,
+                             bt_n   **sx, int si,  uint32 num, uint32 ks,
+                             bt_n    *pd, int pdi,
+                             bt_n    *ps, int psi) {
+    if (!num) return;
+    bool x2x = (*dx == *sx); bool forward = (di >= si);
+    int i    = forward ? (int)num - 1:      0;
+    int end  = forward ?      -1     : (int)num;
+    while (i != end) { // DS remove destDR from dx @i, add srcDR to sx @i
+        int    sii = si + i; int dii = di + i;
+        uint32 drs = getDR(btr, *sx, sii), drd = getDR(btr, *dx, dii);
+        if (drs) {                                    //DEBUG_MV_X_KEYS_1
+            *dx = setDR (btr, *dx, dii, drs, pd, pdi);
+            if (x2x && *dx != *sx) *sx = *dx;
+            *sx = zeroDR(btr, *sx, sii,      ps, psi);
+            if (x2x && *dx != *sx) *dx = *sx;
+        } else if (drd) {                             //DEBUG_MV_X_KEYS_2
+            *dx = zeroDR(btr, *dx, dii, pd, pdi);
+            if (x2x && *dx != *sx) *sx = *dx;
+        }
+        bt_data_t *dest = AKEYS(btr, *dx, di);
+        bt_data_t *src  = AKEYS(btr, *sx, si);
+        void      *dk   = (char *)dest + (i * ks);
+        void      *sk   = (char *)src  + (i * ks);
+        memcpy(dk, sk, ks);
+        if (forward) i--; else i++;
+    }
+}
+static inline void mvXNodes(bt *btr, bt_n *x, int xofst,
+                                     bt_n *z, int zofst, int num) {
+  memmove(NODES(btr, x) + xofst, NODES(btr, z) + zofst, (num) * VOIDSIZE);
+}
+
+//NOTE: trimBTN*() do not ever dirty btn's -- TODO they could UN-dirty
+static bt_n *trimBTN(bt *btr, bt_n *x, bool drt, bt_n *p, int pi) {
+  //DEBUG_TRIM_BTN
+    if (drt) x = zeroDR(btr, x, x->n, p, pi);
+    x->n--; return x;
+}
+static bt_n *trimBTN_n(bt *btr, bt_n *x, int n, bool drt, bt_n *p, int pi) {
+    if (drt) {
+        for (int i = x->n; i >= (x->n - n); i--) x = zeroDR(btr, x, i, p, pi);
+    }
+    x->n -= n; return x;
+}
+
+// INSERT INSERT INSERT INSERT INSERT INSERT INSERT INSERT INSERT INSERT
+static bool btreesplitchild(bt *btr, bt_n *x, int i, bt_n *y, bt_n *p, int pi) {
+    ushort16  t = btr->t; //TODO dirtymath
+    bt_n     *z = allocbtreenode(btr, y->leaf, y->dirty); if (!z) return 0;
+    z->leaf     = y->leaf; /* duplicate leaf setting */
+    for (int j = 0; j < t - 1; j++) {
+        z = setBTKey(btr, z, j, y, j + t, 1, p, pi, p, pi);
+    }
+    z->scion = get_scion_range(btr, z, 0, t - 1); decr_scion(y, z->scion);
+    z->n     = t - 1; y = trimBTN_n(btr, y, t - 1, 0, p, pi);
+    if (!y->leaf) { // if it's an internal node, copy the ptr's too 
+        for (int j = 0; j < t; j++) {
+            uint32_t scion   = NODES(btr, y)[j + t]->scion;
+            decr_scion(y, scion); incr_scion(z, scion);
+            NODES(btr, z)[j] = NODES(btr, y)[j + t];
+        }
+    }
+    for (int j = x->n; j > i; j--) {      // move nodes in parent down one
+        NODES(btr, x)[j + 1] = NODES(btr, x)[j];
+    }
+    NODES(btr, x)[i + 1] = z;             // store new node 
+    for (int j = x->n - 1; j >= i; j--) { // adjust the keys from previous move
+        x = setBTKey(btr, x, j + 1, x, j, 1, p, pi, p, pi);
+    }
+    decr_scion(y, 1 + getDR(btr, y, y->n - 1)); //NEXT LINE: store new key
+    x = setBTKey(btr, x, i, y, y->n - 1, 1, p, pi, p, pi); x->n++;
+    trimBTN(btr, y, 0, p, pi);
+    return 1;
+}
+
+#define GETN(btr) ((2 * btr->t) - 1)
+static bool bt_insertnonfull(bt  *btr, bt_n *x, bt_data_t k, bt_n *p, int pi,
+                             int  dr) {
+    if (x->leaf) { /* we are a leaf, just add it in */
+        int i = findkindex(btr, x, k, NULL, NULL);
+        if (i != x->n - 1) {
+            mvXKeys(btr, &x, i + 2, &x, i + 1, (x->n - i - 1), btr->s.ksize,
+                    p, pi, p, pi);
+        }
+        x = overwriteDR(btr, x, i + 1, dr, p, pi);
+        setBTKeyRaw(btr, x, i + 1, k); x->n++; incr_scion(x, 1);
+    } else { /* not leaf */
+        int i = findkindex(btr, x, k, NULL, NULL) + 1;
+        if (NODES(btr, x)[i]->n == GETN(btr)) { // if next node is full
+            if (!btreesplitchild(btr, x, i, NODES(btr, x)[i], x, i)) return 0;
+            if (btr->cmp(k, KEYS(btr, x, i)) > 0) i++;
+        }
+        bt_insertnonfull(btr, NODES(btr, x)[i], k, x, i, dr); incr_scion(x, 1);
+    }
+    return 1;
+}
+bool bt_insert(bt *btr, bt_data_t k, uint32 dr) {
+    bt_n *r  = btr->root;
+    bt_n *p  = r;
+    int   pi = 0;
+    if (r->n == GETN(btr)) { /* NOTE: tree increase height */
+        bt_n *s          = allocbtreenode(btr, 0, r->dirty); if (!s) return 0;
+        btr->root        = s;
+        s->leaf          = 0;
+        s->n             = 0;
+        incr_scion(s, r->scion);
+        NODES(btr, s)[0] = r;
+        if (!btreesplitchild(btr, s, 0, r, p, pi)) return 0;
+        p                = r = s;
+        btr->numnodes++;
+    }
+    if (!bt_insertnonfull(btr, r, k, p, pi, dr)) return 0;
+    btr->numkeys++;
+    return 1;
+}
+
+// DELETE DELETE DELETE DELETE DELETE DELETE DELETE DELETE DELETE DELETE
+static bt_n *replaceKeyWithGhost(bt *btr, bt_n *x, int i, bt_data_t k,
+                                 uint32 dr, bt_n *p,   int   pi) {
+    //printf("replaceKeyWithGhost\n");
+    ai_obj akey; convertStream2Key(k, &akey, btr);
+    crs_t crs; uint32 ssize; DECLARE_BT_KEY(&akey, x)
+    char *stream = createStream(btr, NULL, btkey, ksize, &ssize, &crs);//DEST027
+    x            = overwriteDR(btr, x, i, dr, p, pi);
+    setBTKeyRaw(btr, x, i, stream);
+    return x;
+}
+
+#define ADD_BP(plist, p, pi) /* used to trace path to deleted key */       \
+  if (plist) {                                                             \
+    bp_t *bp = (bp_t *) cf_malloc(sizeof(bp_t)); /* FREE ME 109 */         \
+    bp->x = p; bp->i = pi;                                                 \
+    ll_ai_bp_element * node = cf_malloc(sizeof(ll_ai_bp_element));       \
+     node->value = bp;                                                      \
+     cf_ll_append(plist, (cf_ll_element *)node);                            \
+  }
+
+#define CREATE_RETURN_DELETED_KEY(btr, kp, dr)        \
+  dwd_t dwd; bzero(&dwd, sizeof(dwd_t)); dwd.dr = dr; \
+  if (BIG_BT(btr)) { memcpy(delbuf, kp, btr->s.ksize); } \
+  dwd.k = BIG_BT(btr) ? delbuf : kp;
+
+/* NOTE: ksize > 8 bytes needs buffer for CASE 1 */
+#define MAX_KEY_SIZE (AS_DIGEST_KEY_SZ *2)
+
+#define DK_NONE 0
+#define DK_2A   1
+#define DK_2B   2
+
+/* remove an existing key from the tree. KEY MUST EXIST
+   the s parameter:
+     1.) for normal operation pass it as DK_NONE,
+     2.) delete the max node, pass it as DK_2A,
+     3.) delete the min node, pass it as DK_2B.
+ */
+typedef struct btds_t {
+    ulong leaf_del_hits; ulong leaf_del_noop;
+    ulong ndel;          ulong del_calls;
+    ulong case1_del;
+    ulong case2A_del; ulong case2B_del; ulong case2C_del;
+    ulong case3_del;
+    ulong case3A1_del; ulong case3A2_del; ulong case3B1_del; ulong case3B2_del;
+} btds_t;
+
+btds_t *btds = NULL;
+
+static dwd_t deletekey(bt   *btr,  bt_n *x,  bt_data_t k,    int    s, bool drt,
+                       bt_n *p,    int   pi, cf_ll     *plist, void **c2Cp,
+                       bool leafd, char delbuf[]) { btds->del_calls++;
+    bt_n *xp, *y, *z; bt_data_t kp;
+    int   yn, zn, i = 0, r = -1, ks = btr->s.ksize;
+    if (s != DK_NONE) { /* min or max node deletion */
+        if (x->leaf)             r =  0;
+        else {
+            if      (s == DK_2A) r =  1;   // max node
+            else if (s == DK_2B) r = -1;   // min node
+        }
+        if      (s == DK_2A) i = x->n - 1; // max node/leaf
+        else if (s == DK_2B) i = -1;       // min node/leaf
+    } else i = findkindex(btr, x, k, &r, NULL);              //DEBUG_DEL_POST_S
+
+    if (!drt) decr_scion(x, 1); // scion reduced by 1 every DELETE
+
+    /* Case 1:
+     * If the key k is in node x and x is a leaf, delete the key k from x. */
+    if (x->leaf) { btds->case1_del++;
+        bool rgst = 0;
+        if (s == DK_2B) i++;                                 //DEBUG_DEL_CASE_1
+        kp        = KEYS (btr, x, i);
+        int  dr   = getDR(btr, x, i);
+        CREATE_RETURN_DELETED_KEY(btr, kp, dr)
+        if (drt) {                                // CASE: EVICT
+            if (s == DK_NONE) {           //NOTE: only place DR grows
+                x = incrPrevDR(btr, x, i, (dr + 1), p, pi, plist);
+            } else decr_scion(x, 1 + dr); //NOTE: key FOR Case2A/B
+        } else if (s == DK_NONE) {                // CASE: DELETE NOT CASE2A/B
+            if (dr) {
+                if (NBT(btr)) { x = incrPrevDR(btr, x, i, dr, p, pi, plist); }
+                else { rgst = 1; // DELETE DataBT KEY w/ DR -> REPLACE w/ GHOST
+                    x = replaceKeyWithGhost(btr, x, i, kp, dr, p, pi);
+                }
+            }
+        } else if (dr) decr_scion(x, dr); // CASE: DELETE CASE2A/B
+        if (!rgst) { // IF NO REPLACE_W_GHOST -> Remove from BTREE
+            mvXKeys(btr, &x, i, &x, i + 1, (x->n - i - 1), ks, p, pi, p, pi);
+            x      = trimBTN(btr, x, drt, p, pi);
+        }
+        return dwd;
+    }
+    dwd_t dwde; bzero(&dwde, sizeof(dwd_t));
+    if (r == 0) { /* (r==0) means key found, but in node */ //DEBUG_DEL_CASE_2
+        kp = KEYS(btr, x, i);
+        if (!drt) { // ON DELETE
+            int dr = getDR(btr, x, i);
+            if (dr) { // IF DR -> REPLACE_W_GHOST, no recursive delete 
+                x = replaceKeyWithGhost(btr, x, i, kp, dr, p, pi);
+                CREATE_RETURN_DELETED_KEY(btr, kp, dr)
+                return dwd;
+            }
+        }
+        /* Case 2:
+         * if the key k is in the node x, and x is an internal node */
+        if ((yn = NODES(btr, x)[i]->n) >= btr->t) {         //DEBUG_DEL_CASE_2a
+            btds->case2A_del++;
+            if (leafd) return dwde;
+            /* Case 2a:
+             * if the node y that precedes k in node x has at least t keys,
+             * then find the previous sequential key (kp) of k.
+             * Recursively delete kp, and replace k with kp in x. */
+            xp         = NODES(btr, x)[i];
+            ADD_BP(plist, x, i)
+            //printf("CASE2A recurse: key: "); printKey(btr, x, i);
+            dwd_t dwd  = deletekey(btr, xp, NULL, DK_2A, drt,
+                                   x, i, plist, c2Cp, leafd, delbuf);
+            //DEBUG_SET_BTKEY_2A
+            if (drt) x = incrDR(btr, x, i, ++dwd.dr, p, pi);
+            else     x = setDR (btr, x, i, dwd.dr,   p, pi);
+            setBTKeyRaw(btr, x, i, dwd.k);
+            dwd.k      = kp; // swap back in KPs original value
+            return dwd;
+        }
+        if ((zn = NODES(btr, x)[i + 1]->n) >= btr->t) {     //DEBUG_DEL_CASE_2b
+            btds->case2B_del++;
+            if (leafd) return dwde;
+            /* Case 2b:
+             * if the node z that follows k in node x has at least t keys,
+             * then find the next sequential key (kp) of k. Recursively delete
+             * kp, and replace k with kp in x. */
+            xp         = NODES(btr, x)[i + 1];
+            ADD_BP(plist, x, i + 1)
+            //printf("CASE2B recurse: key: "); printKey(btr, x, i);
+            dwd_t dwd  = deletekey(btr, xp, NULL, DK_2B, drt,
+                                   x, i + 1, plist, c2Cp, leafd, delbuf);
+            //DEBUG_SET_BTKEY_2B
+            if (drt) { // prev key inherits DR+1
+                x      = incrCase2B (btr, x, i, (getDR(btr, x, i) + 1));
+            } 
+            x          = overwriteDR(btr, x, i, dwd.dr, p, pi);
+            setBTKeyRaw(btr, x, i, dwd.k);
+            dwd.k      = kp; // swap back in KPs original value
+            return dwd;
+        }
+        if (yn == btr->t - 1 && zn == btr->t - 1) {         //DEBUG_DEL_CASE_2c
+            btds->case2C_del++;
+            if (leafd) return dwde;
+            /* Case 2c:
+             * if both y and z have only t - 1 keys, merge k
+             * then all of z into y, so that x loses both k and
+             * the pointer to z, and y now contains 2t - 1 keys. */
+            if (!*c2Cp) *c2Cp = KEYS(btr, x, i); //used in remove_key()
+            y = NODES(btr, x)[i];
+            z = NODES(btr, x)[i + 1];
+            dwd_t dwd; dwd.k = k; dwd.dr = getDR(btr, x, i);
+            incr_scion(y, 1 + dwd.dr);                     //DEBUG_SET_BTKEY_2C
+            y = setDR  (btr, y, y->n, dwd.dr, x, i);
+            setBTKeyRaw(btr, y, y->n, dwd.k); y->n++;
+            incr_scion(y, get_scion_range(btr, z, 0, z->n));
+            mvXKeys(btr, &y, y->n, &z, 0, z->n, ks, x, i, x, i + 1);
+            if (!y->leaf) {
+                move_scion(btr, y,       z,     z->n + 1);
+                mvXNodes  (btr, y, y->n, z, 0, (z->n + 1));
+            }
+            y->n += z->n;
+            mvXKeys (btr, &x, i, &x, i + 1,   (x->n - i - 1), ks, p, pi, p, pi);
+            mvXNodes(btr, x, i + 1, x, i + 2, (x->n - i - 1));
+            x = trimBTN(btr, x, drt, p, pi);
+            bt_free_btreenode(btr, z);
+            ADD_BP(plist, x, i)
+            //printf("CASE2C key: "); printKey(btr, x, i);
+            return deletekey(btr, y, k, s, drt, x, i, plist, c2Cp, leafd, delbuf);
+        }
+    }
+    /* Case 3:
+     * if k is not present in internal node x, determine the root xp of
+     * the appropriate subtree that must contain k, if k is in the tree
+     * at all.  If xp has only t - 1 keys, execute step 3a or 3b as
+     * necessary to guarantee that we descend to a node containing at
+     * least t keys.  Finish by recursing on the appropriate node of x. */
+    i++;
+    if ((xp = NODES(btr, x)[i])->n == btr->t - 1) { /* case 3a-c are !x->leaf */
+        /* Case 3a:
+         * If xp has only (t-1) keys but has a sibling(y) with at least t keys,
+           give xp an extra key by moving a key from x down into xp,
+           moving a key from xp's immediate left or right sibling(y) up into x,
+           & moving the appropriate node from the sibling(y) into xp. */
+        if (i > 0 && (y = NODES(btr, x)[i - 1])->n >= btr->t) {
+            btds->case3A1_del++;
+            //printf("CASE3A1 key: "); printKey(btr, x, i);
+            if (leafd) return dwde;
+            /* left sibling has t keys */                  //DEBUG_DEL_CASE_3a1
+            mvXKeys(btr, &xp, 1, &xp, 0, xp->n, ks, x, i, x, i);
+            if (!xp->leaf) mvXNodes(btr, xp, 1, xp, 0, (xp->n + 1));
+            incr_scion(xp, 1 + getDR(btr, x, i - 1));
+            xp = setBTKey(btr, xp, 0, x, i - 1, drt, x,  i,  p, pi); xp->n++;
+            decr_scion(y, 1 + getDR(btr, y, y->n - 1));
+            x  = setBTKey(btr, x,  i - 1, y, y->n - 1, drt, p,  pi, x, i - 1);
+            if (!xp->leaf) {
+                int dscion = NODES(btr, y)[y->n]->scion;
+                incr_scion(xp, dscion); decr_scion(y, dscion);
+                NODES(btr, xp)[0] = NODES(btr, y)[y->n];
+            }
+            y  = trimBTN(btr, y, drt, x, i - 1);
+        } else if (i < x->n && (y = NODES(btr, x)[i + 1])->n >= btr->t) {
+            btds->case3A2_del++;
+            //printf("CASE3A2 key: "); printKey(btr, x, i);
+            if (leafd) return dwde;
+            /* right sibling has t keys */                 //DEBUG_DEL_CASE_3a2
+            incr_scion(xp, 1 + getDR(btr, x, i));
+            xp = setBTKey(btr, xp, xp->n++, x, i, drt, x, i, p, pi);
+            decr_scion(y, 1 + getDR(btr, y, 0));
+            x  = setBTKey(btr, x,  i,       y, 0, drt, p, pi, x, i + 1);
+            if (!xp->leaf) {
+                int dscion = NODES(btr, y)[0]->scion;
+                incr_scion(xp, dscion); decr_scion(y, dscion);
+                NODES(btr, xp)[xp->n] = NODES(btr, y)[0];
+            }
+            mvXKeys(btr, &y, 0, &y, 1, y->n - 1, ks, x, i + 1, x, i + 1);
+            if (!y->leaf) mvXNodes(btr, y, 0, y, 1, y->n);
+            y  = trimBTN(btr, y, drt, x, i + 1);
+        }
+        /* Case 3b:
+         * If xp and all of xp's siblings have t - 1 keys, merge xp with
+           one sibling, which involves moving a key from x down into the
+           new merged node to become the median key for that node.  */
+        else if (i > 0 && (y = NODES(btr, x)[i - 1])->n == btr->t - 1) {
+            btds->case3B1_del++;
+            //printf("CASE3B1 key: "); printKey(btr, x, i);
+            if (leafd) return dwde;
+            /* merge i with left sibling */                //DEBUG_DEL_CASE_3b1
+            incr_scion(y, 1 + getDR(btr, x, i - 1));
+            y = setBTKey(btr, y, y->n++, x, i - 1, drt, x, i - 1, p, pi);
+            incr_scion(y, get_scion_range(btr, xp, 0, xp->n));
+            mvXKeys(btr, &y, y->n, &xp, 0, xp->n, ks, x, i - 1, x, i);
+            if (!xp->leaf) {
+                move_scion(btr, y,       xp,     xp->n + 1);
+                mvXNodes  (btr, y, y->n, xp, 0, (xp->n + 1));
+            }
+            y->n += xp->n;
+            mvXKeys (btr, &x, i - 1, &x, i, (x->n - i), ks, p, pi, p, pi);
+            mvXNodes(btr, x, i, x, i + 1, (x->n - i));
+            x = trimBTN(btr, x, drt, p, pi);
+            bt_free_btreenode(btr, xp);
+            xp = y; i--; // i-- for parent-arg in recursion (below)
+        } else if (i < x->n && (y = NODES(btr, x)[i + 1])->n == btr->t - 1) {
+            btds->case3B2_del++;
+            //printf("CASE3B2 key: "); printKey(btr, x, i);
+            if (leafd) return dwde;
+            /* merge i with right sibling */               //DEBUG_DEL_CASE_3b2
+            incr_scion(xp, 1 + getDR(btr, x, i));
+            xp = setBTKey(btr, xp, xp->n++, x, i, drt, x, i, p, pi);
+            incr_scion(xp, get_scion_range(btr, y, 0, y->n));
+            mvXKeys(btr, &xp, xp->n, &y, 0, y->n, ks, x, i, x, i + 1);
+            if (!xp->leaf) {
+                move_scion(btr, xp,        y,     y->n + 1);
+                mvXNodes  (btr, xp, xp->n, y, 0, (y->n + 1));
+            }
+            xp->n += y->n;
+            mvXKeys (btr, &x, i, &x, i + 1, (x->n - i - 1), ks, p, pi, p, pi);
+            mvXNodes(btr, x, i + 1, x, i + 2, (x->n - i - 1));
+            x = trimBTN(btr, x, drt, p, pi);
+            bt_free_btreenode(btr, y);
+        }
+    } //printf("RECURSE CASE 3\n");
+    btds->case3_del++;
+    ADD_BP(plist, x, i)                                 //DEBUG_DEL_POST_CASE_3
+    dwd_t dwd = deletekey(btr, xp, k, s, drt, x, i, plist, c2Cp, leafd, delbuf);
+    // CASE2A/B pull keys up from depths, scion must be decremented
+    if (s != DK_NONE) {
+        if (drt) decr_scion(x, 1 + dwd.dr);
+        else     decr_scion(x, dwd.dr);     // DELETE already decr_scion()ed 1
+    }
+    return dwd;
+}
+
+#ifdef DEBUG_DEL_CASE_STATS
+static void print_del_case_stats(bool leafd, dwd_t dwd, bt *btr) {
+    if (leafd) {
+        if (!dwd.k) btds->leaf_del_noop++;
+        else        btds->leaf_del_hits++;
+        printf("deletes: %lu noop: %lu ratio: %f numkeys: %d\n",
+               btds->leaf_del_hits, btds->leaf_del_noop,
+               (btds->leaf_del_noop && btds->leaf_del_hits) ?
+                (double)((double)btds->leaf_del_hits /
+                         (double)btds->leaf_del_noop) : 0,
+               btr->numkeys);
+    } else
+    printf("ndel: %lu ncalls: %lu C1: %lu(%.2f) C2A: %lu(%.2f) "
+           "C2B: %lu(%.2f) C2C: %lu(%.2f) C3: %lu(%.2f) "
+           "C3A1: %lu(%.2f) C3A2: %lu(%.2f) C3B1: %lu(%.2f) "
+           "C3B2: %lu(%.2f)\n",
+           btds->ndel, btds->del_calls,
+           btds->case1_del,
+           (double)((double)btds->case1_del / (double)btds->del_calls),
+           btds->case2A_del,
+           (double)((double)btds->case2A_del / (double)btds->del_calls),
+           btds->case2B_del,
+           (double)((double)btds->case2B_del / (double)btds->del_calls),
+           btds->case2C_del,
+           (double)((double)btds->case2C_del / (double)btds->del_calls),
+           btds->case3_del,
+           (double)((double)btds->case3_del / (double)btds->del_calls),
+           btds->case3A1_del,
+           (double)((double)btds->case3A1_del / (double)btds->del_calls),
+           btds->case3A2_del,
+           (double)((double)btds->case3A2_del / (double)btds->del_calls),
+           btds->case3B1_del,
+           (double)((double)btds->case3B1_del / (double)btds->del_calls),
+           btds->case3B2_del,
+           (double)((double)btds->case3B2_del / (double)btds->del_calls));
+    fflush(NULL);
+}
+#endif
+
+static dwd_t remove_key(bt *btr, bt_data_t k, bool drt, bool leafd) {
+    if (!btds) { btds = cf_malloc(sizeof(btds_t)); bzero(btds, sizeof(btds_t)); }
+    btds->ndel++;
+    if (!btr->root) { dwd_t dwde; bzero(&dwde, sizeof(dwd_t)); return dwde; }
+    void *c2Cp = NULL; /* NOTE: c2Cp gets lost in recursion */ //DEBUG_DEL_START
+    bt_n  *p    = btr->root; int pi = 0;
+    cf_ll plist_tmp;
+	cf_ll * plist = &plist_tmp;  // NOTE: plist stores ancestor line during recursive delete
+    if (drt) {
+        cf_ll_init(plist, ll_ai_bp_destroy_fn, false);
+        ADD_BP(plist, p, pi);//FR110
+    } else plist = NULL;
+    char delbuf[MAX_KEY_SIZE]; // NOTE: ksize > 8B needs buffer for CASE 1
+	dwd_t dwd   = deletekey(btr, btr->root, k, DK_NONE, drt,
+                            p, pi, plist, &c2Cp, leafd, delbuf);
+#ifdef DEBUG_DEL_CASE_STATS
+	print_del_case_stats(leafd, dwd, btr);
+#endif
+    if (!dwd.k) return dwd; // leafd NO-OP
+    btr->numkeys--;                                             //DEBUG_DEL_END
+    /* remove empty non-leaf node from root, */
+    if (!btr->root->n && !btr->root->leaf) { /* NOTE: tree decrease height */
+        btr->numnodes--;
+        bt_n *x   = btr->root;
+        btr->root = NODES(btr, x)[0];
+        bt_free_btreenode(btr, x);
+    }
+    if (c2Cp) dwd.k = c2Cp;
+    if (plist) {
+		cf_ll_reduce(plist, true, ll_ai_bp_reduce_fn, NULL);
+		plist = NULL;
+	};                       // FREED 110
+    return dwd;
+}
+dwd_t bt_delete(bt *btr, bt_data_t k, bool leafd) {
+    return      remove_key(btr, k, 0, leafd);
+}
+
+// ACCESSORS ACCESSORS ACCESSORS ACCESSORS ACCESSORS ACCESSORS ACCESSORS
+static inline bool key_covers_miss(bt *btr, bt_n *x, int i, ai_obj *akey) {
+    if (!(C_IS_NUM(btr->s.ktype))) return 0;
+    if (i < 0) i = 0;
+    ulong mkey = getNumKey(btr, x, i);
+    ulong dr   = (ulong)getDR(btr, x, i);
+    if (mkey && dr) {
+        ulong qkey = akey->l;
+        ulong span = mkey + dr;
+        //DEBUG_CURRKEY_MISS
+        if (qkey >= mkey && qkey <= span) return 1;
+    }
+    return 0;
+}
+#define SET_DWM_XIP { dwm.x = x; dwm.i = i; dwm.p = p; dwm.pi = pi; }
+dwm_t findnodekey(bt *btr, bt_n *x, bt_data_t k, ai_obj *akey) {
+    int    r = -1,             i = 0;
+    bt_n  *p = btr->root; int pi = 0;
+    dwm_t  dwm; bzero(&dwm, sizeof(dwm_t)); SET_DWM_XIP
+    while (x) {
+        i = findkindex(btr, x, k, &r, NULL);              //DEBUG_FIND_NODE_KEY
+        if (i >= 0 && !r) { SET_DWM_XIP dwm.k = KEYS(btr, x, i); return dwm; }
+        if (key_covers_miss(btr, x, i, akey)) { SET_DWM_XIP dwm.miss = 1; }
+        if (x->leaf)       {            dwm.k = NULL;            return dwm; }
+        p = x; pi = i + 1; x = NODES(btr, x)[i + 1];
+    }
+    return dwm;
+}
+bt_data_t bt_find(bt *btr, bt_data_t k, ai_obj *akey) { //Indexes still use this
+    dwm_t dwm = findnodekey(btr, btr->root, k, akey);
+    return dwm.k;
+}
+
+static bool check_min_miss(bt *btr, ai_obj *alow) {
+    if (!btr->dirty_left) return 0;
+    ai_obj amin; convertStream2Key(bt_min(btr), &amin, btr);
+    return ai_objEQ(alow, &amin);
+}
+int bt_init_iterator(bt *btr, bt_data_t k, btIterator *iter, ai_obj *alow) {
+    if (!btr->root) return II_FAIL;
+    int    r          = -1;
+    bool   lmiss      = check_min_miss(btr, alow);
+    bool   miss       =  0;
+    uchar  only_right =  1;
+    bt_n  *x          = btr->root;
+    while (x) {
+        int i = findkindex(btr, x, k, &r, iter);
+        if (i >= 0 && r == 0) return lmiss ? II_L_MISS : II_OK;
+        if (key_covers_miss(btr, x, i, alow)) miss = 1; //DEBUG_BT_II
+        if (miss)             return II_MISS;
+        if (r < 0 || i != (x->n - 1)) only_right = 0;
+        if (x->leaf) {
+            if      (i != (x->n - 1)) only_right = 0;
+            return only_right ? II_ONLY_RIGHT : II_LEAF_EXIT;
+        }
+        iter->bln->child = get_new_iter_child(iter);
+        x                = NODES(btr, x)[i + 1];
+        to_child(iter, x);
+    }
+    return II_FAIL;
+}
+
+bool bt_exist(bt *btr, bt_data_t k, ai_obj *akey) {
+    int   r  = -1;
+    bt_n *x  = btr->root;
+    while (x) {
+        int i = findkindex(btr, x, k, &r, NULL);
+        if (i >= 0 && r == 0)                 return 1;
+        if (key_covers_miss(btr, x, i, akey)) return 1;
+        if (x->leaf)                          return 0;
+        x = NODES(btr, x)[i + 1];
+    }
+    return 0;
+}
+
+static bt_data_t findminkey(bt *btr, bt_n *x) {
+    if (x->leaf) return KEYS(btr, x, 0);
+    else         return findminkey(btr, NODES(btr, x)[0]);
+}
+bt_n *findminnode(bt *btr, bt_n *x) {
+    if (x->leaf) return x;
+    else         return findminnode(btr, NODES(btr, x)[0]);
+}
+static bt_data_t findmaxkey(bt *btr, bt_n *x) {
+    if (x->leaf) return KEYS(btr, x, x->n - 1);
+    else         return findmaxkey(btr, NODES(btr, x)[x->n]);
+}
+bt_data_t bt_min(bt *btr) {
+    if (!btr->root || !btr->numkeys) return NULL;
+    else                             return findminkey(btr, btr->root);
+}
+bt_data_t bt_max(bt *btr) {
+    if (!btr->root || !btr->numkeys) return NULL;
+    else                             return findmaxkey(btr, btr->root);
+}
+
+// DESTRUCTOR DESTRUCTOR DESTRUCTOR DESTRUCTOR DESTRUCTOR DESTRUCTOR
+static void destroy_bt_node(bt *btr, bt_n *x) {
+    if (!x->leaf) {
+        for (int i = 0; i <= x->n; i++) {
+            destroy_bt_node(btr, NODES(btr, x)[i]);
+        }
+	}
+    bt_free_btreenode(btr, x); /* memory management in btr */
+}
+void bt_destroy(bt *btr) {
+    if (btr->root) {
+        if (btr->numkeys) destroy_bt_node  (btr, btr->root);
+        else              bt_free_btreenode(btr, btr->root); 
+        btr->root  = NULL;
+    }
+    bt_free_btree(btr);
+}
diff --git a/ai/src/bt_iterator.c b/ai/src/bt_iterator.c
new file mode 100644
index 00000000..55971bf5
--- /dev/null
+++ b/ai/src/bt_iterator.c
@@ -0,0 +1,528 @@
+/*
+ * bt_iterator.c
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ * This file implements Aerospike Index B-tree iterators.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/param.h>  // For MAX() & MIN().
+
+#include "ai_obj.h"
+#include "bt_iterator.h"
+#include "stream.h"
+#include <citrusleaf/alloc.h>
+
+// HELPER_DEFINES HELPER_DEFINES HELPER_DEFINES HELPER_DEFINES HELPER_DEFINES
+#define GET_NEW_CHILD(iter) \
+ if (!iter->bln->child) { iter->bln->child = get_new_iter_child(iter); }
+
+#define SETITER8R(iter, btr, asc, l, lrev, n, nrev) \
+  btSIter *siter = setIterator(iter, btr, asc ? l : lrev, asc ? n : nrev);
+
+#define CR8ITER8R(btr, asc, l, lrev, n, nrev) \
+  btSIter *siter = createIterator(btr, asc ? l : lrev, asc ? n : nrev);
+
+bt_ll_n *get_new_iter_child(btIterator *iter) { //printf("get_newiterchild\n");
+	assert(iter->num_nodes < MAX_BTREE_DEPTH);
+	bt_ll_n *nn = &(iter->nodes[iter->num_nodes]);
+	bzero(nn, sizeof(bt_ll_n));
+	iter->num_nodes++;
+	return nn;
+}
+
+void to_child(btIterator *iter, bt_n* self) {  //printf("to_child\n");
+	iter->depth++;
+	iter->bln->child->parent = iter->bln;
+	iter->bln->child->ik     = 0;
+	iter->bln->child->in     = 0;
+	iter->bln->child->self   = self;
+	iter->bln                = iter->bln->child;
+}
+static void toparentrecurse(btIterator *iter) {  //printf("to_parent\n");
+	if (!iter->bln->parent) {
+		iter->finished = 1;    /* finished */
+		return;
+	}
+	iter->depth--;
+	bt   *btr    = iter->btr;
+	void *child  = KEYS(btr, iter->bln->self, iter->bln->ik);
+	iter->bln    = iter->bln->parent;                      /* -> parent */
+	void *parent = KEYS(btr, iter->bln->self, iter->bln->ik);
+	int   x      = btr->cmp(child, parent);
+	if (x > 0) {
+		if ((iter->bln->ik + 1) < iter->bln->self->n) iter->bln->ik++;
+		if ((iter->bln->in + 1) < iter->bln->self->n) iter->bln->in++;
+		else                                          toparentrecurse(iter);
+	}
+}
+static void iter_leaf(btIterator *iter) { //printf("iter_leaf\n");
+	if ((iter->bln->ik + 1) < iter->bln->self->n) iter->bln->ik++;
+	else                                          toparentrecurse(iter);
+}
+static void tochildrecurse(btIterator *iter, bt_n* self) {
+	to_child(iter, self);
+	if (!iter->bln->self->leaf) { // depth-first
+		GET_NEW_CHILD(iter)
+		tochildrecurse(iter, NODES(iter->btr, iter->bln->self)[iter->bln->in]);
+	}
+}
+static void iter_node(btIterator *iter) {
+	if ((iter->bln->ik + 1) <  iter->bln->self->n) iter->bln->ik++;
+	if ((iter->bln->in + 1) <= iter->bln->self->n) iter->bln->in++;
+	GET_NEW_CHILD(iter)
+	tochildrecurse(iter, NODES(iter->btr, iter->bln->self)[iter->bln->in]);
+}
+
+static void *btNext(btSIter *siter, bt_n **rx, int *ri, bool asc) {
+	btIterator *iter = &(siter->x);
+	if (iter->finished) {
+		if (siter->scan) siter->missed = siter->nim;
+		return NULL;
+	}
+	if (asc) siter->missed = siter->nim; //Curr MISSED = LastLoop's NextIsMissed
+	bt_n       *x    = iter->bln->self;
+	if (rx) *rx = x;
+	int         i    = iter->bln->ik;
+	if (ri) *ri = i;
+	void       *curr = KEYS(iter->btr, x, i);
+	siter->nim       = getDR(iter->btr, x, i) ? 1 : 0;
+	if (iter->bln->self->leaf) (*iter->iLeaf)(iter);
+	else                       (*iter->iNode)(iter);
+	return curr;
+}
+
+void to_child_rev(btIterator *iter, bt_n* self) {
+	iter->depth++;
+	iter->bln->child->parent = iter->bln;
+	iter->bln->child->ik     = self->n - 1;
+	iter->bln->child->in     = self->n;
+	iter->bln->child->self   = self;
+	iter->bln                = iter->bln->child;
+}
+static void tochildrecurserev(btIterator *iter, bt_n* self) {
+	to_child_rev(iter, self);
+	if (!iter->bln->self->leaf) { // depth-first
+		GET_NEW_CHILD(iter)
+		tochildrecurserev(iter,
+						  NODES(iter->btr, iter->bln->self)[iter->bln->in]);
+	}
+}
+static void toparentrecurserev(btIterator *iter) {
+	if (!iter->bln->parent) {
+		iter->finished = 1;    /* finished */
+		return;
+	}
+	iter->depth--;
+	bt   *btr    = iter->btr;
+	void *child  = KEYS(btr, iter->bln->self, iter->bln->ik);
+	iter->bln    = iter->bln->parent;                      /* -> parent */
+	void *parent = KEYS(btr, iter->bln->self, iter->bln->ik);
+	int   x      = btr->cmp(child, parent);
+	if (x < 0) {
+		if (iter->bln->ik) iter->bln->ik--;
+		if (iter->bln->in) iter->bln->in--;
+		else               toparentrecurserev(iter);
+	}
+	if (iter->bln->in == iter->bln->self->n) iter->bln->in--;
+}
+static void iter_leaf_rev(btIterator *iter) { //printf("iter_leaf_rev\n");
+	if (iter->bln->ik) iter->bln->ik--;
+	else               toparentrecurserev(iter);
+}
+static void iter_node_rev(btIterator *iter) {
+	GET_NEW_CHILD(iter)
+	tochildrecurserev(iter, NODES(iter->btr, iter->bln->self)[iter->bln->in]);
+}
+
+// INIT_ITERATOR INIT_ITERATOR INIT_ITERATOR INIT_ITERATOR INIT_ITERATOR
+static void *setIter(bt    *btr, bt_data_t  bkey, btSIter *siter, ai_obj *alow,
+					 bt_n **rx,  int       *ri,   bool     asc) {
+	btIterator *iter = &(siter->x);
+	int         ret  = bt_init_iterator(btr, bkey, iter, alow);
+	//printf("setIter: ret: %d\n", ret);
+	if (ret == II_FAIL) return NULL;
+	siter->empty = 0;
+	if      (ret == II_L_MISS) {
+		siter->nim = siter->missed = 1;
+		return NULL;
+	}
+	else if (ret == II_MISS)     siter->nim = siter->missed = 1;
+	else if (ret != II_OK) { /* range queries, find nearest match */
+		int x = btr->cmp(bkey, KEYS(btr, iter->bln->self, iter->bln->ik));
+		if (x > 0) {
+			if (ret == II_ONLY_RIGHT) { // off end of B-tree
+				siter->empty = 1;
+				return NULL;
+			} else { // II_LEAF_EXIT
+				//printf("setIter: [II_LEAF_EXIT\n"); //TODO needed?
+				return btNext(siter, rx, ri, asc); // find next
+			}
+		}
+	}
+	if (rx) *rx = iter->bln->self;
+	if (ri) *ri = iter->bln->ik;
+	return KEYS(iter->btr, iter->bln->self, iter->bln->ik);
+}
+static void init_iter(btIterator  *iter, bt          *btr,
+					  iter_single *itl, iter_single *itn) {
+	iter->btr         = btr;
+	iter->high      = LONG_MIN;
+	iter->iLeaf       = itl;
+	iter->iNode     = itn;
+	iter->finished    = 0;
+	iter->num_nodes = 0;
+	iter->bln         = &(iter->nodes[0]);
+	iter->bln->ik     = iter->bln->in         = 0;
+	iter->num_nodes++;
+	iter->bln->self   = btr->root;
+	iter->bln->parent = iter->bln->child  = NULL;
+	iter->depth       = 0;
+}
+
+// AEROSPIKE MULTI_THREAD
+static btSIter *newIter() {
+	btSIter *siter = cf_malloc(sizeof(btSIter));
+	bzero(siter, sizeof(btSIter));
+	return siter;
+}
+
+static btSIter *getIterator() {
+	return newIter();
+}
+
+static void releaseIterator(btSIter *siter) {
+	if (siter) {
+		cf_free(siter);
+	}
+	return;
+}
+
+static btSIter *createIterator(bt *btr, iter_single *itl, iter_single *itn) {
+	btSIter *siter = getIterator();
+	siter->dofree  = 1;
+	siter->missed  = 0;
+	siter->nim     = 0;
+	siter->empty   = 1;
+	siter->scan    = 0;
+	siter->ktype   = btr->s.ktype;
+	init_ai_obj(&siter->key);
+	siter->be.key  = &(siter->key);
+	siter->be.val = NULL;
+	init_iter(&siter->x, btr, itl, itn);
+	return siter;
+}
+//extra insertion
+
+static btSIter *setIterator(btSIter *iter, bt *btr, iter_single *itl, iter_single *itn) {
+	btSIter *siter = iter;
+	siter->dofree  = 0;
+	siter->missed  = 0;
+	siter->nim     = 0;
+	siter->empty   = 1;
+	siter->scan    = 0;
+	siter->ktype   = btr->s.ktype;
+	init_ai_obj(&siter->key);
+	siter->be.key  = &(siter->key);
+	siter->be.val = NULL;
+	init_iter(&siter->x, btr, itl, itn);
+	return siter;
+}
+void btReleaseRangeIterator(btSIter *siter) {
+	if (!siter) return;
+	if (siter->dofree) {
+		releaseIterator(siter);
+	}
+}
+static void setHigh(btSIter *siter, ai_obj *high, col_type_t ktype) {
+	if (C_IS_L(ktype) || C_IS_G(ktype)) {
+		siter->x.high  = high->l;
+	}
+	else if (C_IS_DG(ktype)) {
+		siter->x.highy = high->y;
+	}
+}
+
+static bool streamToBTEntry(uchar *stream, btSIter *siter, bt_n *x, int i) {
+	if (!stream) return 0;
+	if (i < 0) i = 0;
+	convertStream2Key(stream, siter->be.key, siter->x.btr);
+	siter->be.val    = parseStream(stream, siter->x.btr);
+	bool  gost       = IS_GHOST(siter->x.btr, siter->be.val);
+	if (gost) {
+		siter->missed = 1;    // GHOST key
+		siter->nim = 0;
+	}
+	siter->be.dr = x ? getDR(siter->x.btr, x, i) : 0;
+	siter->be.stream = stream;
+	siter->be.x      = x;
+	siter->be.i = i; //NOTE: used by bt_validate_dirty
+	//DUMP_STREAM_TO_BT_ENTRY
+	return 1;
+}
+btSIter *btGetRangeIter(bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc) {
+	if (!btr->root || !btr->numkeys)           return NULL;
+	btk_t btk;
+	bool med;
+	uint32 ksize;           //bt_dumptree(btr, btr->ktype);
+	CR8ITER8R(btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev);
+	setHigh(siter, asc ? ahigh : alow, btr->s.ktype);
+	char    *bkey  = createBTKey(asc ? alow : ahigh,
+								 &med, &ksize, btr, &btk); //D032
+	if (!bkey)                                 goto rangeiter_err;
+	bt_n *x  = NULL;
+	int i = -1;
+	uchar *stream = setIter(btr, bkey, siter, asc ? alow : ahigh, &x, &i, asc);
+	destroyBTKey(bkey, med);                                /* DESTROYED 032 */
+	if (!streamToBTEntry(stream, siter, x, i)) goto rangeiter_err;
+	return siter;
+
+rangeiter_err:
+	btReleaseRangeIterator(siter);
+	return NULL;
+}
+
+
+btSIter *btSetRangeIter(btSIter * iter, bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc) {
+	if (!btr->root || !btr->numkeys)           return NULL;
+	btk_t btk;
+	bool med;
+	uint32 ksize;           //bt_dumptree(btr, btr->ktype);
+	SETITER8R(iter, btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev);
+	setHigh(siter, asc ? ahigh : alow, btr->s.ktype);
+	char    *bkey  = createBTKey(asc ? alow : ahigh,
+								 &med, &ksize, btr, &btk); //D032
+	if (!bkey)                                 goto rangeiter_err;
+	bt_n *x  = NULL;
+	int i = -1;
+	uchar *stream = setIter(btr, bkey, siter, asc ? alow : ahigh, &x, &i, asc);
+	destroyBTKey(bkey, med);                                /* DESTROYED 032 */
+	if (!streamToBTEntry(stream, siter, x, i)) goto rangeiter_err;
+	return siter;
+
+rangeiter_err:
+	btReleaseRangeIterator(siter);
+	return NULL;
+}
+btEntry *btRangeNext(btSIter *siter, bool asc) { //printf("btRangeNext\n");
+	//printf("btRangeNext: siter: %p\n", (void *)siter);
+	//if (siter) printf("btRangeNext: empty: %d\n", siter->empty);
+	if (!siter || siter->empty) return NULL;
+	bt_n *x  = NULL;
+	int i = -1;
+	uchar *stream = btNext(siter, &x, &i, asc);
+	if (!streamToBTEntry(stream, siter, x, i)) return NULL;
+	if (C_IS_L(siter->ktype) || C_IS_G(siter->ktype)) {
+		long l = siter->key.l;
+		if (l == siter->x.high)  siter->x.finished = 1;       /* exact match */
+		if (!asc) {
+			//printf("btRangeNext: DESC: l: %lu dr: %u\n",
+			//       l, getDR(siter->x.btr, x, i));
+			l += getDR(siter->x.btr, x, i);
+		}
+		bool over = asc ? (l > siter->x.high) : (l < siter->x.high);
+		if (over && siter->nim) {
+			siter->missed = 1;
+		}
+		//printf("btRangeNext: over: %d l: %lu high: %lu\n",
+		//       over, l, siter->x.high);
+		return over ? NULL : &(siter->be);
+	} else if (C_IS_DG(siter->ktype)) {
+		uint160 yy = siter->key.y;
+		int ret = u160Cmp(&yy, &siter->x.highy);
+		if (!ret) siter->x.finished = 1;                      /* exact match */
+		if (!asc) { //TODO is ENDIANness of memcpy() correct
+			uint32 low;
+			char *spot = ((char *)&yy) + 12;
+			memcpy(&low, spot, 4);
+			low += getDR(siter->x.btr, x, i);
+			memcpy(spot, &low, 4);
+		}
+		bool over = asc ? (ret > 0) : (ret < 0);
+		return over ? NULL : &(siter->be);
+	} else {
+		return NULL;
+	}
+}
+
+// FULL_BTREE_ITERATOR FULL_BTREE_ITERATOR FULL_BTREE_ITERATOR
+bool assignMinKey(bt *btr, ai_obj *akey) {       //TODO combine w/ setIter()
+	void *e = bt_min(btr);
+	if (!e)   return 0; //      iter can be initialised
+	convertStream2Key(e, akey, btr);
+	return 1; //      w/ this lookup
+}
+bool assignMaxKey(bt *btr, ai_obj *akey) {
+	void *e = bt_max(btr);
+	if (!e)   return 0;
+	convertStream2Key(e, akey, btr);
+	return 1;
+}
+btSIter *btGetFullRangeIter(bt *btr, bool asc, cswc_t *w) {
+	cswc_t W; // used in setHigh()
+	if (!btr->root || !btr->numkeys)                      return NULL;
+	if (!w) w = &W;
+	ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh;
+	if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL;
+	btk_t btk;
+	bool med;
+	uint32 ksize;
+	CR8ITER8R(btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev);
+	siter->scan = 1;
+	setHigh(siter, asc ? aH : aL, btr->s.ktype);
+	char *bkey  = createBTKey(asc ? aL : aH,
+							  &med, &ksize, btr, &btk); //DEST 030
+	if (!bkey)                                            goto frangeiter_err;
+	bt_n *x  = NULL;
+	int i = -1;
+	uchar *stream = setIter(btr, bkey, siter, asc ? aL : aH, &x, &i, asc);
+	destroyBTKey(bkey, med);                             /* DESTROYED 030 */
+	if (!stream && siter->missed)                         return siter;//IILMISS
+	if (!streamToBTEntry(stream, siter, x, i))            goto frangeiter_err;
+	if (btr->dirty_left) siter->missed = 1; // FULL means 100% FULL
+	return siter;
+
+frangeiter_err:
+	btReleaseRangeIterator(siter);
+	return NULL;
+}
+
+btSIter *btSetFullRangeIter(btSIter *iter, bt *btr, bool asc, cswc_t *w) {
+	cswc_t W; // used in setHigh()
+	if (!btr->root || !btr->numkeys)                      return NULL;
+	if (!w) w = &W;
+	ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh;
+	if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL;
+	btk_t btk;
+	bool med;
+	uint32 ksize;
+	SETITER8R(iter, btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev);
+	siter->scan = 1;
+	setHigh(siter, asc ? aH : aL, btr->s.ktype);
+	char *bkey  = createBTKey(asc ? aL : aH,
+							  &med, &ksize, btr, &btk); //DEST 030
+	if (!bkey)                                            goto frangeiter_err;
+	bt_n *x  = NULL;
+	int i = -1;
+	uchar *stream = setIter(btr, bkey, siter, asc ? aL : aH, &x, &i, asc);
+	destroyBTKey(bkey, med);                             /* DESTROYED 030 */
+	if (!stream && siter->missed)                         return siter;//IILMISS
+	if (!streamToBTEntry(stream, siter, x, i))            goto frangeiter_err;
+	if (btr->dirty_left) siter->missed = 1; // FULL means 100% FULL
+	return siter;
+
+frangeiter_err:
+	btReleaseRangeIterator(siter);
+	return NULL;
+}
+
+typedef struct four_longs {
+	long cnt;
+	long ofst;
+	long diff;
+	long over;
+} fol_t;
+
+#define INIT_ITER_BEENTRY(siter, btr, x, i)  \
+  { uchar *iistream = KEYS(btr, x, i); streamToBTEntry(iistream, siter, x, i); }
+static bool btScionFind(btSIter *siter, bt_n *x, ulong ofst, bt *btr, bool asc,
+						cswc_t  *w,     long  lim) {
+	int    i   = asc ? 0        : x->n;
+	int    fin = asc ? x->n + 1 : -1;
+	while (i != fin) {
+		if (x->leaf) break;
+		uint32_t scion = NODES(btr, x)[i]->scion;
+		if (scion >= ofst) {
+			bool i_end_n     = (i == siter->x.bln->self->n);
+			siter->x.bln->in = i;
+			siter->x.bln->ik = (i_end_n) ? i - 1 : i;
+			if (scion == ofst) {
+				if (!asc) {
+					siter->x.bln->in = siter->x.bln->ik = i - 1;
+				}
+				return 1;
+			}
+			siter->x.bln->child = get_new_iter_child(&siter->x);
+			to_child(&siter->x, NODES(btr, x)[i]);
+			bt_n *kid = NODES(btr, x)[i];
+			if (!kid->leaf) {
+				btScionFind(siter, kid, ofst, btr, asc, w, lim);
+				return 1;
+			} else x = kid;
+			break;
+		} else ofst -= (scion + 1); // +1 for NODE itself
+		i = asc ? i + 1 : i - 1;    // loop increment
+	}
+	// Now Find the rest of the OFFSET (respecting DRs)
+	uint32  n    = siter->x.bln->self->n;
+	i            = asc ? 0            : n - 1;
+	fin          = asc ? MIN(ofst, n) : MAX(-1, (n - ofst));
+	int last     = asc ? n - 1        : 0;
+	ulong   cnt  = 0;
+	//TODO findminnode() is too inefficient -> needs to be a part of btr
+	bt_n   *minx = findminnode(btr, btr->root);
+	int     btdl = btr->dirty_left;
+	int     dr   = 0;
+	while (i != fin) {
+		dr   = getDR(btr, x, i);
+		cnt += dr;
+		if (!i && x == minx) cnt += btdl;
+		if (cnt >= ofst) break;
+		cnt++;
+		i = asc ? i + 1 : i - 1; // loop increment
+	}
+	if      (i == fin && i == last) {
+		if (cnt >= x->scion) return 0;
+	}
+	else if (cnt < ofst)                                   return 0; //OFST 2big
+	siter->x.bln->ik = i;
+	INIT_ITER_BEENTRY(siter, btr, x, siter->x.bln->ik);
+	if (asc)  {
+		if ((ofst + dr) != cnt) siter->missed = 1;
+	}
+	else      {
+		if (!i && x == minx) {
+			if (ofst != (cnt - btdl)) siter->missed = 1;
+		}
+		else                 {
+			if (ofst != cnt)          siter->missed = 1;
+		}
+	}
+	return 1;
+}
+btSIter *btGetFullXthIter(bt *btr, ulong oofst, bool asc, cswc_t *w, long lim) {
+	ulong ofst = oofst;
+	cswc_t W; // used in setHigh()
+	if (!btr->root || !btr->numkeys)                      return NULL;
+	if (!w) w = &W;
+	ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh;
+	if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL;
+	CR8ITER8R(btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev);
+	setHigh(siter, asc ? aH : aL, btr->s.ktype);
+	if (btScionFind(siter, btr->root, ofst, btr, asc, w, lim)) siter->empty = 0;
+	return siter;
+}
diff --git a/ai/src/bt_output.c b/ai/src/bt_output.c
new file mode 100644
index 00000000..440fbf01
--- /dev/null
+++ b/ai/src/bt_output.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright 1997-1998, 2001 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ */
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "bt_output.h"
+#include "bt_iterator.h"
+#include "stream.h"
+
+#define PRINT_EVICTED_KEYS
+
+#define DEBUG_BT_TYPE(fp, btr)						\
+	fprintf(fp, "btr: %p NBT: %d NONE: %d "	        \
+			"LL: %d YL: %d "		                \
+			"BIG: %d ksize: %d\n",                  \
+			btr, NBT(btr), NONE_BT(btr),            \
+			LL(btr), YL(btr),                       \
+			BIG_BT(btr), btr->s.ksize);
+
+static int treeheight(bt *btr)
+{
+	bt_n *x = btr->root;
+	if (!x) {
+		return 0;
+	}
+
+	int ret = 0;
+	while (x && !x->leaf) {
+		x = NODES(btr, x)[0];
+		ret++;
+	}
+
+	return ++ret;
+}
+
+void bt_dump_info(FILE *fp, bt *btr)
+{
+	fprintf(fp, "BT: %p t: %d nbits: %d nbyte: %d kbyte: %d "
+			"ksize: %d koff: %d noff: %d numkeys: %d numnodes: %d "
+			"height: %d btr: %p btype: %d ktype: %d bflag: %d "
+			"num: %d root: %p dirty_left: %u msize: %ld dsize: %ld "
+			"dirty: %u\n",
+			btr, btr->t, btr->nbits, btr->nbyte, btr->kbyte, btr->s.ksize,
+			btr->keyofst, btr->nodeofst, btr->numkeys, btr->numnodes,
+			treeheight(btr), (void *)btr, btr->s.btype, btr->s.ktype,
+			btr->s.bflag, btr->s.num, btr->root,
+			btr->dirty_left, btr->msize, btr->dsize, btr->dirty);
+	DEBUG_BT_TYPE(fp, btr);
+}
+
+static void bt_dump_array(FILE *fp, ai_arr *arr, bool verbose)
+{
+	fprintf(fp, "Array:  capacity: %d used: %d\n", arr->capacity, arr->used);
+	if (verbose) {
+		for (int i = 0; i < arr->used; i++) {
+			const int len = 20;
+			char digest_str[2 + (len * 2) + 1];
+			digest_str[0] = '\0';
+			generate_packed_hex_string((uint8_t *) &arr->data[i * CF_DIGEST_KEY_SZ], len, digest_str);
+			fprintf(fp, "\tData[%d]: %s\n", i, digest_str);
+		}
+	}
+}
+
+static void bt_dump_nbtr(FILE *fp, ai_nbtr *nbtr, bool is_index, bool verbose)
+{
+	if (nbtr->is_btree) {
+		bt_dumptree(fp, nbtr->u.nbtr, is_index, verbose);
+	} else {
+		bt_dump_array(fp, nbtr->u.arr, verbose);
+	}
+}
+
+static void dump_tree_node(FILE *fp, bt *btr, bt_n *x, int depth, bool is_index, int slot, bool verbose)
+{
+	if (!x->leaf) {
+		fprintf(fp, "%d: NODE: ",     depth);
+		if (x->dirty > 0) {
+			GET_BTN_SIZE(x->leaf);
+			void *ds = GET_DS(x, nsize);
+			fprintf(fp, "slot: %d n: %d scion: %d -> (%p) ds: %p dirty: %u\n",
+					slot, x->n, x->scion, (void *)x, ds, x->dirty);
+		} else {
+			fprintf(fp, "slot: %d n: %d scion: %d -> (%p)\n",
+					slot, x->n, x->scion, (void *) x);
+		}
+	} else if (verbose) {
+		if (x->dirty > 0) {
+			GET_BTN_SIZE(x->leaf) void *ds = GET_DS(x, nsize);
+			fprintf(fp, "%d: LEAF: slot: %d n: %d scion: %d -> (%p) ds: %p dirty: %u\n",
+					depth, slot, x->n, x->scion, (void *)x, ds, x->dirty);
+		} else {
+			fprintf(fp, "%d: LEAF: slot: %d n: %d scion: %d -> (%p)\n",
+					depth, slot, x->n, x->scion, (void *)x);
+		}
+		if (btr->dirty_left) {
+			if (findminnode(btr, btr->root) == x) {
+#ifdef PRINT_EVICTED_KEYS
+				for (uint32 i = 1; i <= btr->dirty_left; i++) {
+					fprintf(fp, "\t\t\t\t\tEVICTED KEY:\t\t\t%u\n", i);
+				}
+#else
+				fprintf(fp, "\t\tDL: %u\n", btr->dirty_left);
+#endif
+			}
+		}
+	}
+
+	for (int i = 0; i < x->n; i++) {
+		void *be  = KEYS(btr, x, i);
+		ai_obj  akey;
+		convertStream2Key(be, &akey, btr);
+		void *rrow = parseStream(be, btr);
+		if (is_index) {
+			fprintf(fp, "\tINDEX-KEY: ");
+			dump_ai_obj_as_digest(fp, &akey);
+			if (!rrow) { fprintf(fp, "\t\tTOTAL EVICTION\n"); }
+			else { bt_dump_nbtr(fp, (ai_nbtr *) rrow, 0, verbose); }
+		} else if (verbose) {
+			bool key_printed = 0;
+			if (LL(btr)) {
+				fprintf(fp, "\t\tLL: PTR: %p\t", rrow);
+			} else {
+				bool gost = IS_GHOST(btr, rrow);
+				if (gost) { fprintf(fp, "\t\tROW [%d]: %p \tGHOST-", i, rrow); }
+				else { fprintf(fp, "\t\tROW [%d]: %p\t",        i, rrow); }
+			}
+			if (!key_printed) {
+				fprintf(fp, "KEY: ");
+				dump_ai_obj_as_digest(fp, &akey);
+			}
+			if (x->dirty > 0) {
+#ifdef PRINT_EVICTED_KEYS
+				uint32 dr = getDR(btr, x, i);
+				if (dr) { fprintf(fp, "\t\t\t\tDR: %d\n", dr); }
+				else {
+					ulong beg = akey.l;
+					for (ulong j = 1; j <= (ulong)dr; j++) {
+						fprintf(fp, "\t\t\t\t\tEVICTED KEY:\t\t\t%lu\n", beg + j);
+					}
+				}
+#else
+				fprintf(fp, "\t\t\t\tDR: %d\n", getDR(btr, x, i));
+#endif
+			}
+		}
+	}
+	if (!x->leaf && verbose) {
+		depth++;
+		for (int i = 0; i <= x->n; i++) {
+			fprintf(fp, "\t\tNPTR[%d]: %p\n", i, NODES(btr, x)[i]);
+			dump_tree_node(fp, btr, NODES(btr, x)[i], depth, is_index, i, verbose);
+		}
+	}
+}
+
+void bt_dumptree(FILE *fp, bt *btr, bool is_index, bool verbose)
+{
+	bt_dump_info(fp, btr);
+	if (btr->root && btr->numkeys > 0) {
+		dump_tree_node(fp, btr, btr->root, 0, is_index, 0, verbose);
+	}
+	fprintf(fp, "\n");
+}
diff --git a/ai/src/stream.c b/ai/src/stream.c
new file mode 100644
index 00000000..9fe35bef
--- /dev/null
+++ b/ai/src/stream.c
@@ -0,0 +1,166 @@
+/*
+ * stream.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ * This file implements stream parsing for rows.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "ai_obj.h"
+#include "bt.h"
+#include "stream.h"
+
+#include <citrusleaf/alloc.h>
+
+/* COMPARE COMPARE COMPARE COMPARE COMPARE COMPARE COMPARE COMPARE */
+int u160Cmp(void *s1, void *s2) {
+	char *p1 = (char *)s1;
+	char *p2 = (char *)s2;
+	uint128 x1, x2;
+	memcpy(&x1, p1 + 4, 16);
+	memcpy(&x2, p2 + 4, 16);
+	if (x1 == x2) {
+		uint32 u1;
+		memcpy(&u1, p1, 4);
+		uint32 u2;
+		memcpy(&u2, p2, 4);
+		return u1 == u2 ? 0 : (u1 > u2) ? 1 : -1;
+	} else return             (x1 > x2) ? 1 : -1;
+}
+
+static inline int LCmp(void *s1, void *s2) {
+	llk   *ll1 = (llk *)s1;
+	llk   *ll2 = (llk *)s2;
+	long   l1  = ll1->key;
+	long   l2  = ll2->key;
+	return l1 == l2 ? 0 : (l1 > l2) ? 1 : -1;
+}
+
+int llCmp(void *s1, void *s2) {
+	return LCmp(s1, s2);
+}
+
+static inline int YCmp(void *s1, void *s2) {
+	ylk     *yl1 = (ylk *)s1;
+	ylk     *yl2 = (ylk *)s2;
+	uint160  y1  = yl1->key;
+	uint160  y2  = yl2->key;
+	return u160Cmp(&y1, &y2);
+}
+int ylCmp(void *s1, void *s2) {
+	return YCmp(s1, s2);
+}
+
+void destroyBTKey(char *btkey, bool med) {
+	if (med) cf_free(btkey);
+}
+
+char *createBTKey(ai_obj *akey, bool *med, uint32 *ksize, bt *btr, btk_t *btk) {
+	*med   = 0;
+	*ksize = VOIDSIZE;
+
+	if (NBT_DG(btr)) {
+		return (char *)&akey->y;
+	} else if (LL(btr)) {
+		btk->LL.key = akey->l;
+		return (char *)&btk->LL;
+	} else if (YL(btr)) {
+		btk->YL.key = akey->y;
+		return (char *)&btk->YL;
+	}
+	
+	assert(! "Unsupport Btree type"); 
+	return NULL;
+}
+
+uchar *parseStream(uchar *stream, bt *btr) {
+	if (!stream || NBT_DG(btr)) {
+		return NULL;
+	} else if (LL(btr)) {
+		return (uchar *)(*(llk *)(stream)).val;
+	} else if (YL(btr)) {
+		return (uchar *)(long)(*(ylk *)(stream)).val;
+	}
+	assert(! "Unsupported Btree type");
+	return NULL;
+}
+
+void convertStream2Key(uchar *stream, ai_obj *key, bt *btr) {
+	init_ai_obj(key);
+	if (NBT_DG(btr)) {
+		key->type = COL_TYPE_DIGEST;
+		memcpy(&key->y, stream, AS_DIGEST_KEY_SZ);
+	} else if (LL(btr)) {
+		key->type = COL_TYPE_LONG;
+		key->l = ((llk *)stream)->key;
+	} else if (YL(btr)) {
+		key->type = COL_TYPE_DIGEST;
+		key->y = ((ylk *)stream)->key;
+	} else {
+		assert(! "Unsupported Btree type");
+	}
+}
+
+static void *OBT_createStream(bt *btr, void *val, char *btkey, crs_t *crs) {
+   
+    if (LL(btr)) { 
+		llk *ll               = (llk *)btkey;
+		crs->LL_StreamPtr.key = ll->key;
+		crs->LL_StreamPtr.val = (ulong) val;
+        return &crs->LL_StreamPtr;
+	} else if (YL(btr)) {
+        ylk *yl               = (ylk *)btkey;
+        crs->YL_StreamPtr.key = yl->key;
+        crs->YL_StreamPtr.val = (ulong) val;
+        return &crs->YL_StreamPtr;
+    }
+	
+	assert(! "OBT_createStream ERROR");
+	return NULL;
+}
+
+void *createStream(bt *btr, void *val, char *btkey, uint32 klen, uint32 *size,
+				   crs_t *crs) {
+	*size = 0;
+	if (NBT(btr)) {
+		return btkey;
+	} else if (OTHER_BT(btr)) {
+		return OBT_createStream(btr, val, btkey, crs);
+	}
+
+	assert(! "Unsupported Btree type");
+	return NULL;
+}
+
+bool destroyStream(bt *btr, uchar *ostream) {
+	if (!ostream || NBT(btr) || OTHER_BT(btr)) {
+		return 0;
+	}
+
+	assert(! "Unsupported Btree Type");
+	return 1;
+}
diff --git a/apidocs/Makefile b/apidocs/Makefile
new file mode 100644
index 00000000..34da3990
--- /dev/null
+++ b/apidocs/Makefile
@@ -0,0 +1,10 @@
+
+.default: docs
+
+.PHONY: docs
+docs:
+	doxygen src/doxyfile
+
+.PHONY: docs-clean
+docs-clean:
+	rm -rf target
diff --git a/apidocs/src/doxyfile b/apidocs/src/doxyfile
new file mode 100644
index 00000000..aa958e58
--- /dev/null
+++ b/apidocs/src/doxyfile
@@ -0,0 +1,1792 @@
+# Doxyfile 1.8.1.2
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = "Aerospike Server"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = target
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = 
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
+# comments according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you
+# can mix doxygen, HTML, and XML commands with Markdown formatting.
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = YES
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = YES
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = YES
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = YES
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = NO
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            = src/layout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = ../as/src ../cf/src ../modules/common/src ../modules/mod-lua/src
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.h *.c
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = citrusleaf
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+#EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C, C++ and Fortran comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = NO
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = aerospike_ AEROSPIKE_ as_ AS_
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            = src/header.html
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            = src/footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = src/style.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
+# entries shown in the various tree structured indices initially; the user
+# can expand and collapse entries dynamically later on. Doxygen will expand
+# the tree to such a level that at most the specified number of entries are
+# visible (unless a fully collapsed tree already exceeds this amount).
+# So setting the number of entries 1 will produce a full collapsed tree by
+# default. 0 is a special value representing an infinite number of entries
+# and will result in a full expanded tree by default.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Aerospike API"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = com.aerospike
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Aerospike Inc.
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = YES
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = com.aerospike
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you may also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to
+# the MathJax Content Delivery Network so you can quickly see the result without
+# installing MathJax.
+# However, it is strongly recommended to install a local
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each
+# tag file the location of the external documentation should be added. The
+# format of a tag file without this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths
+# or URLs. Note that each tag file must have a unique name (where the name does
+# NOT include the path). If a tag file is not located in the directory in which
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = NO
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = YES
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside
+# the class node. If there are many fields or methods and many nodes the
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
+# threshold limits the number of items for each type to make the size more
+# managable. Set this to 0 for no limit. Note that the threshold may be
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = YES
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/apidocs/src/footer.html b/apidocs/src/footer.html
new file mode 100644
index 00000000..80222ad9
--- /dev/null
+++ b/apidocs/src/footer.html
@@ -0,0 +1,20 @@
+<!-- start footer part -->
+<!--BEGIN GENERATE_TREEVIEW-->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    $navpath
+    <li class="footer">$generatedby
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="$relpath$doxygen.png" alt="doxygen"/></a> $doxygenversion </li>
+  </ul>
+</div>
+<!--END GENERATE_TREEVIEW-->
+<!--BEGIN !GENERATE_TREEVIEW-->
+<hr class="footer"/><address class="footer"><small>
+$generatedby &#160;<a href="http://www.doxygen.org/index.html">
+<img class="footer" src="$relpath$doxygen.png" alt="doxygen"/>
+</a> $doxygenversion
+</small></address>
+<!--END !GENERATE_TREEVIEW-->
+</body>
+</html>
diff --git a/apidocs/src/header.html b/apidocs/src/header.html
new file mode 100644
index 00000000..f5dbcbb5
--- /dev/null
+++ b/apidocs/src/header.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath$tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath$jquery.js"></script>
+<script type="text/javascript" src="$relpath$dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath$style.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <!--BEGIN PROJECT_LOGO-->
+  <td id="projectlogo"><img alt="Logo" src="$relpath$$projectlogo"/></td>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">$projectname
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </td>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <td style="padding-left: 0.5em;">
+    <div id="projectbrief">$projectbrief</div>
+    </td>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <td>$searchbox</td>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+ </tr>
+ </tbody>
+</table>
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/apidocs/src/layout.xml b/apidocs/src/layout.xml
new file mode 100644
index 00000000..0e7ea327
--- /dev/null
+++ b/apidocs/src/layout.xml
@@ -0,0 +1,187 @@
+<doxygenlayout version="1.0">
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>  
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <detaileddescription title=""/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <detaileddescription title=""/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <detaileddescription title=""/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <detaileddescription title="Description"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <detaileddescription title=""/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+  </directory>
+</doxygenlayout>
diff --git a/apidocs/src/style.css b/apidocs/src/style.css
new file mode 100644
index 00000000..403c886a
--- /dev/null
+++ b/apidocs/src/style.css
@@ -0,0 +1,1174 @@
+/* The standard CSS for doxygen */
+
+body, table, div, p, dl {
+	font: 400 11pt Helvetica, sans-serif;
+}
+
+/* @group Heading Levels */
+
+h1 {
+	font-size: 150%;
+}
+
+.title {
+	font-size: 1.4em;
+	font-weight: bold;
+	margin: 10px 2px;
+}
+
+h2 {
+	border-bottom: 1px solid #879ECB;
+	color: #354C7B;
+	font-size: 1.2em;
+	font-weight: normal;
+	margin-top: 1.75em;
+	padding-top: 8px;
+	padding-bottom: 4px;
+	width: 100%;
+}
+
+#titlearea * {
+  background: #FFF;
+  width: 100%;
+  font-size: 16pt;
+}
+
+#projectname {
+
+}
+
+h3 {
+	font-size: 100%;
+}
+
+h1, h2, h3, h4, h5, h6 {
+	-webkit-transition: text-shadow 0.5s linear;
+	-moz-transition: text-shadow 0.5s linear;
+	-ms-transition: text-shadow 0.5s linear;
+	-o-transition: text-shadow 0.5s linear;
+	transition: text-shadow 0.5s linear;
+	margin-right: 15px;
+}
+
+h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow {
+	text-shadow: 0 0 15px cyan;
+}
+
+dt {
+	font-weight: bold;
+}
+
+div.multicol {
+	-moz-column-gap: 1em;
+	-webkit-column-gap: 1em;
+	-moz-column-count: 3;
+	-webkit-column-count: 3;
+}
+
+p.startli, p.startdd, p.starttd {
+	margin-top: 2px;
+}
+
+p.endli {
+	margin-bottom: 0px;
+}
+
+p.enddd {
+	margin-bottom: 4px;
+}
+
+p.endtd {
+	margin-bottom: 2px;
+}
+
+/* @end */
+
+caption {
+	font-weight: bold;
+}
+
+span.legend {
+        font-size: 70%;
+        text-align: center;
+}
+
+h3.version {
+        font-size: 90%;
+        text-align: center;
+}
+
+div.qindex, div.navtab{
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+}
+
+div.qindex, div.navpath {
+	width: 100%;
+	line-height: 140%;
+}
+
+div.navtab {
+	margin-right: 15px;
+}
+
+/* @group Link Styling */
+
+a {
+	color: #3D578C;
+	font-weight: normal;
+	text-decoration: none;
+}
+
+.contents a:visited {
+	color: #4665A2;
+}
+
+a:hover {
+	text-decoration: underline;
+}
+
+a.qindex {
+	font-weight: bold;
+}
+
+a.qindexHL {
+	font-weight: bold;
+	background-color: #9CAFD4;
+	color: #ffffff;
+	border: 1px double #869DCA;
+}
+
+.contents a.qindexHL:visited {
+        color: #ffffff;
+}
+
+a.el {
+	font-weight: bold;
+}
+
+a.elRef {
+}
+
+a.code, a.code:visited {
+	color: #4665A2; 
+}
+
+a.codeRef, a.codeRef:visited {
+	color: #4665A2; 
+}
+
+/* @end */
+
+dl.el {
+	margin-left: -1cm;
+}
+
+pre.fragment {
+        border: 1px solid #C4CFE5;
+        background-color: #FBFCFD;
+        padding: 4px 6px;
+        margin: 4px 8px 4px 2px;
+        overflow: auto;
+        word-wrap: break-word;
+        font-size:  9pt;
+        line-height: 125%;
+        font-family: monospace, fixed;
+        font-size: 105%;
+}
+
+div.fragment {
+        padding: 4px;
+        margin: 4px;
+	background-color: #FBFCFD;
+	border: 1px solid #C4CFE5;
+}
+
+div.line {
+	font-family: monospace, fixed;
+        font-size: 13px;
+	min-height: 13px;
+	line-height: 1.0;
+	text-wrap: unrestricted;
+	white-space: -moz-pre-wrap; /* Moz */
+	white-space: -pre-wrap;     /* Opera 4-6 */
+	white-space: -o-pre-wrap;   /* Opera 7 */
+	white-space: pre-wrap;      /* CSS3  */
+	word-wrap: break-word;      /* IE 5.5+ */
+	text-indent: -53px;
+	padding-left: 53px;
+	padding-bottom: 0px;
+	margin: 0px;
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+div.line.glow {
+	background-color: cyan;
+	box-shadow: 0 0 10px cyan;
+}
+
+
+span.lineno {
+	padding-right: 4px;
+	text-align: right;
+	border-right: 2px solid #0F0;
+	background-color: #E8E8E8;
+        white-space: pre;
+}
+span.lineno a {
+	background-color: #D8D8D8;
+}
+
+span.lineno a:hover {
+	background-color: #C8C8C8;
+}
+
+div.ah {
+	background-color: black;
+	font-weight: bold;
+	color: #ffffff;
+	margin-bottom: 3px;
+	margin-top: 3px;
+	padding: 0.2em;
+	border: solid thin #333;
+	border-radius: 0.5em;
+	-webkit-border-radius: .5em;
+	-moz-border-radius: .5em;
+	box-shadow: 2px 2px 3px #999;
+	-webkit-box-shadow: 2px 2px 3px #999;
+	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
+	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000);
+}
+
+div.groupHeader {
+	margin-left: 16px;
+	margin-top: 12px;
+	font-weight: bold;
+}
+
+div.groupText {
+	margin-left: 16px;
+	font-style: italic;
+}
+
+body {
+	background-color: white;
+	color: black;
+        margin: 0;
+}
+
+div.contents {
+	margin-top: 10px;
+	margin-left: 12px;
+	margin-right: 8px;
+}
+
+td.indexkey {
+	background-color: #EBEFF6;
+	font-weight: bold;
+	border: 1px solid #C4CFE5;
+	margin: 2px 0px 2px 0;
+	padding: 2px 10px;
+        white-space: nowrap;
+        vertical-align: top;
+}
+
+td.indexvalue {
+	background-color: #EBEFF6;
+	border: 1px solid #C4CFE5;
+	padding: 2px 10px;
+	margin: 2px 0px;
+}
+
+tr.memlist {
+	background-color: #EEF1F7;
+}
+
+p.formulaDsp {
+	text-align: center;
+}
+
+img.formulaDsp {
+	
+}
+
+img.formulaInl {
+	vertical-align: middle;
+}
+
+div.center {
+	text-align: center;
+        margin-top: 0px;
+        margin-bottom: 0px;
+        padding: 0px;
+}
+
+div.center img {
+	border: 0px;
+}
+
+address.footer {
+	text-align: right;
+	padding-right: 12px;
+}
+
+img.footer {
+	border: 0px;
+	vertical-align: middle;
+}
+
+/* @group Code Colorization */
+
+span.keyword {
+	color: #008000
+}
+
+span.keywordtype {
+	color: #604020
+}
+
+span.keywordflow {
+	color: #e08000
+}
+
+span.comment {
+	color: #800000
+}
+
+span.preprocessor {
+	color: #806020
+}
+
+span.stringliteral {
+	color: #002080
+}
+
+span.charliteral {
+	color: #008080
+}
+
+span.vhdldigit { 
+	color: #ff00ff 
+}
+
+span.vhdlchar { 
+	color: #000000 
+}
+
+span.vhdlkeyword { 
+	color: #700070 
+}
+
+span.vhdllogic { 
+	color: #ff0000 
+}
+
+blockquote {
+        background-color: #F7F8FB;
+        border-left: 2px solid #9CAFD4;
+        margin: 0 24px 0 4px;
+        padding: 0 12px 0 16px;
+}
+
+/* @end */
+
+/*
+.search {
+	color: #003399;
+	font-weight: bold;
+}
+
+form.search {
+	margin-bottom: 0px;
+	margin-top: 0px;
+}
+
+input.search {
+	font-size: 75%;
+	color: #000080;
+	font-weight: normal;
+	background-color: #e8eef2;
+}
+*/
+
+td.tiny {
+	font-size: 75%;
+}
+
+.dirtab {
+	padding: 4px;
+	border-collapse: collapse;
+	border: 1px solid #A3B4D7;
+}
+
+th.dirtab {
+	background: #EBEFF6;
+	font-weight: bold;
+}
+
+hr {
+	height: 0px;
+	border: none;
+	border-top: 1px solid #4A6AAA;
+}
+
+hr.footer {
+	height: 1px;
+}
+
+/* @group Member Descriptions */
+
+table.memberdecls {
+	border-spacing: 0px;
+	padding: 0px;
+}
+
+.memberdecls td, .fieldtable tr {
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+.memberdecls td.glow, .fieldtable tr.glow {
+	background-color: cyan;
+	box-shadow: 0 0 15px cyan;
+}
+
+.mdescLeft, .mdescRight,
+.memItemLeft, .memItemRight,
+.memTemplItemLeft, .memTemplItemRight, .memTemplParams {
+	background-color: #F9FAFC;
+	border: none;
+	margin: 4px;
+	padding: 1px 0 0 8px;
+}
+
+.mdescLeft, .mdescRight {
+	padding: 0px 8px 4px 8px;
+	color: #555;
+}
+
+.memItemLeft, .memItemRight, .memTemplParams {
+	border-bottom: 1px solid #DEE4F0;
+}
+
+.memItemLeft, .memTemplItemLeft {
+        white-space: nowrap;
+}
+
+.memItemRight {
+	width: 100%;
+}
+
+.memTemplParams {
+	color: #4665A2;
+        white-space: nowrap;
+}
+
+/* @end */
+
+/* @group Member Details */
+
+/* Styles for detailed member documentation */
+
+.memtemplate {
+	font-size: 80%;
+	color: #4665A2;
+	font-weight: normal;
+	margin-left: 9px;
+}
+
+.memnav {
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+	margin: 2px;
+	margin-right: 15px;
+	padding: 2px;
+}
+
+.mempage {
+	width: 100%;
+}
+
+.memitem {
+	padding: 0;
+	margin-bottom: 10px;
+	margin-right: 5px;
+        -webkit-transition: box-shadow 0.5s linear;
+        -moz-transition: box-shadow 0.5s linear;
+        -ms-transition: box-shadow 0.5s linear;
+        -o-transition: box-shadow 0.5s linear;
+        transition: box-shadow 0.5s linear;
+        display: table !important;
+        width: 100%;
+}
+
+.memitem.glow {
+         box-shadow: 0 0 15px cyan;
+}
+
+.memname {
+        font-weight: bold;
+        margin-left: 6px;
+}
+
+.memname td {
+	vertical-align: bottom;
+}
+
+.memproto, dl.reflist dt {
+        border-top: 1px solid #A8B8D9;
+        border-left: 1px solid #A8B8D9;
+        border-right: 1px solid #A8B8D9;
+        padding: 6px 0px 6px 0px;
+        color: #253555;
+        font-weight: bold;
+        text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+        background-image:url('nav_f.png');
+        background-repeat:repeat-x;
+        background-color: #E2E8F2;
+        /* opera specific markup */
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        border-top-right-radius: 4px;
+        border-top-left-radius: 4px;
+        /* firefox specific markup */
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        -moz-border-radius-topright: 4px;
+        -moz-border-radius-topleft: 4px;
+        /* webkit specific markup */
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        -webkit-border-top-right-radius: 4px;
+        -webkit-border-top-left-radius: 4px;
+
+}
+
+.memdoc, dl.reflist dd {
+        border-bottom: 1px solid #A8B8D9;      
+        border-left: 1px solid #A8B8D9;      
+        border-right: 1px solid #A8B8D9; 
+        padding: 6px 10px 2px 10px;
+        background-color: #FBFCFD;
+        border-top-width: 0;
+        background-image:url('nav_g.png');
+        background-repeat:repeat-x;
+        background-color: #FFFFFF;
+        /* opera specific markup */
+        border-bottom-left-radius: 4px;
+        border-bottom-right-radius: 4px;
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        /* firefox specific markup */
+        -moz-border-radius-bottomleft: 4px;
+        -moz-border-radius-bottomright: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        /* webkit specific markup */
+        -webkit-border-bottom-left-radius: 4px;
+        -webkit-border-bottom-right-radius: 4px;
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+}
+
+dl.reflist dt {
+        padding: 5px;
+}
+
+dl.reflist dd {
+        margin: 0px 0px 10px 0px;
+        padding: 5px;
+}
+
+.paramkey {
+	text-align: right;
+}
+
+.paramtype {
+	white-space: nowrap;
+}
+
+.paramname {
+	color: #602020;
+	white-space: nowrap;
+}
+.paramname em {
+	font-style: normal;
+}
+.paramname code {
+        line-height: 14px;
+}
+
+.params, .retval, .exception, .tparams {
+        margin-left: 0px;
+        padding-left: 0px;
+}       
+
+.params .paramname, .retval .paramname {
+        font-weight: bold;
+        vertical-align: top;
+}
+        
+.params .paramtype {
+        font-style: italic;
+        vertical-align: top;
+}       
+        
+.params .paramdir {
+        font-family: "courier new",courier,monospace;
+        vertical-align: top;
+}
+
+table.mlabels {
+	border-spacing: 0px;
+}
+
+td.mlabels-left {
+	width: 100%;
+	padding: 0px;
+}
+
+td.mlabels-right {
+	vertical-align: bottom;
+	padding: 0px;
+	white-space: nowrap;
+}
+
+span.mlabels {
+        margin-left: 8px;
+}
+
+span.mlabel {
+        background-color: #728DC1;
+        border-top:1px solid #5373B4;
+        border-left:1px solid #5373B4;
+        border-right:1px solid #C4CFE5;
+        border-bottom:1px solid #C4CFE5;
+	text-shadow: none;
+        color: white;
+        margin-right: 4px;
+        padding: 2px 3px;
+        border-radius: 3px;
+        font-size: 7pt;
+	white-space: nowrap;
+}
+
+
+
+/* @end */
+
+/* these are for tree view when not used as main index */
+
+div.directory {
+        margin: 10px 0px;
+        border-top: 1px solid #A8B8D9;
+        border-bottom: 1px solid #A8B8D9;
+        width: 100%;
+}
+
+.directory table {
+        border-collapse:collapse;
+}
+
+.directory td {
+        margin: 0px;
+        padding: 0px;
+	vertical-align: top;
+}
+
+.directory td.entry {
+        white-space: nowrap;
+        padding-right: 6px;
+}
+
+.directory td.entry a {
+        outline:none;
+}
+
+.directory td.entry a img {
+        border: none;
+}
+
+.directory td.desc {
+        width: 100%;
+        padding-left: 6px;
+	padding-right: 6px;
+	padding-top: 3px;
+	border-left: 1px solid rgba(0,0,0,0.05);
+}
+
+.directory tr.even {
+	padding-left: 6px;
+	background-color: #F7F8FB;
+}
+
+.directory img {
+	vertical-align: -30%;
+}
+
+.directory .levels {
+        white-space: nowrap;
+        width: 100%;
+        text-align: right;
+        font-size: 9pt;
+}
+
+.directory .levels span {
+        cursor: pointer;
+        padding-left: 2px;
+        padding-right: 2px;
+	color: #3D578C;
+}
+
+div.dynheader {
+        margin-top: 8px;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+address {
+	font-style: normal;
+	color: #2A3D61;
+}
+
+table.doxtable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.doxtable td, table.doxtable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.doxtable th {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+table.fieldtable {
+        width: 100%;
+        margin-bottom: 10px;
+        border: 1px solid #A8B8D9;
+        border-spacing: 0px;
+        -moz-border-radius: 4px;
+        -webkit-border-radius: 4px;
+        border-radius: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+        -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+        box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+}
+
+.fieldtable td, .fieldtable th {
+        padding: 3px 7px 2px;
+}
+
+.fieldtable td.fieldtype, .fieldtable td.fieldname {
+        white-space: nowrap;
+        border-right: 1px solid #A8B8D9;
+        border-bottom: 1px solid #A8B8D9;
+        vertical-align: top;
+}
+
+.fieldtable td.fielddoc {
+        border-bottom: 1px solid #A8B8D9;
+        width: 100%;
+}
+
+.fieldtable tr:last-child td {
+        border-bottom: none;
+}
+
+.fieldtable th {
+        background-image:url('nav_f.png');
+        background-repeat:repeat-x;
+        background-color: #E2E8F2;
+        font-size: 90%;
+        color: #253555;
+        padding-bottom: 4px;
+        padding-top: 5px;
+        text-align:left;
+        -moz-border-radius-topleft: 4px;
+        -moz-border-radius-topright: 4px;
+        -webkit-border-top-left-radius: 4px;
+        -webkit-border-top-right-radius: 4px;
+        border-top-left-radius: 4px;
+        border-top-right-radius: 4px;
+        border-bottom: 1px solid #A8B8D9;
+}
+
+
+.tabsearch {
+	top: 0px;
+	left: 10px;
+	height: 36px;
+	background-image: url('tab_b.png');
+	z-index: 101;
+	overflow: hidden;
+	font-size: 13px;
+}
+
+.navpath ul
+{
+	font-size: 11px;
+	background-image:url('tab_b.png');
+	background-repeat:repeat-x;
+	height:30px;
+	line-height:30px;
+	color:#8AA0CC;
+	border:solid 1px #C2CDE4;
+	overflow:hidden;
+	margin:0px;
+	padding:0px;
+}
+
+.navpath li
+{
+	list-style-type:none;
+	float:left;
+	padding-left:10px;
+	padding-right:15px;
+	background-image:url('bc_s.png');
+	background-repeat:no-repeat;
+	background-position:right;
+	color:#364D7C;
+}
+
+.navpath li.navelem a
+{
+	height:32px;
+	display:block;
+	text-decoration: none;
+	outline: none;
+	font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif;
+}
+
+.navpath li.navelem a:hover
+{
+	color:#6884BD;
+}
+
+.navpath li.footer
+{
+        list-style-type:none;
+        float:right;
+        padding-left:10px;
+        padding-right:15px;
+        background-image:none;
+        background-repeat:no-repeat;
+        background-position:right;
+        color:#364D7C;
+        font-size: 8pt;
+}
+
+
+div.summary
+{
+	float: right;
+	font-size: 8pt;
+	padding-right: 5px;
+	width: 50%;
+	text-align: right;
+}       
+
+div.summary a
+{
+	white-space: nowrap;
+}
+
+div.ingroups
+{
+	font-size: 8pt;
+	width: 50%;
+	text-align: left;
+}
+
+div.ingroups a
+{
+	white-space: nowrap;
+}
+
+div.header
+{
+        background-image:url('nav_h.png');
+        background-repeat:repeat-x;
+	background-color: #F9FAFC;
+	margin:  0px;
+	border-bottom: 1px solid #C4CFE5;
+}
+
+div.headertitle
+{
+	padding: 5px 5px 5px 10px;
+}
+
+dl
+{
+        padding: 0 0 0 10px;
+}
+
+/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */
+dl.section
+{
+	margin-left: 0px;
+	padding-left: 0px;
+}
+
+dl.note
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #D0C000;
+}
+
+dl.warning, dl.attention
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #FF0000;
+}
+
+dl.pre, dl.post, dl.invariant
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00D000;
+}
+
+dl.deprecated
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #505050;
+}
+
+dl.todo
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00C0E0;
+}
+
+dl.test
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #3030E0;
+}
+
+dl.bug
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #C08050;
+}
+
+dl.section dd {
+	margin-bottom: 6px;
+}
+
+
+#projectlogo
+{
+	text-align: center;
+	vertical-align: bottom;
+	border-collapse: separate;
+}
+ 
+#projectlogo img
+{ 
+	border: 0px none;
+}
+ 
+#projectname
+{
+	font-size: 1.5em;
+  font-weight: 600;
+	margin: 0px;
+	padding: 2px 0px;
+}
+    
+#projectbrief
+{
+	font-size: 1.2em;
+	margin: 0px;
+	padding: 0px;
+}
+
+#projectnumber
+{
+	font-size: 1em;
+	margin: 0px;
+	padding: 0px;
+}
+
+#titlearea
+{
+	padding: 0px;
+	margin: 0px;
+	width: 100%;
+	border-bottom: 1px solid #5373B4;
+}
+
+.image
+{
+        text-align: center;
+}
+
+.dotgraph
+{
+        text-align: center;
+}
+
+.mscgraph
+{
+        text-align: center;
+}
+
+.caption
+{
+	font-weight: bold;
+}
+
+div.zoom
+{
+	border: 1px solid #90A5CE;
+}
+
+dl.citelist {
+        margin-bottom:50px;
+}
+
+dl.citelist dt {
+        color:#334975;
+        float:left;
+        font-weight:bold;
+        margin-right:10px;
+        padding:5px;
+}
+
+dl.citelist dd {
+        margin:2px 0;
+        padding:5px 0;
+}
+
+div.toc {
+        padding: 14px 25px;
+        background-color: #F4F6FA;
+        border: 1px solid #D8DFEE;
+        border-radius: 7px 7px 7px 7px;
+        float: right;
+        height: auto;
+        margin: 0 20px 10px 10px;
+        width: 200px;
+}
+
+div.toc li {
+        background: url("bdwn.png") no-repeat scroll 0 5px transparent;
+        font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif;
+        margin-top: 5px;
+        padding-left: 10px;
+        padding-top: 2px;
+}
+
+div.toc h3 {
+        font: bold 12px/1.2 Arial,FreeSans,sans-serif;
+	color: #4665A2;
+        border-bottom: 0 none;
+        margin: 0;
+}
+
+div.toc ul {
+        list-style: none outside none;
+        border: medium none;
+        padding: 0px;
+}       
+
+div.toc li.level1 {
+        margin-left: 0px;
+}
+
+div.toc li.level2 {
+        margin-left: 15px;
+}
+
+div.toc li.level3 {
+        margin-left: 30px;
+}
+
+div.toc li.level4 {
+        margin-left: 45px;
+}
+
+.inherit_header {
+        font-weight: bold;
+        color: gray;
+        cursor: pointer;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+.inherit_header td {
+        padding: 6px 0px 2px 5px;
+}
+
+.inherit {
+        display: none;
+}
+
+tr.heading h2 {
+        margin-top: 12px;
+        margin-bottom: 4px;
+}
+
+@media print
+{
+  #top { display: none; }
+  #side-nav { display: none; }
+  #nav-path { display: none; }
+  body { overflow:visible; }
+  h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
+  .summary { display: none; }
+  .memitem { page-break-inside: avoid; }
+  #doc-content
+  {
+    margin-left:0 !important;
+    height:auto !important;
+    width:auto !important;
+    overflow:inherit;
+    display:inline;
+  }
+}
+
diff --git a/as/Makefile b/as/Makefile
new file mode 100644
index 00000000..e914d9ec
--- /dev/null
+++ b/as/Makefile
@@ -0,0 +1,9 @@
+# Aerospike Server
+# Makefile
+
+.PHONY: default
+default: all
+	@echo "done."
+
+%:
+	$(MAKE) -C src $@
diff --git a/as/etc/README.sample.conf.md b/as/etc/README.sample.conf.md
new file mode 100644
index 00000000..97909cb1
--- /dev/null
+++ b/as/etc/README.sample.conf.md
@@ -0,0 +1,15 @@
+# Aerospike Server Sample Configuration Files
+
+This directory contains sample Aerospike Server configuration files for
+various use cases.
+
+To use a sample configuration, first copy the appropriate file to be
+`/etc/aerospike/aerospike.conf`, and then modify it for your particular
+environment and use case.
+
+## List of Sample Configuration Files
+
+|      Filename       |              Description                           |
+| ------------------- | -------------------------------------------------- |
+| aerospike_mesh.conf | Sample using TCP mesh for clustering               |
+| aerospike_ssd.conf  | Sample using SSD devices for storage               |
diff --git a/as/etc/aerospike-server.sysconfig b/as/etc/aerospike-server.sysconfig
new file mode 100644
index 00000000..204b3160
--- /dev/null
+++ b/as/etc/aerospike-server.sysconfig
@@ -0,0 +1,4 @@
+ASD_CONFIG_FILE=/etc/aerospike/aerospike.conf
+
+# Uncomment to start with cold start
+#ASD_COLDSTART="--cold-start"
diff --git a/as/etc/aerospike-server.tmpfiles b/as/etc/aerospike-server.tmpfiles
new file mode 100644
index 00000000..e5f83b62
--- /dev/null
+++ b/as/etc/aerospike-server.tmpfiles
@@ -0,0 +1 @@
+d /run/aerospike 0755 aerospike aerospike -
diff --git a/as/etc/aerospike.conf b/as/etc/aerospike.conf
new file mode 100644
index 00000000..f3731133
--- /dev/null
+++ b/as/etc/aerospike.conf
@@ -0,0 +1,67 @@
+# Aerospike database configuration file.
+
+service {
+	user root
+	group root
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+	pidfile /var/run/aerospike/asd.pid
+	proto-fd-max 15000
+}
+
+logging {
+	# Log file must be an absolute path.
+	file /var/log/aerospike/aerospike.log {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode multicast
+		multicast-group 239.1.99.222
+		port 9918
+
+		# To use unicast-mesh heartbeats, remove the 3 lines above, and see
+		# aerospike_mesh.conf for alternative.
+
+		interval 150
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+}
+
+namespace bar {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+
+	# To use file storage backing, comment out the line above and use the
+	# following lines instead.
+#	storage-engine device {
+#		file /opt/aerospike/data/bar.dat
+#		filesize 16G
+#		data-in-memory true # Store data in memory in addition to file.
+#	}
+}
diff --git a/as/etc/aerospike.service.d/aerospike.conf b/as/etc/aerospike.service.d/aerospike.conf
new file mode 100644
index 00000000..e69de29b
diff --git a/as/etc/aerospike.service.d/aerospike.conf.coldstart b/as/etc/aerospike.service.d/aerospike.conf.coldstart
new file mode 100644
index 00000000..07dfca62
--- /dev/null
+++ b/as/etc/aerospike.service.d/aerospike.conf.coldstart
@@ -0,0 +1,2 @@
+[Service]
+Environment="ASD_OPTIONS=--cold-start"
diff --git a/as/etc/aerospike.service.d/aerospike.conf.default b/as/etc/aerospike.service.d/aerospike.conf.default
new file mode 100644
index 00000000..e69de29b
diff --git a/as/etc/aerospike.service.head b/as/etc/aerospike.service.head
new file mode 100644
index 00000000..da04d6bc
--- /dev/null
+++ b/as/etc/aerospike.service.head
@@ -0,0 +1,14 @@
+[Unit]
+Description=Aerospike Server
+After=network.target
+Wants=network.target
+
+[Service]
+LimitNOFILE=100000
+TimeoutSec=15
+User=root
+Group=root
+EnvironmentFile=/etc/sysconfig/aerospike
+PermissionsStartOnly=True
+ExecStartPre=/usr/bin/asd-systemd-helper
+ExecStart=/usr/bin/asd $ASD_OPTIONS --config-file $ASD_CONFIG_FILE --fgdaemon
diff --git a/as/etc/aerospike.service.tail b/as/etc/aerospike.service.tail
new file mode 100644
index 00000000..140e4113
--- /dev/null
+++ b/as/etc/aerospike.service.tail
@@ -0,0 +1,3 @@
+
+[Install]
+WantedBy=multi-user.target
diff --git a/as/etc/aerospike.service.telemetry b/as/etc/aerospike.service.telemetry
new file mode 100644
index 00000000..62362af1
--- /dev/null
+++ b/as/etc/aerospike.service.telemetry
@@ -0,0 +1,2 @@
+ExecStartPre=-/bin/systemctl start aerospike_telemetry
+ExecStopPost=-/bin/systemctl stop aerospike_telemetry
diff --git a/as/etc/aerospike_dev.conf b/as/etc/aerospike_dev.conf
new file mode 100644
index 00000000..1715d249
--- /dev/null
+++ b/as/etc/aerospike_dev.conf
@@ -0,0 +1,81 @@
+# Aerospike database developer configuration file.
+
+service {
+	run-as-daemon false # To work with gdb, and make console logging visible.
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+
+	# The number of concurrent connections to the database is limited by
+	# proto-fd-max, and by the system's maximum number of open file descriptors.
+	# See "man limits.conf" for how to set the system's "nofile" limit.
+	proto-fd-max 1024
+
+	work-directory run/work
+	pidfile run/asd.pid
+}
+
+mod-lua {
+	user-path run/work/usr/udf/lua
+	system-path run/work/sys/udf/lua
+}
+
+logging {
+	# Log file must be an absolute path.
+	file run/log/aerospike.log {
+		context any info
+	}
+
+	console {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode multicast
+		multicast-group 239.1.99.222
+		port 9918
+
+		# To use unicast-mesh heartbeats, remove the 3 lines above, and see
+		# aerospike_mesh.conf for alternative.
+
+		interval 150
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+}
+
+namespace bar {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+
+	# To use file storage backing, comment out the line above and use the
+	# following lines instead.
+#	storage-engine device {
+#		file /opt/aerospike/data/bar.dat
+#		filesize 16G
+#		data-in-memory true # Store data in memory in addition to file.
+#	}
+}
diff --git a/as/etc/aerospike_mesh.conf b/as/etc/aerospike_mesh.conf
new file mode 100644
index 00000000..5dbc147f
--- /dev/null
+++ b/as/etc/aerospike_mesh.conf
@@ -0,0 +1,70 @@
+# Aerospike database configuration file for deployments using mesh heartbeats.
+
+service {
+	user root
+	group root
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+	pidfile /var/run/aerospike/asd.pid
+	proto-fd-max 15000
+}
+
+logging {
+	# Log file must be an absolute path.
+	file /var/log/aerospike/aerospike.log {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode mesh
+		port 3002 # Heartbeat port for this node.
+
+		# List one or more other nodes, one ip-address & port per line:
+		mesh-seed-address-port 10.10.10.10 3002
+#		mesh-seed-address-port 10.10.10.11 3002
+#		mesh-seed-address-port 10.10.10.12 3002
+#		mesh-seed-address-port 10.10.10.13 3002
+#		mesh-seed-address-port 10.10.10.14 3002
+
+		interval 250
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+}
+
+namespace bar {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+
+	# To use file storage backing, comment out the line above and use the
+	# following lines instead.
+#	storage-engine device {
+#		file /opt/aerospike/data/bar.dat
+#		filesize 16G
+#		data-in-memory true # Store data in memory in addition to file.
+#	}
+}
diff --git a/as/etc/aerospike_mesh_systemd.conf b/as/etc/aerospike_mesh_systemd.conf
new file mode 100644
index 00000000..4c5b6046
--- /dev/null
+++ b/as/etc/aerospike_mesh_systemd.conf
@@ -0,0 +1,66 @@
+# Aerospike database configuration file for deployments using mesh heartbeats with systemd.
+
+service {
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+	proto-fd-max 15000
+}
+
+logging {
+	console {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode mesh
+		port 3002 # Heartbeat port for this node.
+
+		# List one or more other nodes, one ip-address & port per line:
+		mesh-seed-address-port 10.10.10.10 3002
+#		mesh-seed-address-port 10.10.10.11 3002
+#		mesh-seed-address-port 10.10.10.12 3002
+#		mesh-seed-address-port 10.10.10.13 3002
+#		mesh-seed-address-port 10.10.10.14 3002
+
+		interval 250
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+}
+
+namespace bar {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+
+	# To use file storage backing, comment out the line above and use the
+	# following lines instead.
+#	storage-engine device {
+#		file /opt/aerospike/data/bar.dat
+#		filesize 16G
+#		data-in-memory true # Store data in memory in addition to file.
+#	}
+}
diff --git a/as/etc/aerospike_ssd.conf b/as/etc/aerospike_ssd.conf
new file mode 100644
index 00000000..c79a7251
--- /dev/null
+++ b/as/etc/aerospike_ssd.conf
@@ -0,0 +1,65 @@
+# Aerospike database configuration file for deployments using raw storage.
+
+service {
+	user root
+	group root
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+	pidfile /var/run/aerospike/asd.pid
+	proto-fd-max 15000
+}
+
+logging {
+	# Log file must be an absolute path.
+	file /var/log/aerospike/aerospike.log {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode multicast
+		multicast-group 239.1.99.222
+		port 9918
+
+		# To use unicast-mesh heartbeats, remove the 3 lines above, and see
+		# aerospike_mesh.conf for alternative.
+
+		interval 150
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	# Warning - legacy data in defined raw partition devices will be erased.
+	# These partitions must not be mounted by the file system.
+	storage-engine device {
+		# Use one or more lines like those below with actual device paths.
+#		device /dev/sdb
+#		device /dev/sdc
+
+		# The 2 lines below optimize for SSD.
+		scheduler-mode noop
+		write-block-size 128K
+
+		# Use the line below to store data in memory in addition to devices.
+#		data-in-memory true
+	}
+}
+
diff --git a/as/etc/aerospike_ssd_systemd.conf b/as/etc/aerospike_ssd_systemd.conf
new file mode 100644
index 00000000..06392cd7
--- /dev/null
+++ b/as/etc/aerospike_ssd_systemd.conf
@@ -0,0 +1,61 @@
+# Aerospike database configuration file for deployments using raw storage with systemd.
+
+service {
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+	proto-fd-max 15000
+}
+
+logging {
+	console {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode multicast
+		multicast-group 239.1.99.222
+		port 9918
+
+		# To use unicast-mesh heartbeats, remove the 3 lines above, and see
+		# aerospike_mesh.conf for alternative.
+
+		interval 150
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	# Warning - legacy data in defined raw partition devices will be erased.
+	# These partitions must not be mounted by the file system.
+	storage-engine device {
+		# Use one or more lines like those below with actual device paths.
+#		device /dev/sdb
+#		device /dev/sdc
+
+		# The 2 lines below optimize for SSD.
+		scheduler-mode noop
+		write-block-size 128K
+
+		# Use the line below to store data in memory in addition to devices.
+#		data-in-memory true
+	}
+}
+
diff --git a/as/etc/aerospike_systemd.conf b/as/etc/aerospike_systemd.conf
new file mode 100644
index 00000000..58fa30c0
--- /dev/null
+++ b/as/etc/aerospike_systemd.conf
@@ -0,0 +1,63 @@
+# Aerospike database configuration file for use with systemd.
+
+service {
+	paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+	proto-fd-max 15000
+}
+
+logging {
+	console {
+		context any info
+	}
+}
+
+network {
+	service {
+		address any
+		port 3000
+	}
+
+	heartbeat {
+		mode multicast
+		multicast-group 239.1.99.222
+		port 9918
+
+		# To use unicast-mesh heartbeats, remove the 3 lines above, and see
+		# aerospike_mesh.conf for alternative.
+
+		interval 150
+		timeout 10
+	}
+
+	fabric {
+		port 3001
+	}
+
+	info {
+		port 3003
+	}
+}
+
+namespace test {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+}
+
+namespace bar {
+	replication-factor 2
+	memory-size 4G
+	default-ttl 30d # 30 days, use 0 to never expire/evict.
+
+	storage-engine memory
+
+	# To use file storage backing, comment out the line above and use the
+	# following lines instead.
+#	storage-engine device {
+#		file /opt/aerospike/data/bar.dat
+#		filesize 16G
+#		data-in-memory true # Store data in memory in addition to file.
+#	}
+}
diff --git a/as/etc/aerospike_telemetry.service b/as/etc/aerospike_telemetry.service
new file mode 100644
index 00000000..694de3eb
--- /dev/null
+++ b/as/etc/aerospike_telemetry.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=Aerospike Telemetry Agent
+After=network.target
+Wants=network.target
+
+[Service]
+User=aerospike
+Group=aerospike
+EnvironmentFile=/etc/sysconfig/aerospike_telemetry
+PermissionsStartOnly=True
+ExecStart=/opt/aerospike/telemetry/telemetry.py $TELEMETRY_CONFIG_FILE start --fgdaemon
diff --git a/as/etc/aerospike_telemetry.sysconfig b/as/etc/aerospike_telemetry.sysconfig
new file mode 100644
index 00000000..6c1d364a
--- /dev/null
+++ b/as/etc/aerospike_telemetry.sysconfig
@@ -0,0 +1 @@
+TELEMETRY_CONFIG_FILE=/etc/aerospike/telemetry.conf
diff --git a/as/etc/asd-systemd-helper b/as/etc/asd-systemd-helper
new file mode 100644
index 00000000..4f19836c
--- /dev/null
+++ b/as/etc/asd-systemd-helper
@@ -0,0 +1,36 @@
+#!/bin/bash
+mem=`/sbin/sysctl -n kernel.shmall`
+min=4294967296
+if [ ${#mem} -le ${#min} ]; then
+    if [ $mem -lt $min ]; then
+    echo "kernel.shmall too low, setting to 4G pages = 16TB"
+    /sbin/sysctl -w kernel.shmall=$min
+    fi
+fi
+
+mem=`/sbin/sysctl -n kernel.shmmax`
+min=1073741824
+if [ ${#mem} -le ${#min} ]; then
+    if [ $mem -lt $min ]; then
+    echo "kernel.shmmax too low, setting to 1GB"
+    /sbin/sysctl -w kernel.shmmax=$min
+    fi
+fi
+
+set_socket_buffer_limit() {
+	name=${1}; path=${2}; size=${3}
+	curr=$(cat ${path})
+
+	if [ ${curr} -lt ${size} ]; then
+		echo "Increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}"
+		echo ${size} >${path}
+	fi
+}
+
+set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640
+set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880
+
+if [ -f /etc/aerospike/initfns ]
+then
+    . /etc/aerospike/initfns
+fi
diff --git a/as/etc/init-script b/as/etc/init-script
new file mode 100644
index 00000000..8112d6d8
--- /dev/null
+++ b/as/etc/init-script
@@ -0,0 +1,193 @@
+#!/bin/sh
+# chkconfig: 2345 85 15
+# description: Starts and stops the Aerospike daemon
+
+### BEGIN INIT INFO
+# Provides:          aerospike
+# Required-Start:    $remote_fs $network
+# Required-Stop:     $remote_fs $network
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: Aerospike Clustered Data Service
+### END INIT INFO
+
+. /etc/rc.d/init.d/functions
+. /etc/sysconfig/network
+[ "$NETWORKING" = "no" ] && exit 0
+
+ASD=/usr/bin/asd
+ASDN=$(basename $ASD)
+LOCKFILE=/var/lock/subsys/aerospike
+CONFIG_FILE=/etc/aerospike/aerospike.conf
+CMD="$ASD --config-file $CONFIG_FILE"
+PIDDIR="/var/run/aerospike"
+ASD_USER="aerospike"
+ASD_GROUP=$ASD_USER
+STOP_TIMEOUT=${STOP_TIMEOUT-30}
+EDITION="@EDITION@"
+
+INITFNS=/etc/aerospike/initfns
+if [ -f $INITFNS ]; then . $INITFNS; fi
+if [ -n $LD_PRELOAD ]; then export LD_PRELOAD; fi
+
+# in production, the corefiles are so huge as to prevent
+# quick restarts of servers. Turn this on only if requested
+# DAEMON_COREFILE_LIMIT="unlimited"
+
+set_shmall() {
+	mem=`/sbin/sysctl -n kernel.shmall`
+	min=4294967296
+	if [ ${#mem} -le ${#min} ]; then
+	    if [ $mem -lt $min ]; then
+		echo "kernel.shmall too low, setting to 4G pages = 16TB"
+		/sbin/sysctl -w kernel.shmall=$min
+	    fi
+	fi
+}
+
+set_shmmax() {
+	mem=`/sbin/sysctl -n kernel.shmmax`
+	min=1073741824
+	if [ ${#mem} -le ${#min} ]; then
+	    if [ $mem -lt $min ]; then
+		echo "kernel.shmmax too low, setting to 1GB"
+		/sbin/sysctl -w kernel.shmmax=$min
+	    fi
+	fi
+}
+
+set_socket_buffer_limit() {
+	name=${1}; path=${2}; size=${3}
+	curr=$(cat ${path})
+
+	if [ ${curr} -lt ${size} ]; then
+		echo "Increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}"
+		echo ${size} >${path}
+	fi
+}
+
+set_socket_buffer_limits() {
+	set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640
+	set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880
+}
+
+#We are adding create_piddir as /var/run is tmpfs on some distributions. 
+#This causes the piddir to be removed on reboot
+#adding this to centos init for parity
+create_piddir() {
+	if [ ! -d $PIDDIR ]
+	then
+		(mkdir $PIDDIR && chown $ASD_USER:$ASD_GROUP $PIDDIR) &> /dev/null
+	fi
+}
+
+start() {
+	ulimit -n 100000
+	logger -t aerospike  "ulimit -n="`ulimit -n`
+	[ -x $ASD ] || exit 0
+	set_shmall
+	set_shmmax
+	set_socket_buffer_limits
+	create_piddir
+	echo -n $"Starting and checking aerospike: "
+	daemon "$CMD && pgrep $ASDN &> /dev/null"
+	retval=$?
+	echo
+	[ $retval -eq 0 ] && touch $LOCKFILE
+	return $retval
+}
+
+coldstart() {
+	ulimit -n 100000
+	logger -t aerospike  "ulimit -n="`ulimit -n`
+	[ -x $ASD ] || exit 0
+	set_shmall
+	set_shmmax
+	set_socket_buffer_limits
+	create_piddir
+	echo -n $"Cold-starting aerospike: "
+	daemon "$CMD --cold-start && pgrep $ASDN &> /dev/null"
+	retval=$?
+	echo
+	[ $retval -eq 0 ] && touch $LOCKFILE
+	return $retval
+}
+
+stop() {
+	echo -n $"Stopping aerospike: "
+	killproc -d ${STOP_TIMEOUT} $ASDN
+	retval=$?
+	echo
+	[ $retval -eq 0 ] && rm -f $LOCKFILE
+	return $retval
+}
+
+rh_status() {
+	status $ASDN
+}
+
+rh_status_quiet() {
+	status $ASDN >/dev/null 2>&1
+}
+
+do_telemetry_start () {
+	if [ $EDITION = "community" ];
+	then
+		/sbin/service aerospike_telemetry start >/dev/null 2>&1
+	fi
+}
+
+do_telemetry_stop () {
+	if [ $EDITION = "community" ];
+	then
+		/sbin/service aerospike_telemetry stop >/dev/null 2>&1
+	fi
+}
+
+case "$1" in
+	start)
+		rh_status_quiet
+		if [ $? == 0 ];
+		then
+			{ echo -n "Already "; $0 status; }
+		else
+			$1
+		fi
+
+		do_telemetry_start
+		;;
+	coldstart)
+		rh_status_quiet
+		if [ $? == 0 ];
+		then
+			{ echo -n "Already "; $0 status; }
+		else
+			$1
+		fi
+
+		do_telemetry_start
+		;;
+	stop)
+		rh_status_quiet
+		if [ $? == 3 ];
+		then
+			{ echo -n "Already "; $0 status; }
+		else
+			$1
+		fi
+
+		do_telemetry_stop
+		;;
+	status)
+		rh_status
+		;;
+	restart)
+		$0 stop
+		sleep 3
+		$0 start
+		;;
+	*)
+		echo $"Usage: $0 {start|stop|status|coldstart|restart}"
+		exit 2
+		;;
+esac
diff --git a/as/etc/init-script.deb b/as/etc/init-script.deb
new file mode 100755
index 00000000..d4afd3b7
--- /dev/null
+++ b/as/etc/init-script.deb
@@ -0,0 +1,162 @@
+#!/bin/bash
+# chkconfig: 2345 85 15
+# description: Starts and stops the Aerospike daemon
+
+### BEGIN INIT INFO
+# Provides:          aerospike
+# Required-Start:    $remote_fs $network
+# Required-Stop:     $remote_fs $network
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: Aerospike Clustered Data Service
+### END INIT INFO
+
+ASD=/usr/bin/asd
+ASDN=$(basename $ASD)
+CONFIG_FILE=/etc/aerospike/aerospike.conf
+OPTS="--config-file $CONFIG_FILE"
+COLD_OPTS="$OPTS --cold-start"
+PIDDIR="/var/run/aerospike"
+PIDFILE=$PIDDIR/asd.pid
+ASD_USER="aerospike"
+ASD_GROUP=$ASD_USER
+EDITION="@EDITION@"
+
+INITFNS=/etc/aerospike/initfns
+if [ -f $INITFNS ]; then . $INITFNS; fi
+
+. /lib/lsb/init-functions
+
+set_shmall() {
+	mem=`/sbin/sysctl -n kernel.shmall`
+	min=4294967296
+	if [ ${#mem} -le ${#min} ]; then
+	    if [ $mem -lt $min ]; then
+		echo "kernel.shmall too low, setting to 4G pages = 16TB"
+		/sbin/sysctl -w kernel.shmall=$min
+	    fi
+	fi
+}
+
+set_shmmax() {
+	mem=`/sbin/sysctl -n kernel.shmmax`
+	min=1073741824
+	if [ ${#mem} -le ${#min} ]; then
+	    if [ $mem -lt $min ]; then
+		echo "kernel.shmmax too low, setting to 1GB"
+		/sbin/sysctl -w kernel.shmmax=$min
+	    fi
+	fi
+}
+
+set_socket_buffer_limit() {
+	name=${1}; path=${2}; size=${3}
+	curr=$(cat ${path})
+
+	if [ ${curr} -lt ${size} ]; then
+		echo "Increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}"
+		echo ${size} >${path}
+	fi
+}
+
+set_socket_buffer_limits() {
+	set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640
+	set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880
+}
+
+#We are adding create_piddir as /var/run is tmpfs on Debian 7+/Ubuntu 12+. This causes
+#the piddir to be removed on reboot
+create_piddir() {
+	if [ ! -d $PIDDIR ]
+	then
+		(mkdir $PIDDIR && chown $ASD_USER:$ASD_GROUP $PIDDIR) &> /dev/null
+	fi
+}
+
+start() {
+	start-stop-daemon --start --quiet --name $ASDN --pidfile $PIDFILE --exec $ASD -- $OPTS
+}
+
+coldstart() {
+	start-stop-daemon --start --quiet --name $ASDN --pidfile $PIDFILE --exec $ASD -- $COLD_OPTS
+}
+
+stop() {
+	[ -f $PIDFILE ] && pid=`cat $PIDFILE`
+	start-stop-daemon --stop --quiet --pidfile $PIDFILE --name $ASDN
+	rv=$?
+	[ $pid ] && while [ -e /proc/$pid ]; do sleep 0.1; done
+	return $rv
+}
+
+do_telemetry_start () {
+	if [ $EDITION = "community" ];
+	then
+		/usr/sbin/service aerospike_telemetry start >/dev/null  2>&1
+	fi
+}
+
+do_telemetry_stop () {
+	if [ $EDITION = "community" ];
+	then
+		/usr/sbin/service aerospike_telemetry stop >/dev/null  2>&1
+	fi
+}
+
+case "$1" in
+	start|coldstart)
+		ulimit -n 100000
+		logger -t aerospike "ulimit -n=" `ulimit -n`
+		set_shmall
+		set_shmmax
+		set_socket_buffer_limits
+		create_piddir
+
+		[ -n "$LD_PRELOAD" ] && export LD_PRELOAD
+		log_daemon_msg "${1^}ing aerospike"
+		$1
+		case $? in
+			0)
+				log_end_msg 0
+				;;
+			1)
+				echo "aerospike already started"
+				log_end_msg 0
+				;;
+			*)
+				log_end_msg 1
+				;;
+		esac
+
+		do_telemetry_start
+		;;
+	stop)
+		log_daemon_msg "Stopping aerospike"
+		$1
+		case $? in
+			0)
+				log_end_msg 0
+				;;
+			1)
+				echo "aerospike already stopped"
+				log_end_msg 0
+				;;
+			*)
+				log_end_msg 1
+				;;
+		esac
+
+		do_telemetry_stop
+		;;
+	status)
+		status_of_proc -p $PIDFILE $ASDN aerospike
+		;;
+	restart)
+		[ -n "`pgrep $ASDN`" ] && $0 stop
+		$0 start
+		;;
+	*)
+		echo $"Usage: $0 {start|stop|status|coldstart|restart}"
+		exit 2
+		;;
+esac
diff --git a/as/etc/init-telemetry-script b/as/etc/init-telemetry-script
new file mode 100644
index 00000000..3c3b2ff2
--- /dev/null
+++ b/as/etc/init-telemetry-script
@@ -0,0 +1,49 @@
+#!/bin/bash
+# chkconfig: 2345 85 15
+# description: Starts and stops the Aerospike Telemetry Agent
+
+### BEGIN INIT INFO
+# Provides:		aerospike_telemetry
+# Required-Start:	$remote_fs $network
+# Required-Stop:	$remote_fs $network
+# Default-Start:	2 3 4 5
+# Default-Stop:		0 1 6
+# Short-Description:	Aerospike Telemetry Agent
+### END INIT INFO
+
+# Source function library.
+. /etc/rc.d/init.d/functions
+
+DIR=/opt/aerospike/telemetry
+DAEMON=$DIR/telemetry.py
+CONFIG="/etc/aerospike/telemetry.conf"
+
+start() {
+	python $DAEMON $CONFIG start
+}
+
+stop() {
+	python $DAEMON $CONFIG stop
+}
+
+status() {
+	python $DAEMON $CONFIG status
+}
+
+restart() {
+	python $DAEMON $CONFIG restart
+}
+
+try-restart() {
+	python $DAEMON $CONFIG try-restart
+}
+
+case "$1" in
+	start|stop|status|restart|try-restart)
+		${1}
+		;;
+	*)
+		echo "Usage: $0 {start|stop|status|restart|try-restart}"
+		exit 2
+		;;
+esac
diff --git a/as/etc/init-telemetry-script.deb b/as/etc/init-telemetry-script.deb
new file mode 100644
index 00000000..876cf4f7
--- /dev/null
+++ b/as/etc/init-telemetry-script.deb
@@ -0,0 +1,49 @@
+#!/bin/bash
+# chkconfig: 2345 85 15
+# description: Starts and stops the Aerospike Telemetry Agent
+
+### BEGIN INIT INFO
+# Provides:		aerospike_telemetry
+# Required-Start:	$remote_fs $network
+# Required-Stop:	$remote_fs $network
+# Default-Start:	2 3 4 5
+# Default-Stop:		0 1 6
+# Short-Description:	Aerospike Telemetry Agent
+### END INIT INFO
+
+# Source function library.
+. /lib/lsb/init-functions
+
+DIR=/opt/aerospike/telemetry
+DAEMON=$DIR/telemetry.py
+CONFIG="/etc/aerospike/telemetry.conf"
+
+start() {
+	python $DAEMON $CONFIG start
+}
+
+stop() {
+	python $DAEMON $CONFIG stop
+}
+
+status() {
+	python $DAEMON $CONFIG status
+}
+
+restart() {
+	python $DAEMON $CONFIG restart
+}
+
+try-restart() {
+	python $DAEMON $CONFIG try-restart
+}
+
+case "$1" in
+	start|stop|status|restart|try-restart)
+		${1}
+		;;
+	*)
+		echo "Usage: $0 {start|stop|status|restart|try-restart}"
+		exit 2
+		;;
+esac
diff --git a/as/etc/irqbalance-ban.sh b/as/etc/irqbalance-ban.sh
new file mode 100755
index 00000000..bd934147
--- /dev/null
+++ b/as/etc/irqbalance-ban.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+if [ -d ${1}/net ]; then
+    echo 'ban=true'
+fi
diff --git a/as/etc/logrotate_asd b/as/etc/logrotate_asd
new file mode 100644
index 00000000..bbaa54fe
--- /dev/null
+++ b/as/etc/logrotate_asd
@@ -0,0 +1,12 @@
+/var/log/aerospike/aerospike.log {
+    daily
+    rotate 90
+    dateext
+    compress
+    olddir /var/log/aerospike
+    missingok
+    notifempty
+    postrotate
+        (kill -HUP `pgrep asd`) > /dev/null 2>&1 || true
+    endscript
+}
diff --git a/as/etc/logrotate_telemetry b/as/etc/logrotate_telemetry
new file mode 100644
index 00000000..69548fc3
--- /dev/null
+++ b/as/etc/logrotate_telemetry
@@ -0,0 +1,12 @@
+/var/log/aerospike/telemetry.log {
+    daily
+    rotate 5
+    dateext
+    compress
+    olddir /var/log/aerospike
+    missingok
+    notifempty
+    postrotate
+        service aerospike_telemetry try-restart > /dev/null 2>&1 || true
+    endscript
+}
diff --git a/as/etc/telemetry.conf b/as/etc/telemetry.conf
new file mode 100644
index 00000000..bf14cdad
--- /dev/null
+++ b/as/etc/telemetry.conf
@@ -0,0 +1,13 @@
+[asd]
+config-file = /etc/aerospike/aerospike.conf
+
+[logging]
+logfile = /var/log/aerospike/telemetry.log
+loglevel = info
+
+[main]
+disable = false
+interval = 600
+home-url = https://telemetry.aerospike.com
+user = aerospike
+group = aerospike
diff --git a/as/etc/telemetry_dev.conf b/as/etc/telemetry_dev.conf
new file mode 100644
index 00000000..3e105053
--- /dev/null
+++ b/as/etc/telemetry_dev.conf
@@ -0,0 +1,13 @@
+[asd]
+config-file = as/etc/aerospike_dev.conf
+
+[logging]
+logfile = run/log/telemetry.log
+loglevel = info
+
+[main]
+disable = false
+interval = 600
+home-url = https://telemetry.aerospike.com
+user = aerospike
+group = aerospike
diff --git a/as/etc/valgrind.supp b/as/etc/valgrind.supp
new file mode 100644
index 00000000..e4ba17e4
--- /dev/null
+++ b/as/etc/valgrind.supp
@@ -0,0 +1,190 @@
+# I hope one can put comments in here
+# this supression file allows backtraces under  valgrind
+# Put the following block in `~/valgrind.supp`, then run `valgrind --suppressions=/home/bob/valgrind.supp` 
+# (note that `valgrind` doesn't understand `~` in pathnames).
+
+# malloc known supressions
+# 
+{
+   alloc-namespaces
+   Memcheck:Leak
+   fun:malloc
+   fun:cf_malloc_at
+   fun:cf_rc_alloc_at
+   fun:as_namespace_create
+   fun:as_config_init
+   fun:main
+}
+
+{
+   index-trees
+   Memcheck:Leak
+   fun:malloc
+   fun:cf_malloc_at
+   fun:cf_rc_alloc_at
+   fun:as_index_tree_create
+   fun:as_partition_reinit
+   fun:as_partition_balance_new
+   fun:as_paxos_init
+   fun:main
+}
+
+##
+# we always send uninit data to the network, so it says, and it's always ok
+#
+{
+   SendToUninit-xxx
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:as_msg_send_reply
+   fun:single_transaction_response
+   fun:send_response
+   fun:send_success
+   fun:send_result
+   fun:udf_apply_record
+   fun:udf_rw_local
+   fun:internal_rw_start
+   fun:as_rw_start
+   fun:as_write_start
+   fun:thr_tsvc
+}
+
+{
+   SendToUninit-222
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:as_msg_send_reply
+   fun:single_transaction_response
+   fun:thr_tsvc_read
+   fun:rw_complete
+   fun:internal_rw_start
+   fun:as_rw_start
+   fun:thr_tsvc
+   fun:start_thread
+   obj:*
+}
+{
+   SendToUninit-333
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:as_msg_send_reply
+   fun:single_transaction_response
+   fun:send_response.isra.3.constprop.4
+   fun:udf_apply_record
+   fun:udf_rw_local
+   fun:internal_rw_start
+   fun:as_rw_start
+   fun:thr_tsvc
+   fun:start_thread
+}
+
+{
+   udf_write_1
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:as_msg_send_reply
+   fun:single_transaction_response
+   fun:send_response
+   fun:udf_apply_record
+   fun:udf_rw_local
+   fun:internal_rw_start
+   fun:as_rw_start
+   fun:thr_tsvc
+   fun:start_thread
+   obj:*
+}
+
+{
+   udf_write_2
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:as_msg_send_reply
+   fun:thr_tsvc_read
+   fun:rw_complete
+   fun:internal_rw_start
+   fun:as_rw_start
+   fun:thr_tsvc
+   fun:start_thread
+   obj:*
+}
+
+{
+   udf_write_3
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:as_msg_send_reply
+   fun:single_transaction_response
+   fun:send_response
+   fun:send_result
+   fun:udf_rw_local
+   fun:internal_rw_start
+   fun:as_rw_start
+   fun:thr_tsvc
+   fun:start_thread
+   obj:*
+}
+
+
+#
+# known issues  - reasonably well investigated
+#
+
+{
+   libc-execinfo-backtrace
+   Memcheck:Addr4
+   obj:/lib/tls/i686/cmov/libc-2.7.so
+   obj:/lib/ld-2.7.so
+   fun:__libc_dlopen_mode
+   obj:/lib/tls/i686/cmov/libc-2.7.so
+   fun:pthread_once
+   fun:cf_fault_event
+}
+
+{
+   storage-files-write
+   Memcheck:Param
+   write(buf)
+   obj:/usr/lib/debug/libpthread-2.8.90.so
+   fun:write_bins
+   fun:as_storage_record_close_files
+   fun:as_storage_record_close
+   fun:write_local
+   fun:as_write_start
+   fun:thr_tsvc
+   fun:start_thread
+   fun:clone
+   obj:*
+}
+
+{
+   storage-header-write
+   Memcheck:Param
+   write(buf)
+   obj:/lib64/libpthread-2.11.1.so
+   fun:as_storage_write_header
+   fun:as_storage_info_flush_ssd
+   fun:init_ssd_devices
+   fun:as_storage_namespace_init_ssd
+   fun:as_storage_namespace_init
+   fun:main
+   obj:*
+}
+
+{
+   uninitalized_fabric_message
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:fabric_process_writable
+   fun:fabric_worker_fn
+   fun:start_thread
+   fun:clone
+}
+
+
diff --git a/as/include/base/aggr.h b/as/include/base/aggr.h
new file mode 100644
index 00000000..aeb29a95
--- /dev/null
+++ b/as/include/base/aggr.h
@@ -0,0 +1,54 @@
+/*
+ * aggr.h
+ *
+ * Copyright (C) 2014-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+
+#include "aerospike/as_rec.h"
+#include "aerospike/as_result.h"
+#include "aerospike/as_stream.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_ll.h"
+
+#include "ai_btree.h"
+
+#include "transaction/udf.h"
+
+struct as_namespace_s;
+struct as_partition_reservation_s;
+struct udf_record_s;
+
+typedef struct {
+	as_stream_status                    (* ostream_write) (void *, as_val *);
+	void                                (* set_error)     (void *, int);
+	struct as_partition_reservation_s * (* ptn_reserve)   (void *, struct as_namespace_s *, uint32_t, struct as_partition_reservation_s *);
+	void                                (* ptn_release)   (void *, struct as_partition_reservation_s *);
+	bool                                (* pre_check)     (void *, struct udf_record_s *, void *);
+} as_aggr_hooks;
+
+typedef struct {
+	udf_def                   def;
+	const as_aggr_hooks     * aggr_hooks;
+} as_aggr_call;
+
+int as_aggr_process(struct as_namespace_s *ns, as_aggr_call *ag_call, cf_ll *ap_recl, void *udata, as_result *ap_res);
diff --git a/as/include/base/as_stap.h b/as/include/base/as_stap.h
new file mode 100644
index 00000000..9919e0cc
--- /dev/null
+++ b/as/include/base/as_stap.h
@@ -0,0 +1,52 @@
+/*
+ * as_stap.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#if defined(USE_SYSTEMTAP)
+#include <sys/sdt.h>
+#include "probes.h"
+#else
+#define ASD_TRANS_DEMARSHAL(arg1,arg2,arg3)
+#define ASD_QUERY_STARTING(arg1,arg2)
+#define ASD_QUERY_QTRSETUP_STARTING(arg1,arg2)
+#define ASD_QUERY_QTRSETUP_FINISHED(arg1,arg2)
+#define ASD_QUERY_INIT(arg1,arg2)
+#define ASD_QUERY_DONE(arg1,arg2,arg3)
+#define ASD_QUERY_TRANS_DONE(arg1,arg2,arg3)
+#define ASD_QUERY_QTR_ALLOC(arg1,arg2,arg3)
+#define ASD_QUERY_QTR_FREE(arg1,arg2,arg3)
+#define ASD_QUERY_IOREQ_STARTING(arg1,arg2)
+#define ASD_QUERY_IOREQ_FINISHED(arg1,arg2)
+#define ASD_QUERY_IO_STARTING(arg1,arg2)
+#define ASD_QUERY_IO_NOTMATCH(arg1,arg2)
+#define ASD_QUERY_IO_ERROR(arg1,arg2)
+#define ASD_QUERY_IO_FINISHED(arg1,arg2)
+#define ASD_QUERY_NETIO_STARTING(arg1,arg2)
+#define ASD_QUERY_NETIO_FINISHED(arg1,arg2)
+#define ASD_QUERY_ADDFIN(arg1,arg2)
+#define ASD_QUERY_SENDPACKET_STARTING(arg1,arg2,arg3)
+#define ASD_QUERY_SENDPACKET_CONTINUE(arg1,arg2)
+#define ASD_QUERY_SENDPACKET_FINISHED(arg1)
+#define ASD_SINDEX_MSGRANGE_STARTING(arg1,arg2)
+#define ASD_SINDEX_MSGRANGE_FINISHED(arg1,arg2)
+#endif
diff --git a/as/include/base/batch.h b/as/include/base/batch.h
new file mode 100644
index 00000000..36e9cc33
--- /dev/null
+++ b/as/include/base/batch.h
@@ -0,0 +1,40 @@
+/*
+ * batch.h
+ *
+ * Copyright (C) 2008-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "base/transaction.h"
+#include "dynbuf.h"
+
+typedef struct as_batch_shared_s as_batch_shared;
+
+int as_batch_init();
+int as_batch_queue_task(as_transaction* tr);
+void as_batch_add_result(as_transaction* tr, uint16_t n_bins, as_bin** bins, as_msg_op** ops);
+void as_batch_add_proxy_result(as_batch_shared* shared, uint32_t index, cf_digest* digest, cl_msg* cmsg, size_t size);
+void as_batch_add_error(as_batch_shared* shared, uint32_t index, int result_code);
+int as_batch_threads_resize(uint32_t threads);
+void as_batch_queues_info(cf_dyn_buf* db);
+int as_batch_unused_buffers();
+void as_batch_destroy();
+
+as_file_handle* as_batch_get_fd_h(as_batch_shared* shared);
diff --git a/as/include/base/cdt.h b/as/include/base/cdt.h
new file mode 100644
index 00000000..f70d1552
--- /dev/null
+++ b/as/include/base/cdt.h
@@ -0,0 +1,492 @@
+/*
+ * cdt.h
+ *
+ * Copyright (C) 2015-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aerospike/as_msgpack.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+
+#include "dynbuf.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define CDT_MAX_PACKED_INT_SZ (sizeof(uint64_t) + 1)
+#define CDT_MAX_STACK_OBJ_SZ  (1024 * 1024)
+#define CDT_MAX_PARAM_LIST_COUNT (1024 * 1024)
+
+typedef struct rollback_alloc_s {
+	cf_ll_buf *ll_buf;
+	size_t malloc_list_sz;
+	size_t malloc_list_cap;
+	bool malloc_ns;
+	void *malloc_list[];
+} rollback_alloc;
+
+#define define_rollback_alloc(__name, __alloc_buf, __rollback_size, __malloc_ns) \
+	uint8_t __name ## __mem[sizeof(rollback_alloc) + sizeof(void *) * (__alloc_buf ? 0 : __rollback_size)]; \
+	rollback_alloc *__name = (rollback_alloc *)__name ## __mem; \
+	__name->ll_buf = __alloc_buf; \
+	__name->malloc_list_sz = 0; \
+	__name->malloc_list_cap = (__alloc_buf ? 0 : __rollback_size); \
+	__name->malloc_ns = __malloc_ns;
+
+typedef struct cdt_process_state_s {
+	as_cdt_optype type;
+	as_unpacker pk;
+	uint32_t ele_count;
+} cdt_process_state;
+
+typedef struct cdt_payload_s {
+	const uint8_t *ptr;
+	uint32_t sz;
+} cdt_payload;
+
+typedef struct result_data_s {
+	as_bin *result;
+	rollback_alloc *alloc;
+	result_type_t type;
+	as_cdt_op_flags flags;
+	bool is_multi;
+} cdt_result_data;
+
+typedef struct cdt_modify_data_s {
+	as_bin *b;
+	as_bin *result;
+	cf_ll_buf *alloc_buf;
+
+	int ret_code;
+} cdt_modify_data;
+
+typedef struct cdt_read_data_s {
+	const as_bin *b;
+	as_bin *result;
+
+	int ret_code;
+} cdt_read_data;
+
+typedef struct cdt_container_builder_s {
+	as_particle *particle;
+	uint8_t *write_ptr;
+	uint32_t *sz;
+	uint32_t ele_count;
+} cdt_container_builder;
+
+typedef struct cdt_op_table_entry_s {
+	uint32_t count;
+	uint32_t opt_args;
+	const char *name;
+	const as_cdt_paramtype *args;
+} cdt_op_table_entry;
+
+typedef struct cdt_calc_delta_s {
+	int64_t incr_int;
+	double incr_double;
+
+	as_val_t type;
+
+	int64_t value_int;
+	double value_double;
+} cdt_calc_delta;
+
+typedef struct msgpacked_index_s {
+	uint8_t *ptr;
+	uint32_t ele_sz;
+	uint32_t ele_count;
+} msgpacked_index;
+
+typedef struct offset_index_s {
+	msgpacked_index _;
+
+	const uint8_t *contents;
+	uint32_t content_sz;
+	bool is_partial;
+} offset_index;
+
+// Value order index.
+typedef struct order_index_s {
+	msgpacked_index _;
+	uint32_t max_idx;
+} order_index;
+
+typedef struct order_index_find_s {
+	uint32_t start;
+	uint32_t count;
+	uint32_t target;
+	uint32_t result;
+	bool found;
+} order_index_find;
+
+typedef msgpack_compare_t (*order_heap_compare_fn)(const void *ptr, uint32_t index0, uint32_t index1);
+
+// Value order heap.
+typedef struct order_heap_s {
+	order_index _;
+	const void *userdata;
+	order_heap_compare_fn cmp_fn;
+	msgpack_compare_t cmp;
+	uint32_t filled;
+} order_heap;
+
+typedef struct cdt_packed_op_s {
+	// Input.
+	const uint8_t *packed;
+	uint32_t packed_sz;
+
+	// Parsed.
+	uint32_t ele_count;
+	const uint8_t *contents;
+	uint32_t content_sz;
+
+	// Result.
+	uint32_t new_ele_count;
+} cdt_packed_op;
+
+struct order_index_adjust_s;
+typedef uint32_t (*order_index_adjust_func)(const struct order_index_adjust_s *via, uint32_t src);
+
+typedef struct order_index_adjust_s {
+	order_index_adjust_func f;
+	uint32_t upper;
+	uint32_t lower;
+	int32_t delta;
+} order_index_adjust;
+
+typedef enum {
+	CDT_FIND_ITEMS_IDXS_FOR_LIST_VALUE,
+	CDT_FIND_ITEMS_IDXS_FOR_MAP_KEY,
+	CDT_FIND_ITEMS_IDXS_FOR_MAP_VALUE
+} cdt_find_items_idxs_type;
+
+#define define_offset_index(__name, __contents, __content_sz, __ele_count) \
+		offset_index __name; \
+		offset_index_init(&__name, NULL, __ele_count, __contents, __content_sz); \
+		uint8_t __name ## __offset_index_mem__[offset_index_size(&__name)]; \
+		__name._.ptr = __name ## __offset_index_mem__; \
+		offset_index_set_filled(&__name, 1)
+
+#define cond_vla_order_index2(__name, __max_idx, __alloc_count, __cond) \
+		union { \
+			order_index ordidx; \
+			uint8_t mem_temp[sizeof(order_index) + ((__cond) ? order_index_calc_size(__max_idx, __alloc_count) : 0)]; \
+		} __name; \
+		order_index_init2(&__name.ordidx, __name.mem_temp + sizeof(order_index), __max_idx, __alloc_count)
+
+#define define_order_index(__name, __ele_count) \
+		order_index __name; \
+		uint8_t __name ## __order_index_mem__[order_index_calc_size(__ele_count, __ele_count)]; \
+		order_index_init(&__name, __name ## __order_index_mem__, __ele_count)
+
+#define define_order_index2(__name, __max_idx, __alloc_count) \
+		order_index __name; \
+		uint8_t __name ## __order_index_mem__[order_index_calc_size(__max_idx, __alloc_count)]; \
+		order_index_init2(&__name, __name ## __order_index_mem__, __max_idx, __alloc_count)
+
+#define define_int_list_builder(__name, __alloc, __count) \
+		cdt_container_builder __name; \
+		cdt_int_list_builder_start(&__name, __alloc, __count)
+
+#define define_cdt_idx_mask(__name, __ele_count) \
+		uint64_t __name[cdt_idx_mask_count(__ele_count)]; \
+		cdt_idx_mask_init(__name, __ele_count)
+
+#define cond_define_cdt_idx_mask(__name, __ele_count, __cond) \
+		uint64_t __name[__cond ? cdt_idx_mask_count(__ele_count) : 1]; \
+		if (__cond) { \
+			cdt_idx_mask_init(__name, __ele_count); \
+		}
+
+#define define_build_order_heap_by_range(__name, __idx, __count, __ele_count, __udata, __cmp_fn, __success) \
+		order_heap __name; \
+		uint8_t __name ## __order_heap_mem__[order_index_calc_size(__ele_count, __ele_count)]; \
+		bool __success = order_heap_init_build_by_range(&__name, __name ## __order_heap_mem__, __idx, __count, __ele_count, __cmp_fn, __udata)
+
+#define VA_NARGS_SEQ 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define VA_NARGS_EXTRACT_N(_9, _8, _7, _6, _5, _4, _3, _2, _1, _0, N, ...) N
+#define VA_NARGS_SEQ2N(...) VA_NARGS_EXTRACT_N(__VA_ARGS__)
+#define VA_NARGS(...) VA_NARGS_SEQ2N(_, ##__VA_ARGS__, VA_NARGS_SEQ)
+
+// Get around needing to pass last named arg to va_start().
+#define CDT_OP_TABLE_GET_PARAMS(state, ...) cdt_process_state_get_params(state, VA_NARGS(__VA_ARGS__), __VA_ARGS__)
+
+static const uint8_t msgpack_nil[1] = {0xC0};
+
+
+//==========================================================
+// Function declarations.
+//
+
+bool calc_index_count(int64_t in_index, uint64_t in_count, uint32_t ele_count, uint32_t *out_index, uint32_t *out_count, bool is_multi);
+
+// cdt_result_data
+bool result_data_set_not_found(cdt_result_data *rd, int64_t index);
+void result_data_set_list_int2x(cdt_result_data *rd, int64_t i1, int64_t i2);
+int result_data_set_index_rank_count(cdt_result_data *rd, uint32_t start, uint32_t count, uint32_t ele_count);
+int result_data_set_range(cdt_result_data *rd, uint32_t start, uint32_t count, uint32_t ele_count);
+void result_data_set_by_irc(cdt_result_data *rd, const order_index *ordidx, const order_index *idx_map, uint32_t total_count);
+void result_data_set_by_itemlist_irc(cdt_result_data *rd, const order_index *items_ord, order_index *ranks, uint32_t total_count);
+void result_data_set_int_list_by_mask(cdt_result_data *rd, const uint64_t *mask, uint32_t count, uint32_t ele_count);
+
+// as_bin
+void as_bin_set_int(as_bin *b, int64_t value);
+void as_bin_set_double(as_bin *b, double value);
+void as_bin_set_unordered_empty_list(as_bin *b, rollback_alloc *alloc_buf);
+void as_bin_set_empty_packed_map(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags);
+
+// cdt_delta_value
+bool cdt_calc_delta_init(cdt_calc_delta *cdv, const cdt_payload *delta_value, bool is_decrement);
+bool cdt_calc_delta_add(cdt_calc_delta *cdv, as_unpacker *pk_value);
+void cdt_calc_delta_pack_and_result(cdt_calc_delta *cdv, cdt_payload *value, as_bin *result);
+
+// cdt_payload
+bool cdt_payload_is_int(const cdt_payload *payload);
+int64_t cdt_payload_get_int64(const cdt_payload *payload);
+void cdt_payload_pack_int(cdt_payload *packed, int64_t value);
+void cdt_payload_pack_double(cdt_payload *packed, double value);
+
+// cdt_process_state
+bool cdt_process_state_init(cdt_process_state *cdt_state, const as_msg_op *op);
+bool cdt_process_state_get_params(cdt_process_state *state, size_t n, ...);
+const char *cdt_process_state_get_op_name(const cdt_process_state *state);
+
+// cdt_process_state_packed_list
+bool cdt_process_state_packed_list_modify_optype(cdt_process_state *state, cdt_modify_data *cdt_udata);
+bool cdt_process_state_packed_list_read_optype(cdt_process_state *state, cdt_read_data *cdt_udata);
+
+void cdt_container_builder_add(cdt_container_builder *builder, const uint8_t *buf, uint32_t sz);
+void cdt_container_builder_add_n(cdt_container_builder *builder, const uint8_t *buf, uint32_t count, uint32_t sz);
+void cdt_container_builder_add_int64(cdt_container_builder *builder, int64_t value);
+void cdt_container_builder_add_int_range(cdt_container_builder *builder, uint32_t start, uint32_t count, uint32_t ele_count, bool reverse);
+void cdt_container_builder_set_result(cdt_container_builder *builder, cdt_result_data *result);
+
+void cdt_list_builder_start(cdt_container_builder *builder, rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t max_sz);
+void cdt_map_builder_start(cdt_container_builder *builder, rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_max_sz, uint8_t flags);
+
+// cdt_process_state_packed_map
+bool cdt_process_state_packed_map_modify_optype(cdt_process_state *state, cdt_modify_data *cdt_udata);
+bool cdt_process_state_packed_map_read_optype(cdt_process_state *state, cdt_read_data *cdt_udata);
+
+// rollback_alloc
+void rollback_alloc_push(rollback_alloc *packed_alloc, void *ptr);
+uint8_t *rollback_alloc_reserve(rollback_alloc *alloc_buf, size_t sz);
+void rollback_alloc_rollback(rollback_alloc *alloc_buf);
+bool rollback_alloc_from_msgpack(rollback_alloc *alloc_buf, as_bin *b, const cdt_payload *seg);
+
+// msgpacked_index
+void msgpacked_index_set(msgpacked_index *idxs, uint32_t index, uint32_t value);
+void msgpacked_index_incr(msgpacked_index *idxs, uint32_t index);
+void msgpacked_index_set_ptr(msgpacked_index *idxs, uint8_t *ptr);
+void *msgpacked_index_get_mem(const msgpacked_index *idxs, uint32_t index);
+uint32_t msgpacked_index_size(const msgpacked_index *idxs);
+uint32_t msgpacked_index_ptr2value(const msgpacked_index *idxs, const void *ptr);
+uint32_t msgpacked_index_get(const msgpacked_index *idxs, uint32_t index);
+void msgpacked_index_print(const msgpacked_index *idxs, const char *name);
+bool msgpacked_index_find_index_sorted(const msgpacked_index *sorted_indexes, uint32_t find_index, uint32_t count, uint32_t *where);
+
+// offset_index
+void offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz);
+void offset_index_set(offset_index *offidx, uint32_t index, uint32_t value);
+bool offset_index_set_next(offset_index *offidx, uint32_t index, uint32_t value);
+void offset_index_set_filled(offset_index *offidx, uint32_t ele_filled);
+void offset_index_set_ptr(offset_index *offidx, uint8_t *idx_mem, const uint8_t *packed_mem);
+void offset_index_copy(offset_index *dest, const offset_index *src, uint32_t d_start, uint32_t s_start, uint32_t count, int delta);
+void offset_index_append_size(offset_index *offidx, uint32_t delta);
+
+bool offset_index_find_items(offset_index *full_offidx, cdt_find_items_idxs_type find_type, as_unpacker *items_pk, order_index *items_ordidx_r, bool inverted, uint64_t *rm_mask, uint32_t *rm_count_r, order_index *rm_ranks_r);
+
+void *offset_index_get_mem(const offset_index *offidx, uint32_t index);
+uint32_t offset_index_size(const offset_index *offidx);
+bool offset_index_is_null(const offset_index *offidx);
+bool offset_index_is_valid(const offset_index *offidx);
+bool offset_index_is_full(const offset_index *offidx);
+uint32_t offset_index_get_const(const offset_index *offidx, uint32_t idx);
+uint32_t offset_index_get_delta_const(const offset_index *offidx, uint32_t index);
+uint32_t offset_index_get_filled(const offset_index *offidx);
+
+void offset_index_print(const offset_index *offidx, const char *name);
+void offset_index_delta_print(const offset_index *offidx, const char *name);
+
+// order_index
+void order_index_init(order_index *ordidx, uint8_t *ptr, uint32_t ele_count);
+void order_index_init2(order_index *ordidx, uint8_t *ptr, uint32_t max_idx, uint32_t ele_count);
+void order_index_init_ref(order_index *dst, const order_index *src, uint32_t start, uint32_t count);
+void order_index_set(order_index *ordidx, uint32_t index, uint32_t value);
+void order_index_set_ptr(order_index *ordidx, uint8_t *ptr);
+void order_index_incr(order_index *ordidx, uint32_t index);
+void order_index_clear(order_index *ordidx);
+bool order_index_sorted_mark_dup_eles(order_index *ordidx, const offset_index *full_offidx, uint32_t *count_r, uint32_t *sz_r);
+
+uint32_t order_index_size(const order_index *ordidx);
+bool order_index_is_null(const order_index *ordidx);
+bool order_index_is_valid(const order_index *ordidx);
+bool order_index_is_filled(const order_index *ordidx);
+
+void *order_index_get_mem(const order_index *ordidx, uint32_t index);
+uint32_t order_index_ptr2value(const order_index *ordidx, const void *ptr);
+uint32_t order_index_get(const order_index *ordidx, uint32_t index);
+
+bool order_index_find_rank_by_value(const order_index *ordidx, const cdt_payload *value, const offset_index *full_offidx, order_index_find *find);
+
+uint32_t order_index_get_ele_size(const order_index *ordidx, uint32_t count, const offset_index *full_offidx);
+uint8_t *order_index_write_eles(const order_index *ordidx, uint32_t count, const offset_index *full_offidx, uint8_t *ptr, bool invert);
+
+uint32_t order_index_adjust_value(const order_index_adjust *via, uint32_t src);
+void order_index_copy(order_index *dest, const order_index *src, uint32_t d_start, uint32_t s_start, uint32_t count, const order_index_adjust *adjust);
+size_t order_index_calc_size(uint32_t max_idx, uint32_t ele_count);
+
+void order_index_print(const order_index *ordidx, const char *name);
+
+// order_heap
+bool order_heap_init_build_by_range(order_heap *heap, uint8_t *heap_mem, uint32_t idx, uint32_t count, uint32_t ele_count, order_heap_compare_fn cmp_fn, const void *udata);
+void order_heap_swap(order_heap *heap, uint32_t index1, uint32_t index2);
+bool order_heap_remove_top(order_heap *heap);
+bool order_heap_replace_top(order_heap *heap, uint32_t value);
+bool order_heap_heapify(order_heap *heap, uint32_t index);
+bool order_heap_build(order_heap *heap, bool init);
+bool order_heap_order_at_end(order_heap *heap, uint32_t count);
+void order_heap_reverse_end(order_heap *heap, uint32_t count);
+
+void order_heap_print(const order_heap *heap);
+
+// cdt_idx_mask
+size_t cdt_idx_mask_count(uint32_t ele_count);
+void cdt_idx_mask_init(uint64_t *mask, uint32_t ele_count);
+void cdt_idx_mask_set(uint64_t *mask, uint32_t idx);
+void cdt_idx_mask_set_by_ordidx(uint64_t *mask, const order_index *ordidx, uint32_t start, uint32_t count, bool inverted);
+void cdt_idx_mask_set_by_irc(uint64_t *mask, const order_index *rankcount, const order_index *idx_map, bool inverted);
+void cdt_idx_mask_invert(uint64_t *mask, uint32_t ele_count);
+
+uint64_t cdt_idx_mask_get(const uint64_t *mask, uint32_t idx);
+
+bool cdt_idx_mask_is_set(const uint64_t *mask, uint32_t idx);
+
+uint32_t cdt_idx_mask_find(const uint64_t *mask, uint32_t start, uint32_t end, bool is_find0);
+uint8_t *cdt_idx_mask_write_eles(const uint64_t *mask, uint32_t count, const offset_index *full_offidx, uint8_t *ptr, bool invert);
+uint32_t cdt_idx_mask_get_content_sz(const uint64_t *mask, uint32_t count, const offset_index *full_offidx);
+
+void cdt_idx_mask_print(const uint64_t *mask, uint32_t ele_count, const char *name);
+
+// list
+bool list_full_offset_index_fill_all(offset_index *offidx);
+bool list_order_index_sort(order_index *ordidx, const offset_index *full_offidx, as_cdt_sort_flags flags);
+
+bool list_param_parse(const cdt_payload *items, as_unpacker *pk, uint32_t *count_r);
+
+// Debugging support
+void print_hex(const uint8_t *packed, uint32_t packed_sz, char *buf, uint32_t buf_sz);
+void print_packed(const uint8_t *packed, uint32_t sz, const char *name);
+void cdt_bin_print(const as_bin *b, const char *name);
+
+
+//==========================================================
+// Inline functions.
+//
+
+static inline bool
+result_data_is_inverted(cdt_result_data *rd)
+{
+	return (rd->flags & AS_CDT_OP_FLAG_INVERTED) != 0;
+}
+
+static inline void
+result_data_set(cdt_result_data *rd, uint64_t result_type, bool is_multi)
+{
+	rd->type = (result_type_t)(result_type & AS_CDT_OP_FLAG_RESULT_MASK);
+	rd->flags = (as_cdt_op_flags)(result_type & (~AS_CDT_OP_FLAG_RESULT_MASK));
+	rd->is_multi = is_multi;
+}
+
+static inline void
+result_data_set_int(cdt_result_data *rd, int64_t value)
+{
+	if (rd) {
+		as_bin_set_int(rd->result, value);
+	}
+}
+
+static inline bool
+result_data_is_return_elements(const cdt_result_data *rd)
+{
+	return (rd->type == RESULT_TYPE_KEY || rd->type == RESULT_TYPE_VALUE ||
+			rd->type == RESULT_TYPE_MAP);
+}
+
+static inline bool
+result_data_is_return_index(const cdt_result_data *rd)
+{
+	return (rd->type == RESULT_TYPE_INDEX || rd->type == RESULT_TYPE_REVINDEX);
+}
+
+static inline bool
+result_data_is_return_index_range(const cdt_result_data *rd)
+{
+	return (rd->type == RESULT_TYPE_INDEX_RANGE ||
+			rd->type == RESULT_TYPE_REVINDEX_RANGE);
+}
+
+static inline bool
+result_data_is_return_rank(const cdt_result_data *rd)
+{
+	return (rd->type == RESULT_TYPE_REVRANK || rd->type == RESULT_TYPE_RANK);
+}
+
+static inline bool
+result_data_is_return_rank_range(const cdt_result_data *rd)
+{
+	return (rd->type == RESULT_TYPE_REVRANK_RANGE ||
+			rd->type == RESULT_TYPE_RANK_RANGE);
+}
+
+static inline void
+order_heap_set(order_heap *heap, uint32_t index, uint32_t value)
+{
+	order_index_set((order_index *)heap, index, value);
+}
+
+static inline uint32_t
+order_heap_get(const order_heap *heap, uint32_t index)
+{
+	return order_index_get((const order_index *)heap, index);
+}
+
+// Calculate index given index and max_index.
+static inline int64_t
+calc_index(int64_t index, uint32_t max_index)
+{
+	return index < 0 ? (int64_t)max_index + index : index;
+}
+
+static inline void
+cdt_int_list_builder_start(cdt_container_builder *builder,
+		rollback_alloc *alloc_buf, uint32_t ele_count)
+{
+	cdt_list_builder_start(builder, alloc_buf, ele_count,
+			CDT_MAX_PACKED_INT_SZ * ele_count);
+}
diff --git a/as/include/base/cfg.h b/as/include/base/cfg.h
new file mode 100644
index 00000000..8278bdf3
--- /dev/null
+++ b/as/include/base/cfg.h
@@ -0,0 +1,284 @@
+/*
+ * cfg.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <grp.h>
+#include <pthread.h>
+#include <pwd.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "xdr_config.h"
+
+#include "aerospike/mod_lua_config.h"
+#include "citrusleaf/cf_atomic.h"
+
+#include "enhanced_alloc.h"
+#include "hardware.h"
+#include "node.h"
+#include "socket.h"
+#include "tls.h"
+
+#include "base/security_config.h"
+#include "fabric/clustering.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "fabric/hlc.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_namespace_s;
+
+
+//==========================================================
+// Typedefs and constants.
+//
+
+#ifndef AS_NAMESPACE_SZ
+#define AS_NAMESPACE_SZ 2
+#endif
+
+#define AS_CLUSTER_NAME_SZ 65
+
+#define MAX_DEMARSHAL_THREADS 256
+#define MAX_BATCH_THREADS 256
+#define MAX_TLS_SPECS 10
+
+// Declare bools with PAD_BOOL so they can't share a 4-byte space with other
+// bools, chars or shorts. This prevents adjacent bools set concurrently in
+// different threads (albeit very unlikely) from interfering with each other.
+// Add others (e.g. PAD_UINT8, PAD_UINT16 ...) as needed.
+#define PGLUE(a, b) a##b
+#define PBOOL(line) bool PGLUE(pad_, line)[3]; bool
+#define PAD_BOOL PBOOL(__LINE__)
+
+typedef struct as_config_s {
+
+	// The order here matches that in the configuration parser's enum,
+	// cfg_case_id. This is for organizational sanity.
+
+	//--------------------------------------------
+	// service context.
+	//
+
+	// Normally visible, in canonical configuration file order:
+
+	uid_t			uid;
+	gid_t			gid;
+	uint32_t		paxos_single_replica_limit; // cluster size at which, and below, the cluster will run with replication factor 1
+	char*			pidfile;
+	int				n_proto_fd_max;
+
+	// Normally hidden:
+
+	// Note - advertise-ipv6 affects a cf_socket_ee.c global, so can't be here.
+	cf_topo_auto_pin auto_pin;
+	int				n_batch_threads;
+	uint32_t		batch_max_buffers_per_queue; // maximum number of buffers allowed in a buffer queue at any one time, fail batch if full
+	uint32_t		batch_max_requests; // maximum count of database requests in a single batch
+	uint32_t		batch_max_unused_buffers; // maximum number of buffers allowed in buffer pool at any one time
+	uint32_t		batch_priority; // number of records between an enforced context switch, used by old batch only
+	uint32_t		n_batch_index_threads;
+	char			cluster_name[AS_CLUSTER_NAME_SZ];
+	as_clustering_config clustering_config;
+	PAD_BOOL		fabric_benchmarks_enabled;
+	PAD_BOOL		svc_benchmarks_enabled;
+	PAD_BOOL		info_hist_enabled;
+	const char*		feature_key_file;
+	uint32_t		hist_track_back; // total time span in seconds over which to cache data
+	uint32_t		hist_track_slice; // period in seconds at which to cache histogram data
+	char*			hist_track_thresholds; // comma-separated bucket (ms) values to track
+	int				n_info_threads;
+	// Note - log-local-time affects a cf_fault.c global, so can't be here.
+	uint32_t		migrate_max_num_incoming;
+	uint32_t		n_migrate_threads;
+	char*			node_id_interface;
+	uint32_t		nsup_delete_sleep; // sleep this many microseconds between generating delete transactions, default 0
+	uint32_t		nsup_period;
+	PAD_BOOL		nsup_startup_evict;
+	int				proto_fd_idle_ms; // after this many milliseconds, connections are aborted unless transaction is in progress
+	int				proto_slow_netio_sleep_ms; // dynamic only
+	uint32_t		query_bsize;
+	uint64_t		query_buf_size; // dynamic only
+	uint32_t		query_bufpool_size;
+	PAD_BOOL		query_in_transaction_thr;
+	uint32_t		query_long_q_max_size;
+	PAD_BOOL		query_enable_histogram;
+	PAD_BOOL		partitions_pre_reserved; // query will reserve all partitions up front
+	uint32_t		query_priority;
+	uint64_t		query_sleep_us;
+	uint64_t		query_rec_count_bound;
+	PAD_BOOL		query_req_in_query_thread;
+	uint32_t		query_req_max_inflight;
+	uint32_t		query_short_q_max_size;
+	uint32_t		query_threads;
+	uint32_t		query_threshold;
+	uint64_t		query_untracked_time_ms;
+	uint32_t		query_worker_threads;
+	PAD_BOOL		run_as_daemon;
+	uint32_t		scan_max_active; // maximum number of active scans allowed
+	uint32_t		scan_max_done; // maximum number of finished scans kept for monitoring
+	uint32_t		scan_max_udf_transactions; // maximum number of active transactions per UDF background scan
+	uint32_t		scan_threads; // size of scan thread pool
+	uint32_t		n_service_threads;
+	uint32_t		sindex_builder_threads; // secondary index builder thread pool size
+	uint32_t		sindex_gc_max_rate; // Max sindex entries processed per second for gc
+	uint32_t		sindex_gc_period; // same as nsup_period for sindex gc
+	uint32_t		ticker_interval;
+	uint64_t		transaction_max_ns;
+	uint32_t		transaction_pending_limit; // 0 means no limit
+	uint32_t		n_transaction_queues;
+	uint32_t		transaction_retry_ms;
+	uint32_t		n_transaction_threads_per_queue;
+	char*			work_directory;
+
+	// For special debugging or bug-related repair:
+
+	cf_alloc_debug	debug_allocations; // how to instrument the memory allocation API
+	PAD_BOOL		fabric_dump_msgs; // whether to log information about existing "msg" objects and queues
+	uint32_t		prole_extra_ttl; // seconds beyond expiry time after which we garbage collect, 0 for no garbage collection
+
+	//--------------------------------------------
+	// network::service context.
+	//
+
+	// Normally visible, in canonical configuration file order:
+
+	cf_serv_spec	service; // client service
+
+	// Normally hidden:
+
+	cf_serv_spec	tls_service; // TLS client service
+
+	//--------------------------------------------
+	// network::heartbeat context.
+	//
+
+	cf_serv_spec	hb_serv_spec; // literal binding address spec parsed from config
+	cf_serv_spec	hb_tls_serv_spec; // literal binding address spec for TLS parsed from config
+	cf_addr_list	hb_multicast_groups; // literal multicast groups parsed from config
+	as_hb_config	hb_config;
+
+	//--------------------------------------------
+	// network::fabric context.
+	//
+
+	// Normally visible, in canonical configuration file order:
+
+	cf_serv_spec	fabric; // fabric service
+	cf_serv_spec	tls_fabric; // TLS fabric service
+
+	// Normally hidden:
+
+	uint32_t		n_fabric_channel_fds[AS_FABRIC_N_CHANNELS];
+	uint32_t		n_fabric_channel_recv_threads[AS_FABRIC_N_CHANNELS];
+	PAD_BOOL		fabric_keepalive_enabled;
+	int				fabric_keepalive_intvl;
+	int				fabric_keepalive_probes;
+	int				fabric_keepalive_time;
+	uint32_t		fabric_latency_max_ms; // time window for ordering
+	uint32_t		fabric_recv_rearm_threshold;
+	uint32_t		n_fabric_send_threads;
+
+	//--------------------------------------------
+	// network::info context.
+	//
+
+	// Normally visible, in canonical configuration file order:
+
+	cf_serv_spec	info; // info service
+
+	//--------------------------------------------
+	// Remaining configuration top-level contexts.
+	//
+
+	mod_lua_config	mod_lua;
+	as_sec_config	sec_cfg;
+
+	uint32_t		n_tls_specs;
+	cf_tls_spec		tls_specs[MAX_TLS_SPECS];
+
+
+	//======================================================
+	// Not (directly) configuration. Many should probably be
+	// relocated...
+	//
+
+	// Global variable that just shouldn't be here.
+	cf_node			self_node;
+
+	// Global variables that just shouldn't be here.
+	cf_node			xdr_clmap[AS_CLUSTER_SZ]; // cluster map as known to XDR
+	xdr_node_lst	xdr_peers_lst[AS_CLUSTER_SZ]; // last XDR shipping info of other nodes
+	uint64_t		xdr_self_lastshiptime[DC_MAX_NUM]; // last XDR shipping by this node
+
+	// Namespaces.
+	struct as_namespace_s* namespaces[AS_NAMESPACE_SZ];
+	uint32_t		n_namespaces;
+
+	// To speed up transaction enqueue's determination of whether to "inline":
+	uint32_t		n_namespaces_inlined;
+	uint32_t		n_namespaces_not_inlined;
+
+} as_config;
+
+
+//==========================================================
+// Public API.
+//
+
+as_config* as_config_init(const char* config_file);
+void as_config_post_process(as_config* c, const char* config_file);
+
+void as_config_cluster_name_get(char* cluster_name);
+bool as_config_cluster_name_set(const char* cluster_name);
+bool as_config_cluster_name_matches(const char* cluster_name);
+
+bool as_config_error_enterprise_only();
+
+extern as_config g_config;
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+// Parsed configuration file line.
+typedef struct cfg_line_s {
+	int		num;
+	char*	name_tok;
+	char*	val_tok_1;
+	char*	val_tok_2;
+	char*	val_tok_3;
+} cfg_line;
+
+void cfg_enterprise_only(const cfg_line* p_line);
+void cfg_post_process();
+cf_tls_spec* cfg_link_tls(const char* which, char** our_name);
diff --git a/as/include/base/datamodel.h b/as/include/base/datamodel.h
new file mode 100644
index 00000000..66fe9779
--- /dev/null
+++ b/as/include/base/datamodel.h
@@ -0,0 +1,1207 @@
+/*
+ * datamodel.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * core data model structures and definitions
+ */
+
+#pragma once
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "arenax.h"
+#include "dynbuf.h"
+#include "hist.h"
+#include "hist_track.h"
+#include "linear_hist.h"
+#include "msg.h"
+#include "node.h"
+#include "shash.h"
+#include "vmapx.h"
+
+#include "base/cfg.h"
+#include "base/proto.h"
+#include "base/rec_props.h"
+#include "base/transaction_policy.h"
+#include "base/truncate.h"
+#include "fabric/hb.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+
+#define AS_STORAGE_MAX_DEVICES (64) // maximum devices per namespace
+#define AS_STORAGE_MAX_FILES (64) // maximum files per namespace
+#define AS_STORAGE_MAX_DEVICE_SIZE (2L * 1024L * 1024L * 1024L * 1024L) // 2Tb, due to rblock_id in as_index
+
+#define OBJ_SIZE_HIST_NUM_BUCKETS 100
+#define TTL_HIST_NUM_BUCKETS 100
+
+#define MAX_ALLOWED_TTL (3600 * 24 * 365 * 10) // 10 years
+
+// [0-1] for partition-id
+// [1-2] for tree sprigs and locks
+// [2-3] for the olock
+// [4-7] for rw_request hash
+#define DIGEST_SCRAMBLE_BYTE1		4
+// [8-11] for SSD device hash
+#define DIGEST_STORAGE_BASE_BYTE	8
+
+/* SYNOPSIS
+ * Data model
+ *
+ * Objects are stored in a hierarchy: namespace:record:bin:particle.
+ * The records in a namespace are further partitioned for distribution
+ * amongst the participating nodes in the cluster.
+ */
+
+
+
+/* Forward declarations */
+typedef struct as_namespace_s as_namespace;
+typedef struct as_index_s as_record;
+typedef struct as_bin_s as_bin;
+typedef struct as_index_ref_s as_index_ref;
+typedef struct as_set_s as_set;
+typedef struct as_treex_s as_treex;
+
+struct as_index_tree_s;
+
+
+/* AS_ID_[NAMESPACE,SET,BIN,INAME]_SZ
+ * The maximum length, in bytes, of an identification field; by convention,
+ * these values are null-terminated UTF-8 */
+#define AS_ID_NAMESPACE_SZ 32
+#define AS_ID_BIN_SZ 15 // size used in storage format
+#define AS_ID_INAME_SZ 256
+#define VMAP_BIN_NAME_MAX_SZ ((AS_ID_BIN_SZ + 3) & ~3) // round up to multiple of 4
+#define MAX_BIN_NAMES 0x10000 // no need for more - numeric ID is 16 bits
+#define BIN_NAMES_QUOTA (MAX_BIN_NAMES / 2) // don't add more names than this via client transactions
+
+/*
+ * Compare two 16-bit generation counts, allowing wrap-arounds.
+ * Works correctly, if:
+ *
+ *   - rhs is ahead of lhs, but rhs isn't ahead more than 32,768.
+ *   - lhs is ahead of rhs, but lhs isn't ahead more than 32,767.
+ */
+
+static inline bool
+as_gen_less_than(uint16_t lhs, uint16_t rhs)
+{
+	return (uint16_t)(lhs - rhs) >= 32768;
+}
+
+
+/* as_particle_type
+ * Particles are typed, which reflects their contents:
+ *    NULL: no associated content (not sure I really need this internally?)
+ *    INTEGER: a signed, 64-bit integer
+ *    FLOAT: a floating point
+ *    STRING: a null-terminated UTF-8 string
+ *    BLOB: arbitrary-length binary data
+ *    TIMESTAMP: milliseconds since 1 January 1970, 00:00:00 GMT
+ *    DIGEST: an internal Aerospike key digest */
+typedef enum {
+	AS_PARTICLE_TYPE_NULL = 0,
+	AS_PARTICLE_TYPE_INTEGER = 1,
+	AS_PARTICLE_TYPE_FLOAT = 2,
+	AS_PARTICLE_TYPE_STRING = 3,
+	AS_PARTICLE_TYPE_BLOB = 4,
+	AS_PARTICLE_TYPE_TIMESTAMP = 5,
+	AS_PARTICLE_TYPE_UNUSED_6 = 6,
+	AS_PARTICLE_TYPE_JAVA_BLOB = 7,
+	AS_PARTICLE_TYPE_CSHARP_BLOB = 8,
+	AS_PARTICLE_TYPE_PYTHON_BLOB = 9,
+	AS_PARTICLE_TYPE_RUBY_BLOB = 10,
+	AS_PARTICLE_TYPE_PHP_BLOB = 11,
+	AS_PARTICLE_TYPE_ERLANG_BLOB = 12,
+	AS_PARTICLE_TYPE_MAP = 19,
+	AS_PARTICLE_TYPE_LIST = 20,
+	AS_PARTICLE_TYPE_GEOJSON = 23,
+	AS_PARTICLE_TYPE_MAX = 24,
+	AS_PARTICLE_TYPE_BAD = AS_PARTICLE_TYPE_MAX
+} as_particle_type;
+
+/* as_particle
+ * The common part of a particle
+ * this is poor man's subclassing - IE, how to do a subclassed interface in C
+ * Go look in particle.c to see all the subclass implementation and structure */
+typedef struct as_particle_s {
+	uint8_t		metadata;		// used by the iparticle for is_integer and inuse, as well as version in multi bin mode only
+								// used by *particle for type
+	uint8_t		data[];
+} __attribute__ ((__packed__)) as_particle;
+
+// Bit Flag constants used for the particle state value (4 bits, 16 values)
+#define AS_BIN_STATE_UNUSED			0
+#define AS_BIN_STATE_INUSE_INTEGER	1
+#define AS_BIN_STATE_RECYCLE_ME		2 // was - hidden bin
+#define AS_BIN_STATE_INUSE_OTHER	3
+#define AS_BIN_STATE_INUSE_FLOAT	4
+
+typedef struct as_particle_iparticle_s {
+	uint8_t		version: 4;		// now unused - and can't be used in single-bin config
+	uint8_t		state: 4;		// see AS_BIN_STATE_...
+	uint8_t		data[];
+} __attribute__ ((__packed__)) as_particle_iparticle;
+
+/* Particle function declarations */
+
+static inline bool
+is_embedded_particle_type(as_particle_type type)
+{
+	return type == AS_PARTICLE_TYPE_INTEGER || type == AS_PARTICLE_TYPE_FLOAT;
+}
+
+extern as_particle_type as_particle_type_from_asval(const as_val *val);
+extern as_particle_type as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size);
+
+extern uint32_t as_particle_size_from_asval(const as_val *val);
+
+extern uint32_t as_particle_asval_client_value_size(const as_val *val);
+extern uint32_t as_particle_asval_to_client(const as_val *val, as_msg_op *op);
+
+// as_bin particle function declarations
+
+extern void as_bin_particle_destroy(as_bin *b, bool free_particle);
+extern uint32_t as_bin_particle_size(as_bin *b);
+
+// wire:
+extern int as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op);
+extern int as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op);
+extern int as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op);
+extern int as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op);
+extern int as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end);
+extern int as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end);
+extern int as_bin_particle_compare_from_pickled(const as_bin *b, uint8_t **p_pickled);
+extern uint32_t as_bin_particle_client_value_size(const as_bin *b);
+extern uint32_t as_bin_particle_to_client(const as_bin *b, as_msg_op *op);
+extern uint32_t as_bin_particle_pickled_size(const as_bin *b);
+extern uint32_t as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled);
+
+// Different for CDTs - the operations may return results, so we don't use the
+// normal APIs and particle table functions.
+extern int as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result);
+extern int as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result);
+extern int as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result);
+
+// as_val:
+extern int as_bin_particle_replace_from_asval(as_bin *b, const as_val *val);
+extern void as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val);
+extern as_val *as_bin_particle_to_asval(const as_bin *b);
+
+// msgpack:
+extern int as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size);
+
+// flat:
+extern int as_bin_particle_cast_from_flat(as_bin *b, uint8_t *flat, uint32_t flat_size);
+extern int as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, uint32_t flat_size);
+extern uint32_t as_bin_particle_flat_size(as_bin *b);
+extern uint32_t as_bin_particle_to_flat(const as_bin *b, uint8_t *flat);
+
+// odd as_bin particle functions for specific particle types
+
+// integer:
+extern int64_t as_bin_particle_integer_value(const as_bin *b);
+extern void as_bin_particle_integer_set(as_bin *b, int64_t i);
+
+// string:
+extern uint32_t as_bin_particle_string_ptr(const as_bin *b, char **p_value);
+
+// geojson:
+typedef void * geo_region_t;
+#define MAX_REGION_CELLS    32
+#define MAX_REGION_LEVELS   30
+extern size_t as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **pp_cells);
+extern bool as_particle_geojson_match(as_particle *p, uint64_t cellid, geo_region_t region, bool is_strict);
+extern bool as_particle_geojson_match_asval(const as_val *val, uint64_t cellid, geo_region_t region, bool is_strict);
+char const *as_geojson_mem_jsonstr(const as_particle *p, size_t *p_jsonsz);
+
+// list:
+struct cdt_payload_s;
+struct rollback_alloc_s;
+extern void as_bin_particle_list_get_packed_val(const as_bin *b, struct cdt_payload_s *packed);
+extern int as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result);
+extern int as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result, cf_ll_buf *particles_llb);
+
+
+/* as_bin
+ * A bin container - null name means unused */
+struct as_bin_s {
+	as_particle	iparticle;	// 1 byte
+	as_particle	*particle;	// for embedded particle this is value, not pointer
+
+	// Never read or write these bytes in single-bin configuration:
+	uint16_t	id;			// ID of bin name
+	uint8_t		unused;		// pad to 12 bytes (multiple of 4) - legacy
+} __attribute__ ((__packed__)) ;
+
+// For data-in-memory namespaces in multi-bin mode, we keep an array of as_bin
+// structs in memory, accessed via this struct.
+typedef struct as_bin_space_s {
+	uint16_t	n_bins;
+	as_bin		bins[];
+} __attribute__ ((__packed__)) as_bin_space;
+
+// TODO - Do we really need to pad as_bin to 12 bytes for thread safety?
+// Do we ever write & read adjacent as_bin structures in a bins array from
+// different threads when not under the record lock? And if we're worried about
+// 4-byte alignment for that or any other reason, wouldn't we also have to pad
+// after n_bins in as_bin_space?
+
+// For data-in-memory namespaces in multi-bin mode, if we're storing extra
+// record metadata, we access it via this struct. In this case the index points
+// here instead of directly to an as_bin_space.
+typedef struct as_rec_space_s {
+	as_bin_space*	bin_space;
+
+	// So far the key is the only extra record metadata we store in memory.
+	uint32_t		key_size;
+	uint8_t			key[];
+} __attribute__ ((__packed__)) as_rec_space;
+
+// For copying as_bin structs without the last 3 bytes.
+static inline void
+as_single_bin_copy(as_bin *to, const as_bin *from)
+{
+	to->iparticle = from->iparticle;
+	to->particle = from->particle;
+}
+
+static inline bool
+as_bin_inuse(const as_bin *b)
+{
+	return (((as_particle_iparticle *)b)->state);
+}
+
+static inline uint8_t
+as_bin_state(const as_bin *b)
+{
+	return ((as_particle_iparticle *)b)->state;
+}
+
+static inline void
+as_bin_state_set(as_bin *b, uint8_t val)
+{
+	((as_particle_iparticle *)b)->state = val;
+}
+
+static inline void
+as_bin_state_set_from_type(as_bin *b, as_particle_type type)
+{
+	switch (type) {
+	case AS_PARTICLE_TYPE_NULL:
+		((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED;
+		break;
+	case AS_PARTICLE_TYPE_INTEGER:
+		((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_INTEGER;
+		break;
+	case AS_PARTICLE_TYPE_FLOAT:
+		((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_FLOAT;
+		break;
+	case AS_PARTICLE_TYPE_TIMESTAMP:
+		// TODO - unsupported
+		((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED;
+		break;
+	default:
+		((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_OTHER;
+		break;
+	}
+}
+
+static inline bool
+as_bin_inuse_has(as_storage_rd *rd)
+{
+	// In-use bins are at the beginning - only need to check the first bin.
+	return (rd->n_bins && as_bin_inuse(rd->bins));
+}
+
+static inline void
+as_bin_set_empty(as_bin *b)
+{
+	as_bin_state_set(b, AS_BIN_STATE_UNUSED);
+}
+
+static inline void
+as_bin_set_empty_shift(as_storage_rd *rd, uint32_t i)
+{
+	// Shift the bins over, so there's no space between used bins.
+	// This can overwrite the "emptied" bin, and that's fine.
+
+	uint16_t j;
+
+	for (j = i + 1; j < rd->n_bins; j++) {
+		if (! as_bin_inuse(&rd->bins[j])) {
+			break;
+		}
+	}
+
+	uint16_t n = j - (i + 1);
+
+	if (n) {
+		memmove(&rd->bins[i], &rd->bins[i + 1], n * sizeof(as_bin));
+	}
+
+	// Mark the last bin that was *formerly* in use as null.
+	as_bin_set_empty(&rd->bins[j - 1]);
+}
+
+static inline void
+as_bin_set_empty_from(as_storage_rd *rd, uint16_t from) {
+	for (uint16_t i = from; i < rd->n_bins; i++) {
+		as_bin_set_empty(&rd->bins[i]);
+	}
+}
+
+static inline void
+as_bin_set_all_empty(as_storage_rd *rd) {
+	as_bin_set_empty_from(rd, 0);
+}
+
+static inline bool
+as_bin_is_embedded_particle(const as_bin *b) {
+	return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_INTEGER ||
+			((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_FLOAT;
+}
+
+static inline bool
+as_bin_is_external_particle(const as_bin *b) {
+	return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_OTHER;
+}
+
+static inline as_particle *
+as_bin_get_particle(as_bin *b) {
+	return as_bin_is_embedded_particle(b) ? &b->iparticle : b->particle;
+}
+
+// "Embedded" types like integer are stored directly, but other bin types
+// ("other") must follow an indirection to get the actual type.
+static inline uint8_t
+as_bin_get_particle_type(const as_bin *b) {
+	switch (((as_particle_iparticle *)b)->state) {
+		case AS_BIN_STATE_INUSE_INTEGER:
+			return AS_PARTICLE_TYPE_INTEGER;
+		case AS_BIN_STATE_INUSE_FLOAT:
+			return AS_PARTICLE_TYPE_FLOAT;
+		case AS_BIN_STATE_INUSE_OTHER:
+			return b->particle->metadata;
+		default:
+			return AS_PARTICLE_TYPE_NULL;
+	}
+}
+
+
+/* Bin function declarations */
+extern int16_t as_bin_get_id(as_namespace *ns, const char *name);
+extern uint16_t as_bin_get_or_assign_id(as_namespace *ns, const char *name);
+extern uint16_t as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len);
+extern const char* as_bin_get_name_from_id(as_namespace *ns, uint16_t id);
+extern bool as_bin_name_within_quota(as_namespace *ns, const char *name);
+extern void as_bin_init(as_namespace *ns, as_bin *b, const char *name);
+extern void as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from);
+extern int as_storage_rd_load_n_bins(as_storage_rd *rd);
+extern int as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins);
+extern uint16_t as_bin_inuse_count(as_storage_rd *rd);
+extern void as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs);
+extern as_bin *as_bin_get_by_id(as_storage_rd *rd, uint32_t id);
+extern as_bin *as_bin_get(as_storage_rd *rd, const char *name);
+extern as_bin *as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len);
+extern as_bin *as_bin_create(as_storage_rd *rd, const char *name);
+extern as_bin *as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result);
+extern as_bin *as_bin_get_or_create(as_storage_rd *rd, const char *name);
+extern as_bin *as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result);
+extern int32_t as_bin_get_index(as_storage_rd *rd, const char *name);
+extern int32_t as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len);
+extern void as_bin_destroy(as_storage_rd *rd, uint16_t i);
+extern void as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta);
+
+
+typedef enum {
+	AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF = 0,
+	AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION = 1,
+	AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME = 2,
+	AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP = 3
+} conflict_resolution_pol;
+
+/* Record function declarations */
+extern uint32_t clock_skew_stop_writes_sec();
+extern void handle_clock_skew(as_namespace* ns, uint64_t skew_ms);
+extern uint16_t plain_generation(uint16_t regime_generation, const as_namespace* ns);
+extern void as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms, const as_namespace* ns);
+extern void as_record_increment_generation(as_record *r, const as_namespace* ns);
+extern bool as_record_is_live(const as_record *r);
+extern int as_record_get_create(struct as_index_tree_s *tree, cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns);
+extern int as_record_get(struct as_index_tree_s *tree, cf_digest *keyd, as_index_ref *r_ref);
+extern int as_record_get_live(struct as_index_tree_s *tree, cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns);
+extern int as_record_exists(struct as_index_tree_s *tree, cf_digest *keyd);
+extern int as_record_exists_live(struct as_index_tree_s *tree, cf_digest *keyd, as_namespace *ns);
+extern void as_record_rescue(as_index_ref *r_ref, as_namespace *ns);
+
+extern void as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from);
+extern void as_record_destroy_bins(as_storage_rd *rd);
+extern void as_record_free_bin_space(as_record *r);
+
+extern void as_record_destroy(as_record *r, as_namespace *ns);
+extern void as_record_done(as_index_ref *r_ref, as_namespace *ns);
+
+void as_record_drop_stats(as_record* r, as_namespace* ns);
+
+extern void as_record_allocate_key(as_record* r, const uint8_t* key, uint32_t key_size);
+extern void as_record_remove_key(as_record* r);
+extern int as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut);
+extern uint8_t *as_record_pickle(as_storage_rd *rd, size_t *len_r);
+extern int as_record_write_from_pickle(as_storage_rd *rd);
+extern int as_record_set_set_from_msg(as_record *r, as_namespace *ns, as_msg *m);
+
+static inline bool
+as_record_pickle_is_binless(const uint8_t *buf)
+{
+	return *(uint16_t *)buf == 0;
+}
+
+// For enterprise split only.
+int record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut);
+
+static inline int
+resolve_last_update_time(uint64_t left, uint64_t right)
+{
+	return left == right ? 0 : (right > left ? 1 : -1);
+}
+
+typedef struct as_remote_record_s {
+	cf_node src;
+	as_partition_reservation *rsv;
+	cf_digest *keyd;
+
+	uint8_t *record_buf;
+	size_t record_buf_sz;
+
+	uint32_t generation;
+	uint32_t void_time;
+	uint64_t last_update_time;
+
+	const char *set_name;
+	size_t set_name_len;
+
+	const uint8_t *key;
+	size_t key_size;
+
+	uint8_t repl_state; // relevant only for enterprise edition
+} as_remote_record;
+
+int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write);
+
+// a simpler call that gives seconds in the right epoch
+#define as_record_void_time_get() cf_clepoch_seconds()
+bool as_record_is_expired(const as_record *r); // TODO - eventually inline
+
+static inline bool
+as_record_is_doomed(const as_record *r, struct as_namespace_s *ns)
+{
+	return as_record_is_expired(r) || as_truncate_record_is_truncated(r, ns);
+}
+
+#define AS_SINDEX_MAX		256
+
+#define MIN_PARTITIONS_PER_INDEX 1
+#define MAX_PARTITIONS_PER_INDEX 256
+#define DEFAULT_PARTITIONS_PER_INDEX 32
+#define MAX_PARTITIONS_PER_INDEX_CHAR 3 // Number of characters in max paritions per index
+
+// as_sindex structure which hangs from the ns.
+#define AS_SINDEX_INACTIVE			1 // On init, pre-loading
+#define AS_SINDEX_ACTIVE			2 // On creation and afterwards
+#define AS_SINDEX_DESTROY			3 // On destroy
+// dummy sindex state when ai_btree_create() returns error this
+// sindex is not available for any of the DML operations
+#define AS_SINDEX_NOTCREATED		4 // Un-used flag
+#define AS_SINDEX_FLAG_WACTIVE			0x01 // On ai btree create of sindex, never reset
+#define AS_SINDEX_FLAG_RACTIVE			0x02 // When sindex scan of database is completed
+#define AS_SINDEX_FLAG_DESTROY_CLEANUP 	0x04 // Called for AI clean-up during si deletion
+#define AS_SINDEX_FLAG_MIGRATE_CLEANUP  0x08 // Un-used
+#define AS_SINDEX_FLAG_POPULATING		0x10 // Indicates current si scan job, reset when scan is done.
+
+struct as_sindex_s;
+struct as_sindex_config_s;
+
+#define AS_SET_MAX_COUNT 0x3FF	// ID's 10 bits worth minus 1 (ID 0 means no set)
+#define AS_BINID_HAS_SINDEX_SIZE  MAX_BIN_NAMES / ( sizeof(uint32_t) * CHAR_BIT )
+
+
+// TODO - would be nice to put this in as_index.h:
+// Callback invoked when as_index is destroyed.
+typedef void (*as_index_value_destructor) (struct as_index_s* v, void* udata);
+
+// TODO - would be nice to put this in as_index.h:
+typedef struct as_index_tree_shared_s {
+	as_index_value_destructor destructor;
+	void*			destructor_udata;
+
+	// Number of lock pairs and sprigs per partition tree.
+	uint32_t		n_lock_pairs;
+	uint32_t		n_sprigs;
+
+	// Bit-shifts used to calculate indexes from digest bits.
+	uint32_t		locks_shift;
+	uint32_t		sprigs_shift;
+
+	// Offset into as_index_tree struct's variable-sized data.
+	uint32_t		sprigs_offset;
+} as_index_tree_shared;
+
+
+struct as_namespace_s {
+
+	char name[AS_ID_NAMESPACE_SZ];
+	uint32_t id; // this is 1-based
+	uint32_t namehash;
+
+	//--------------------------------------------
+	// Persistent memory.
+	//
+
+	// Persistent memory "base" block ID for this namespace.
+	uint32_t		xmem_id;
+
+	// Pointer to the persistent memory "base" block.
+	uint8_t*		xmem_base;
+
+	// Pointer to partition tree info in persistent memory "treex" block.
+	as_treex*		xmem_roots;
+
+	// Pointer to arena structure (not stages) in persistent memory base block.
+	cf_arenax*		arena;
+
+	// Pointer to bin name vmap in persistent memory base block.
+	cf_vmapx*		p_bin_name_vmap;
+
+	// Pointer to set information vmap in persistent memory base block.
+	cf_vmapx*		p_sets_vmap;
+
+	// Temporary array of sets to hold config values until sets vmap is ready.
+	as_set*			sets_cfg_array;
+	uint32_t		sets_cfg_count;
+
+	// Configuration flags relevant for warm or cool restart.
+	uint32_t		xmem_flags;
+
+	//--------------------------------------------
+	// Cold start.
+	//
+
+	// If true, read storage devices to build index at startup.
+	bool			cold_start;
+
+	// Flag for ticker during initial loading of records from device.
+	bool			loading_records;
+
+	// For cold start eviction.
+	pthread_mutex_t	cold_start_evict_lock;
+	uint32_t		cold_start_record_add_count;
+	cf_atomic32		cold_start_threshold_void_time;
+	uint32_t		cold_start_max_void_time;
+
+	//--------------------------------------------
+	// Memory management.
+	//
+
+	// JEMalloc arena to be used for long-term storage in this namespace (-1 if nonexistent.)
+	int jem_arena;
+
+	// Cached partition ownership info for clients.
+	client_replica_map* replica_maps;
+
+	// Common partition tree information. Contains two configuration items.
+	as_index_tree_shared tree_shared;
+
+	//--------------------------------------------
+	// Storage management.
+	//
+
+	// This is typecast to (drv_ssds*) in storage code.
+	void*			storage_private;
+
+	uint64_t		ssd_size; // discovered (and rounded) size of drive
+	int				storage_last_avail_pct; // most recently calculated available percent
+	int				storage_max_write_q; // storage_max_write_cache is converted to this
+	uint32_t		saved_defrag_sleep; // restore after defrag at startup is done
+	uint32_t		defrag_lwm_size; // storage_defrag_lwm_pct % of storage_write_block_size
+
+	// For data-not-in-memory, we optionally cache swbs after writing to device.
+	// To track fraction of reads from cache:
+	cf_atomic32		n_reads_from_cache;
+	cf_atomic32		n_reads_from_device;
+
+	uint8_t			storage_encryption_key[32];
+
+	//--------------------------------------------
+	// Truncate records.
+	//
+
+	as_truncate		truncate;
+
+	//--------------------------------------------
+	// Secondary index.
+	//
+
+	int				sindex_cnt;
+	uint32_t		n_setless_sindexes;
+	struct as_sindex_s* sindex; // array with AS_MAX_SINDEX metadata
+	cf_shash*		sindex_set_binid_hash;
+	cf_shash*		sindex_iname_hash;
+	uint32_t		binid_has_sindex[AS_BINID_HAS_SINDEX_SIZE];
+
+	//--------------------------------------------
+	// Configuration.
+	//
+
+	uint32_t		cfg_replication_factor;
+	uint32_t		replication_factor; // indirect config - can become less than cfg_replication_factor
+	uint64_t		memory_size;
+	uint64_t		default_ttl;
+
+	PAD_BOOL		enable_xdr;
+	PAD_BOOL		sets_enable_xdr; // namespace-level flag to enable set-based xdr shipping
+	PAD_BOOL		ns_forward_xdr_writes; // namespace-level flag to enable forwarding of xdr writes
+	PAD_BOOL		ns_allow_nonxdr_writes; // namespace-level flag to allow nonxdr writes or not
+	PAD_BOOL		ns_allow_xdr_writes; // namespace-level flag to allow xdr writes or not
+
+	uint32_t		cold_start_evict_ttl;
+	conflict_resolution_pol conflict_resolution_policy;
+	PAD_BOOL		cp; // relevant only for enterprise edition
+	PAD_BOOL		cp_allow_drops; // relevant only for enterprise edition
+	PAD_BOOL		data_in_index; // with single-bin, allows warm restart for data-in-memory (with storage-engine device)
+	PAD_BOOL		write_dup_res_disabled;
+	PAD_BOOL		disallow_null_setname;
+	PAD_BOOL		batch_sub_benchmarks_enabled;
+	PAD_BOOL		read_benchmarks_enabled;
+	PAD_BOOL		udf_benchmarks_enabled;
+	PAD_BOOL		udf_sub_benchmarks_enabled;
+	PAD_BOOL		write_benchmarks_enabled;
+	PAD_BOOL		proxy_hist_enabled;
+	uint32_t		evict_hist_buckets;
+	uint32_t		evict_tenths_pct;
+	uint32_t		hwm_disk_pct;
+	uint32_t		hwm_memory_pct;
+	uint64_t		max_ttl;
+	uint32_t		migrate_order;
+	uint32_t		migrate_retransmit_ms;
+	uint32_t		migrate_sleep;
+	cf_atomic32		obj_size_hist_max; // TODO - doesn't need to be atomic, really.
+	uint32_t		rack_id;
+	as_read_consistency_level read_consistency_level;
+	PAD_BOOL		single_bin; // restrict the namespace to objects with exactly one bin
+	uint32_t		stop_writes_pct;
+	uint32_t		tomb_raider_eligible_age; // relevant only for enterprise edition
+	uint32_t		tomb_raider_period; // relevant only for enterprise edition
+	as_write_commit_level write_commit_level;
+	cf_vector		xdr_dclist_v;
+
+	as_storage_type storage_type;
+
+	char*			storage_devices[AS_STORAGE_MAX_DEVICES];
+	char*			storage_shadows[AS_STORAGE_MAX_DEVICES];
+	char*			storage_files[AS_STORAGE_MAX_FILES];
+	uint64_t		storage_filesize;
+	char*			storage_scheduler_mode; // relevant for devices only, not files
+	uint32_t		storage_write_block_size;
+	PAD_BOOL		storage_data_in_memory;
+
+	PAD_BOOL		storage_cold_start_empty;
+	PAD_BOOL		storage_commit_to_device; // relevant only for enterprise edition
+	uint32_t		storage_commit_min_size; // relevant only for enterprise edition
+	uint32_t		storage_defrag_lwm_pct;
+	uint32_t		storage_defrag_queue_min;
+	uint32_t		storage_defrag_sleep;
+	int				storage_defrag_startup_minimum;
+	PAD_BOOL		storage_disable_odirect;
+	PAD_BOOL		storage_benchmarks_enabled; // histograms are per-drive except device-read-size & device-write-size
+	PAD_BOOL		storage_enable_osync;
+	char*			storage_encryption_key_file;
+	uint64_t		storage_flush_max_us;
+	uint64_t		storage_fsync_max_us;
+	uint64_t		storage_max_write_cache;
+	uint32_t		storage_min_avail_pct;
+	cf_atomic32 	storage_post_write_queue; // number of swbs/device held after writing to device
+	uint32_t		storage_tomb_raider_sleep; // relevant only for enterprise edition
+	uint32_t		storage_write_threads;
+
+	uint32_t		sindex_num_partitions;
+
+	PAD_BOOL		geo2dsphere_within_strict;
+	uint16_t		geo2dsphere_within_min_level;
+	uint16_t		geo2dsphere_within_max_level;
+	uint16_t		geo2dsphere_within_max_cells;
+	uint16_t		geo2dsphere_within_level_mod;
+	uint32_t		geo2dsphere_within_earth_radius_meters;
+
+	//--------------------------------------------
+	// Statistics and histograms.
+	//
+
+	// Object counts.
+
+	cf_atomic64		n_objects;
+	cf_atomic64		n_tombstones; // relevant only for enterprise edition
+
+	// Consistency info.
+
+	uint32_t		n_dead_partitions;
+	uint32_t		n_unavailable_partitions;
+	bool			clock_skew_stop_writes;
+
+	// Expiration & eviction (nsup) stats.
+
+	cf_atomic32		stop_writes;
+	cf_atomic32		hwm_breached;
+
+	uint64_t		non_expirable_objects;
+
+	cf_atomic64		n_expired_objects;
+	cf_atomic64		n_evicted_objects;
+
+	cf_atomic64		evict_ttl;
+
+	uint32_t		nsup_cycle_duration; // seconds taken for most recent nsup cycle
+	uint32_t		nsup_cycle_sleep_pct; // fraction of most recent nsup cycle that was spent sleeping
+
+	// Memory usage stats.
+
+	cf_atomic_int	n_bytes_memory;
+	cf_atomic64		n_bytes_sindex_memory;
+
+	// Persistent storage stats.
+
+	float			cache_read_pct;
+
+	// Migration stats.
+
+	cf_atomic_int	migrate_tx_partitions_imbalance; // debug only
+	cf_atomic_int	migrate_tx_instance_count; // debug only
+	cf_atomic_int	migrate_rx_instance_count; // debug only
+	cf_atomic_int	migrate_tx_partitions_active;
+	cf_atomic_int	migrate_rx_partitions_active;
+	cf_atomic_int	migrate_tx_partitions_initial;
+	cf_atomic_int	migrate_tx_partitions_remaining;
+	cf_atomic_int	migrate_rx_partitions_initial;
+	cf_atomic_int	migrate_rx_partitions_remaining;
+	cf_atomic_int	migrate_signals_active;
+	cf_atomic_int	migrate_signals_remaining;
+	cf_atomic_int	appeals_tx_active; // relevant only for enterprise edition
+	cf_atomic_int	appeals_rx_active; // relevant only for enterprise edition
+	cf_atomic_int	appeals_tx_remaining; // relevant only for enterprise edition
+
+	// Per-record migration stats:
+	cf_atomic_int	migrate_records_skipped; // relevant only for enterprise edition
+	cf_atomic_int	migrate_records_transmitted;
+	cf_atomic_int	migrate_record_retransmits;
+	cf_atomic_int	migrate_record_receives;
+	cf_atomic_int	appeals_records_exonerated; // relevant only for enterprise edition
+
+	// From-client transaction stats.
+
+	cf_atomic64		n_client_tsvc_error;
+	cf_atomic64		n_client_tsvc_timeout;
+
+	cf_atomic64		n_client_proxy_complete;
+	cf_atomic64		n_client_proxy_error;
+	cf_atomic64		n_client_proxy_timeout;
+
+	cf_atomic64		n_client_read_success;
+	cf_atomic64		n_client_read_error;
+	cf_atomic64		n_client_read_timeout;
+	cf_atomic64		n_client_read_not_found;
+
+	cf_atomic64		n_client_write_success;
+	cf_atomic64		n_client_write_error;
+	cf_atomic64		n_client_write_timeout;
+
+	// Subset of n_client_write_... above, respectively.
+	cf_atomic64		n_xdr_write_success;
+	cf_atomic64		n_xdr_write_error;
+	cf_atomic64		n_xdr_write_timeout;
+
+	cf_atomic64		n_client_delete_success;
+	cf_atomic64		n_client_delete_error;
+	cf_atomic64		n_client_delete_timeout;
+	cf_atomic64		n_client_delete_not_found;
+
+	// Subset of n_client_delete_... above, respectively.
+	cf_atomic64		n_xdr_delete_success;
+	cf_atomic64		n_xdr_delete_error;
+	cf_atomic64		n_xdr_delete_timeout;
+	cf_atomic64		n_xdr_delete_not_found;
+
+	cf_atomic64		n_client_udf_complete;
+	cf_atomic64		n_client_udf_error;
+	cf_atomic64		n_client_udf_timeout;
+
+	cf_atomic64		n_client_lang_read_success;
+	cf_atomic64		n_client_lang_write_success;
+	cf_atomic64		n_client_lang_delete_success;
+	cf_atomic64		n_client_lang_error;
+
+	// Batch sub-transaction stats.
+
+	cf_atomic64		n_batch_sub_tsvc_error;
+	cf_atomic64		n_batch_sub_tsvc_timeout;
+
+	cf_atomic64		n_batch_sub_proxy_complete;
+	cf_atomic64		n_batch_sub_proxy_error;
+	cf_atomic64		n_batch_sub_proxy_timeout;
+
+	cf_atomic64		n_batch_sub_read_success;
+	cf_atomic64		n_batch_sub_read_error;
+	cf_atomic64		n_batch_sub_read_timeout;
+	cf_atomic64		n_batch_sub_read_not_found;
+
+	// Internal-UDF sub-transaction stats.
+
+	cf_atomic64		n_udf_sub_tsvc_error;
+	cf_atomic64		n_udf_sub_tsvc_timeout;
+
+	cf_atomic64		n_udf_sub_udf_complete;
+	cf_atomic64		n_udf_sub_udf_error;
+	cf_atomic64		n_udf_sub_udf_timeout;
+
+	cf_atomic64		n_udf_sub_lang_read_success;
+	cf_atomic64		n_udf_sub_lang_write_success;
+	cf_atomic64		n_udf_sub_lang_delete_success;
+	cf_atomic64		n_udf_sub_lang_error;
+
+	// Transaction retransmit stats.
+
+	uint64_t		n_retransmit_client_read_dup_res;
+
+	uint64_t		n_retransmit_client_write_dup_res;
+	uint64_t		n_retransmit_client_write_repl_write;
+
+	uint64_t		n_retransmit_client_delete_dup_res;
+	uint64_t		n_retransmit_client_delete_repl_write;
+
+	uint64_t		n_retransmit_client_udf_dup_res;
+	uint64_t		n_retransmit_client_udf_repl_write;
+
+	uint64_t		n_retransmit_batch_sub_dup_res;
+
+	uint64_t		n_retransmit_udf_sub_dup_res;
+	uint64_t		n_retransmit_udf_sub_repl_write;
+
+	// Scan stats.
+
+	cf_atomic64		n_scan_basic_complete;
+	cf_atomic64		n_scan_basic_error;
+	cf_atomic64		n_scan_basic_abort;
+
+	cf_atomic64		n_scan_aggr_complete;
+	cf_atomic64		n_scan_aggr_error;
+	cf_atomic64		n_scan_aggr_abort;
+
+	cf_atomic64		n_scan_udf_bg_complete;
+	cf_atomic64		n_scan_udf_bg_error;
+	cf_atomic64		n_scan_udf_bg_abort;
+
+	// Query stats.
+
+	cf_atomic64		query_reqs;
+	cf_atomic64		query_fail;
+	cf_atomic64		query_short_queue_full;
+	cf_atomic64		query_long_queue_full;
+	cf_atomic64		query_short_reqs;
+	cf_atomic64		query_long_reqs;
+
+	cf_atomic64		n_lookup;
+	cf_atomic64		n_lookup_success;
+	cf_atomic64		n_lookup_abort;
+	cf_atomic64		n_lookup_errs;
+	cf_atomic64		lookup_response_size;
+	cf_atomic64		lookup_num_records;
+
+	cf_atomic64		n_aggregation;
+	cf_atomic64		n_agg_success;
+	cf_atomic64		n_agg_abort;
+	cf_atomic64		n_agg_errs;
+	cf_atomic64		agg_response_size;
+	cf_atomic64		agg_num_records;
+
+	cf_atomic64		n_query_udf_bg_success;
+	cf_atomic64		n_query_udf_bg_failure;
+
+	// Geospatial query stats:
+	cf_atomic64		geo_region_query_count;		// number of region queries
+	cf_atomic64		geo_region_query_cells;		// number of cells used by region queries
+	cf_atomic64		geo_region_query_points;	// number of valid points found
+	cf_atomic64		geo_region_query_falsepos;	// number of false positives found
+
+	// Re-replication stats - relevant only for enterprise edition.
+
+	cf_atomic64		n_re_repl_success;
+	cf_atomic64		n_re_repl_error;
+	cf_atomic64		n_re_repl_timeout;
+
+	// Special errors that deserve their own counters:
+
+	cf_atomic64		n_fail_xdr_forbidden;
+	cf_atomic64		n_fail_key_busy;
+	cf_atomic64		n_fail_generation;
+	cf_atomic64		n_fail_record_too_big;
+
+	// Special non-error counters:
+
+	cf_atomic64		n_deleted_last_bin;
+
+	// One-way automatically activated histograms.
+
+	cf_hist_track*	read_hist;
+	cf_hist_track*	write_hist;
+	cf_hist_track*	udf_hist;
+	cf_hist_track*	query_hist;
+	histogram*		query_rec_count_hist;
+	histogram*		re_repl_hist; // relevant only for enterprise edition
+
+	PAD_BOOL		read_hist_active;
+	PAD_BOOL		write_hist_active;
+	PAD_BOOL		udf_hist_active;
+	PAD_BOOL		query_hist_active;
+	PAD_BOOL		query_rec_count_hist_active;
+	PAD_BOOL		re_repl_hist_active; // relevant only for enterprise edition
+
+	// Activate-by-config histograms.
+
+	histogram*		proxy_hist;
+
+	histogram*		read_start_hist;
+	histogram*		read_restart_hist;
+	histogram*		read_dup_res_hist;
+	histogram*		read_repl_ping_hist;
+	histogram*		read_local_hist;
+	histogram*		read_response_hist;
+
+	histogram*		write_start_hist;
+	histogram*		write_restart_hist;
+	histogram*		write_dup_res_hist;
+	histogram*		write_master_hist; // split this?
+	histogram*		write_repl_write_hist;
+	histogram*		write_response_hist;
+
+	histogram*		udf_start_hist;
+	histogram*		udf_restart_hist;
+	histogram*		udf_dup_res_hist;
+	histogram*		udf_master_hist; // split this?
+	histogram*		udf_repl_write_hist;
+	histogram*		udf_response_hist;
+
+	histogram*		batch_sub_start_hist;
+	histogram*		batch_sub_restart_hist;
+	histogram*		batch_sub_dup_res_hist;
+	histogram*		batch_sub_repl_ping_hist;
+	histogram*		batch_sub_read_local_hist;
+	histogram*		batch_sub_response_hist;
+
+	histogram*		udf_sub_start_hist;
+	histogram*		udf_sub_restart_hist;
+	histogram*		udf_sub_dup_res_hist;
+	histogram*		udf_sub_master_hist; // split this?
+	histogram*		udf_sub_repl_write_hist;
+	histogram*		udf_sub_response_hist;
+
+	histogram*		device_read_size_hist;
+	histogram*		device_write_size_hist;
+
+	// Histograms of master object storage sizes. (Meaningful for drive-backed
+	// namespaces only.)
+	linear_hist*	obj_size_hist;
+	linear_hist*	set_obj_size_hists[AS_SET_MAX_COUNT + 1];
+
+	// Histograms used for general eviction and expiration.
+	linear_hist*	evict_hist; // not just for info
+	linear_hist*	ttl_hist;
+	linear_hist*	set_ttl_hists[AS_SET_MAX_COUNT + 1];
+
+	//--------------------------------------------
+	// Data partitions.
+	//
+
+	as_partition partitions[AS_PARTITIONS];
+
+	//--------------------------------------------
+	// Information for rebalancing.
+	//
+
+	uint32_t cluster_size;
+	cf_node succession[AS_CLUSTER_SZ];
+	as_partition_version cluster_versions[AS_CLUSTER_SZ][AS_PARTITIONS];
+	uint32_t rack_ids[AS_CLUSTER_SZ]; // is observed-rack-ids in CP mode
+
+	// Observed nodes - relevant only for enterprise edition.
+	uint32_t observed_cluster_size;
+	cf_node observed_succession[AS_CLUSTER_SZ];
+
+	// Roster management - relevant only for enterprise edition.
+	uint32_t smd_roster_generation;
+	uint32_t smd_roster_count;
+	cf_node smd_roster[AS_CLUSTER_SZ];
+	uint32_t smd_roster_rack_ids[AS_CLUSTER_SZ];
+	uint32_t roster_generation;
+	uint32_t roster_count;
+	cf_node roster[AS_CLUSTER_SZ];
+	uint32_t roster_rack_ids[AS_CLUSTER_SZ];
+
+	// Master regimes - relevant only for enterprise edition.
+	uint32_t eventual_regime;
+	uint32_t rebalance_regime;
+	uint32_t rebalance_regimes[AS_CLUSTER_SZ];
+};
+
+#define AS_SET_NAME_MAX_SIZE	64		// includes space for null-terminator
+
+#define INVALID_SET_ID 0
+
+#define IS_SET_EVICTION_DISABLED(p_set)		(cf_atomic32_get(p_set->disable_eviction) == 1)
+#define DISABLE_SET_EVICTION(p_set, on_off)	(cf_atomic32_set(&p_set->disable_eviction, on_off ? 1 : 0))
+
+typedef enum {
+	AS_SET_ENABLE_XDR_DEFAULT = 0,
+	AS_SET_ENABLE_XDR_TRUE = 1,
+	AS_SET_ENABLE_XDR_FALSE = 2
+} as_set_enable_xdr_flag;
+
+// Caution - changing this struct could break warm or cool restart.
+struct as_set_s {
+	char			name[AS_SET_NAME_MAX_SIZE];
+	cf_atomic64		n_objects;
+	cf_atomic64		n_tombstones;		// relevant only for enterprise edition
+	cf_atomic64		n_bytes_memory;		// for data-in-memory only - sets's total record data size
+	cf_atomic64		stop_writes_count;	// restrict number of records in a set
+	uint64_t		truncate_lut;		// records with last-update-time less than this are truncated
+	cf_atomic32		disable_eviction;	// don't evict anything in this set (note - expiration still works)
+	cf_atomic32		enable_xdr;			// white-list (AS_SET_ENABLE_XDR_TRUE) or black-list (AS_SET_ENABLE_XDR_FALSE) a set for XDR replication
+	uint32_t		n_sindexes;
+	uint8_t padding[12];
+};
+
+static inline bool
+as_set_stop_writes(as_set *p_set) {
+	uint64_t n_objects = cf_atomic64_get(p_set->n_objects);
+	uint64_t stop_writes_count = cf_atomic64_get(p_set->stop_writes_count);
+
+	return stop_writes_count != 0 && n_objects >= stop_writes_count;
+}
+
+// These bin functions must be below definition of struct as_namespace_s:
+
+static inline void
+as_bin_set_id_from_name_buf(as_namespace *ns, as_bin *b, const uint8_t *buf,
+		int len) {
+	if (! ns->single_bin) {
+		b->id = as_bin_get_or_assign_id_w_len(ns, (const char *)buf, len);
+	}
+}
+
+static inline void
+as_bin_set_id_from_name(as_namespace *ns, as_bin *b, const char *name) {
+	if (! ns->single_bin) {
+		b->id = as_bin_get_or_assign_id(ns, name);
+	}
+}
+
+static inline size_t
+as_bin_memcpy_name(as_namespace *ns, uint8_t *buf, as_bin *b) {
+	size_t len = 0;
+
+	if (! ns->single_bin) {
+		const char *name = as_bin_get_name_from_id(ns, b->id);
+
+		len = strlen(name);
+		memcpy(buf, name, len);
+	}
+
+	return len;
+}
+
+// forward ref
+struct as_msg_field_s;
+
+/* Namespace function declarations */
+extern as_namespace *as_namespace_create(char *name);
+extern void as_namespaces_init(bool cold_start_cmd, uint32_t instance);
+extern void as_namespaces_setup(bool cold_start_cmd, uint32_t instance, uint32_t stage_capacity);
+extern bool as_namespace_configure_sets(as_namespace *ns);
+extern as_namespace *as_namespace_get_byname(char *name);
+extern as_namespace *as_namespace_get_byid(uint32_t id);
+extern as_namespace *as_namespace_get_bybuf(uint8_t *name, size_t len);
+extern as_namespace *as_namespace_get_bymsgfield(struct as_msg_field_s *fp);
+extern const char *as_namespace_get_set_name(as_namespace *ns, uint16_t set_id);
+extern uint16_t as_namespace_get_set_id(as_namespace *ns, const char *set_name);
+extern uint16_t as_namespace_get_create_set_id(as_namespace *ns, const char *set_name);
+extern int as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len, uint16_t *p_set_id, bool apply_restrictions);
+extern int as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name, size_t len, as_set **pp_set, uint16_t *p_set_id);
+extern as_set *as_namespace_get_set_by_name(as_namespace *ns, const char *set_name);
+extern as_set* as_namespace_get_set_by_id(as_namespace* ns, uint16_t set_id);
+extern as_set* as_namespace_get_record_set(as_namespace *ns, const as_record *r);
+extern void as_namespace_get_set_info(as_namespace *ns, const char *set_name, cf_dyn_buf *db);
+extern void as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id, int64_t delta_bytes);
+extern void as_namespace_release_set_id(as_namespace *ns, uint16_t set_id);
+extern void as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns);
+extern void as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name,
+		cf_dyn_buf *db, bool show_ns);
+
+static inline bool
+as_namespace_cool_restarts(const as_namespace *ns)
+{
+	return ns->storage_data_in_memory && ! ns->data_in_index;
+}
+
+static inline const char*
+as_namespace_start_mode_str(const as_namespace *ns)
+{
+	return as_namespace_cool_restarts(ns) ? "cool" : "warm";
+}
+
+// Persistent Memory Management
+
+struct as_treex_s {
+	uint64_t root_h: 40;
+} __attribute__ ((__packed__));
+
+void as_namespace_xmem_trusted(as_namespace *ns);
+
+// Not namespace class functions, but they live in namespace.c:
+uint32_t as_mem_check();
+
+// XXX POST-JUMP - remove in "six months".
+static inline uint32_t
+truncate_void_time(as_namespace *ns, uint32_t void_time)
+{
+	uint32_t max_void_time = as_record_void_time_get() + (uint32_t)ns->max_ttl;
+	return void_time > max_void_time ? max_void_time : void_time;
+}
diff --git a/as/include/base/features.h b/as/include/base/features.h
new file mode 100644
index 00000000..062ce760
--- /dev/null
+++ b/as/include/base/features.h
@@ -0,0 +1,30 @@
+/*
+ * features.h
+ *
+ * Copyright (C) 2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+
+//==========================================================
+// Public API.
+//
+
+const char *as_features_info();
diff --git a/as/include/base/index.h b/as/include/base/index.h
new file mode 100644
index 00000000..09c9d61c
--- /dev/null
+++ b/as/include/base/index.h
@@ -0,0 +1,337 @@
+/*
+ * index.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "arenax.h"
+#include "cf_mutex.h"
+
+#include "base/datamodel.h"
+
+
+//==========================================================
+// Index tree node - as_index, also known as as_record.
+//
+// There's one for every record. Contains metadata, and
+// points to record data in memory and/or on storage device.
+//
+
+typedef struct as_index_s {
+
+	// offset: 0
+	cf_atomic32 rc;
+
+	// offset: 4
+	cf_digest keyd;
+
+	// offset: 24
+	uint64_t right_h: 40;
+	uint64_t left_h: 40;
+
+	// offset: 34
+	// Don't use the free bits here for record info - this is accessed outside
+	// the record lock.
+	uint16_t color: 1;
+	uint16_t unused_but_unsafe: 15;
+
+	// Everything below here is used under the record lock.
+
+	// offset: 36
+	uint32_t tombstone: 1;
+	uint32_t cenotaph: 1;
+	uint32_t void_time: 30;
+
+	// offset: 40
+	uint64_t last_update_time: 40;
+	uint64_t generation: 16;
+
+	// offset: 47
+	// Used by the storage engines.
+	uint64_t rblock_id: 34;		// can address 2^34 * 128b = 2Tb drive
+	uint64_t n_rblocks: 14;		// is enough for 1Mb/128b = 8K rblocks
+	uint64_t file_id: 6;		// can spec 2^6 = 64 drives
+
+	uint64_t set_id_bits: 10;	// do not use directly, used for set-ID
+
+	// offset: 55
+	// In single-bin mode for data-in-memory namespaces, this offset is cast to
+	// an as_bin, but only 4 bits get used (for the iparticle state). The other
+	// 4 bits are used for replication state and index flags.
+	uint8_t repl_state: 2;
+	uint8_t unused_flag: 1;
+	uint8_t key_stored: 1;
+	uint8_t single_bin_state: 4; // used indirectly, only in single-bin mode
+
+	// offset: 56
+	// For data-not-in-memory namespaces, these 8 bytes are currently unused.
+	// For data-in-memory namespaces: in single-bin mode the as_bin is embedded
+	// here (these 8 bytes plus 4 bits in flex_bits above), but in multi-bin
+	// mode this is a pointer to either of:
+	// - an as_bin_space containing n_bins and an array of as_bin structs
+	// - an as_rec_space containing an as_bin_space pointer and other metadata
+	void* dim;
+
+	// final size: 64
+
+} __attribute__ ((__packed__)) as_index;
+
+#define AS_INDEX_SINGLE_BIN_OFFSET 55 // can't use offsetof() with bit fields
+
+
+//==========================================================
+// Accessor functions for bits in as_index.
+//
+
+// Size in bytes of as_index, currently the same for all namespaces.
+static inline
+uint32_t as_index_size_get(as_namespace *ns)
+{
+	return (uint32_t)sizeof(as_index);
+}
+
+// Fast way to clear the record portion of as_index.
+// Note - relies on current layout and size of as_index!
+static inline
+void as_index_clear_record_info(as_index *index) {
+	*(uint32_t*)((uint8_t*)index + 36) = 0;
+
+	uint64_t *p_clear = (uint64_t*)((uint8_t*)index + 40);
+
+	*p_clear++	= 0;
+	*p_clear++	= 0;
+	*p_clear	= 0;
+}
+
+// Generation 0 is never written, and generation plays no role in record
+// destruction, so it works to flag both "half created" and deleted records.
+static inline
+void as_index_invalidate_record(as_index *index) {
+	index->generation = 0;
+}
+
+static inline
+bool as_index_is_valid_record(as_index *index) {
+	return index->generation != 0;
+}
+
+
+//------------------------------------------------
+// Single bin, as_bin_space & as_rec_space.
+//
+
+static inline
+as_bin *as_index_get_single_bin(const as_index *index) {
+	// We only use 4 bits of the first byte for the bin state.
+	return (as_bin*)((uint8_t *)index + AS_INDEX_SINGLE_BIN_OFFSET);
+}
+
+static inline
+as_bin_space* as_index_get_bin_space(const as_index *index) {
+	return index->key_stored == 1 ?
+		   ((as_rec_space*)index->dim)->bin_space : (as_bin_space*)index->dim;
+}
+
+static inline
+void as_index_set_bin_space(as_index* index, as_bin_space* bin_space) {
+	if (index->key_stored == 1) {
+		((as_rec_space*)index->dim)->bin_space = bin_space;
+	}
+	else {
+		index->dim = (void*)bin_space;
+	}
+}
+
+
+//------------------------------------------------
+// Set-ID bits.
+//
+
+static inline
+uint16_t as_index_get_set_id(const as_index *index) {
+	return index->set_id_bits;
+}
+
+static inline
+void as_index_set_set_id(as_index *index, uint16_t set_id) {
+	// TODO - check that it fits in the 10 bits ???
+	index->set_id_bits = set_id;
+}
+
+static inline
+bool as_index_has_set(const as_index *index) {
+	return index->set_id_bits != 0;
+}
+
+
+//------------------------------------------------
+// Set-ID helpers.
+//
+
+static inline
+int as_index_set_set_w_len(as_index *index, as_namespace *ns,
+		const char *set_name, size_t len, bool apply_restrictions) {
+	uint16_t set_id;
+	int rv = as_namespace_set_set_w_len(ns, set_name, len, &set_id,
+			apply_restrictions);
+
+	if (rv != 0) {
+		return rv;
+	}
+
+	as_index_set_set_id(index, set_id);
+	return 0;
+}
+
+static inline
+int as_index_set_set(as_index *index, as_namespace *ns, const char *set_name,
+		bool apply_restrictions) {
+	return as_index_set_set_w_len(index, ns, set_name, strlen(set_name),
+		apply_restrictions);
+}
+
+static inline
+const char *as_index_get_set_name(as_index *index, as_namespace *ns) {
+	// TODO - don't really need this check - remove?
+	if (! as_index_has_set(index)) {
+		return NULL;
+	}
+
+	return as_namespace_get_set_name(ns, as_index_get_set_id(index));
+}
+
+
+//==========================================================
+// Handling as_index objects.
+//
+
+// Container for as_index pointer with lock and location.
+struct as_index_ref_s {
+	bool				skip_lock;
+	as_index			*r;
+	cf_arenax_handle	r_h;
+	cf_mutex			*olock;
+};
+
+
+//==========================================================
+// Index tree.
+//
+
+typedef struct as_index_tree_s {
+	// Data common to all trees in a namespace.
+	as_index_tree_shared	*shared;
+
+	// Where we allocate from and free to. Left out of 'shared' since we may
+	// later use multiple arenas per namespace.
+	cf_arenax				*arena;
+
+	// Variable length data, dependent on configuration.
+	uint8_t					data[];
+} as_index_tree;
+
+
+//==========================================================
+// as_index_tree variable length data components.
+//
+
+typedef struct as_lock_pair_s {
+	// Note: reduce_lock's scope is always inside of lock's scope.
+	cf_mutex lock;        // insert, delete vs. insert, delete, get
+	cf_mutex reduce_lock; // insert, delete vs. reduce
+} as_lock_pair;
+
+typedef struct as_sprig_s {
+	cf_arenax_handle	root_h;
+	uint64_t			n_elements;
+} as_sprig;
+
+static inline as_lock_pair *
+tree_locks(as_index_tree *tree)
+{
+	return (as_lock_pair*)tree->data;
+}
+
+static inline as_sprig *
+tree_sprigs(as_index_tree *tree)
+{
+	return (as_sprig*)(tree->data + tree->shared->sprigs_offset);
+}
+
+
+//------------------------------------------------
+// as_index_tree public API.
+//
+
+void as_index_tree_gc_init();
+int as_index_tree_gc_queue_size();
+
+as_index_tree *as_index_tree_create(as_index_tree_shared *shared, cf_arenax *arena);
+as_index_tree *as_index_tree_resume(as_index_tree_shared *shared, cf_arenax *arena, as_treex *treex);
+void as_index_tree_shutdown(as_index_tree *tree, as_treex *treex);
+int as_index_tree_release(as_index_tree *tree);
+uint64_t as_index_tree_size(as_index_tree *tree);
+
+typedef void (*as_index_reduce_fn) (as_index_ref *value, void *udata);
+
+void as_index_reduce(as_index_tree *tree, as_index_reduce_fn cb, void *udata);
+void as_index_reduce_partial(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata);
+
+void as_index_reduce_live(as_index_tree *tree, as_index_reduce_fn cb, void *udata);
+void as_index_reduce_partial_live(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata);
+
+int as_index_exists(as_index_tree *tree, cf_digest *keyd);
+int as_index_get_vlock(as_index_tree *tree, cf_digest *keyd, as_index_ref *index_ref);
+int as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd, as_index_ref *index_ref);
+int as_index_delete(as_index_tree *tree, cf_digest *keyd);
+
+#define as_index_reserve(_r) cf_atomic32_incr(&(_r->rc))
+#define as_index_release(_r) cf_atomic32_decr(&(_r->rc))
+
+
+//------------------------------------------------
+// Private API - for enterprise separation only.
+//
+
+// Container for sprig-level function parameters.
+typedef struct as_index_sprig_s {
+	as_index_value_destructor destructor;
+	void			*destructor_udata;
+
+	cf_arenax		*arena;
+
+	as_lock_pair	*pair;
+	as_sprig		*sprig;
+} as_index_sprig;
+
+#define SENTINEL_H 0
+
+#define RESOLVE_H(__h) ((as_index*)cf_arenax_resolve(isprig->arena, __h))
+
+// Flag to indicate full index reduce.
+#define AS_REDUCE_ALL (-1L)
diff --git a/as/include/base/job_manager.h b/as/include/base/job_manager.h
new file mode 100644
index 00000000..39d3bb69
--- /dev/null
+++ b/as/include/base/job_manager.h
@@ -0,0 +1,171 @@
+/*
+ * job_manager.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_queue_priority.h"
+
+struct as_job_s;
+struct as_job_manager_s;
+struct as_mon_jobstat_s;
+struct as_namespace_s;
+struct as_partition_reservation_s;
+
+//----------------------------------------------------------
+// as_priority_thread_pool - class header.
+// TODO - move to common.
+//
+
+typedef struct as_priority_thread_pool_s {
+	pthread_mutex_t		lock;
+	cf_queue_priority*	dispatch_queue;
+	cf_queue*			complete_queue;
+	uint32_t			n_threads;
+} as_priority_thread_pool;
+
+typedef void (*as_priority_thread_pool_task_fn)(void* task);
+
+// Same as cf_queue_priority scheme, so no internal conversion needed:
+#define THREAD_POOL_PRIORITY_LOW	CF_QUEUE_PRIORITY_LOW
+#define THREAD_POOL_PRIORITY_MEDIUM	CF_QUEUE_PRIORITY_MEDIUM
+#define THREAD_POOL_PRIORITY_HIGH	CF_QUEUE_PRIORITY_HIGH
+
+bool as_priority_thread_pool_init(as_priority_thread_pool* pool, uint32_t n_threads);
+void as_priority_thread_pool_shutdown(as_priority_thread_pool* pool);
+bool as_priority_thread_pool_resize(as_priority_thread_pool* pool, uint32_t n_threads);
+bool as_priority_thread_pool_queue_task(as_priority_thread_pool* pool, as_priority_thread_pool_task_fn task_fn, void* task, int priority);
+bool as_priority_thread_pool_remove_task(as_priority_thread_pool* pool, void* task);
+void as_priority_thread_pool_change_task_priority(as_priority_thread_pool* pool, void* task, int new_priority);
+
+//----------------------------------------------------------
+// as_job - base class header.
+//
+
+typedef void (*as_job_slice_fn)(struct as_job_s* _job, struct as_partition_reservation_s* rsv);
+typedef void (*as_job_finish_fn)(struct as_job_s* _job);
+typedef void (*as_job_destroy_fn)(struct as_job_s* _job);
+typedef void (*as_job_info_fn)(struct as_job_s* _job, struct as_mon_jobstat_s* stat);
+
+typedef struct as_job_vtable_s {
+	as_job_slice_fn		slice_fn;
+	as_job_finish_fn	finish_fn;
+	as_job_destroy_fn	destroy_fn;
+	as_job_info_fn		info_mon_fn;
+} as_job_vtable;
+
+typedef enum {
+	RSV_WRITE	= 0,
+	RSV_MIGRATE	= 1
+} as_job_rsv_type;
+
+// Same as cf_queue_priority scheme, so no internal conversion needed:
+#define AS_JOB_PRIORITY_LOW		THREAD_POOL_PRIORITY_LOW
+#define AS_JOB_PRIORITY_MEDIUM	THREAD_POOL_PRIORITY_MEDIUM
+#define AS_JOB_PRIORITY_HIGH	THREAD_POOL_PRIORITY_HIGH
+
+// Same as proto result codes so connected scans don't have to convert:
+#define AS_JOB_FAIL_UNKNOWN		AS_PROTO_RESULT_FAIL_UNKNOWN
+#define AS_JOB_FAIL_PARAMETER	AS_PROTO_RESULT_FAIL_PARAMETER
+#define AS_JOB_FAIL_CLUSTER_KEY	AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH
+#define AS_JOB_FAIL_USER_ABORT	AS_PROTO_RESULT_FAIL_SCAN_ABORT
+#define AS_JOB_FAIL_FORBIDDEN	AS_PROTO_RESULT_FAIL_FORBIDDEN
+
+// These result codes can't make it back to the client, but show in monitor:
+#define AS_JOB_FAIL_RESPONSE_ERROR		(-1)
+#define AS_JOB_FAIL_RESPONSE_TIMEOUT	(-2)
+
+typedef struct as_job_s {
+	// Mandatory interface for derived classes:
+	as_job_vtable				vtable;
+
+	// Parent:
+	struct as_job_manager_s*	mgr;
+
+	// Which partitions to reduce:
+	as_job_rsv_type				rsv_type;
+
+	// Unique identifier:
+	uint64_t					trid;
+
+	// Job scope:
+	struct as_namespace_s*		ns;
+	uint16_t					set_id;
+
+	// Handle active phase:
+	pthread_mutex_t				requeue_lock;
+	int							priority;
+	cf_atomic32					active_rc;
+	volatile int				next_pid;
+	volatile int				abandoned;
+
+	// For tracking:
+	uint64_t					start_ms;
+	uint64_t					finish_ms;
+	cf_atomic64					n_records_read;
+} as_job;
+
+void as_job_init(as_job* _job, const as_job_vtable* vtable,
+		struct as_job_manager_s* manager, as_job_rsv_type rsv_type,
+		uint64_t trid, struct as_namespace_s* ns, uint16_t set_id,
+		int priority);
+void as_job_slice(void* task);
+void as_job_finish(as_job* _job);
+void as_job_destroy(as_job* _job);
+void as_job_info(as_job* _job, struct as_mon_jobstat_s* stat);
+void as_job_active_reserve(as_job* _job);
+void as_job_active_release(as_job* _job);
+
+//----------------------------------------------------------
+// as_job_manager - class header.
+//
+
+typedef struct as_job_manager_s {
+	pthread_mutex_t			lock;
+	cf_queue*				active_jobs;
+	cf_queue*				finished_jobs;
+	as_priority_thread_pool	thread_pool;
+
+	// Manager configuration:
+	uint32_t				max_active;
+	uint32_t				max_done;
+} as_job_manager;
+
+void as_job_manager_init(as_job_manager* mgr, uint32_t max_active, uint32_t max_done, uint32_t n_threads);
+int as_job_manager_start_job(as_job_manager* mgr, as_job* _job);
+void as_job_manager_requeue_job(as_job_manager* mgr, as_job* _job);
+void as_job_manager_finish_job(as_job_manager* mgr, as_job* _job);
+void as_job_manager_abandon_job(as_job_manager* mgr, as_job* _job, int reason);
+bool as_job_manager_abort_job(as_job_manager* mgr, uint64_t trid);
+int as_job_manager_abort_all_jobs(as_job_manager* mgr);
+bool as_job_manager_change_job_priority(as_job_manager* mgr, uint64_t trid, int priority);
+void as_job_manager_limit_active_jobs(as_job_manager* mgr, uint32_t max_active);
+void as_job_manager_limit_finished_jobs(as_job_manager* mgr, uint32_t max_done);
+void as_job_manager_resize_thread_pool(as_job_manager* mgr, uint32_t n_threads);
+struct as_mon_jobstat_s* as_job_manager_get_job_info(as_job_manager* mgr, uint64_t trid);
+struct as_mon_jobstat_s* as_job_manager_get_info(as_job_manager* mgr, int* size);
+int as_job_manager_get_active_job_count(as_job_manager* mgr);
diff --git a/as/include/base/json_init.h b/as/include/base/json_init.h
new file mode 100644
index 00000000..66f2475d
--- /dev/null
+++ b/as/include/base/json_init.h
@@ -0,0 +1,34 @@
+/*
+ * json_init.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+/* SYNOPSIS
+ *  This module handles initialization of the Jansson JSON API by
+ *  setting the memory allocation functions to be used internally
+ *  by Jansson to the CF allocation-related functions.
+ */
+
+/*
+ *  Initialize the JSON module by setting the memory allocation functions.
+ */
+void as_json_init();
diff --git a/as/include/base/monitor.h b/as/include/base/monitor.h
new file mode 100644
index 00000000..82e37e43
--- /dev/null
+++ b/as/include/base/monitor.h
@@ -0,0 +1,103 @@
+/*
+ * monitor.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ *  Long Running Job Monitoring interface
+ *
+ *  This file implements the generic interface for the long running jobs
+ *  in Aerospike like query / scan / batch etc. The idea is to able to see
+ *  what is going on in the system.
+ *
+ *  Each module which needs to show up in the monitoring needs to register
+ *  and implement the interfaces.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "dynbuf.h"
+
+#include "base/datamodel.h"
+
+
+#define AS_MON_OK 0
+#define AS_MON_ERR -1
+#define AS_MON_EXIST -2
+#define TRID_LIST_SIZE 1000
+
+typedef enum {
+	QUERY_MOD	= 0,
+	SCAN_MOD	= 1,
+	SBLD_MOD	= 2
+} as_mon_module_slot;
+
+extern const char * AS_MON_MODULES[];
+
+// Stat for currently running job
+typedef struct as_mon_jobstat_s {
+	uint64_t	trid;
+	char		job_type[32];
+	char		ns[AS_ID_NAMESPACE_SZ];
+	char		set[AS_SET_NAME_MAX_SIZE];
+	uint32_t	priority;
+	char		status[64];
+	float		progress_pct;
+	uint64_t	run_time;
+	uint64_t	time_since_done;
+	uint64_t	recs_read;
+	uint64_t	net_io_bytes;
+	float		cpu;
+	char		jdata[512];
+} as_mon_jobstat;
+
+typedef struct as_mon_cb_s {
+	as_mon_jobstat *(*get_jobstat)		(uint64_t trid);
+	as_mon_jobstat *(*get_jobstat_all)	(int * size);
+
+	// Per transaction
+	int (*set_priority)	(uint64_t trid, uint32_t priority);
+	int (*kill)			(uint64_t trid);
+	int (*suspend)		(uint64_t trid);
+
+	// Per Module
+	// Numer of pending transaction of this job type in queue allowed
+	// incoming more than this will be rejected.
+	int	(*set_pendingmax)	(int);
+
+	// Set the number of transaction that can be inflight at
+	// any point of time.
+	int (*set_maxinflight)	(int);
+
+	// Any individual transaction priority has upper bound of max
+	// priority of jobtype
+	int (*set_maxpriority)	(int);
+} as_mon_cb;
+
+// Structure to register module with as mon interface.
+typedef struct as_mon_s {
+	char 		*type;
+	as_mon_cb	cb;
+} as_mon;
+
+void as_mon_info_cmd(const char *module, char *cmd, uint64_t trid, uint32_t priority, cf_dyn_buf *db);
+int  as_mon_init();
diff --git a/as/include/base/packet_compression.h b/as/include/base/packet_compression.h
new file mode 100644
index 00000000..1af088f5
--- /dev/null
+++ b/as/include/base/packet_compression.h
@@ -0,0 +1,81 @@
+/*
+ * packet_compression.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+typedef enum compression_type_e {
+	COMPRESSION_ZLIB = 1
+} compression_type;
+
+/**
+ * Function to decompress the given data
+ * Expected arguments
+ * @param type			Type of compression
+ * @param length		Length of buffer to be decompressed
+ * @param buf			Pointer to buffer to be decompressed
+ * @param out_buf_len	Length of buffer to hold decompressed data
+ * @param out_buf		Pointer to buffer to hold decompressed data
+ * @return 0 if successful
+ */
+int
+as_decompress(compression_type type, size_t buf_len, const uint8_t *buf, size_t *out_buf_len, uint8_t *out_buf);
+
+/**
+ * Function to get back decompressed packet from PROTO_TYPE_AS_MSG_COMPRESSED packet
+ * Packet :  Header - Original size of message - Compressed message
+ * @param buf					Pointer to PROTO_TYPE_AS_MSG_COMPRESSED packet.
+ * @param output_packet			Pointer holding address of decompressed packet.
+ * @param output_packet_size 	Size of output_packet buffer
+ */
+int
+as_packet_decompression(uint8_t *buf, uint8_t **output_packet, size_t *output_packet_size);
+
+/*
+ * Function to compress the given data
+ * Expected arguments
+ * 1. Type of compression
+ *  1 for zlib
+ * 2. Length of buffer to be compressed - mandatory
+ * 3. Pointer to buffer to be compressed - mandatory
+ * 4. Length of buffer to hold compressed data - mandatory
+ * 5. Pointer to buffer to hold compressed data - mandatory
+ * 6. Compression level - Optional, default Z_DEFAULT_COMPRESSION
+ *                                          Z_NO_COMPRESSION         0
+ *                                          Z_BEST_SPEED             1
+ *                                          Z_BEST_COMPRESSION       9
+ *                                          Z_DEFAULT_COMPRESSION  (-1)
+ */
+int
+as_compress(int argc, uint8_t *argv[]);
+
+/*
+ * Function to create packet to send compressed data.
+ * Packet :  Header - Original size of message - Compressed message.
+ * Input : buf - Pointer to data to be compressed. - Input
+ *     buf_sz - Size of the data to be compressed. - Input
+ *     compressed_packet : Pointer holding address of compressed packet. - Output
+ *     compressed_packet_sz : Size of the compressed packet. - Output
+ */
+int
+as_packet_compression(uint8_t *buf, size_t buf_sz, uint8_t **compressed_packet, size_t *compressed_packet_sz);
diff --git a/as/include/base/particle.h b/as/include/base/particle.h
new file mode 100644
index 00000000..591e5578
--- /dev/null
+++ b/as/include/base/particle.h
@@ -0,0 +1,98 @@
+/*
+ * particle.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "aerospike/as_val.h"
+#include "base/datamodel.h"
+
+//------------------------------------------------
+// Particle interface specification - functions.
+//
+
+// Destructor, etc.
+typedef void (*as_particle_destructor_fn) (as_particle *p);
+typedef uint32_t (*as_particle_size_fn) (const as_particle *p);
+
+// Handle "wire" format.
+typedef int32_t (*as_particle_concat_size_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+typedef int (*as_particle_append_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+typedef int (*as_particle_prepend_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+typedef int (*as_particle_incr_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+typedef int32_t (*as_particle_size_from_wire_fn) (const uint8_t *wire_value, uint32_t value_size);
+typedef int (*as_particle_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+typedef int (*as_particle_compare_from_wire_fn) (const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size);
+typedef uint32_t (*as_particle_wire_size_fn) (const as_particle *p);
+typedef uint32_t (*as_particle_to_wire_fn) (const as_particle *p, uint8_t *wire);
+
+// Handle as_val translation.
+typedef uint32_t (*as_particle_size_from_asval_fn) (const as_val *val);
+typedef void (*as_particle_from_asval_fn) (const as_val *val, as_particle **pp);
+typedef as_val *(*as_particle_to_asval_fn) (const as_particle *p);
+typedef uint32_t (*as_particle_asval_wire_size_fn) (const as_val *val);
+typedef uint32_t (*as_particle_asval_to_wire_fn) (const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+typedef uint32_t (*as_particle_size_from_msgpack_fn) (const uint8_t *packed, uint32_t packed_size);
+typedef void (*as_particle_from_msgpack_fn) (const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+// Handle on-device "flat" format.
+typedef int32_t (*as_particle_size_from_flat_fn) (const uint8_t *flat, uint32_t flat_size);
+typedef int (*as_particle_cast_from_flat_fn) (uint8_t *flat, uint32_t flat_size, as_particle **pp);
+typedef int (*as_particle_from_flat_fn) (const uint8_t *flat, uint32_t flat_size, as_particle **pp);
+typedef uint32_t (*as_particle_flat_size_fn) (const as_particle *p);
+typedef uint32_t (*as_particle_to_flat_fn) (const as_particle *p, uint8_t *flat);
+
+//------------------------------------------------
+// Particle interface specification - vtable.
+//
+
+typedef struct as_particle_vtable_s {
+	as_particle_destructor_fn				destructor_fn;
+	as_particle_size_fn						size_fn;
+
+	as_particle_concat_size_from_wire_fn	concat_size_from_wire_fn;
+	as_particle_append_from_wire_fn			append_from_wire_fn;
+	as_particle_prepend_from_wire_fn		prepend_from_wire_fn;
+	as_particle_incr_from_wire_fn			incr_from_wire_fn;
+	as_particle_size_from_wire_fn			size_from_wire_fn;
+	as_particle_from_wire_fn				from_wire_fn;
+	as_particle_compare_from_wire_fn		compare_from_wire_fn;
+	as_particle_wire_size_fn				wire_size_fn;
+	as_particle_to_wire_fn					to_wire_fn;
+
+	as_particle_size_from_asval_fn			size_from_asval_fn;
+	as_particle_from_asval_fn				from_asval_fn;
+	as_particle_to_asval_fn					to_asval_fn;
+	as_particle_asval_wire_size_fn			asval_wire_size_fn;
+	as_particle_asval_to_wire_fn			asval_to_wire_fn;
+
+	as_particle_size_from_msgpack_fn		size_from_msgpack_fn;
+	as_particle_from_msgpack_fn				from_msgpack_fn;
+
+	as_particle_size_from_flat_fn			size_from_flat_fn; // TODO - unused - remove?
+	as_particle_cast_from_flat_fn			cast_from_flat_fn;
+	as_particle_from_flat_fn				from_flat_fn;
+	as_particle_flat_size_fn				flat_size_fn;
+	as_particle_to_flat_fn					to_flat_fn;
+} as_particle_vtable;
diff --git a/as/include/base/particle_blob.h b/as/include/base/particle_blob.h
new file mode 100644
index 00000000..62b90884
--- /dev/null
+++ b/as/include/base/particle_blob.h
@@ -0,0 +1,63 @@
+/*
+ * particle_blob.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "aerospike/as_val.h"
+#include "base/datamodel.h"
+
+// The BLOB particle interface function declarations are in this header file
+// since BLOB functions are used by other particles derived from BLOB.
+
+// Destructor, etc.
+void blob_destruct(as_particle *p);
+uint32_t blob_size(const as_particle *p);
+
+// Handle "wire" format.
+int32_t blob_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int blob_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int blob_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int blob_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int32_t blob_size_from_wire(const uint8_t *wire_value, uint32_t value_size);
+int blob_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int blob_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size);
+uint32_t blob_wire_size(const as_particle *p);
+uint32_t blob_to_wire(const as_particle *p, uint8_t *wire);
+
+// Handle as_val translation.
+uint32_t blob_size_from_asval(const as_val *val);
+void blob_from_asval(const as_val *val, as_particle **pp);
+as_val *blob_to_asval(const as_particle *p);
+uint32_t blob_asval_wire_size(const as_val *val);
+uint32_t blob_asval_to_wire(const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+uint32_t blob_size_from_msgpack(const uint8_t *packed, uint32_t packed_size);
+void blob_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+// Handle on-device "flat" format.
+int32_t blob_size_from_flat(const uint8_t *flat, uint32_t flat_size);
+int blob_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp);
+int blob_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp);
+uint32_t blob_flat_size(const as_particle *p);
+uint32_t blob_to_flat(const as_particle *p, uint8_t *flat);
diff --git a/as/include/base/particle_integer.h b/as/include/base/particle_integer.h
new file mode 100644
index 00000000..2ebe2e5d
--- /dev/null
+++ b/as/include/base/particle_integer.h
@@ -0,0 +1,63 @@
+/*
+ * particle_integer.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "aerospike/as_val.h"
+#include "base/datamodel.h"
+
+// The INTEGER particle interface function declarations are in this header file
+// since INTEGER functions are used by other particles derived from INTEGER.
+
+// Destructor, etc.
+void integer_destruct(as_particle *p);
+uint32_t integer_size(const as_particle *p);
+
+// Handle "wire" format.
+int32_t integer_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int integer_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int integer_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int integer_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int32_t integer_size_from_wire(const uint8_t *wire_value, uint32_t value_size);
+int integer_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int integer_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size);
+uint32_t integer_wire_size(const as_particle *p);
+uint32_t integer_to_wire(const as_particle *p, uint8_t *wire);
+
+// Handle as_val translation.
+uint32_t integer_size_from_asval(const as_val *val);
+void integer_from_asval(const as_val *val, as_particle **pp);
+as_val *integer_to_asval(const as_particle *p);
+uint32_t integer_asval_wire_size(const as_val *val);
+uint32_t integer_asval_to_wire(const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+uint32_t integer_size_from_msgpack(const uint8_t *packed, uint32_t packed_size);
+void integer_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+// Handle on-device "flat" format.
+int32_t integer_size_from_flat(const uint8_t *flat, uint32_t flat_size);
+int integer_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp);
+int integer_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp);
+uint32_t integer_flat_size(const as_particle *p);
+uint32_t integer_to_flat(const as_particle *p, uint8_t *flat);
diff --git a/as/include/base/predexp.h b/as/include/base/predexp.h
new file mode 100644
index 00000000..93454107
--- /dev/null
+++ b/as/include/base/predexp.h
@@ -0,0 +1,57 @@
+/*
+ * predexp.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * predicate expression declarations
+ *
+ */
+
+#pragma once
+
+#include "base/datamodel.h"
+#include "base/index.h"
+
+// A "compiled" predicate expression
+typedef struct predexp_eval_base_s predexp_eval_t;
+
+// A named variable
+typedef struct predexp_var_s as_predexp_var_t;
+
+// Arguments to predicate expressions
+typedef struct predexp_args_s {
+	as_namespace*		ns;		// always present
+	as_record*			md;		// always present
+	as_predexp_var_t*	vl;		// always present
+	as_storage_rd*		rd;		// NULL during metadata phase
+} predexp_args_t;
+
+extern predexp_eval_t* predexp_build(as_msg_field* pfp);
+
+// Called with NULL rd
+extern bool predexp_matches_metadata(predexp_eval_t* eval,
+									 predexp_args_t* argsp);
+
+// Called with both ndx and rd.
+extern bool predexp_matches_record(predexp_eval_t* eval,
+								   predexp_args_t* argsp);
+
+extern void predexp_destroy(predexp_eval_t* eval);
diff --git a/as/include/base/proto.h b/as/include/base/proto.h
new file mode 100644
index 00000000..9b16a912
--- /dev/null
+++ b/as/include/base/proto.h
@@ -0,0 +1,693 @@
+/*
+ * proto.h
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * wire protocol definition
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+#include "socket.h"
+
+
+// Forward declarations.
+struct as_bin_s;
+struct as_index_s;
+struct as_storage_rd_s;
+struct as_namespace_s;
+struct as_file_handle_s;
+struct as_transaction_s;
+
+// These numbers match with cl_types.h on the client
+
+#define AS_PROTO_RESULT_OK							0
+#define AS_PROTO_RESULT_FAIL_UNKNOWN				1	// unknown failure - consider retry
+#define AS_PROTO_RESULT_FAIL_NOT_FOUND				2
+#define AS_PROTO_RESULT_FAIL_GENERATION				3
+#define AS_PROTO_RESULT_FAIL_PARAMETER				4
+#define AS_PROTO_RESULT_FAIL_RECORD_EXISTS			5	// if 'WRITE_ADD', could fail because already exists
+#define AS_PROTO_RESULT_FAIL_UNUSED_6				6	// recycle - was AS_PROTO_RESULT_FAIL_BIN_EXISTS
+#define AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH	7
+#define AS_PROTO_RESULT_FAIL_OUT_OF_SPACE			8
+#define AS_PROTO_RESULT_FAIL_TIMEOUT				9
+#define AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN		10	// operation not allowed for current (static) configuration
+#define AS_PROTO_RESULT_FAIL_UNAVAILABLE			11	// error returned during node down and partition isn't available
+#define AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE		12	// op and bin type incompatibility
+#define AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG			13
+#define AS_PROTO_RESULT_FAIL_KEY_BUSY				14
+#define AS_PROTO_RESULT_FAIL_SCAN_ABORT				15
+#define AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE	16	// asked to do something we don't do for a particular configuration
+#define AS_PROTO_RESULT_FAIL_UNUSED_17				17	// recycle - was AS_PROTO_RESULT_FAIL_BIN_NOT_FOUND
+#define AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD		18
+#define AS_PROTO_RESULT_FAIL_KEY_MISMATCH			19
+#define AS_PROTO_RESULT_FAIL_NAMESPACE				20
+#define AS_PROTO_RESULT_FAIL_BIN_NAME				21
+#define AS_PROTO_RESULT_FAIL_FORBIDDEN				22	// operation temporarily not possible
+#define AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND		23
+#define AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS			24
+#define AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY		25	// attempting enterprise functionality on community build
+
+// Security result codes. Must be <= 255, to fit in one byte. Defined here to
+// ensure no overlap with other result codes.
+#define AS_SEC_RESULT_OK_LAST			50	// the last message
+	// Security message errors.
+#define AS_SEC_ERR_NOT_SUPPORTED		51	// security features not supported
+#define AS_SEC_ERR_NOT_ENABLED			52	// security features not enabled
+#define AS_SEC_ERR_SCHEME				53	// security scheme not supported
+#define AS_SEC_ERR_COMMAND				54	// unrecognized command
+#define AS_SEC_ERR_FIELD				55	// can't parse field
+#define AS_SEC_ERR_STATE				56	// e.g. unexpected command
+	// Security procedure errors.
+#define AS_SEC_ERR_USER					60	// no user or unknown user
+#define AS_SEC_ERR_USER_EXISTS			61	// user already exists
+#define AS_SEC_ERR_PASSWORD				62	// no password or bad password
+#define AS_SEC_ERR_EXPIRED_PASSWORD		63	// expired password
+#define AS_SEC_ERR_FORBIDDEN_PASSWORD	64	// forbidden password (e.g. recently used)
+#define AS_SEC_ERR_CREDENTIAL			65	// no credential or bad credential
+	// ... room for more ...
+#define AS_SEC_ERR_ROLE					70	// no role(s) or unknown role(s)
+#define AS_SEC_ERR_ROLE_EXISTS			71	// role already exists
+#define AS_SEC_ERR_PRIVILEGE			72	// no privileges or unknown privileges
+	// Permission errors.
+#define AS_SEC_ERR_NOT_AUTHENTICATED	80	// socket not authenticated
+#define AS_SEC_ERR_ROLE_VIOLATION		81	// role (privilege) violation
+
+// UDF Errors (100 - 109)
+#define AS_PROTO_RESULT_FAIL_UDF_EXECUTION     100
+
+// Batch Errors (150 - 159)
+#define AS_PROTO_RESULT_FAIL_BATCH_DISABLED		150 // batch functionality has been disabled
+#define AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS	151 // batch-max-requests has been exceeded
+#define AS_PROTO_RESULT_FAIL_BATCH_QUEUES_FULL	152 // all batch queues are full
+
+// Geo Errors (160 - 169)
+#define AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON 160 // Invalid GeoJSON on insert/update
+
+// Secondary Index Query Failure Codes (200 - 219)
+#define AS_PROTO_RESULT_FAIL_INDEX_FOUND       200
+#define AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND    201
+#define AS_PROTO_RESULT_FAIL_INDEX_OOM         202
+#define AS_PROTO_RESULT_FAIL_INDEX_NOTREADABLE 203
+#define AS_PROTO_RESULT_FAIL_INDEX_GENERIC     204
+#define AS_PROTO_RESULT_FAIL_INDEX_NAME_MAXLEN 205
+#define AS_PROTO_RESULT_FAIL_INDEX_MAXCOUNT    206
+
+#define AS_PROTO_RESULT_FAIL_QUERY_USERABORT   210
+#define AS_PROTO_RESULT_FAIL_QUERY_QUEUEFULL   211
+#define AS_PROTO_RESULT_FAIL_QUERY_TIMEOUT     212
+#define AS_PROTO_RESULT_FAIL_QUERY_CBERROR     213
+#define AS_PROTO_RESULT_FAIL_QUERY_NETIO_ERR   214
+#define AS_PROTO_RESULT_FAIL_QUERY_DUPLICATE   215
+
+/* SYNOPSIS
+ * Aerospike wire protocol
+ *
+ * Version 2
+ *
+ * Aerospike uses a message-oriented wire protocol to transfer information.
+ * Each message consists of a header, which determines the type and the length
+ * to follow. This is called the 'proto_msg'.
+ *
+ * these messages are vectored out to the correct handler. Over TCP, they can be
+ * pipelined (but not out of order). If we wish to support out of order responses,
+ * we should upgrade the protocol.
+ *
+ * the most common type of message is the as_msg, a message which reads or writes
+ * a single row to the data store.
+ *
+ */
+
+#define PROTO_VERSION					2
+
+#define PROTO_TYPE_INFO					1 // ascii-format message for determining server info
+#define PROTO_TYPE_SECURITY				2
+#define PROTO_TYPE_AS_MSG				3
+#define PROTO_TYPE_AS_MSG_COMPRESSED	4
+#define PROTO_TYPE_INTERNAL_XDR			5
+#define PROTO_TYPE_MAX					6 // if you see 6, it's illegal
+
+#define PROTO_SIZE_MAX (128 * 1024 * 1024) // used simply for validation, as we've been corrupting msgp's
+
+#define PROTO_FIELD_LENGTH_MAX	1024
+#define PROTO_OP_LENGTH_MAX		131072
+
+typedef struct as_proto_s {
+	uint8_t		version;
+	uint8_t		type;
+	uint64_t	sz: 48;
+	uint8_t		data[];
+} __attribute__ ((__packed__)) as_proto;
+
+/*
+ * zlib decompression API needs original size of the compressed data.
+ * So we need to transfer it to another end.
+ * This structure packs together -
+ * header + original size of data + compressed data
+ */
+typedef struct as_comp_proto_s {
+	as_proto    proto;     // Protocol header
+	uint64_t    org_sz;    // Original size of compressed data hold in 'data'
+	uint8_t data[];        // Compressed data
+}  as_comp_proto;
+
+/* as_msg_field
+* Aerospike message field */
+typedef struct as_msg_field_s {
+#define AS_MSG_FIELD_TYPE_NAMESPACE				0
+#define AS_MSG_FIELD_TYPE_SET					1
+#define AS_MSG_FIELD_TYPE_KEY					2
+#define AS_MSG_FIELD_TYPE_DIGEST_RIPE			4
+#define AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY		6
+#define AS_MSG_FIELD_TYPE_TRID					7
+#define AS_MSG_FIELD_TYPE_SCAN_OPTIONS			8
+#define AS_MSG_FIELD_TYPE_SOCKET_TIMEOUT		9
+
+#define AS_MSG_FIELD_TYPE_INDEX_NAME			21
+#define	AS_MSG_FIELD_TYPE_INDEX_RANGE			22
+#define AS_MSG_FIELD_TYPE_INDEX_TYPE			26
+
+// UDF RANGE: 30-39
+#define AS_MSG_FIELD_TYPE_UDF_FILENAME			30
+#define AS_MSG_FIELD_TYPE_UDF_FUNCTION			31
+#define AS_MSG_FIELD_TYPE_UDF_ARGLIST			32
+#define AS_MSG_FIELD_TYPE_UDF_OP				33
+
+#define AS_MSG_FIELD_TYPE_QUERY_BINLIST			40
+#define AS_MSG_FIELD_TYPE_BATCH					41
+#define AS_MSG_FIELD_TYPE_BATCH_WITH_SET		42
+#define AS_MSG_FIELD_TYPE_PREDEXP				43
+
+	/* NB: field_sz is sizeof(type) + sizeof(data) */
+	uint32_t field_sz; // get the data size through the accessor function, don't worry, it's a small macro
+	uint8_t type;   // ordering matters :-( see as_transaction_prepare
+	uint8_t data[];
+} __attribute__((__packed__)) as_msg_field;
+
+// For as_transaction::field_types, a bit-field to mark which fields are in the
+// as_msg.
+#define AS_MSG_FIELD_BIT_NAMESPACE			0x00000001
+#define AS_MSG_FIELD_BIT_SET				0x00000002
+#define AS_MSG_FIELD_BIT_KEY				0x00000004
+#define AS_MSG_FIELD_BIT_DIGEST_RIPE		0x00000008
+#define AS_MSG_FIELD_BIT_DIGEST_RIPE_ARRAY	0x00000010
+#define AS_MSG_FIELD_BIT_TRID				0x00000020
+#define AS_MSG_FIELD_BIT_SCAN_OPTIONS		0x00000040
+#define AS_MSG_FIELD_BIT_SOCKET_TIMEOUT		0x00000080
+#define AS_MSG_FIELD_BIT_INDEX_NAME			0x00000100
+#define	AS_MSG_FIELD_BIT_INDEX_RANGE		0x00000200
+#define AS_MSG_FIELD_BIT_INDEX_TYPE  		0x00000400
+#define AS_MSG_FIELD_BIT_UDF_FILENAME		0x00000800
+#define AS_MSG_FIELD_BIT_UDF_FUNCTION		0x00001000
+#define AS_MSG_FIELD_BIT_UDF_ARGLIST		0x00002000
+#define AS_MSG_FIELD_BIT_UDF_OP				0x00004000
+#define AS_MSG_FIELD_BIT_QUERY_BINLIST		0x00008000
+#define AS_MSG_FIELD_BIT_BATCH				0x00010000
+#define AS_MSG_FIELD_BIT_BATCH_WITH_SET		0x00020000
+#define AS_MSG_FIELD_BIT_PREDEXP			0x00040000
+
+// as_msg ops
+
+#define AS_MSG_OP_READ 1			// read the value in question
+#define AS_MSG_OP_WRITE 2			// write the value in question
+
+// Prospective CDT top-level ops:
+#define AS_MSG_OP_CDT_READ 3
+#define AS_MSG_OP_CDT_MODIFY 4
+
+#define AS_MSG_OP_INCR 5			// arithmetically add a value to an existing value, works only on integers
+// Unused - 6
+// Unused - 7
+// Unused - 8
+#define AS_MSG_OP_APPEND 9			// append a value to an existing value, works on strings and blobs
+#define AS_MSG_OP_PREPEND 10		// prepend a value to an existing value, works on strings and blobs
+#define AS_MSG_OP_TOUCH 11			// touch a value without doing anything else to it - will increment the generation
+
+#define AS_MSG_OP_MC_INCR 129		// Memcache-compatible version of the increment command
+#define AS_MSG_OP_MC_APPEND 130		// append the value to an existing value, works only strings for now
+#define AS_MSG_OP_MC_PREPEND 131	// prepend a value to an existing value, works only strings for now
+#define AS_MSG_OP_MC_TOUCH 132		// Memcache-compatible touch - does not change generation
+
+#define OP_IS_MODIFY(op) ( \
+	   (op) == AS_MSG_OP_INCR \
+	|| (op) == AS_MSG_OP_APPEND \
+	|| (op) == AS_MSG_OP_PREPEND \
+	|| (op) == AS_MSG_OP_MC_INCR \
+    || (op) == AS_MSG_OP_MC_APPEND \
+    || (op) == AS_MSG_OP_MC_PREPEND \
+    )
+
+#define OP_IS_TOUCH(op) ((op) == AS_MSG_OP_TOUCH || (op) == AS_MSG_OP_MC_TOUCH)
+
+typedef struct as_msg_op_s {
+	uint32_t op_sz;
+	uint8_t  op;
+	uint8_t  particle_type;
+	uint8_t  version; // now unused
+	uint8_t  name_sz;
+	uint8_t	 name[]; // UTF-8
+	// there's also a value here but you can't have two variable size arrays
+} __attribute__((__packed__)) as_msg_op;
+
+static inline uint8_t * as_msg_op_get_value_p(as_msg_op *op)
+{
+	return (uint8_t*)op + sizeof(as_msg_op) + op->name_sz;
+}
+
+static inline uint32_t as_msg_op_get_value_sz(const as_msg_op *op)
+{
+	return op->op_sz - (4 + op->name_sz);
+}
+
+static inline uint32_t as_msg_field_get_value_sz(as_msg_field *f)
+{
+	return f->field_sz - 1;
+}
+
+static inline uint32_t as_msg_field_get_strncpy(as_msg_field *f, char *dst, int sz)
+{
+	int fsz = f->field_sz - 1;
+	if (sz > fsz) {
+		memcpy(dst, f->data, fsz);
+		dst[fsz] = 0;
+		return fsz;
+	}
+	else {
+		memcpy(dst, f->data, sz - 1);
+		dst[sz - 1] = 0;
+		return sz - 1;
+	}
+}
+
+typedef struct as_msg_s {
+	/*00 [x00] (08) */	uint8_t		header_sz;	// number of bytes in this header - 22
+	/*01 [x01] (09) */	uint8_t		info1;		// bitfield about this request
+	/*02 [x02] (10) */	uint8_t		info2;		// filled up, need another
+	/*03 [x03] (11) */	uint8_t		info3;		// nice extra space. Mmm, tasty extra space.
+	/*04 [x04] (12) */	uint8_t		unused;
+	/*05 [x05] (13) */	uint8_t		result_code;
+	/*06 [x06] (14) */	uint32_t	generation;
+	/*10 [x0A] (18) */	uint32_t	record_ttl;
+	/*14 [x10] (22) */	uint32_t	transaction_ttl;
+	/*18 [x12] (26) */	uint16_t	n_fields;	// number of fields
+	/*20 [x14] (28) */	uint16_t	n_ops;		// number of operations
+	/*22 [x16] (30) */	uint8_t		data[];		// data contains first the fields, then the ops
+} __attribute__((__packed__)) as_msg;
+
+/* as_ms
+ * Aerospike message
+ * sz: size of the payload, not including the header */
+typedef struct cl_msg_s {
+	as_proto  	proto;
+	as_msg		msg;
+} __attribute__((__packed__)) cl_msg;
+
+#define AS_MSG_INFO1_READ				(1 << 0) // contains a read operation
+#define AS_MSG_INFO1_GET_ALL			(1 << 1) // get all bins, period
+// (Note:  Bit 2 is unused.)
+#define AS_MSG_INFO1_BATCH				(1 << 3) // new batch protocol
+#define AS_MSG_INFO1_XDR				(1 << 4) // operation is being performed by XDR
+#define AS_MSG_INFO1_GET_NO_BINS		(1 << 5) // get record metadata only - no bin metadata or data
+#define AS_MSG_INFO1_CONSISTENCY_LEVEL_B0	(1 << 6) // read consistency level - bit 0
+#define AS_MSG_INFO1_CONSISTENCY_LEVEL_B1	(1 << 7) // read consistency level - bit 1
+
+#define AS_MSG_INFO2_WRITE				(1 << 0) // contains a write semantic
+#define AS_MSG_INFO2_DELETE				(1 << 1) // delete record
+#define AS_MSG_INFO2_GENERATION			(1 << 2) // pay attention to the generation
+#define AS_MSG_INFO2_GENERATION_GT		(1 << 3) // apply write if new generation > old, good for restore
+#define AS_MSG_INFO2_DURABLE_DELETE		(1 << 4) // op resulting in record deletion leaves tombstone (Enterprise only)
+#define AS_MSG_INFO2_CREATE_ONLY		(1 << 5) // write record only if it doesn't exist
+// (Note:  Bit 6 is unused.)
+#define AS_MSG_INFO2_RESPOND_ALL_OPS	(1 << 7) // all bin ops (read, write, or modify) require a response, in request order
+
+#define AS_MSG_INFO3_LAST				(1 << 0) // this is the last of a multi-part message
+#define AS_MSG_INFO3_COMMIT_LEVEL_B0  	(1 << 1) // write commit level - bit 0
+#define AS_MSG_INFO3_COMMIT_LEVEL_B1  	(1 << 2) // write commit level - bit 1
+#define AS_MSG_INFO3_UPDATE_ONLY		(1 << 3) // update existing record only, do not create new record
+#define AS_MSG_INFO3_CREATE_OR_REPLACE	(1 << 4) // completely replace existing record, or create new record
+#define AS_MSG_INFO3_REPLACE_ONLY		(1 << 5) // completely replace existing record, do not create new record
+#define AS_MSG_INFO3_LINEARIZE_READ		(1 << 6) // enterprise only
+// (Note:  Bit 7 is unused.)
+
+#define AS_MSG_FIELD_SCAN_UNUSED_2					(0x02) // was - whether to send ldt bin data back to the client
+#define AS_MSG_FIELD_SCAN_DISCONNECTED_JOB			(0x04) // for sproc jobs that won't be sending results back to the client [UNUSED]
+#define AS_MSG_FIELD_SCAN_FAIL_ON_CLUSTER_CHANGE	(0x08) // if we should fail when cluster is migrating or cluster changes
+#define AS_MSG_FIELD_SCAN_PRIORITY(__cl_byte)		((0xF0 & __cl_byte)>>4) // 4 bit value indicating the scan priority
+
+static inline as_msg_field *
+as_msg_field_get_next(as_msg_field *mf)
+{
+	return (as_msg_field*)(((uint8_t*)mf) + sizeof(mf->field_sz) + mf->field_sz);
+}
+
+static inline uint8_t *
+as_msg_field_skip(as_msg_field *mf)
+{
+	// At least 1 byte always follow field_sz.
+	return mf->field_sz == 0 ? NULL : (uint8_t*)mf + sizeof(mf->field_sz) + mf->field_sz;
+}
+
+/* as_msg_field_get
+ * Retrieve a specific field from a message */
+static inline as_msg_field *
+as_msg_field_get(const as_msg *msg, uint8_t type)
+{
+	uint16_t n;
+	as_msg_field *fp = NULL;
+
+	fp = (as_msg_field*)msg->data;
+
+	for (n = 0; n < msg->n_fields; n++) {
+
+		if (fp->type == type) {
+			break;
+		}
+
+		fp = as_msg_field_get_next(fp);
+	}
+
+	if (n == msg->n_fields) {
+		return NULL;
+	}
+	else {
+		return fp;
+	}
+}
+
+static inline as_msg_op *
+as_msg_op_get_next(as_msg_op *op)
+{
+	return (as_msg_op*)(((uint8_t*)op) + sizeof(uint32_t) + op->op_sz);
+}
+
+static inline uint8_t *
+as_msg_op_skip(as_msg_op *op)
+{
+	// At least 4 bytes always follow op_sz.
+	return (uint32_t)op->name_sz + 4 > op->op_sz ?
+			NULL : (uint8_t*)op + sizeof(op->op_sz) + op->op_sz;
+}
+
+/* as_msg_field_getnext
+ * Iterator for all fields of a particular type.
+ * First time through: pass 0 as current, you'll get a field.
+ * Next time through: pass the current as current, you'll get null when there
+ * are no more.
+ */
+static inline as_msg_op *
+as_msg_op_iterate(as_msg *msg, as_msg_op *current, int *n)
+{
+	// Skip over the fields the first time.
+	if (! current) {
+		if (msg->n_ops == 0) {
+			return 0; // short cut
+		}
+
+		as_msg_field *mf = (as_msg_field*)msg->data;
+
+		for (uint16_t i = 0; i < msg->n_fields; i++) {
+			mf = as_msg_field_get_next(mf);
+		}
+
+		current = (as_msg_op*)mf;
+		*n = 0;
+
+		return current;
+	}
+
+	(*n)++;
+
+	if (*n >= msg->n_ops) {
+		return 0;
+	}
+
+	return as_msg_op_get_next(current);
+}
+
+static inline size_t
+as_proto_size_get(const as_proto *proto)
+{
+	return sizeof(as_proto) + proto->sz;
+}
+
+static inline bool
+as_proto_is_valid_type(const as_proto *proto)
+{
+	return proto->type != 0 && proto->type < PROTO_TYPE_MAX;
+}
+
+static inline bool
+as_proto_wrapped_is_valid(const as_proto *proto, size_t size)
+{
+	return proto->version == PROTO_VERSION &&
+			proto->type == PROTO_TYPE_AS_MSG && // currently we only wrap as_msg
+			as_proto_size_get(proto) == size;
+}
+
+void as_proto_swap(as_proto *proto);
+void as_msg_swap_header(as_msg *m);
+void as_msg_swap_field(as_msg_field *mf);
+void as_msg_swap_op(as_msg_op *op);
+
+cl_msg *as_msg_create_internal(const char *ns_name, const cf_digest *keyd,
+		uint8_t info1, uint8_t info2, uint8_t info3);
+
+cl_msg *as_msg_make_response_msg(uint32_t result_code, uint32_t generation,
+		uint32_t void_time, as_msg_op **ops, struct as_bin_s **bins,
+		uint16_t bin_count, struct as_namespace_s *ns, cl_msg *msgp_in,
+		size_t *msg_sz_in, uint64_t trid);
+int32_t as_msg_make_response_bufbuilder(cf_buf_builder **bb_r,
+		struct as_storage_rd_s *rd, bool no_bin_data, bool include_key,
+		bool skip_empty_records, cf_vector *select_bins);
+cl_msg *as_msg_make_val_response(bool success, const as_val *val,
+		uint32_t result_code, uint32_t generation, uint32_t void_time,
+		uint64_t trid, size_t *p_msg_sz);
+void as_msg_make_val_response_bufbuilder(const as_val *val,
+		cf_buf_builder **bb_r, uint32_t val_sz, bool);
+
+int as_msg_send_reply(struct as_file_handle_s *fd_h, uint32_t result_code,
+		uint32_t generation, uint32_t void_time, as_msg_op **ops,
+		struct as_bin_s **bins, uint16_t bin_count, struct as_namespace_s *ns,
+		uint64_t trid);
+int as_msg_send_ops_reply(struct as_file_handle_s *fd_h, cf_dyn_buf *db);
+bool as_msg_send_fin(cf_socket *sock, uint32_t result_code);
+size_t as_msg_send_fin_timeout(cf_socket *sock, uint32_t result_code,
+		int32_t timeout);
+
+// Async IO
+typedef int (* as_netio_finish_cb) (void *udata, int retcode);
+typedef int (* as_netio_start_cb) (void *udata, int seq);
+typedef struct as_netio_s {
+	as_netio_finish_cb         finish_cb;
+	as_netio_start_cb          start_cb;
+	void                     * data;
+	// fd and buffer
+	struct as_file_handle_s  * fd_h;
+	cf_buf_builder           * bb_r;
+	uint32_t                   offset;
+	uint32_t                   seq;
+	bool                       slow;
+	uint64_t                   start_time;
+} as_netio;
+
+void as_netio_init();
+int as_netio_send(as_netio *io, bool slow, bool blocking);
+
+#define AS_NETIO_OK        0
+#define AS_NETIO_CONTINUE  1
+#define AS_NETIO_ERR       2
+#define AS_NETIO_IO_ERR    3
+
+// These values correspond to client protocol values - do not change them!
+typedef enum as_udf_op {
+	AS_UDF_OP_KVS        = 0,
+	AS_UDF_OP_AGGREGATE  = 1,
+	AS_UDF_OP_BACKGROUND = 2,
+	AS_UDF_OP_FOREGROUND = 3		// not supported yet
+} as_udf_op;
+
+#define CDT_MAGIC	0xC0 // so we know it can't be (first byte of) msgpack list/map
+
+typedef enum as_cdt_paramtype_e {
+	AS_CDT_PARAM_NONE		= 0,
+
+	AS_CDT_PARAM_INDEX		= 1,
+	AS_CDT_PARAM_COUNT		= 2,
+	AS_CDT_PARAM_PAYLOAD	= 3,
+	AS_CDT_PARAM_FLAGS		= 4,
+} as_cdt_paramtype;
+
+typedef enum result_type_e {
+	RESULT_TYPE_NONE			= 0,
+	RESULT_TYPE_INDEX			= 1,
+	RESULT_TYPE_REVINDEX		= 2,
+	RESULT_TYPE_RANK			= 3,
+	RESULT_TYPE_REVRANK			= 4,
+	RESULT_TYPE_COUNT			= 5,
+	RESULT_TYPE_KEY				= 6,
+	RESULT_TYPE_VALUE			= 7,
+	RESULT_TYPE_MAP				= 8,
+	RESULT_TYPE_INDEX_RANGE		= 9,
+	RESULT_TYPE_REVINDEX_RANGE	= 10,
+	RESULT_TYPE_RANK_RANGE		= 11,
+	RESULT_TYPE_REVRANK_RANGE	= 12,
+} result_type_t;
+
+typedef enum {
+	AS_CDT_OP_FLAG_RESULT_MASK = 0x0000ffff,
+	AS_CDT_OP_FLAG_INVERTED = 0x00010000
+} as_cdt_op_flags;
+
+typedef enum {
+	AS_CDT_SORT_ASCENDING = 0,
+	AS_CDT_SORT_DESCENDING = 1,
+	AS_CDT_SORT_DROP_DUPLICATES = 2
+} as_cdt_sort_flags;
+
+typedef enum {
+	AS_CDT_LIST_MODIFY_DEFAULT = 0x00,
+	AS_CDT_LIST_ADD_UNIQUE = 0x01,
+	AS_CDT_LIST_INSERT_BOUNDED = 0x02
+} as_cdt_list_modify_flags;
+
+typedef enum as_cdt_optype_e {
+	// ------------------------------------------------------------------------
+	// List Operation
+
+	AS_CDT_OP_LIST_SET_TYPE      = 0,
+
+	// Adds
+	AS_CDT_OP_LIST_APPEND        = 1,
+	AS_CDT_OP_LIST_APPEND_ITEMS  = 2,
+	AS_CDT_OP_LIST_INSERT        = 3,
+	AS_CDT_OP_LIST_INSERT_ITEMS  = 4,
+
+	// Removes
+	AS_CDT_OP_LIST_POP           = 5,
+	AS_CDT_OP_LIST_POP_RANGE     = 6,
+	AS_CDT_OP_LIST_REMOVE        = 7,
+	AS_CDT_OP_LIST_REMOVE_RANGE  = 8,
+
+	// Modifies
+	AS_CDT_OP_LIST_SET           = 9,
+	AS_CDT_OP_LIST_TRIM          = 10,
+	AS_CDT_OP_LIST_CLEAR         = 11,
+	AS_CDT_OP_LIST_INCREMENT     = 12,
+
+	AS_CDT_OP_LIST_SORT          = 13,
+
+	// Reads
+	AS_CDT_OP_LIST_SIZE          = 16,
+	AS_CDT_OP_LIST_GET           = 17,
+	AS_CDT_OP_LIST_GET_RANGE     = 18,
+
+	// GET_BYs
+	AS_CDT_OP_LIST_GET_BY_INDEX             = 19,
+	AS_CDT_OP_LIST_GET_BY_VALUE             = 20,
+	AS_CDT_OP_LIST_GET_BY_RANK              = 21,
+
+	AS_CDT_OP_LIST_GET_ALL_BY_VALUE         = 22,
+	AS_CDT_OP_LIST_GET_ALL_BY_VALUE_LIST    = 23,
+
+	AS_CDT_OP_LIST_GET_BY_INDEX_RANGE       = 24,
+	AS_CDT_OP_LIST_GET_BY_VALUE_INTERVAL    = 25,
+	AS_CDT_OP_LIST_GET_BY_RANK_RANGE        = 26,
+
+	// REMOVE_BYs
+	AS_CDT_OP_LIST_REMOVE_BY_INDEX          = 32,
+	AS_CDT_OP_LIST_REMOVE_BY_VALUE          = 33,
+	AS_CDT_OP_LIST_REMOVE_BY_RANK           = 34,
+
+	AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE      = 35,
+	AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE_LIST = 36,
+
+	AS_CDT_OP_LIST_REMOVE_BY_INDEX_RANGE    = 37,
+	AS_CDT_OP_LIST_REMOVE_BY_VALUE_INTERVAL = 38,
+	AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE     = 39,
+
+	// ------------------------------------------------------------------------
+	// Map Operation
+
+	// Create and flags
+	AS_CDT_OP_MAP_SET_TYPE							= 64,
+
+	// Modify Ops
+	AS_CDT_OP_MAP_ADD								= 65,
+	AS_CDT_OP_MAP_ADD_ITEMS							= 66,
+	AS_CDT_OP_MAP_PUT								= 67,
+	AS_CDT_OP_MAP_PUT_ITEMS							= 68,
+	AS_CDT_OP_MAP_REPLACE							= 69,
+	AS_CDT_OP_MAP_REPLACE_ITEMS						= 70,
+	AS_CDT_OP_MAP_RESERVED_0						= 71,
+	AS_CDT_OP_MAP_RESERVED_1						= 72,
+
+	AS_CDT_OP_MAP_INCREMENT							= 73,
+	AS_CDT_OP_MAP_DECREMENT							= 74,
+
+	AS_CDT_OP_MAP_CLEAR								= 75,
+
+	AS_CDT_OP_MAP_REMOVE_BY_KEY						= 76,
+	AS_CDT_OP_MAP_REMOVE_BY_INDEX					= 77,
+	AS_CDT_OP_MAP_REMOVE_BY_VALUE					= 78,
+	AS_CDT_OP_MAP_REMOVE_BY_RANK					= 79,
+
+	AS_CDT_OP_MAP_RESERVED_2						= 80,
+	AS_CDT_OP_MAP_REMOVE_BY_KEY_LIST				= 81,
+	AS_CDT_OP_MAP_REMOVE_ALL_BY_VALUE				= 82,
+	AS_CDT_OP_MAP_REMOVE_BY_VALUE_LIST				= 83,
+
+	AS_CDT_OP_MAP_REMOVE_BY_KEY_INTERVAL			= 84,
+	AS_CDT_OP_MAP_REMOVE_BY_INDEX_RANGE				= 85,
+	AS_CDT_OP_MAP_REMOVE_BY_VALUE_INTERVAL			= 86,
+	AS_CDT_OP_MAP_REMOVE_BY_RANK_RANGE				= 87,
+
+	// Read ops
+	AS_CDT_OP_MAP_SIZE								= 96,
+
+	AS_CDT_OP_MAP_GET_BY_KEY						= 97,
+	AS_CDT_OP_MAP_GET_BY_INDEX						= 98,
+	AS_CDT_OP_MAP_GET_BY_VALUE						= 99,
+	AS_CDT_OP_MAP_GET_BY_RANK						= 100,
+
+	AS_CDT_OP_MAP_RESERVED_3						= 101,
+	AS_CDT_OP_MAP_GET_ALL_BY_VALUE					= 102,
+
+	AS_CDT_OP_MAP_GET_BY_KEY_INTERVAL				= 103,
+	AS_CDT_OP_MAP_GET_BY_INDEX_RANGE				= 104,
+	AS_CDT_OP_MAP_GET_BY_VALUE_INTERVAL				= 105,
+	AS_CDT_OP_MAP_GET_BY_RANK_RANGE					= 106,
+
+	AS_CDT_OP_MAP_GET_BY_KEY_LIST					= 107,
+	AS_CDT_OP_MAP_GET_BY_VALUE_LIST					= 108,
+
+} as_cdt_optype;
+
+#define AS_CDT_OP_LIST_LAST AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE
diff --git a/as/include/base/rec_props.h b/as/include/base/rec_props.h
new file mode 100644
index 00000000..14f4f7dd
--- /dev/null
+++ b/as/include/base/rec_props.h
@@ -0,0 +1,79 @@
+/*
+ * rec_props.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * A list of record properties.
+ *
+ */
+
+#pragma once
+
+
+//==========================================================
+// Includes
+//
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+//==========================================================
+// Typedefs
+//
+
+// Values stored on drive - be careful.
+typedef enum {
+	CL_REC_PROPS_FIELD_SET_NAME	= 0,
+	CL_REC_PROPS_FIELD_UNUSED_1	= 1,
+	CL_REC_PROPS_FIELD_KEY		= 2,
+	CL_REC_PROPS_FIELD_LAST_PLUS_1
+} as_rec_props_field_id;
+
+//------------------------------------------------
+// Class Member Data
+//
+typedef struct as_rec_props_s {
+	uint8_t*	p_data;
+	uint32_t	size;
+} as_rec_props;
+
+
+//==========================================================
+// Public API
+//
+
+void as_rec_props_clear(as_rec_props *_this);
+int as_rec_props_get_value(const as_rec_props *_this,
+		as_rec_props_field_id id, uint32_t *p_value_size, uint8_t **pp_value);
+uint32_t as_rec_props_sizeof_field(uint32_t value_size);
+void as_rec_props_init(as_rec_props *_this, uint8_t *p_data);
+void as_rec_props_init_malloc(as_rec_props *_this, uint32_t malloc_size);
+void as_rec_props_add_field(as_rec_props *_this,
+		as_rec_props_field_id id, uint32_t value_size, const uint8_t *p_value);
+void as_rec_props_add_field_null_terminate(as_rec_props *_this,
+		as_rec_props_field_id id, uint32_t value_len, const uint8_t *p_value);
+
+size_t as_rec_props_size_all(const uint8_t *set_name, size_t set_name_len,
+		const uint8_t *key, size_t key_size);
+void as_rec_props_fill_all(as_rec_props *_this, uint8_t *p_data,
+		const uint8_t *set_name, size_t set_name_len, const uint8_t *key,
+		size_t key_size);
diff --git a/as/include/base/scan.h b/as/include/base/scan.h
new file mode 100644
index 00000000..f71f3f32
--- /dev/null
+++ b/as/include/base/scan.h
@@ -0,0 +1,58 @@
+/*
+ * scan.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+#include "dynbuf.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_mon_jobstat_s;
+struct as_namespace_s;
+struct as_transaction_s;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_scan_init();
+int as_scan(struct as_transaction_s *tr, struct as_namespace_s *ns);
+void as_scan_limit_active_jobs(uint32_t max_active);
+void as_scan_limit_finished_jobs(uint32_t max_done);
+void as_scan_resize_thread_pool(uint32_t n_threads);
+int as_scan_get_active_job_count();
+int as_scan_list(char* name, cf_dyn_buf* db);
+struct as_mon_jobstat_s* as_scan_get_jobstat(uint64_t trid);
+struct as_mon_jobstat_s* as_scan_get_jobstat_all(int* size);
+int as_scan_abort(uint64_t trid);
+int as_scan_abort_all();
+int as_scan_change_job_priority(uint64_t trid, uint32_t priority);
diff --git a/as/include/base/secondary_index.h b/as/include/base/secondary_index.h
new file mode 100644
index 00000000..8fecf337
--- /dev/null
+++ b/as/include/base/secondary_index.h
@@ -0,0 +1,691 @@
+/*
+ * secondary_index.h
+ *
+ * Copyright (C) 2012-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ *  SYNOPSIS
+ *  Abstraction to support secondary indexes with multiple implementations.
+ */
+
+#pragma once
+
+#include "base/datamodel.h"
+#include "base/monitor.h"
+#include "base/proto.h"
+#include "base/system_metadata.h"
+#include "base/transaction.h"
+#include "fabric/partition.h"
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_ll.h"
+
+#include "dynbuf.h"
+#include "hist.h"
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "storage/storage.h"
+
+
+/*
+ * HARD LIMIT ON SIZES
+ */
+// **************************************************************************************************
+#define AS_SINDEX_MAX_STRING_KSIZE 2048
+#define AS_SINDEX_MAX_GEOJSON_KSIZE (1024 * 1024)
+#define OLD_SINDEX_SMD_KEY_SIZE    AS_ID_INAME_SZ + AS_ID_NAMESPACE_SZ
+#define SINDEX_SMD_KEY_SIZE        (AS_ID_NAMESPACE_SZ + AS_SET_NAME_MAX_SIZE + AS_SINDEX_MAX_PATH_LENGTH + 1 + 2 + 2)
+#define SINDEX_SMD_VALUE_SIZE      (AS_SMD_MAJORITY_CONSENSUS_KEYSIZE)
+#define OLD_SINDEX_MODULE          "sindex_module"
+#define SINDEX_MODULE              "sindex"
+#define AS_SINDEX_MAX_PATH_LENGTH  256
+#define AS_SINDEX_MAX_DEPTH        10
+#define AS_SINDEX_TYPE_STR_SIZE    20 // LIST / MAPKEYS / MAPVALUES / DEFAULT(NONE)
+#define AS_SINDEXDATA_STR_SIZE     AS_SINDEX_MAX_PATH_LENGTH + 1 + 8 // binpath + separator (,) + keytype (string/numeric)
+#define AS_INDEX_KEYS_ARRAY_QUEUE_HIGHWATER  512
+#define AS_INDEX_KEYS_PER_ARR      51
+// **************************************************************************************************
+
+/* 
+ * Return status codes for index object functions.
+ *
+ * NB: When adding error code add the string in the as_sindex_err_str 
+ * in secondary_index.c 
+ *
+ * Negative > 10 are the ones which show up and goes till client
+ *
+ * Positive are < 10 are something which are internal
+ */
+// **************************************************************************************************
+typedef enum {
+	AS_SINDEX_ERR_INAME_MAXLEN     = -17,
+	AS_SINDEX_ERR_MAXCOUNT         = -16,
+	AS_SINDEX_ERR_SET_MISMATCH     = -15,
+	AS_SINDEX_ERR_UNKNOWN_KEYTYPE  = -14,
+	AS_SINDEX_ERR_BIN_NOTFOUND     = -13,
+	AS_SINDEX_ERR_TYPE_MISMATCH    = -11,
+
+	// Needed when attempting index create/query
+	AS_SINDEX_ERR_FOUND            = -6,
+	AS_SINDEX_ERR_NOTFOUND         = -5,
+	AS_SINDEX_ERR_NO_MEMORY        = -4,
+	AS_SINDEX_ERR_PARAM            = -3,
+	AS_SINDEX_ERR_NOT_READABLE     = -2,
+	AS_SINDEX_ERR                  = -1,
+	AS_SINDEX_OK                   =  0,
+
+	// Internal Not needed
+	AS_SINDEX_CONTINUE             = 1,
+	AS_SINDEX_DONE                 = 2,
+	// Needed when inserting object in the btree.
+	AS_SINDEX_KEY_FOUND            = 3,
+	AS_SINDEX_KEY_NOTFOUND         = 4
+} as_sindex_status;
+// **************************************************************************************************
+
+/*
+ * SINDEX OP TYPES.
+ */
+// **************************************************************************************************
+typedef enum {
+	AS_SINDEX_OP_UPDATE = 0,
+	AS_SINDEX_OP_DELETE = 1,
+	AS_SINDEX_OP_INSERT = 2,
+	AS_SINDEX_OP_READ = 3
+} as_sindex_op;
+// **************************************************************************************************
+
+/*
+ * SINDEX GC RETURN ENUMS
+ */
+// **************************************************************************************************
+typedef enum {
+	AS_SINDEX_GC_OK             = 0,
+	AS_SINDEX_GC_ERROR          = 1,
+	AS_SINDEX_GC_SKIP_ITERATION = 2
+} as_sindex_gc_status;
+// **************************************************************************************************
+
+/*
+ * SECONDARY INDEX KEY TYPES same as COL_TYPE*
+ */
+// **************************************************************************************************
+typedef uint8_t as_sindex_ktype;
+// **************************************************************************************************
+
+/*
+ * SINDEX TYPES.
+ * THEY WOULD BE IN SYNC WITH THE CLIENTS.
+ * Do not change the order of this enum
+ */
+// **************************************************************************************************
+typedef enum {
+	AS_SINDEX_ITYPE_DEFAULT   = 0,
+	AS_SINDEX_ITYPE_LIST      = 1,
+	AS_SINDEX_ITYPE_MAPKEYS   = 2,
+	AS_SINDEX_ITYPE_MAPVALUES = 3,
+	AS_SINDEX_ITYPE_MAX       = 4
+} as_sindex_type;
+#define AS_SINDEX_ITYPE_MAX_TO_STR_SZ 2
+// **************************************************************************************************
+
+/* 
+ * STRUCTURES FROM ALCHEMY
+ */
+// *****************************
+struct btree;
+// **************************************************************************************************
+
+/*
+ * STATS AND CONFIG STRUCTURE
+ * Stats are collected about memory utilization based on simple index
+ * overhead. Any insert delete from the secondary index would update
+ * this number and the memory management folks has to use this info.
+ */
+// **************************************************************************************************
+typedef struct as_sindex_stat_s {
+	cf_atomic64        n_objects;
+	int                n_keys;
+	cf_atomic64        mem_used;
+
+	cf_atomic64        n_reads;
+	cf_atomic64        read_errs;
+
+	cf_atomic64        n_writes;
+	cf_atomic64        write_errs;
+	histogram *        _write_hist;         // Histogram to track time spend writing to the sindex
+	histogram *        _si_prep_hist;
+
+	cf_atomic64        n_deletes;
+	cf_atomic64        delete_errs;
+	histogram *        _delete_hist;        // Histogram to track time spend deleting from sindex
+
+	// Background thread stats
+	cf_atomic64        loadtime;
+	cf_atomic64        recs_pending;
+
+	cf_atomic64        n_defrag_records;
+	cf_atomic64        defrag_time;
+	
+	// Query Stats
+	histogram *       _query_hist;            // Histogram to track query latency
+	histogram *       _query_batch_lookup;    // Histogram to track latency of batch request from sindex tree.
+	histogram *       _query_batch_io;        // Histogram to track time spend doing I/O per batch
+	//	--aggregation stats
+	cf_atomic64        n_aggregation;
+	cf_atomic64        agg_response_size;
+	cf_atomic64        agg_num_records;
+	cf_atomic64        agg_errs;
+	//	--lookup stats
+	cf_atomic64        n_lookup;
+	cf_atomic64        lookup_response_size;
+	cf_atomic64        lookup_num_records;
+	cf_atomic64        lookup_errs;
+
+	histogram *       _query_rcnt_hist;       // Histogram to track record counts from queries
+	histogram *       _query_diff_hist;       // Histogram to track the false positives found by queries
+} as_sindex_stat;
+
+typedef struct as_sindex_config_s {
+	volatile uint16_t  flag; // TODO change_name
+} as_sindex_config;
+
+// **************************************************************************************************
+
+
+/*
+ * SINDEX METADATAS
+ */
+// **************************************************************************************************
+typedef struct as_sindex_physical_metadata_s {
+	pthread_rwlock_t    slock;
+	struct btree       *ibtr;
+} as_sindex_pmetadata;
+
+
+typedef struct as_sindex_path_s {
+	as_particle_type type;  // MAP/LIST
+	union {
+		int      index;     // For index of lists.
+		char   * key_str;   // For string type keys in maps.
+		uint64_t key_int;   // For integer type keys in maps.
+	} value;
+	as_particle_type mapkey_type;  // This could be either string or integer type
+} as_sindex_path;
+
+typedef struct as_sindex_metadata_s {
+	pthread_rwlock_t      slock;
+	// Protected by lock
+	as_sindex_pmetadata * pimd;
+	uint32_t              flag;
+
+	// Static Data. Does not need protection
+	struct as_sindex_s  * si;
+	char                * ns_name;
+	char                * set;
+	char                * iname;
+	char                * bname;
+	uint32_t              binid; // Redundant info to aid search
+	as_sindex_ktype       sktype; // Same as Aerospike Index type
+	as_sindex_type        itype;
+	as_sindex_path        path[AS_SINDEX_MAX_DEPTH];
+	int                   path_length;
+	char                * path_str;
+	int                   nprts;   // Aerospike Index Number of Index partitions	
+} as_sindex_metadata;
+
+/*
+ * This structure right now hangs from the namespace structure for the
+ * Aerospike Index B-tree.
+ */
+typedef struct as_sindex_s {
+	int                          simatch; //self, shash match by name
+	// Protected by SI_GWLOCK
+	uint8_t                      state;
+	
+	// TODO : shift to imd
+	volatile uint16_t            flag;
+	// No need to be volatile; little stale info
+	// about this is ok. And it is not checked
+	// in busy loop
+	bool                         enable_histogram; // default false;
+
+	as_namespace                *ns;
+
+	// Protected by si reference
+	struct as_sindex_metadata_s *imd;
+	struct as_sindex_metadata_s *recreate_imd;
+
+	as_sindex_stat               stats;
+	as_sindex_config             config;
+} as_sindex;
+
+// **************************************************************************************************
+/*
+ * SBINS STRUCTURES
+ */
+typedef struct sbin_value_pool_s{
+	uint32_t used_sz;
+	uint8_t  *value;
+} sbin_value_pool;
+
+#define AS_SINDEX_VALUESZ_ON_STACK 16 * 1000
+#define SINDEX_BINS_SETUP(skey_bin, size)                      \
+	sbin_value_pool value_pool;                                    \
+	value_pool.value   = alloca(AS_SINDEX_VALUESZ_ON_STACK);       \
+	value_pool.used_sz = 0;                    \
+	as_sindex_bin skey_bin[(size)];                            \
+	for (int id = 0; id < (size); id++) {         \
+			skey_bin[id].si = NULL;         \
+			skey_bin[id].stack_buf = &value_pool; \
+	}
+
+/*
+ * Used as structure to call into secondary indexes sindex_* interface
+ * TODO: as_sindex_bin is not appropriate name for this structure.
+ * maybe as_sindex_transaction 
+ */
+typedef struct as_sindex_bin_s {
+	union {                       // we use this if we need to store only one value inside sbin.
+		int64_t       int_val;    // accessing this is much faster than accessing any other value
+		cf_digest     str_val;    // value on the stack.
+	} value;
+	uint64_t          num_values; 
+	void            * values;     // If there are more than 1 value in the sbin, we use this to
+	as_particle_type  type;       // point to them. the type of data which is going to get indexed
+	as_sindex_op      op;         // (STRING or INTEGER). Should we delete or insert this values
+	bool              to_free;    // from/into the secondary index tree. If the values are malloced.
+	as_sindex       * si;         // simatch of the si this bin is pointing to.
+	sbin_value_pool * stack_buf;
+	uint32_t          heap_capacity;
+} as_sindex_bin;
+
+// TODO: Reorganise this structure.
+// No need of union.
+typedef struct as_sindex_bin_data_s {
+	uint32_t          id;
+	as_particle_type  type; // this type is citrusleaf type
+	// Union is to support sindex for other datatypes in future.
+	// Currently sindex is supported for only int64 and string.
+	union {
+		int64_t  i64;
+	} u;
+	cf_digest         digest;
+} as_sindex_bin_data;
+
+// Caution: Using this will waste 12 bytes per long type skey 
+typedef struct as_sindex_key_s {
+	union {
+		cf_digest str_key;
+		uint64_t  int_key;
+	} key;
+} as_sindex_key;
+// **************************************************************************************************
+
+
+// **************************************************************************************************
+
+/*
+ * STRUCTUES FOR QUERY MODULE
+ */
+// **************************************************************************************************
+struct ai_obj;
+typedef struct as_sindex_query_context_s {
+	uint64_t         bsize;
+	cf_ll            *recl;
+	uint64_t         n_bdigs;
+
+    int              range_index;
+		
+	// Physical Tree offset
+	bool             new_ibtr;		  // If new tree
+	int              pimd_idx;
+
+	// IBTR offset
+	bool             nbtr_done;       // If nbtr was finished
+								      // next iteration starts
+							          // from key next to bkey
+	struct ai_obj   *bkey;     	      // offset in ibtr
+
+	// NBTR offset
+	cf_digest        bdig;
+
+	// If true all query-able partitions will be reserved before processing the query
+	bool             partitions_pre_reserved; 
+	// Cache information about query-able partitions
+	bool             can_partition_query[AS_PARTITIONS];
+} as_sindex_qctx;
+
+/*
+ * The range structure used to define the lower and upper limit
+ * along with the key types. 
+ *
+ *  [0, endl]
+ *  [startl, -1(inf)]
+ *  [startl, endl]
+ */
+typedef struct as_sindex_range_s {
+	uint8_t             num_binval;
+	bool                isrange;
+	as_sindex_bin_data  start;
+	as_sindex_bin_data  end;
+	as_sindex_type      itype;
+	char                bin_path[AS_SINDEX_MAX_PATH_LENGTH];
+	uint64_t			cellid;	// target of regions-containing-point query
+	geo_region_t		region;	// target of points-in-region query
+} as_sindex_range;
+
+/*
+ * sindex_keys  are used by Secondary index queries to validate the keys against
+ * the values of bins
+ * ALl the jobs which runs over these queries also uses them
+ * Like - Aggregation Query
+ */
+typedef struct as_index_keys_arr_s { 
+	uint32_t      num;
+	cf_digest     pindex_digs[AS_INDEX_KEYS_PER_ARR];	
+	as_sindex_key sindex_keys[AS_INDEX_KEYS_PER_ARR];
+} __attribute__ ((packed)) as_index_keys_arr;
+
+typedef struct as_index_keys_ll_element_s {
+	cf_ll_element       ele;
+	as_index_keys_arr * keys_arr;
+} as_index_keys_ll_element;
+
+
+// **************************************************************************************************
+
+
+// APIs exposed to other modules
+// TODO return values is actually enum. 
+
+/*
+ * MODULE INIT AND SHUTDOWN
+ */
+// **************************************************************************************************
+
+/* Index abstraction layer functions. */
+/*
+ * Initialize an instantiation of the index abstraction layer
+ * using the array of index type-specific parameters passed in.
+ *
+ * All indexes created during this instantiation will use these type-specific
+ * parameters (e.g., maximum data structure sizes, allocation policies, and any
+ * other tuning parameters.)
+ *
+ * Call once before creating any type of index object.
+ */
+extern  int as_sindex_init(as_namespace *ns);
+
+/*
+ * Terminate an instantiation of the index abstraction layer.
+ *
+ * Do not use any "sindex" functions after calling this function, so free your indexes beforehand.
+ */
+extern int  as_sindex_reinit(char *name, char *params, cf_dyn_buf *db);
+// **************************************************************************************************
+
+/*
+ * INDEX BOOT
+ */
+// **************************************************************************************************
+extern int  as_sindex_populate_done(as_sindex *si);
+extern int  as_sindex_boot_populateall_done(as_namespace *ns);
+extern int  as_sindex_boot_populateall();
+// **************************************************************************************************
+
+/* 
+ * DDL AND METADATA QUERY
+ * 
+*/
+// **************************************************************************************************
+extern int  as_sindex_create(as_namespace *ns, as_sindex_metadata *imd);
+extern int  as_sindex_destroy(as_namespace *ns, as_sindex_metadata *imd);
+extern int  as_sindex_recreate(as_sindex_metadata *imd);
+extern void as_sindex_destroy_pmetadata(as_sindex *si);
+// **************************************************************************************************
+
+
+/*
+ * CREATION AND UPDATION OF SINDEX BIN 
+ */
+// **************************************************************************************************
+extern int  as_sindex_sbins_from_rd(as_storage_rd *rd, uint16_t from_bin, uint16_t to_bin, 
+			as_sindex_bin sbins[], as_sindex_op op);
+extern int  as_sindex_sbins_from_bin(as_namespace *ns, const char *set, const as_bin *b,
+			as_sindex_bin * start_sbin, as_sindex_op op);
+extern int  as_sindex_update_by_sbin(as_namespace *ns, const char *set, as_sindex_bin *start_sbin, 
+			int num_sbins, cf_digest * pkey);
+extern uint32_t as_sindex_sbins_populate(as_sindex_bin *sbins, as_namespace *ns, const char *set_name,
+			const as_bin *b_old, const as_bin *b_new);
+// **************************************************************************************************
+
+
+/*
+ * DMLs USING RECORDS
+ */
+// **************************************************************************************************
+int  as_sindex_put_rd(as_sindex *si, as_storage_rd *rd);
+void as_sindex_putall_rd(as_namespace *ns, as_storage_rd *rd);
+// **************************************************************************************************
+
+
+/* 
+ * UTILS
+ */
+// **************************************************************************************************
+extern int                  as_sindex_ns_has_sindex(as_namespace *ns);
+extern const char         * as_sindex_err_str(int err_code);
+extern uint8_t              as_sindex_err_to_clienterr(int err, char *fname, int lineno);
+extern bool                 as_sindex_isactive(as_sindex *si);
+extern int                  as_sindex_get_err(int op_code, char *filename, int lineno);
+extern as_sindex_status     as_sindex__delete_from_set_binid_hash(as_namespace * ns, 
+							as_sindex_metadata * imd);
+extern as_val             * as_sindex_extract_val_from_path(as_sindex_metadata * imd, as_val * v);
+extern as_sindex_gc_status  as_sindex_can_defrag_record(as_namespace *ns, cf_digest *keyd);
+extern as_sindex_status     as_sindex_extract_bin_path(as_sindex_metadata * imd, char * path_str);
+int                         as_sindex_create_check_params(as_namespace* ns, as_sindex_metadata* imd);
+bool                        as_sindex_delete_checker(as_namespace *ns, as_sindex_metadata *imd);
+as_particle_type            as_sindex_pktype(as_sindex_metadata * imd);
+extern const char         * as_sindex_ktype_str(as_sindex_ktype type);
+extern as_sindex_ktype      as_sindex_ktype_from_string(const char * type_str);
+int                         as_sindex_arr_lookup_by_set_binid_lockfree(as_namespace * ns, 
+							const char *set, int binid, as_sindex ** si_arr);
+void                        as_sindex_delete_set(as_namespace * ns, char * set_name);
+// **************************************************************************************************
+
+/*
+ * INFO AND CONFIGS
+ */
+// **************************************************************************************************
+extern int  as_sindex_list_str(as_namespace *ns, cf_dyn_buf *db);
+extern int  as_sindex_stats_str(as_namespace *ns, char * iname, cf_dyn_buf *db);
+extern int  as_sindex_set_config(as_namespace *ns, as_sindex_metadata *imd, char *params);
+extern void as_sindex_dump(char *nsname, char *iname, char *fname, bool verbose);
+extern void as_sindex_gconfig_default(struct as_config_s *c);
+extern int  as_info_parse_params_to_sindex_imd(char* params, as_sindex_metadata *imd, cf_dyn_buf* db,
+			bool is_create, bool *is_smd_op, char * cmd);
+void        as_sindex__config_default(as_sindex *si);
+void        as_sindex_ticker_start(as_namespace * ns, as_sindex * si);
+void        as_sindex_ticker(as_namespace * ns, as_sindex * si, uint64_t n_obj_scanned, uint64_t start_time);
+void        as_sindex_ticker_done(as_namespace * ns, as_sindex * si, uint64_t start_time);
+// **************************************************************************************************
+
+/*
+ * HISTOGRAMS
+ */
+// **************************************************************************************************
+extern int as_sindex_histogram_enable(as_namespace *ns, char * iname, bool enable);
+extern int as_sindex_histogram_dumpall(as_namespace *ns);
+#define SINDEX_HIST_INSERT_DATA_POINT(si, type, start_time_ns)                          \
+do {                                                                                    \
+	if (si->enable_histogram && start_time_ns != 0) {                                   \
+		if (si->stats._ ##type) {                                                       \
+			histogram_insert_data_point(si->stats._ ##type, start_time_ns);             \
+		}                                                                               \
+	}                                                                                   \
+} while(0);
+
+#define SINDEX_HIST_INSERT_RAW(si, type, value)                                         \
+do {                                                                                    \
+	if (si->enable_histogram) {                                                         \
+		if (si->stats._ ##type) {                                                       \
+			histogram_insert_raw(si->stats._ ##type, value);                            \
+		}                                                                               \
+	}                                                                                   \
+} while(0);
+
+
+// **************************************************************************************************
+
+/* 
+ * UTILS FOR QUERIES
+*/
+// **************************************************************************************************
+extern int         as_sindex_query(as_sindex *si, as_sindex_range *range, as_sindex_qctx *qctx);
+extern int         as_sindex_range_free(as_sindex_range **srange);
+extern int         as_sindex_rangep_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range **srange);
+extern int         as_sindex_range_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range *srange);
+extern bool        as_sindex_can_query(as_sindex *si);
+extern as_sindex * as_sindex_from_msg(as_namespace *ns, as_msg *msgp); 
+extern as_sindex * as_sindex_from_range(as_namespace *ns, char *set, as_sindex_range *srange);
+extern int         as_index_keys_reduce_fn(cf_ll_element *ele, void *udata);
+extern void        as_index_keys_destroy_fn(cf_ll_element *ele);
+// **************************************************************************************************
+
+
+/*
+ * RESERVE, RELEASE AND FREE
+ */
+// **************************************************************************************************
+#define AS_SINDEX_RESERVE(si) \
+	as_sindex_reserve((si), __FILE__, __LINE__);
+#define AS_SINDEX_RELEASE(si) \
+	as_sindex_release((si), __FILE__, __LINE__);
+extern int  as_sindex_reserve(as_sindex *si, char *fname, int lineno);
+extern void as_sindex_release(as_sindex *si, char *fname, int lineno);
+extern int  as_sindex_imd_free(as_sindex_metadata *imd);
+extern int  as_sindex_sbin_free(as_sindex_bin *sbin);
+extern int  as_sindex_sbin_freeall(as_sindex_bin *sbin, int numval);
+void        as_sindex_release_arr(as_sindex *si_arr[], int si_arr_sz);
+// **************************************************************************************************
+
+/*
+ * SINDEX LOCKS
+ */
+// **************************************************************************************************
+extern pthread_rwlock_t g_sindex_rwlock;
+#define SINDEX_GRLOCK()         \
+do { \
+	int ret = pthread_rwlock_rdlock(&g_sindex_rwlock); \
+	if (ret) cf_warning(AS_SINDEX, "GRLOCK(%d) %s:%d",ret, __FILE__, __LINE__); \
+} while (0);
+
+#define SINDEX_GWLOCK()         \
+do { \
+	int ret = pthread_rwlock_wrlock(&g_sindex_rwlock); \
+	if (ret) cf_warning(AS_SINDEX, "GWLOCK(%d) %s:%d", ret, __FILE__, __LINE__); \
+} while (0);
+
+#define SINDEX_GRUNLOCK()        \
+do { \
+	int ret = pthread_rwlock_unlock(&g_sindex_rwlock); \
+	if (ret) cf_warning(AS_SINDEX, "GRUNLOCK (%d) %s:%d",ret,  __FILE__, __LINE__); \
+} while (0);
+
+#define SINDEX_GWUNLOCK()        \
+do { \
+	int ret = pthread_rwlock_unlock(&g_sindex_rwlock); \
+	if (ret) cf_warning(AS_SINDEX, "GWUNLOCK (%d) %s:%d",ret,  __FILE__, __LINE__); \
+} while (0);
+
+#define PIMD_RLOCK(l)          \
+do {                                            \
+	int ret = pthread_rwlock_rdlock((l));        \
+	if (ret) cf_warning(AS_SINDEX, "RLOCK_ONLY (%d) %s:%d", ret, __FILE__, __LINE__); \
+} while(0);
+
+#define PIMD_WLOCK(l)                       \
+do {                                            \
+	int ret = pthread_rwlock_wrlock((l));        \
+	if (ret) cf_warning(AS_SINDEX, "WLOCK_ONLY (%d) %s:%d",ret, __FILE__, __LINE__); \
+} while(0);
+
+#define PIMD_RUNLOCK(l)							\
+do {                                            \
+	int ret = pthread_rwlock_unlock((l));        \
+	if (ret) cf_warning(AS_SINDEX, "RUNLOCK_ONLY (%d) %s:%d",ret, __FILE__, __LINE__); \
+} while(0);
+
+#define PIMD_WUNLOCK(l)							\
+do {                                            \
+	int ret = pthread_rwlock_unlock((l));        \
+	if (ret) cf_warning(AS_SINDEX, "WUNLOCK_ONLY (%d) %s:%d",ret, __FILE__, __LINE__); \
+} while(0);
+
+// **************************************************************************************************
+
+/*
+ * APIs for SMD
+ */
+// **************************************************************************************************
+extern void as_sindex_init_smd();
+extern void as_sindex_imd_to_smd_key(const as_sindex_metadata *imd, char *smd_key);
+extern bool as_sindex_delete_imd_to_smd_key(as_namespace *ns, as_sindex_metadata *imd, char *smd_key);
+extern int  as_sindex_smd_accept_cb(char *module, as_smd_item_list_t *items, void *udata, 
+						uint32_t accept_opt);
+// **************************************************************************************************
+
+/*
+ * QUERY MACROS
+ */
+// **************************************************************************************************
+#define AS_QUERY_OK        AS_SINDEX_OK
+#define AS_QUERY_ERR       AS_SINDEX_ERR
+#define AS_QUERY_CONTINUE  AS_SINDEX_CONTINUE
+#define AS_QUERY_DONE      AS_SINDEX_DONE
+// **************************************************************************************************
+
+/*
+ * QUERY APIs exposed to other modules
+ */
+// **************************************************************************************************
+extern void                 as_query_init();
+extern int                  as_query(as_transaction *tr, as_namespace *ns);
+extern int                  as_query_reinit(int set_size, int *actual_size);
+extern int                  as_query_worker_reinit(int set_size, int *actual_size);
+extern int                  as_query_list(char *name, cf_dyn_buf *db);
+extern int                  as_query_kill(uint64_t trid);
+extern void                 as_query_gconfig_default(struct as_config_s *c);
+extern as_mon_jobstat     * as_query_get_jobstat(uint64_t trid);
+extern as_mon_jobstat     * as_query_get_jobstat_all(int * size);
+extern int                  as_query_set_priority(uint64_t trid, uint32_t priority);
+extern void                 as_query_histogram_dumpall();
+extern as_index_keys_arr  * as_index_get_keys_arr();
+extern void                 as_index_keys_release_arr_to_queue(as_index_keys_arr *v);
+extern int                  as_index_keys_ll_reduce_fn(cf_ll_element *ele, void *udata);
+extern void                 as_index_keys_ll_destroy_fn(cf_ll_element *ele);
+
+extern cf_atomic32 g_query_short_running;
+extern cf_atomic32 g_query_long_running;
+// **************************************************************************************************
diff --git a/as/include/base/security.h b/as/include/base/security.h
new file mode 100644
index 00000000..c34fe07b
--- /dev/null
+++ b/as/include/base/security.h
@@ -0,0 +1,106 @@
+/*
+ * security.h
+ *
+ * Copyright (C) 2014-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_file_handle_s;
+struct as_namespace_s;
+struct as_transaction_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// Security permissions.
+typedef enum {
+	PERM_NONE			= 0,
+
+	// Data transactions.
+	PERM_READ			= 0x0001,
+	PERM_SCAN			= 0x0002,
+	PERM_QUERY			= 0x0004,
+	PERM_WRITE			= 0x0008,
+	PERM_DELETE			= 0x0010,
+	PERM_UDF_APPLY		= 0x0020,
+	PERM_UDF_SCAN		= 0x0040,
+	PERM_UDF_QUERY		= 0x0080,
+	// ... 8 unused bits ...
+
+	// Data transactions' system metadata management.
+	PERM_INDEX_MANAGE	= 0x00010000,
+	PERM_UDF_MANAGE		= 0x00020000,
+	PERM_SCAN_MANAGE	= 0x00040000,
+	PERM_QUERY_MANAGE	= 0x00080000,
+	PERM_JOB_MONITOR	= 0x00100000,
+	PERM_TRUNCATE		= 0x00200000,
+	// ... 2 unused bits ...
+
+	// Deployment operations management.
+	PERM_SET_CONFIG		= 0x01000000,
+	PERM_LOGGING_CTRL	= 0x02000000,
+	PERM_SERVICE_CTRL	= 0x04000000,
+
+	// Database users and roles management.
+	PERM_USER_ADMIN		= 0x100000000000
+} as_sec_perm;
+
+// Current security message version.
+#define AS_SEC_MSG_SCHEME 0
+
+// Security protocol message container.
+typedef struct as_sec_msg_s {
+	uint8_t		scheme;		// security scheme/version
+	uint8_t		result;		// result code (only for responses, except MORE)
+	uint8_t		command;	// security command (only for requests)
+	uint8_t		n_fields;	// number of fields in this message
+
+	uint8_t		unused[12];	// reserved bytes round as_sec_msg size to 16 bytes
+
+	uint8_t		fields[];	// the fields (name/value pairs)
+} __attribute__ ((__packed__)) as_sec_msg;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_security_init();
+uint8_t as_security_check(const struct as_file_handle_s* fd_h, as_sec_perm perm);
+bool as_security_check_data_op(struct as_transaction_s* tr, struct as_namespace_s* ns, as_sec_perm perm);
+void* as_security_filter_create();
+void as_security_filter_destroy(void* pv_filter);
+void as_security_log(const struct as_file_handle_s* fd_h, uint8_t result, as_sec_perm perm, const char* action, const char* detail);
+void as_security_refresh(struct as_file_handle_s* fd_h);
+void as_security_transact(struct as_transaction_s* tr);
diff --git a/as/include/base/security_config.h b/as/include/base/security_config.h
new file mode 100644
index 00000000..6a9bae65
--- /dev/null
+++ b/as/include/base/security_config.h
@@ -0,0 +1,78 @@
+/*
+ * security_config.h
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// Syslog "local" facilities.
+typedef enum {
+	AS_SYSLOG_NONE		= -1,
+	AS_SYSLOG_MIN		= 0,
+	AS_SYSLOG_MAX		= 7,
+
+	// May configure any facility from "local0" to "local7".
+	AS_SYSLOG_LOCAL0	= 0,
+	AS_SYSLOG_LOCAL1	= 1,
+	AS_SYSLOG_LOCAL2	= 2,
+	AS_SYSLOG_LOCAL3	= 3,
+	AS_SYSLOG_LOCAL4	= 4,
+	AS_SYSLOG_LOCAL5	= 5,
+	AS_SYSLOG_LOCAL6	= 6,
+	AS_SYSLOG_LOCAL7	= 7,
+} as_sec_syslog_local;
+
+// Security-related reporting sink bit-field flags.
+#define AS_SEC_SINK_LOG		0x1
+#define AS_SEC_SINK_SYSLOG	0x2
+
+// Security-related reporting sinks as bit-fields.
+typedef struct as_sec_report_s {
+	uint32_t	authentication;
+	uint32_t	data_op;
+	uint32_t	sys_admin;
+	uint32_t	user_admin;
+	uint32_t	violation;
+} as_sec_report;
+
+// Security configuration.
+typedef struct as_sec_config_s {
+	bool				security_enabled;
+	uint32_t			privilege_refresh_period;	// (seconds)
+	as_sec_report		report;						// reporting sinks
+	as_sec_syslog_local	syslog_local;				// syslog local facility
+} as_sec_config;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_security_config_check();
+void as_security_config_log_scope(uint32_t sink, const char* ns_name,
+		const char* set_name);
diff --git a/as/include/base/stats.h b/as/include/base/stats.h
new file mode 100644
index 00000000..5605e536
--- /dev/null
+++ b/as/include/base/stats.h
@@ -0,0 +1,129 @@
+/*
+ * stats.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+
+#include "hist.h"
+
+#include "fabric/fabric.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct as_stats_s {
+
+	// Connection stats.
+	cf_atomic64		proto_connections_opened; // not just a statistic
+	cf_atomic64		proto_connections_closed; // not just a statistic
+	// In ticker but not collected via info:
+	cf_atomic64		heartbeat_connections_opened;
+	cf_atomic64		heartbeat_connections_closed;
+	cf_atomic64		fabric_connections_opened;
+	cf_atomic64		fabric_connections_closed;
+
+	// Heartbeat stats.
+	cf_atomic64		heartbeat_received_self;
+	cf_atomic64		heartbeat_received_foreign;
+
+	// Demarshal stats.
+	uint64_t		reaper_count; // not in ticker - incremented only in reaper thread
+
+	// Info stats.
+	cf_atomic64		info_complete;
+
+	// Early transaction errors.
+	cf_atomic64		n_demarshal_error;
+	cf_atomic64		n_tsvc_client_error;
+	cf_atomic64		n_tsvc_batch_sub_error;
+	cf_atomic64		n_tsvc_udf_sub_error;
+
+	// Batch-index stats.
+	cf_atomic64		batch_index_initiate; // not in ticker - not just a statistic
+	cf_atomic64		batch_index_complete;
+	cf_atomic64		batch_index_errors;
+	cf_atomic64		batch_index_timeout;
+
+	// Batch-index stats.
+	cf_atomic64		batch_index_huge_buffers; // not in ticker
+	cf_atomic64		batch_index_created_buffers; // not in ticker
+	cf_atomic64		batch_index_destroyed_buffers; // not in ticker
+
+	// "Old" batch stats.
+	cf_atomic64		batch_initiate; // not in ticker
+	cf_atomic64		batch_errors; // not in ticker
+	cf_atomic64		batch_timeout; // not in ticker
+
+	// Query & secondary index stats.
+	cf_atomic64		query_false_positives;
+	cf_atomic64		sindex_gc_timedout; // number of times sindex gc iteration timed out waiting for partition lock
+	uint64_t		sindex_gc_list_creation_time; // cumulative sum of list creation phase in sindex gc
+	uint64_t		sindex_gc_list_deletion_time; // cumulative sum of list deletion phase in sindex gc
+	uint64_t		sindex_gc_objects_validated; // cumulative sum of sindex objects validated
+	uint64_t		sindex_gc_garbage_found; // amount of garbage found during list creation phase
+	uint64_t		sindex_gc_garbage_cleaned; // amount of garbage deleted during list deletion phase
+
+	// Fabric stats.
+	uint64_t		fabric_bulk_s_rate;
+	uint64_t		fabric_bulk_r_rate;
+	uint64_t		fabric_ctrl_s_rate;
+	uint64_t		fabric_ctrl_r_rate;
+	uint64_t		fabric_meta_s_rate;
+	uint64_t		fabric_meta_r_rate;
+	uint64_t		fabric_rw_s_rate;
+	uint64_t		fabric_rw_r_rate;
+
+	//--------------------------------------------
+	// Histograms.
+	//
+
+	histogram*		batch_index_hist;
+	bool			batch_index_hist_active; // automatically activated
+
+	histogram*		info_hist;
+
+	histogram*		svc_demarshal_hist;
+	histogram*		svc_queue_hist;
+
+	histogram*		fabric_send_init_hists[AS_FABRIC_N_CHANNELS];
+	histogram*		fabric_send_fragment_hists[AS_FABRIC_N_CHANNELS];
+	histogram*		fabric_recv_fragment_hists[AS_FABRIC_N_CHANNELS];
+	histogram*		fabric_recv_cb_hists[AS_FABRIC_N_CHANNELS];
+
+} as_stats;
+
+
+//==========================================================
+// Public API.
+//
+
+// For now this is in thr_info.c, until a separate .c file is worth it.
+extern as_stats g_stats;
diff --git a/as/include/base/system_metadata.h b/as/include/base/system_metadata.h
new file mode 100644
index 00000000..6b398cd9
--- /dev/null
+++ b/as/include/base/system_metadata.h
@@ -0,0 +1,236 @@
+/*
+ * system_metadata.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ *  SYNOPSIS
+ *    The System Metadata module provides a mechanism for synchronizing
+ *    module metadata cluster-wide.  While each module is responsible
+ *    for the interpretation of its own metadata, the System Metadata
+ *    module provides persistence and automatic distribution of changes
+ *    to that opaque metadata.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "node.h"
+
+
+/* Declare Public System Metadata Types */
+
+
+/*
+ *  Type for actions to perform upon metadata items.
+ */
+typedef enum as_smd_action_e {
+	AS_SMD_ACTION_SET,       // Add or modify this metadata item
+	AS_SMD_ACTION_DELETE     // Delete this metadata item
+} as_smd_action_t;
+
+/*
+ *  Type for an item of metadata.
+ */
+typedef struct as_smd_item_s {
+	cf_node node_id;         // Originating node ID
+	as_smd_action_t action;  // Action to perform on this metadata item
+	char *module_name;       // Module name of the item
+	char *key;               // Key of the metadata item
+	char *value;             // Value of the metadata item
+	uint32_t generation;     // Metadata generation counter
+	uint64_t timestamp;      // Time metadata last modified
+} as_smd_item_t;
+
+/*
+ *  Type for a list of metadata items for a particular node.
+ */
+typedef struct as_smd_item_list_s {
+	size_t num_items;        // Number of metadata items
+	as_smd_item_t *item[];   // Array of pointers to metadata items
+} as_smd_item_list_t;
+
+/*
+ *  Opaque type representing the state of the System Metadata module.
+ */
+typedef struct as_smd_s as_smd_t;
+
+/*
+ *  SMD is a singleton, though many class methods are passed an object pointer.
+ */
+extern as_smd_t *g_smd;
+
+/*
+ *  Type for mutually-disjoint flag values passed by SMD to the module's accept callback
+ *   via the "accept_opt" argument specifying the originator of the operation.
+ */
+typedef enum as_smd_accept_option_e {
+	AS_SMD_ACCEPT_OPT_CREATE  = (1 << 0),  // Module creation-time accept event
+	AS_SMD_ACCEPT_OPT_MERGE   = (1 << 1),  // Post-cluster state change merge
+	AS_SMD_ACCEPT_OPT_API     = (1 << 2)   // User-initiated set/delete metadata via SMD API
+} as_smd_accept_option_t;
+
+/*
+ *  Size of the key to be used during a majority consensus merge operation.
+ *  (Ideally this would be a module-supplied parameter rather than a constant.)
+ */
+#define AS_SMD_MAJORITY_CONSENSUS_KEYSIZE  (1024)
+
+
+/* Callback Function Types. */
+
+
+/*
+ *  Callback function type for getting metadata items.
+ */
+typedef int (*as_smd_get_cb)(char *module, as_smd_item_list_t *items, void *udata);
+
+/*
+ *  Callback function type for metadata merge policy functions.
+ *    Resolve action executed on Paxos principal node to determine the cluster-wide "truth."
+ *    Default merge policy:  union
+ *    Alternative merge policies:  highest generation, latest timestamp
+ *    Configurable via registering a per-module callback function.
+ */
+typedef int (*as_smd_merge_cb)(const char *module, as_smd_item_list_t **item_list_out, as_smd_item_list_t **item_lists_in, size_t num_lists, void *udata);
+
+/*
+ *  Callback function type for metadata merge item conflict resolution functions.
+ *    Use only if not using custom as_smd_merge_cb
+ *    Default item conflict resolution picks greater SMD generation/timestamp
+ *    Configurable via registering a per-module callback function.
+ *    Return true to choose existing_item, false to choose new_item.
+ */
+typedef bool (*as_smd_conflict_cb)(char *module, as_smd_item_t *existing_item, as_smd_item_t *new_item, void *udata);
+
+/*
+ *  Callback function type for metadata acceptance policy functions.
+ *    The accept callback is executed to commit a metadata change, with
+ *     the accept option specifying the originator of the accept action as follows:
+ *       1). OPT_CREATE:  When a module has been created and its persisted metadata has been restored.
+ *       2). OPT_MERGE:   When all cluster nodes receive and accept the truth from the Paxos principal.
+ *       3). OPT_API:     When metadata is set via the API or restored from persistence, handled locally
+ *                          prior to cluster formation, otherwise proxied via the Paxos principal.
+ *    Configurable via registering a per-module callback function.
+ */
+typedef int (*as_smd_accept_cb)(char *module, as_smd_item_list_t *items, void *udata, uint32_t accept_opt);
+
+/*
+ *  Callback function type for metadata acceptance pre-check policy function.
+ *    When a user-initiated metadata change operation is requested via the SMD API,
+ *    the validity of operation and arguments is first checked on the Paxos principal
+ *    to decide whether this operation should be sent to all cluster nodes.
+ *    Configurable via registering a per-module callback function.
+ */
+typedef int (*as_smd_can_accept_cb)(char* module, as_smd_item_t *item, void *udata);
+
+
+/* Constructor and destructor functions for metadata item list objects passed to/from the callback functions. */
+
+
+/*
+ *  Create an empty list of reference-counted metadata items.
+ */
+as_smd_item_list_t *as_smd_item_list_create(size_t num_items);
+
+/*
+ *  Release a list of reference-counted metadata items.
+ */
+void as_smd_item_list_destroy(as_smd_item_list_t *items);
+
+
+/* System Metadata Module Startup / Shutdown */
+
+
+/*
+ *  Initialize the single global System Metadata module.
+ */
+as_smd_t *as_smd_init(void);
+
+/*
+ *  Start the System Metadata module to begin receiving Paxos state change events.
+ */
+int as_smd_start(as_smd_t *smd);
+
+/*
+ *  Terminate the System Metadata module.
+ */
+int as_smd_shutdown(as_smd_t *smd);
+
+
+/* Metadata Manipulation */
+
+
+/*
+ *  Create a container for the named module's metadata and register the policy callback functions.
+ *  (Pass a NULL callback function pointer to select the default policy.)
+ */
+int as_smd_create_module(char *module,
+						 as_smd_merge_cb merge_cb, void *merge_udata,
+						 as_smd_conflict_cb conflict_cb, void *conflict_udata,
+						 as_smd_accept_cb accept_cb, void *accept_udata,
+						 as_smd_can_accept_cb can_accept_cb, void *can_accept_udata);
+
+/*
+ *  Destroy the container for the named module's metadata, releasing all of its metadata.
+ */
+int as_smd_destroy_module(char *module);
+
+/*
+ *  Add a new, or modify an existing, metadata item in an existing module.
+ */
+int as_smd_set_metadata(char *module, char *key, char *value);
+
+/*
+ *  Delete an existing metadata item from an existing module.
+ */
+int as_smd_delete_metadata(char *module, char *key);
+
+/*
+ *  Retrieve metadata item(s.) (Pass NULL for module and/or key for "all".)
+ */
+int as_smd_get_metadata(char *module, char *key, as_smd_get_cb cb, void *udata);
+
+
+/* Info Command Functions */
+
+
+/*
+ *  Print info. about the System Metadata state to the log.
+ *  (Verbose true prints detailed info. about the metadata values.)
+ */
+void as_smd_dump(bool verbose);
+
+/*
+ *  Manipulate the System Metadata and log the result.
+ */
+void as_smd_info_cmd(char *cmd, cf_node node_id, char *module, char *key, char *value);
+
+
+/* Pre-Defined Callback Policy Functions. */
+
+
+/*
+ *  Merge callback function implementing the majority consensus merge policy.
+ */
+int as_smd_majority_consensus_merge(const char *module, as_smd_item_list_t **item_list_out,
+									as_smd_item_list_t **item_lists_in, size_t num_lists, void *udata);
diff --git a/as/include/base/thr_batch.h b/as/include/base/thr_batch.h
new file mode 100644
index 00000000..b80056a5
--- /dev/null
+++ b/as/include/base/thr_batch.h
@@ -0,0 +1,31 @@
+/*
+ * thr_batch.h
+ *
+ * Copyright (C) 2008-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "base/datamodel.h"
+#include "base/transaction.h"
+
+int as_batch_direct_init();
+int as_batch_direct_queue_task(as_transaction* tr, as_namespace *ns);
+int as_batch_direct_queue_size();
+int as_batch_direct_threads_resize(uint32_t threads);
diff --git a/as/include/base/thr_demarshal.h b/as/include/base/thr_demarshal.h
new file mode 100644
index 00000000..a94dd879
--- /dev/null
+++ b/as/include/base/thr_demarshal.h
@@ -0,0 +1,46 @@
+/*
+ * thr_demarshal.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "socket.h"
+#include "tls.h"
+#include "base/cfg.h"
+#include "base/transaction.h"
+
+typedef struct as_info_endpoint_s {
+	cf_addr_list addrs;
+	cf_ip_port port;
+} as_info_endpoint;
+
+typedef struct as_info_access_s {
+	as_info_endpoint service;
+	as_info_endpoint alt_service;
+	as_info_endpoint tls_service;
+	as_info_endpoint alt_tls_service;
+} as_info_access;
+
+extern as_info_access g_access;
+extern cf_serv_cfg g_service_bind;
+extern cf_tls_info *g_service_tls;
+
+void thr_demarshal_rearm(as_file_handle *fd_h);
diff --git a/as/include/base/thr_info.h b/as/include/base/thr_info.h
new file mode 100644
index 00000000..cfe23370
--- /dev/null
+++ b/as/include/base/thr_info.h
@@ -0,0 +1,88 @@
+/*
+ * thr_info.h
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "dynbuf.h"
+
+#include "base/proto.h"
+#include "base/security.h"
+#include "base/transaction.h"
+
+typedef int (*as_info_get_tree_fn) (char *name, char *subtree, cf_dyn_buf *db);
+typedef int (*as_info_get_value_fn) (char *name, cf_dyn_buf *db);
+typedef int (*as_info_command_fn) (char *name, char *parameters, cf_dyn_buf *db);
+
+// Sets a static value - set to 0 to remove a previous value.
+extern int as_info_set_buf(const char *name, const uint8_t *value, size_t value_sz, bool def);
+extern int as_info_set(const char *name, const char *value, bool def);
+
+// For dynamic items - you will get called when the name is requested. The
+// dynbuf will be fully set up for you - just add the information you want to
+// return.
+extern int as_info_set_dynamic(char *name, as_info_get_value_fn gv_fn, bool def);
+
+// For tree items - you will get called when the name is requested, and it will
+// have the name you registered (name) and the subtree portion (value). The
+// dynbuf will be fully set up for you - just add the information you want to
+// return
+extern int as_info_set_tree(char *name, as_info_get_tree_fn gv_fn);
+
+// For commands - you will be called with the parameters.
+extern int as_info_set_command(char *name, as_info_command_fn command_fn, as_sec_perm required_perm);
+
+int as_info_parameter_get(char *param_str, char *param, char *value, int *value_len);
+
+typedef struct as_info_transaction_s {
+	as_file_handle *fd_h;
+	as_proto *proto;
+	uint64_t start_time;
+} as_info_transaction;
+
+// Processes an info request that comes in from the network, sends the response.
+extern void as_info(as_info_transaction *it);
+
+// Processes a pure puffer request without any info header stuff.
+extern int as_info_buffer(uint8_t *req_buf, size_t req_buf_len, cf_dyn_buf *rsp);
+
+// The info unit uses the fabric to communicate with the other members of the
+// cluster so it needs to register for different messages and create listener
+// threads, etc.
+extern int as_info_init();
+
+// Needed by heartbeat:
+
+char *as_info_bind_to_string(const cf_serv_cfg *cfg, cf_sock_owner owner);
+
+// Needed by ticker:
+
+int as_info_queue_get_size();
+void info_log_with_datestamp(void (*log_fn)(void));
+
+extern bool g_mstats_enabled;
+
+// Needed by main():
+extern uint64_t g_start_ms;
diff --git a/as/include/base/thr_info_port.h b/as/include/base/thr_info_port.h
new file mode 100644
index 00000000..97a40235
--- /dev/null
+++ b/as/include/base/thr_info_port.h
@@ -0,0 +1,30 @@
+/*
+ * thr_info_port.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "socket.h"
+
+void as_info_port_start();
+
+extern cf_serv_cfg g_info_bind;
+extern cf_ip_port g_info_port;
diff --git a/as/include/base/thr_query.h b/as/include/base/thr_query.h
new file mode 100644
index 00000000..8c21114c
--- /dev/null
+++ b/as/include/base/thr_query.h
@@ -0,0 +1,42 @@
+/*
+ * thr_query.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * QUERY Engine Defaults
+ */
+// **************************************************************************************************
+#define QUERY_BATCH_SIZE              100
+#define AS_MAX_NUM_SCRIPT_PARAMS      10
+#define AS_QUERY_BUF_SIZE             1024 * 1024 * 2 // At least 2 Meg
+#define AS_QUERY_MAX_BUFS             256	// That makes it 512 meg max in steady state
+#define AS_QUERY_MAX_QREQ             1024	// this is 4 kb
+#define AS_QUERY_MAX_QTR_POOL		  128	// They are 4MB+ each ...
+#define AS_QUERY_MAX_THREADS          32
+#define AS_QUERY_MAX_WORKER_THREADS   15 * AS_QUERY_MAX_THREADS
+#define AS_QUERY_MAX_QREQ_INFLIGHT    100	// worker queue capping per query
+#define AS_QUERY_MAX_QUERY            500	// 32 MB be little generous for now!!
+#define AS_QUERY_MAX_SHORT_QUEUE_SZ   500	// maximum 500 outstanding short running queries
+#define AS_QUERY_MAX_LONG_QUEUE_SZ    500	// maximum 500 outstanding long  running queries
+#define AS_QUERY_MAX_UDF_TRANSACTIONS 20	// Higher the value more aggressive it will be
+#define AS_QUERY_UNTRACKED_TIME       1000 // (millisecond) 1 sec
+#define AS_QUERY_WAIT_MAX_TRAN_US     1000
+// **************************************************************************************************
diff --git a/as/include/base/thr_sindex.h b/as/include/base/thr_sindex.h
new file mode 100644
index 00000000..c95d0819
--- /dev/null
+++ b/as/include/base/thr_sindex.h
@@ -0,0 +1,78 @@
+/*
+ * thr_sindex.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * secondary index function declarations
+ */
+
+#pragma once
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "ai_obj.h"
+#include "dynbuf.h"
+#include "hist.h"
+
+#include "base/datamodel.h"
+#include "base/monitor.h"
+
+#define SINDEX_GC_QUEUE_HIGHWATER  10
+#define SINDEX_GC_NUM_OBJS_PER_ARR 20
+
+typedef struct acol_digest_t {
+	cf_digest dig;
+	ai_obj    acol;
+} acol_digest;
+
+typedef struct objs_to_defrag_arr_t {
+	acol_digest acol_digs[SINDEX_GC_NUM_OBJS_PER_ARR];
+	uint32_t    num;
+} objs_to_defrag_arr;
+
+typedef struct ll_sindex_gc_element_s {
+	cf_ll_element        ele;
+	objs_to_defrag_arr * objs_to_defrag;
+} ll_sindex_gc_element;
+
+extern pthread_rwlock_t sindex_rwlock;
+extern cf_queue *g_sindex_populate_q;
+extern cf_queue *g_sindex_destroy_q;
+extern cf_queue *g_sindex_populateall_done_q;
+extern bool      g_sindex_boot_done;
+
+void as_sindex_thr_init();
+objs_to_defrag_arr * as_sindex_gc_get_defrag_arr(void);
+
+#define MAX_SINDEX_BUILDER_THREADS 32
+
+void as_sbld_init();
+void as_sbld_build_all(as_namespace* ns);
+void as_sbld_resize_thread_pool(uint32_t n_threads);
+int as_sbld_list(char* name, cf_dyn_buf* db);
+as_mon_jobstat* as_sbld_get_jobstat(uint64_t trid);
+as_mon_jobstat* as_sbld_get_jobstat_all(int* size);
+int as_sbld_abort(uint64_t trid);
diff --git a/as/include/base/thr_tsvc.h b/as/include/base/thr_tsvc.h
new file mode 100644
index 00000000..12e0d5d5
--- /dev/null
+++ b/as/include/base/thr_tsvc.h
@@ -0,0 +1,55 @@
+/*
+ * thr_tsvc.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define MAX_TRANSACTION_QUEUES 128
+#define MAX_TRANSACTION_THREADS_PER_QUEUE 256
+
+
+//==========================================================
+// Public API.
+//
+
+void as_tsvc_init();
+void as_tsvc_enqueue(struct as_transaction_s *tr);
+void as_tsvc_set_threads_per_queue(uint32_t n_threads);
+int as_tsvc_queue_get_size();
+void as_tsvc_process_transaction(struct as_transaction_s *tr);
diff --git a/as/include/base/ticker.h b/as/include/base/ticker.h
new file mode 100644
index 00000000..a8063944
--- /dev/null
+++ b/as/include/base/ticker.h
@@ -0,0 +1,29 @@
+/*
+ * ticker.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Public API.
+//
+
+void as_ticker_start();
diff --git a/as/include/base/transaction.h b/as/include/base/transaction.h
new file mode 100644
index 00000000..aa7803b2
--- /dev/null
+++ b/as/include/base/transaction.h
@@ -0,0 +1,378 @@
+/*
+ * transaction.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#pragma once
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "msg.h"
+#include "node.h"
+#include "socket.h"
+
+#include "base/cfg.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/stats.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+struct as_namespace_s;
+
+
+//==========================================================
+// Histogram macros.
+//
+
+#define G_HIST_INSERT_DATA_POINT(name, start_time) \
+{ \
+	if (g_config.name##_enabled) { \
+		histogram_insert_data_point(g_stats.name, start_time); \
+	} \
+}
+
+#define G_HIST_ACTIVATE_INSERT_DATA_POINT(name, start_time) \
+{ \
+	g_stats.name##_active = true; \
+	histogram_insert_data_point(g_stats.name, start_time); \
+}
+
+#define HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(trw, name) \
+{ \
+	trw->rsv.ns->name##_active = true; \
+	cf_hist_track_insert_data_point(trw->rsv.ns->name, trw->start_time); \
+}
+
+#define HIST_ACTIVATE_INSERT_DATA_POINT(trw, name) \
+{ \
+	trw->rsv.ns->name##_active = true; \
+	histogram_insert_data_point(trw->rsv.ns->name, trw->start_time); \
+}
+
+#define BENCHMARK_START(tr, name, orig) \
+{ \
+	if (tr->rsv.ns->name##_benchmarks_enabled && tr->origin == orig) { \
+		if (tr->benchmark_time == 0) { \
+			tr->benchmark_time = histogram_insert_data_point(tr->rsv.ns->name##_start_hist, tr->start_time); \
+		} \
+		else { \
+			tr->benchmark_time = histogram_insert_data_point(tr->rsv.ns->name##_restart_hist, tr->benchmark_time); \
+		} \
+	} \
+}
+
+#define BENCHMARK_NEXT_DATA_POINT(trw, name, tok) \
+{ \
+	if (trw->rsv.ns->name##_benchmarks_enabled && trw->benchmark_time != 0) { \
+		trw->benchmark_time = histogram_insert_data_point(trw->rsv.ns->name##_##tok##_hist, trw->benchmark_time); \
+	} \
+}
+
+
+//==========================================================
+// Client socket information - as_file_handle.
+//
+
+typedef struct as_file_handle_s {
+	char		client[64];		// client identifier (currently ip-addr:port)
+	uint64_t	last_used;		// last ms we read or wrote
+	cf_socket	sock;			// our socket
+	cf_poll		poll;			// our epoll instance
+	bool		reap_me;		// tells the reaper to come and get us
+	uint32_t	fh_info;		// bitmap containing status info of this file handle
+	as_proto	proto_hdr;
+	as_proto	*proto;
+	uint64_t	proto_unread;
+	void		*security_filter;
+} as_file_handle;
+
+#define FH_INFO_DONOT_REAP	0x00000001	// this bit indicates that this file handle should not be reaped
+#define FH_INFO_XDR			0x00000002	// the file handle belongs to an XDR connection
+
+// Helpers to release transaction file handles.
+void as_release_file_handle(as_file_handle *proto_fd_h);
+void as_end_of_transaction(as_file_handle *proto_fd_h, bool force_close);
+void as_end_of_transaction_ok(as_file_handle *proto_fd_h);
+void as_end_of_transaction_force_close(as_file_handle *proto_fd_h);
+
+
+//==========================================================
+// Transaction.
+//
+
+typedef enum {
+	TRANS_DONE_ERROR	= -1, // tsvc frees msgp & reservation, response was sent to origin
+	TRANS_DONE_SUCCESS	=  0, // tsvc frees msgp & reservation, response was sent to origin
+	TRANS_IN_PROGRESS	=  1, // tsvc leaves msgp & reservation alone, rw_request now owns them
+	TRANS_WAITING		=  2  // tsvc leaves msgp alone but frees reservation
+} transaction_status;
+
+// How to interpret the 'from' union.
+//
+// NOT a generic transaction type flag, e.g. batch sub-transactions that proxy
+// are FROM_PROXY on the proxyee node, hence we still need a separate
+// FROM_FLAG_BATCH_SUB.
+//
+typedef enum {
+	// External, comes through demarshal or fabric:
+	FROM_CLIENT	= 1,
+	FROM_PROXY,
+
+	// Internal, generated on local node:
+	FROM_BATCH,
+	FROM_IUDF,
+	FROM_NSUP,
+	FROM_RE_REPL, // enterprise-only
+
+	FROM_UNDEF	= 0
+} transaction_origin;
+
+struct as_batch_shared_s;
+struct iudf_origin_s;
+
+typedef struct as_transaction_s {
+
+	//------------------------------------------------------
+	// transaction 'head' - copied onto queue.
+	//
+
+	cl_msg*		msgp;
+	uint32_t	msg_fields;
+
+	uint8_t		origin;
+	uint8_t		from_flags;
+
+	// 2 spare bytes.
+
+	union {
+		void*						any;
+		as_file_handle*				proto_fd_h;
+		cf_node						proxy_node;
+		struct as_batch_shared_s*	batch_shared;
+		struct iudf_origin_s*		iudf_orig;
+		void (*re_repl_orig_cb) (struct as_transaction_s* tr);
+	} from;
+
+	union {
+		uint32_t any;
+		uint32_t proxy_tid;
+		uint32_t batch_index;
+	} from_data;
+
+	cf_digest	keyd; // only batch sub-transactions require this on queue
+
+	uint64_t	start_time;
+	uint64_t	benchmark_time;
+
+	//<><><><><><><><><><><> 64 bytes <><><><><><><><><><><>
+
+	//------------------------------------------------------
+	// transaction 'body' - NOT copied onto queue.
+	//
+
+	as_partition_reservation rsv;
+
+	uint64_t	end_time;
+	uint8_t		result_code;
+	uint8_t		flags;
+	uint16_t	generation;
+	uint32_t	void_time;
+	uint64_t	last_update_time;
+
+} as_transaction;
+
+#define AS_TRANSACTION_HEAD_SIZE (offsetof(as_transaction, rsv))
+
+// 'from_flags' bits - set before queuing transaction head:
+#define FROM_FLAG_BATCH_SUB		0x0001
+#define FROM_FLAG_RESTART		0x0002
+
+// 'flags' bits - set in transaction body after queuing:
+#define AS_TRANSACTION_FLAG_SINDEX_TOUCHED	0x01
+#define AS_TRANSACTION_FLAG_IS_DELETE		0x02
+#define AS_TRANSACTION_FLAG_MUST_PING		0x04 // enterprise-only
+
+
+void as_transaction_init_head(as_transaction *tr, cf_digest *, cl_msg *);
+void as_transaction_init_body(as_transaction *tr);
+
+void as_transaction_copy_head(as_transaction *to, const as_transaction *from);
+
+struct rw_request_s;
+
+void as_transaction_init_from_rw(as_transaction *tr, struct rw_request_s *rw);
+void as_transaction_init_head_from_rw(as_transaction *tr, struct rw_request_s *rw);
+
+bool as_transaction_set_msg_field_flag(as_transaction *tr, uint8_t type);
+bool as_transaction_prepare(as_transaction *tr, bool swap);
+
+static inline bool
+as_transaction_is_restart(const as_transaction *tr)
+{
+	return (tr->from_flags & FROM_FLAG_RESTART) != 0;
+}
+
+static inline bool
+as_transaction_is_batch_sub(const as_transaction *tr)
+{
+	return (tr->from_flags & FROM_FLAG_BATCH_SUB) != 0;
+}
+
+static inline bool
+as_transaction_has_set(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_SET) != 0;
+}
+
+static inline bool
+as_transaction_has_key(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_KEY) != 0;
+}
+
+static inline bool
+as_transaction_has_digest(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_DIGEST_RIPE) != 0;
+}
+
+static inline bool
+as_transaction_has_no_key_or_digest(const as_transaction *tr)
+{
+	return (tr->msg_fields & (AS_MSG_FIELD_BIT_KEY | AS_MSG_FIELD_BIT_DIGEST_RIPE)) == 0;
+}
+
+static inline bool
+as_transaction_is_multi_record(const as_transaction *tr)
+{
+	return	(tr->msg_fields & (AS_MSG_FIELD_BIT_KEY | AS_MSG_FIELD_BIT_DIGEST_RIPE)) == 0 &&
+			(tr->from_flags & FROM_FLAG_BATCH_SUB) == 0;
+}
+
+static inline bool
+as_transaction_is_batch_direct(const as_transaction *tr)
+{
+	// Assumes we're already multi-record.
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_DIGEST_RIPE_ARRAY) != 0;
+}
+
+static inline bool
+as_transaction_is_query(const as_transaction *tr)
+{
+	// Assumes we're already multi-record.
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_INDEX_RANGE) != 0;
+}
+
+static inline bool
+as_transaction_is_udf(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_UDF_FILENAME) != 0;
+}
+
+static inline bool
+as_transaction_has_udf_op(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_UDF_OP) != 0;
+}
+
+static inline bool
+as_transaction_has_scan_options(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_SCAN_OPTIONS) != 0;
+}
+
+static inline bool
+as_transaction_has_socket_timeout(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_SOCKET_TIMEOUT) != 0;
+}
+
+static inline bool
+as_transaction_has_predexp(const as_transaction *tr)
+{
+	return (tr->msg_fields & AS_MSG_FIELD_BIT_PREDEXP) != 0;
+}
+
+// For now it's not worth storing the trid in the as_transaction struct since we
+// only parse it from the msg once per transaction anyway.
+static inline uint64_t
+as_transaction_trid(const as_transaction *tr)
+{
+	if ((tr->msg_fields & AS_MSG_FIELD_BIT_TRID) == 0) {
+		return 0;
+	}
+
+	as_msg_field *f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_TRID);
+
+	return cf_swap_from_be64(*(uint64_t*)f->data);
+}
+
+static inline bool
+as_transaction_is_delete(const as_transaction *tr)
+{
+	return (tr->msgp->msg.info2 & AS_MSG_INFO2_DELETE) != 0;
+}
+
+static inline bool
+as_transaction_is_durable_delete(const as_transaction *tr)
+{
+	return (tr->msgp->msg.info2 & AS_MSG_INFO2_DURABLE_DELETE) != 0;
+}
+
+// TODO - where should this go?
+static inline bool
+as_msg_is_xdr(const as_msg *m)
+{
+	return (m->info1 & AS_MSG_INFO1_XDR) != 0;
+}
+
+static inline bool
+as_transaction_is_xdr(const as_transaction *tr)
+{
+	return (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR) != 0;
+}
+
+static inline bool
+as_transaction_is_nsup_delete(const as_transaction *tr)
+{
+	return tr->origin == FROM_NSUP;
+}
+
+static inline bool
+as_transaction_is_linearized_read(const as_transaction *tr)
+{
+	return (tr->msgp->msg.info3 & AS_MSG_INFO3_LINEARIZE_READ) != 0;
+}
+
+void as_transaction_init_iudf(as_transaction *tr, struct as_namespace_s *ns, cf_digest *keyd, struct iudf_origin_s *iudf_orig, bool is_durable_delete);
+
+void as_transaction_demarshal_error(as_transaction *tr, uint32_t error_code);
+void as_transaction_error(as_transaction *tr, struct as_namespace_s *ns, uint32_t error_code);
+void as_multi_rec_transaction_error(as_transaction *tr, uint32_t error_code);
diff --git a/as/include/base/transaction_policy.h b/as/include/base/transaction_policy.h
new file mode 100644
index 00000000..dcc4b66f
--- /dev/null
+++ b/as/include/base/transaction_policy.h
@@ -0,0 +1,114 @@
+/*
+ * transaction_policy.h
+ *
+ * Copyright (C) 2014-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef enum {
+	// Server config override value only - means use policy sent by client.
+	AS_READ_CONSISTENCY_LEVEL_PROTO = -1,
+
+	// Must match AS_POLICY_CONSISTENCY_LEVEL_ONE in C Client v3 as_policy.h.
+	// Ignore duplicates - i.e. don't duplicate resolve.
+	AS_READ_CONSISTENCY_LEVEL_ONE,
+
+	// Must match AS_POLICY_CONSISTENCY_LEVEL_ALL in C Client v3 as_policy.h.
+	// Involve all duplicates in the operation - i.e. duplicate resolve.
+	AS_READ_CONSISTENCY_LEVEL_ALL,
+} as_read_consistency_level;
+
+typedef enum {
+	// Server config override value only - means use policy sent by client.
+	AS_WRITE_COMMIT_LEVEL_PROTO = -1,
+
+	// Must match AS_POLICY_COMMIT_LEVEL_ALL in C Client v3 as_policy.h.
+	// Respond to client only after successfully committing all replicas.
+	AS_WRITE_COMMIT_LEVEL_ALL,
+
+	// Must match AS_POLICY_COMMIT_LEVEL_MASTER in C Client v3 as_policy.h.
+	// Respond to client after successfully committing the master replica.
+	AS_WRITE_COMMIT_LEVEL_MASTER,
+} as_write_commit_level;
+
+
+//==========================================================
+// Public API - macros.
+//
+
+//------------------------------------------------
+// Extract levels from an as_msg.
+//
+
+// Not a strict check: both bits == 0 means ONE, anything else means ALL.
+#define PROTO_CONSISTENCY_LEVEL(asmsg) \
+	((((asmsg).info1 & AS_MSG_INFO1_CONSISTENCY_LEVEL_B0) == 0 && \
+	  ((asmsg).info1 & AS_MSG_INFO1_CONSISTENCY_LEVEL_B1) == 0) ? \
+			AS_READ_CONSISTENCY_LEVEL_ONE : AS_READ_CONSISTENCY_LEVEL_ALL)
+
+// Not a strict check: both bits == 0 means ALL, anything else means MASTER.
+#define PROTO_COMMIT_LEVEL(asmsg) \
+	((((asmsg).info3 & AS_MSG_INFO3_COMMIT_LEVEL_B0) == 0 && \
+	  ((asmsg).info3 & AS_MSG_INFO3_COMMIT_LEVEL_B1) == 0) ? \
+			AS_WRITE_COMMIT_LEVEL_ALL : AS_WRITE_COMMIT_LEVEL_MASTER)
+
+//------------------------------------------------
+// Get levels for a transaction with reservation.
+//
+
+// Determine read consistency level for a transaction based on everything.
+#define TR_READ_CONSISTENCY_LEVEL(tr) \
+	(tr->rsv.ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_PROTO ? \
+		PROTO_CONSISTENCY_LEVEL(tr->msgp->msg) : \
+		tr->rsv.ns->read_consistency_level)
+
+// Determine write commit level for a transaction based on everything.
+#define TR_WRITE_COMMIT_LEVEL(tr) \
+	(tr->rsv.ns->write_commit_level == AS_WRITE_COMMIT_LEVEL_PROTO ? \
+		PROTO_COMMIT_LEVEL(tr->msgp->msg) : \
+		tr->rsv.ns->write_commit_level)
+
+//------------------------------------------------
+// Get levels without need of reservation.
+//
+
+// Same as above, for use before tr->rsv has been made.
+#define READ_CONSISTENCY_LEVEL(ns, asmsg) \
+	(ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_PROTO ? \
+		PROTO_CONSISTENCY_LEVEL(asmsg) : \
+		ns->read_consistency_level)
+
+//------------------------------------------------
+// Get config override values' names.
+//
+
+#define NS_READ_CONSISTENCY_LEVEL_NAME() \
+	(ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_PROTO ? \
+		"off" : (ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_ONE ? \
+			"one" : "all"))
+
+#define NS_WRITE_COMMIT_LEVEL_NAME() \
+	(ns->write_commit_level == AS_WRITE_COMMIT_LEVEL_PROTO ? \
+		"off" : (ns->write_commit_level == AS_WRITE_COMMIT_LEVEL_ALL ? \
+			"all" : "master"))
diff --git a/as/include/base/truncate.h b/as/include/base/truncate.h
new file mode 100644
index 00000000..130b2f10
--- /dev/null
+++ b/as/include/base/truncate.h
@@ -0,0 +1,94 @@
+/*
+ * truncate.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_atomic.h"
+
+#include "shash.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_index_s;
+struct as_namespace_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef enum {
+	TRUNCATE_IDLE,
+	TRUNCATE_RUNNING,
+	TRUNCATE_RESTART
+} truncate_state;
+
+typedef struct as_truncate_s {
+	uint64_t lut;
+	cf_shash* startup_set_hash; // relevant only for enterprise edition
+	truncate_state state;
+	pthread_mutex_t state_lock;
+	cf_atomic32 n_threads_running;
+	cf_atomic32 pid;
+	cf_atomic64 n_records_this_run;
+	uint64_t n_records;
+} as_truncate;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_truncate_init(struct as_namespace_s* ns);
+void as_truncate_init_smd();
+void as_truncate_list_cenotaphs(struct as_namespace_s* ns);
+void as_truncate_done_startup(struct as_namespace_s* ns);
+bool as_truncate_cmd(const char* ns_name, const char* set_name, const char* lut_str);
+void as_truncate_undo_cmd(const char* ns_name, const char* set_name);
+bool as_truncate_now_is_truncated(struct as_namespace_s* ns, uint16_t set_id);
+bool as_truncate_record_is_truncated(const struct as_index_s* r, struct as_namespace_s* ns);
+
+
+//==========================================================
+// For enterprise separation only.
+//
+
+typedef struct truncate_hval_s {
+	uint64_t cenotaph:1;
+	uint64_t unused:23;
+	uint64_t lut:40;
+} truncate_hval;
+
+void truncate_startup_hash_init(struct as_namespace_s* ns);
+void truncate_action_startup(struct as_namespace_s* ns, const char* set_name, uint64_t lut);
diff --git a/as/include/base/udf_aerospike.h b/as/include/base/udf_aerospike.h
new file mode 100644
index 00000000..76510ae9
--- /dev/null
+++ b/as/include/base/udf_aerospike.h
@@ -0,0 +1,27 @@
+/*
+ * udf_aerospike.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+ 
+#include "aerospike/as_aerospike.h"
+
+extern const   as_aerospike_hooks udf_aerospike_hooks;
diff --git a/as/include/base/udf_arglist.h b/as/include/base/udf_arglist.h
new file mode 100644
index 00000000..42fccf59
--- /dev/null
+++ b/as/include/base/udf_arglist.h
@@ -0,0 +1,31 @@
+/*
+ * udf_arglist.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "aerospike/as_list.h"
+
+/******************************************************************************
+ * VARIABLES
+ ******************************************************************************/
+
+extern const as_list_hooks udf_arglist_hooks;
diff --git a/as/include/base/udf_cask.h b/as/include/base/udf_cask.h
new file mode 100644
index 00000000..42cec76c
--- /dev/null
+++ b/as/include/base/udf_cask.h
@@ -0,0 +1,70 @@
+/*
+ * udf_cask.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "dynbuf.h"
+
+#include "base/thr_info.h"
+
+
+// UDF Types
+#define AS_UDF_TYPE_LUA 0
+#define MAX_UDF_CONTENT_LENGTH (1024 * 1024) //(1MB)
+
+extern char *as_udf_type_name[];
+
+//------------------------------------------------
+// Register function
+void udf_cask_init();
+
+//------------------------------------------------
+// these functions are "as_info_command" format
+// and called directly from there.
+// therefore they have the same calling convention
+
+int udf_cask_info_clear_cache(char * name, char * params, cf_dyn_buf * out);
+
+int udf_cask_info_get(char * name, char * params, cf_dyn_buf * out);
+
+int udf_cask_info_put(char * name, char * params, cf_dyn_buf * out);
+
+int udf_cask_info_remove(char * name, char * params, cf_dyn_buf * out);
+
+int udf_cask_info_reconfigure(char * name, char * params, cf_dyn_buf * buf);
+
+int udf_cask_info_list(char *name, cf_dyn_buf * out);
+
+//------------------------------------------------
+// these are called by the modules that need to run UDFs
+
+// called by a module to get the data associated with a udf (the file contents)
+// this will be a reference count (rc_alloc) pointer and must be dereferenced by the caller
+int udf_cask_get_udf(char *module, char *udf_type, uint8_t **buf , size_t *buf_len );
+
+// called by a module to get the data associated with a udf (the fully qualified file name)
+// caller passes in a max-size string buffer that gets filled out (null terminated)
+int udf_cask_get_udf_filename(char *module, char *udf_type, char *filename );
+
diff --git a/as/include/base/udf_memtracker.h b/as/include/base/udf_memtracker.h
new file mode 100644
index 00000000..619edd45
--- /dev/null
+++ b/as/include/base/udf_memtracker.h
@@ -0,0 +1,51 @@
+/*
+ * udf_memtracker.h
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/**
+ * An as_memtracker for tests.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "aerospike/as_memtracker.h"
+
+typedef enum {
+	MEM_RESERVE	= 0,
+	MEM_RELEASE	= 1,
+	MEM_RESET	= 2
+} memtracker_op;
+
+typedef struct mem_tracker_s mem_tracker;
+typedef bool (*as_memtracker_op_cb)(mem_tracker *mt, uint32_t, memtracker_op);
+
+struct mem_tracker_s {
+	void					*udata;
+	as_memtracker_op_cb		cb;
+};
+
+/*****************************************************************************
+ * STATIC FUNCTIONS
+ *****************************************************************************/
+as_memtracker * udf_memtracker_init();
+void udf_memtracker_setup(mem_tracker *mt);
+void udf_memtracker_cleanup();
diff --git a/as/include/base/udf_record.h b/as/include/base/udf_record.h
new file mode 100644
index 00000000..e6973ef4
--- /dev/null
+++ b/as/include/base/udf_record.h
@@ -0,0 +1,110 @@
+/*
+ * udf_record.h
+ *
+ * Copyright (C) 2013-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aerospike/as_rec.h"
+#include "aerospike/as_hashmap.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_atomic.h"
+
+#include "base/datamodel.h"
+#include "base/rec_props.h"
+#include "base/transaction.h"
+#include "base/xdr_serverside.h"
+#include "storage/storage.h"
+
+
+// Maximum number of bins that can be updated in a single UDF.
+#define UDF_RECORD_BIN_ULIMIT 512
+
+typedef struct udf_record_bin_s {
+	char				name[AS_ID_BIN_SZ];
+	as_val *			value;
+	as_val *			oldvalue; // keeps track of old value in case rollback is required
+	bool				dirty;
+	void                *particle_buf;
+} udf_record_bin;
+
+typedef struct udf_record_s {
+
+	// STORAGE
+	as_index_ref 		*r_ref;
+	as_transaction 		*tr;
+	as_storage_rd 		*rd;
+	xdr_dirty_bins		*dirty;
+	cf_digest			keyd;
+	as_bin				stack_bins[UDF_RECORD_BIN_ULIMIT]; // TODO increase bin limit?
+
+	// UDF CHANGE CACHE
+	udf_record_bin		updates[UDF_RECORD_BIN_ULIMIT]; // stores cache bin value
+                                                        // if dirty flag is set the bin is being modified
+	uint32_t			nupdates; // reset after every cache free, incremented in every cache set
+
+	// RUNTIME ACCOUNTING
+	uint8_t				*particle_data; // non-null for data-on-ssd, and lazy allocated on first bin write
+	uint8_t				*cur_particle_data; // where the pointer is
+	uint8_t				*end_particle_data;
+	uint32_t			starting_memory_bytes;
+	cf_atomic_int		udf_runtime_memory_used;
+
+	// INTERNAL UTILITY
+	uint16_t			flag;
+} udf_record;
+
+#define UDF_RECORD_FLAG_ALLOW_UPDATES		0x0001   // Write/Updates Allowed
+#define UDF_RECORD_FLAG_TOO_MANY_BINS		0x0002   // UDF exceeds the bin limit
+#define UDF_RECORD_FLAG_UNUSED_4			0x0004   // was - sub-record
+#define UDF_RECORD_FLAG_OPEN				0x0008   // as_record_open done
+#define UDF_RECORD_FLAG_STORAGE_OPEN		0x0010   // as_storage_record_open done
+#define UDF_RECORD_FLAG_HAS_UPDATES			0x0020   // Write/Update done
+#define UDF_RECORD_FLAG_PREEXISTS			0x0040   // Record preexisted not created
+#define UDF_RECORD_FLAG_ISVALID				0x0080   // Udf is setup and in use
+#define UDF_RECORD_FLAG_METADATA_UPDATED	0x0100   // Write/Update metadata done
+
+extern const as_rec_hooks udf_record_hooks;
+
+//------------------------------------------------
+// Utility functions for all the wrapper as_record implementation
+// which use udf_record under the hood
+extern void     udf_record_cache_free   (udf_record *);
+extern int      udf_record_open         (udf_record *);
+extern int      udf_storage_record_open (udf_record *);
+extern void     udf_record_close        (udf_record *);
+extern int      udf_storage_record_close(udf_record *);
+extern void     udf_record_init         (udf_record *, bool);
+extern as_val * udf_record_storage_get  (const udf_record *, const char *);
+
+#define UDF_ERR_INTERNAL_PARAMETER   2
+#define UDF_ERR_RECORD_NOT_VALID     3
+#define UDF_ERR_PARAMETER            4
+extern int      udf_record_param_check(const as_rec *rec, char *fname, int lineno);
+extern bool     udf_record_destroy(as_rec *rec);
+
+//------------------------------------------------
+// Note that the main interface routines do NOT get declared here.
+// extern int      udf_record_set_flags(const as_rec *, const char *, uint8_t);
+// extern int      udf_record_set_type(const as_rec *,  int8_t);
diff --git a/as/include/base/udf_timer.h b/as/include/base/udf_timer.h
new file mode 100644
index 00000000..da71320b
--- /dev/null
+++ b/as/include/base/udf_timer.h
@@ -0,0 +1,47 @@
+/*
+ * udf_timer.h
+ *
+ * Copyright (C) 2013-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * An as_timer for tests.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "aerospike/as_timer.h"
+
+typedef struct   time_tracker_s time_tracker;
+typedef uint64_t (* as_timer_end_time_cb)(time_tracker *tt);
+typedef uint64_t (* as_timer_timeslice_cb)(time_tracker *tt);
+
+struct time_tracker_s {
+	void                  * udata;
+	as_timer_end_time_cb    end_time;
+};
+
+/*****************************************************************************
+ * STATIC FUNCTIONS
+ *****************************************************************************/
+void         udf_timer_setup(time_tracker *tt);
+void         udf_timer_cleanup();
+extern const as_timer_hooks udf_timer_hooks;
+
diff --git a/as/include/base/xdr_config.h b/as/include/base/xdr_config.h
new file mode 100644
index 00000000..c5faf457
--- /dev/null
+++ b/as/include/base/xdr_config.h
@@ -0,0 +1,128 @@
+/*
+ * xdr_config.h
+ *
+ * Copyright (C) 2011-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include "citrusleaf/cf_vector.h"
+
+#include "node.h"
+#include "tls.h"
+
+//==========================================================
+// Forward declarations.
+//
+
+//==========================================================
+// Constants & typedefs.
+//
+
+// Length definitions. This should be in sync with the server definitions.
+// It is bad that we are not using a common header file for all this.
+#define CLUSTER_MAX_SZ		128
+#define NAMESPACE_MAX_NUM	32
+#define DC_MAX_NUM			32
+
+typedef struct xdr_node_lst_s {
+	cf_node		node;
+	uint64_t	time[DC_MAX_NUM];
+} xdr_node_lst;
+
+typedef struct node_addr_port_s {
+	char            *addr;
+	char            *tls_name;
+	int             port;
+} node_addr_port;
+
+// Config option in case the configuration value is changed
+typedef struct xdr_new_config_s {
+	bool	skip_outstanding;
+} xdr_new_config;
+
+// Config option which is maintained both by the server and the XDR module
+typedef struct xdr_config_s {
+
+	bool		xdr_section_configured;
+	bool		xdr_global_enabled;
+
+	// Ring buffer configuration
+	char		*xdr_digestlog_path;
+	uint64_t	xdr_digestlog_file_size;
+
+	uint32_t	xdr_info_port;
+	uint32_t	xdr_max_ship_throughput;
+	uint32_t	xdr_max_ship_bandwidth;
+	uint32_t	xdr_min_dlog_free_pct;
+	uint32_t	xdr_hotkey_time_ms;
+	uint32_t	xdr_read_threads;
+	uint32_t	xdr_write_timeout;
+	uint32_t	xdr_client_threads;
+	uint32_t	xdr_forward_xdrwrites;
+	uint32_t	xdr_internal_shipping_delay;
+	uint32_t	xdr_info_request_timeout_ms;
+	uint32_t	xdr_compression_threshold;
+	uint32_t	xdr_digestlog_iowait_ms;
+
+	bool		xdr_shipping_enabled;
+	bool		xdr_delete_shipping_enabled;
+	bool		xdr_nsup_deletes_enabled;
+	bool		xdr_ship_bins;
+	bool		xdr_handle_failednode;
+	bool		xdr_handle_linkdown;
+
+	// Internal
+	bool		xdr_conf_change_flag;
+	xdr_new_config xdr_new_cfg;
+} xdr_config;
+
+typedef struct xdr_security_config_s {
+	char		*sec_config_file;
+	char		*username;
+	char		*password;
+} xdr_security_config;
+
+typedef struct dc_config_opt_s {
+	 char					*dc_name;
+	 int					dc_id;
+	 cf_vector				dc_node_v;
+	 cf_vector				dc_addr_map_v;
+	 uint32_t				dc_connections;
+	 uint32_t				dc_connections_idle_ms;
+	 xdr_security_config	dc_security_cfg;
+	 bool					dc_use_alternate_services;
+	 char					*tls_our_name;
+	 cf_tls_spec			*tls_spec;
+} dc_config_opt;
+
+//==========================================================
+// Public API.
+//
+
+void xdr_config_defaults();
+bool xdr_read_security_configfile(xdr_security_config* sc);
+
+extern xdr_config g_xcfg;
+extern int g_dc_count;
+extern dc_config_opt g_dc_xcfg_opt[DC_MAX_NUM];
diff --git a/as/include/base/xdr_serverside.h b/as/include/base/xdr_serverside.h
new file mode 100644
index 00000000..e325e809
--- /dev/null
+++ b/as/include/base/xdr_serverside.h
@@ -0,0 +1,87 @@
+/*
+ * xdr_serverside.h
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_digest.h"
+
+#include "dynbuf.h"
+#include "node.h"
+#include "socket.h"
+
+#include "base/datamodel.h"
+#include "base/transaction.h"
+
+//==========================================================
+// Forward declarations.
+//
+
+//==========================================================
+// Constants & typedefs.
+//
+
+typedef enum {
+	XDR_OP_TYPE_WRITE,
+	XDR_OP_TYPE_DROP,
+	XDR_OP_TYPE_DURABLE_DELETE
+} xdr_op_type;
+
+typedef uint64_t xdr_dirty_bins[2];
+
+//==========================================================
+// Public API.
+//
+
+int as_xdr_init();
+void xdr_config_post_process();
+void as_xdr_start();
+int as_xdr_shutdown();
+void xdr_sig_handler(int signum);
+
+void xdr_clear_dirty_bins(xdr_dirty_bins *dirty);
+void xdr_fill_dirty_bins(xdr_dirty_bins *dirty);
+void xdr_copy_dirty_bins(xdr_dirty_bins *from, xdr_dirty_bins *to);
+void xdr_add_dirty_bin(as_namespace *ns, xdr_dirty_bins *dirty, const char *name, size_t name_len);
+void xdr_write(as_namespace *ns, cf_digest *keyd, uint16_t generation, cf_node masternode, xdr_op_type op_type, uint16_t set_id, xdr_dirty_bins *dirty);
+void as_xdr_read_txn(as_transaction *txn);
+
+void as_xdr_info_init(void);
+void as_xdr_info_port(cf_serv_cfg *serv_cfg);
+int as_info_command_xdr(char *name, char *params, cf_dyn_buf *db);
+void as_xdr_get_stats(cf_dyn_buf *db);
+void as_xdr_get_config(cf_dyn_buf *db);
+bool as_xdr_set_config(char *params);
+bool as_xdr_set_config_ns(char *ns_name, char *params);
+
+bool is_xdr_delete_shipping_enabled();
+bool is_xdr_digestlog_low(as_namespace *ns);
+bool is_xdr_forwarding_enabled();
+bool is_xdr_nsup_deletes_enabled();
+
+void xdr_cfg_add_int_ext_mapping(dc_config_opt *dc_cfg, char* orig, char* alt);
diff --git a/as/include/fabric/clustering.h b/as/include/fabric/clustering.h
new file mode 100644
index 00000000..9ac9163a
--- /dev/null
+++ b/as/include/fabric/clustering.h
@@ -0,0 +1,296 @@
+/*
+ * clustering.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * Aerospike cluster formation v5 based on paxos.
+ * Complete discussion of the algorithm can be found
+ * https://docs.google.com/document/d/1u-27aeZD9no9wiWgt1_BsTSg_6ewG9VBI2sYA0g01BE/edit#
+ */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_vector.h"
+
+#include "fault.h"
+
+#include "fabric/hlc.h"
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public data structures.
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Aerospike cluster key.
+ */
+typedef uint64_t as_cluster_key;
+
+/**
+ * Aerospike clustering protocol identifier.
+ */
+typedef uint32_t as_cluster_proto_identifier;
+
+/**
+ * Configuration for the clustering algorithm.
+ */
+typedef struct as_clustering_config_s
+{
+	/**
+	 * The smallest allowed  cluster size.
+	 */
+	uint32_t cluster_size_min;
+
+	/**
+	 * Indicates if clique based eviction is enabled.
+	 */
+	bool clique_based_eviction_enabled;
+
+	/**
+	 * Current protocol identifier.
+	 */
+	as_cluster_proto_identifier protocol_identifier;
+
+} as_clustering_config;
+
+/**
+ * The clustering protocol versions.
+ */
+typedef enum as_clustering_protocol_version
+{
+	AS_CLUSTERING_PROTOCOL_UNDEF,
+	AS_CLUSTERING_PROTOCOL_NONE,
+	AS_CLUSTERING_PROTOCOL_V1,
+	AS_CLUSTERING_PROTOCOL_V2,
+	AS_CLUSTERING_PROTOCOL_V3,
+	AS_CLUSTERING_PROTOCOL_V4,
+	AS_CLUSTERING_PROTOCOL_V5
+} as_clustering_protocol_version;
+
+/**
+ * Clustering event type.
+ */
+typedef enum as_clustering_event_type_e
+{
+	/**
+	 * Cluster membership for this node changed.
+	 */
+	AS_CLUSTERING_CLUSTER_CHANGED,
+
+	/**
+	 * This node became an orphan node.
+	 */
+	AS_CLUSTERING_ORPHANED
+} as_clustering_event_type;
+
+/**
+ * Clustering event type.
+ */
+typedef enum as_clustering_event_qualifier_e
+{
+	/**
+	 * The default qualifier for cases where a qualifier is not applicable.
+	 */
+	AS_CLUSTERING_QUALIFIER_NA,
+
+	/**
+	 * Cluster membership lost since the principal evicted this node or is no
+	 * longer reachable or the cluster is invalid. Relevant only for orphaned
+	 * event.
+	 */
+	AS_CLUSTERING_MEMBERSHIP_LOST,
+
+	/**
+	 * This node became an orphan node in order to attempt a merge. Relevant
+	 * only for orphaned event.
+	 */
+	AS_CLUSTERING_ATTEMPTING_MERGE,
+} as_clustering_event_qualifier;
+
+/**
+ * Clustering event.
+ */
+typedef struct as_clustering_event_s
+{
+	/**
+	 * The clustering event type.
+	 */
+	as_clustering_event_type type;
+
+	/**
+	 * The clustering event qualifier.
+	 */
+	as_clustering_event_qualifier qualifier;
+
+	/**
+	 * The cluster key. Will be non-zero if this is a cluster change event.
+	 */
+	as_cluster_key cluster_key;
+
+	/**
+	 * The new succession list. It will not be empty if this is a cluster change
+	 * event.
+	 *
+	 * The allocated space will be freed once the event processing is complete.
+	 * Listeners should always create a copy of this list, if it needs to be
+	 * used later on by the listener.
+	 */
+	cf_vector* succession_list;
+} as_clustering_event;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public API.
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Initialize clustering subsystem.
+ */
+void
+as_clustering_init();
+
+/**
+ * Start clustering subsystem.
+ */
+void
+as_clustering_start();
+
+/**
+ * Stop clustering subsystem.
+ */
+void
+as_clustering_stop();
+
+/**
+ * Reform the cluster with the same succession list.This would trigger the
+ * generation of new partition info and the cluster would get a new cluster key.
+ *
+ * @return 0 if new clustering round started, -1 otherwise.
+ */
+int
+as_clustering_cluster_reform();
+
+/**
+ * Return the quantum interval, i.e., the interval at which cluster change
+ * decisions are taken. The unit is milliseconds.
+ */
+uint64_t
+as_clustering_quantum_interval();
+
+/**
+ * Log a vector of node-ids at input severity spliting long vectors over
+ * multiple lines. The call might not work if the vector is not protected
+ * against multi-threaded access.
+ *
+ * @param context the logging context.
+ * @param severity the log severity.
+ * @param file_name the source file name for the log line.
+ * @param line the source file line number for the log line.
+ * @param message the message prefix for each log line. Message and node list
+ * will be separated with a space. Can be NULL for no prefix.
+ * @param nodes the vector of nodes.
+ */
+void
+as_clustering_cf_node_vector_event(cf_fault_severity severity,
+		cf_fault_context context, char* file_name, int line, char* message,
+		cf_vector* nodes);
+
+/**
+ * Log an array of node-ids at input severity spliting long vectors over
+ * multiple lines. The call might not work if the array is not protected against
+ * multi-threaded access.
+ *
+ * @param context the logging context.
+ * @param severity the log severity.
+ * @param file_name the source file name for the log line.
+ * @param line the source file line number for the log line.
+ * @param message the message prefix for each log line. Message and node list
+ * will be separated with a space. Can be NULL for no prefix.
+ * @param nodes the array of nodes.
+ * @param node_count the count of nodes in the array.
+ */
+void
+as_clustering_cf_node_array_event(cf_fault_severity severity,
+		cf_fault_context context, char* file_name, int line, char* message,
+		cf_node* nodes, int node_count);
+
+/**
+ * Log a vector of node-ids at input severity spliting long vectors over
+ * multiple lines. The call might not work if the vector is not protected
+ * against multi-threaded access.
+ *
+ * @param context the logging context.
+ * @param severity the log severity.
+ * @param message the message prefix for each log line. Message and node list
+ * will be separated with a space. Can be NULL for no prefix.
+ * @param nodes the vector of nodes.
+ */
+#define as_clustering_log_cf_node_vector(severity, context, message, nodes)					\
+	as_clustering_cf_node_vector_event(severity, context, __FILENAME__,	\
+									   __LINE__, message, nodes)
+
+/**
+ * Log an array of node-ids at input severity spliting long vectors over
+ * multiple lines. The call might not work if the array is not protected against
+ * multi-threaded access.
+ *
+ * @param context the logging context.
+ * @param severity the log severity.
+ * @param message the message prefix for each log line. Message and node list
+ * will be separated with a space. Can be NULL for no prefix.
+ * @param nodes the array of nodes.
+ * @param node_count the count of nodes in the array.
+ */
+#define as_clustering_log_cf_node_array(severity, context, message, nodes,	\
+		node_count)															\
+as_clustering_cf_node_array_event(severity, context, __FILENAME__,			\
+		__LINE__, message, nodes, node_count);
+
+
+/*
+ * ---- Clustering info command functions. ----
+ */
+/**
+ * If false means than either this node is orphaned, or is undergoing a cluster
+ * change.
+ */
+bool
+as_clustering_has_integrity();
+
+/**
+ * Indicates if self node is orphaned.
+ */
+bool
+as_clustering_is_orphan();
+
+/**
+ * Dump clustering state to the log.
+ */
+void
+as_clustering_dump(bool verbose);
+
+/**
+ * Set the min cluster size.
+ */
+int
+as_clustering_cluster_size_min_set(uint32_t new_cluster_size_min);
diff --git a/as/include/fabric/endpoint.h b/as/include/fabric/endpoint.h
new file mode 100644
index 00000000..f10b0d0b
--- /dev/null
+++ b/as/include/fabric/endpoint.h
@@ -0,0 +1,324 @@
+/*
+ * endpoint.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * Overview
+ * ========
+ *
+ * An endpoint captures all information needed by a peer node to establish a
+ * connection to a service (e.g. fabric or heartbeat). The key difference
+ * between an endpoint and socket API's cf_sock_cfg, is that cf_sock_cfg captures
+ * all information needed by a service to start a server socket and accept
+ * connections on it, whereas an endpoint captures all information a peer needs
+ * to connect to the service. These two complementary structures overlap in
+ * information content, however cf_sock_cfg will carry server side configuration
+ * values (e.g. TLS configuration), which are irrelevant for the client using
+ * this service. Also an endpoint structure is oriented to be advertised over
+ * the wire.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "socket.h"
+
+/**
+ * Indicates if this endpoint supports TLS.
+ */
+#define AS_ENDPOINT_TLS_MASK 0x01
+
+/**
+ * Endpoint address type.
+ */
+typedef enum
+{
+	/**
+	 * Undefined address type.
+	 */
+	AS_ENDPOINT_ADDR_TYPE_UNDEF,
+	/**
+	 * IPv4 address.
+	 */
+	AS_ENDPOINT_ADDR_TYPE_IPv4,
+	/**
+	 * IPv6 address.
+	 */
+	AS_ENDPOINT_ADDR_TYPE_IPv6,
+	/**
+	 * Sentinel value.
+	 */
+	AS_ENDPOINT_ADDR_TYPE_SENTINEL
+} as_endpoint_addr_type;
+
+/**
+ * An endpoint definition.
+ */
+typedef struct as_endpoint_s
+{
+	/**
+	 *  Bit field of capabilities. currently carries only tls enabled flag.
+	 */
+	uint8_t capabilities;
+
+	/**
+	 * The type of the address.
+	 */
+	uint8_t addr_type;
+
+	/**
+	 * The endpoint port.
+	 */
+	uint16_t port;
+
+	/**
+	 * The network formatted and ordered IPv4 / IPv6 address (or string if
+	 * we decide to support dns names). The size of this field depends on
+	 * the address type.
+	 */
+	uint8_t addr[];
+}__attribute__((__packed__)) as_endpoint;
+
+/**
+ * A list of endpoints.
+ */
+typedef struct as_endpoint_list_s
+{
+	/**
+	 * The number of endpoints contained in the list. Max of 255.
+	 */
+	uint8_t n_endpoints;
+
+	/**
+	 * The list of endpoints.
+	 */
+	as_endpoint endpoints[];
+}__attribute__((__packed__)) as_endpoint_list;
+
+/**
+ * Iterate function for iterating over endpoints in an endpoint list.
+ * @param endpoint current endpoint in the iteration.
+ * @param udata udata passed through from the invoker of the iterate function.
+ */
+typedef void
+	(*as_endpoint_iterate_fn)(const as_endpoint* endpoint, void* udata);
+
+/**
+ * Filter function for an endpoints in an endpoint list.
+ * @param endpoint current endpoint in the iteration.
+ * @param udata udata passed through from the invoker of the filter function.
+ * @return should return true if this endpoint passes the filter, false if it
+ * fails the filter.
+ */
+typedef bool
+	(*as_endpoint_filter_fn)(const as_endpoint* endpoint, void* udata);
+
+/**
+ * Get the sizeof an endpoint. Accounts for variable size of the address field.
+ * @return the size of the endpoint address. Zero if the endpoint address is
+ * invalid.
+ */
+size_t
+as_endpoint_sizeof(const as_endpoint* endpoint);
+
+/**
+ * Enable a capability on an endpoint given its mask.
+ * @param endpoint the endpoint.
+ * @param capability_mask the capability mask.
+ */
+void
+as_endpoint_capability_enable(as_endpoint* endpoint, uint8_t capability_mask);
+
+/**
+ * Disable a capability on an endpoint given its mask.
+ * @param endpoint the endpoint.
+ * @param capability_mask the capability mask.
+ */
+void
+as_endpoint_capability_disable(as_endpoint* endpoint, uint8_t capability_mask);
+
+/**
+ * Connect to an endpoint.
+ *
+ * @param endpoint the peer endpoint to connect to.
+ * @param owner the socket owner module.
+ * @param timeout the overall connect timeout.
+ * @param sock (output) will be populated if connections is successful.
+ * @return -1 on success, 0 on failure.
+ */
+int
+as_endpoint_connect(const as_endpoint* endpoint, int32_t timeout, cf_socket* sock);
+
+/**
+ * Connect to the best matching endpoint in the endpoint list.
+ *
+ * @param endpoint_list the list of endpoints.
+ * @param filter_fn filter function to discard incompatible endpoints. Can be
+ * NULL.
+ * @param filter_udata udata passed on as is to the filter function.
+ * @param timeout the overall connect timeout.
+ * @param sock (output) will be populated if connection is successful.
+ * @return the connected endpoint on success, NULL if no endpoint count be
+ * connected.
+ */
+const as_endpoint*
+as_endpoint_connect_any(const as_endpoint_list* endpoint_list,
+	as_endpoint_filter_fn filter_fn, void* filter_udata, int32_t timeout, cf_socket* sock);
+/**
+ * Convert a socket configuration to an endpoint inplace.
+ * @return a heap allocated, converted endpoint. Should be freed using cf_free
+ * once the endpoint is no longer needed.
+ */
+void
+as_endpoint_from_sock_cfg_fill(const cf_sock_cfg* src, as_endpoint* endpoint);
+
+/**
+ * Convert a socket configuration to an endpoint.
+ * @return a heap allocated, converted endpoint. Should be freed using cf_free
+ * once the endpoint is no longer needed.
+ */
+as_endpoint*
+as_endpoint_from_sock_cfg(const cf_sock_cfg* src);
+
+/**
+ * Convert an endpoint to a cf_sock_addr.
+ * @param endpoint the source endpoint.
+ * @param sock_addr the target socket address.
+ */
+int
+as_endpoint_to_sock_addr(const as_endpoint* endpoint, cf_sock_addr* sock_addr);
+
+/**
+ * Indicates if an endpoint supports listed capabilities.
+ * @return true if the endpoint supports the input capability.
+ */
+bool
+as_endpoint_capability_is_supported(const as_endpoint* endpoint, uint8_t capability_mask);
+
+/**
+ * Iterate over endpoints in an endpoint list and invoke the iterate function
+ * for each endpoint.
+ * @param iterate_fn the iterate function invoked for each endpoint in the list.
+ * @param udata passed as is to the iterate function. Useful for getting results
+ * out of the iteration.
+ * NULL if there is no plugin data.
+ * @return the size of the plugin data. 0 if there is no plugin data.
+ */
+void
+as_endpoint_list_iterate(const as_endpoint_list* endpoint_list, as_endpoint_iterate_fn iterate_fn,
+	void* udata);
+
+/**
+ * Return the in memory size in bytes of the endpoint list.
+ * @param endpoint_list the endpoint list.
+ * @param size (output) the size of the list on success.
+ * @return 0 on successful size calculation, -1 otherwise.
+ */
+int
+as_endpoint_list_sizeof(const as_endpoint_list* endpoint_list, size_t* size);
+
+/**
+ * Return the in memory size in bytes of the endpoint list, but abort if the
+ * size of the read exceeds the input size.
+ * @param endpoint_list the endpoint list.
+ * @param size (output) the size of the list on success.
+ * @param size_max the maximum size until which parsing will be attempted.
+ * @return 0 on successful size calculation, -1 otherwise.
+ */
+int
+as_endpoint_list_nsizeof(const as_endpoint_list* endpoint_list, size_t* size, size_t size_max);
+
+/**
+ * Convert a server configuration to an endpoint list in place into the
+ * destination endpoint list.
+ * @param serv_cfg source server configuration.
+ * @param endpoint_list destination endpoint list.
+ */
+void
+as_endpoint_list_from_serv_cfg_fill(const cf_serv_cfg* serv_cfg, as_endpoint_list* endpoint_list);
+
+/**
+ * Convert a server configuration to an endpoint list.
+ * @param serv_cfg server configuration.
+ * @return a heap allocated endpoint list.  Should be freed using cf_free
+ * once the endpoint is no longer needed.
+ */
+as_endpoint_list*
+as_endpoint_list_from_serv_cfg(const cf_serv_cfg* serv_cfg);
+
+/**
+ * Compare two endpoint lists for equality.
+ * @param list1 the first. NULL allowed.
+ * @param list2 the second list. NULL allowed.
+ * @return true iff the lists are equals, false otherwise.
+ */
+bool
+as_endpoint_lists_are_equal(const as_endpoint_list* list1, const as_endpoint_list* list2);
+
+/**
+ * Check if two lists overlap in at least one endpoint.
+ * @param list1 the first. NULL allowed.
+ * @param list2 the second list. NULL allowed.
+ * @param ignore_capabilities set to true if the overlap match should ignore
+ * node capabilities, false if capabilities should also be matched.
+ * @return true iff the lists are overlap, false otherwise.
+ */
+bool
+as_endpoint_lists_are_overlapping(const as_endpoint_list* list1, const as_endpoint_list* list2,
+	bool ignore_capabilities);
+
+/**
+ * Convert an endpoint list to a string.
+ * @param endpoint_list the input list. NULL allowed.
+ * @param buffer the output buffer.
+ * @buffer_capacity the capacity of the output buffer.
+ * @return the number of characters printed (excluding the null  byte  used  to
+ end  output  to strings)
+ */
+int
+as_endpoint_list_to_string(const as_endpoint_list* endpoint_list, char* buffer,
+	size_t buffer_capacity);
+
+/**
+ * Convert an endpoint list to a string matching capabilities.
+ * @param endpoint_list the input list. NULL allowed.
+ * @param buffer the output buffer.
+ * @param buffer_capacity the capacity of the output buffer.
+ * @param capability_mask specifies which bit to match.
+ * @param capabilities specifies capabilities to be match for.
+ * @return the number of characters printed (excluding the null  byte  used  to
+ * end output to strings)
+ */
+int
+as_endpoint_list_to_string_match_capabilities(
+		const as_endpoint_list* endpoint_list, char* buffer,
+		size_t buffer_capacity, uint8_t capability_mask, uint8_t capabilities);
+
+/**
+ * Populate dyn buf with endpoints info.
+ * @param endpoint_list the input list. NULL allowed.
+ * @param db the dynamic buffer.
+ */
+void
+as_endpoint_list_info(const as_endpoint_list* endpoint_list, cf_dyn_buf* db);
diff --git a/as/include/fabric/exchange.h b/as/include/fabric/exchange.h
new file mode 100644
index 00000000..c24cd34b
--- /dev/null
+++ b/as/include/fabric/exchange.h
@@ -0,0 +1,158 @@
+/*
+ * exchange.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+#include "node.h"
+
+/*
+ * ----------------------------------------------------------------------------
+ * Constants
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Number of quantum intervals in orphan state after which client transactions
+ * will be blocked.
+ */
+#define AS_EXCHANGE_REVERT_ORPHAN_INTERVALS 5
+
+/*
+ * ----------------------------------------------------------------------------
+ * Typedefs.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Exchange event raised for every well-formed cluster change, after exchange
+ * concludes successfully.
+ */
+typedef struct as_exchange_cluster_changed_event_s
+{
+	/**
+	 * The new cluster key.
+	 */
+	uint64_t cluster_key;
+
+	/**
+	 * The new cluster size.
+	 */
+	uint32_t cluster_size;
+
+	/**
+	 * The new succession list.
+	 */
+	cf_node* succession;
+} as_exchange_cluster_changed_event;
+
+/**
+ * Cluster change event call back function for cluster changed event listeners.
+ */
+typedef void
+(*as_exchange_cluster_changed_cb)(
+		const as_exchange_cluster_changed_event* event, void* udata);
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public API.
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Initialize exchange subsystem.
+ */
+void
+as_exchange_init();
+
+/**
+ * Start exchange subsystem.
+ */
+void
+as_exchange_start();
+
+/**
+ * Stop exchange subsystem.
+ */
+void
+as_exchange_stop();
+
+/**
+ * Register to receive cluster-changed events.
+ * TODO - may replace with simple static list someday.
+ */
+void
+as_exchange_register_listener(as_exchange_cluster_changed_cb cb, void* udata);
+
+/**
+ * Dump exchange state to log.
+ */
+void
+as_exchange_dump(bool verbose);
+
+/**
+ * Member-access method.
+ */
+uint64_t
+as_exchange_cluster_key();
+
+/**
+ * Member-access method.
+ */
+uint32_t
+as_exchange_cluster_size();
+
+/**
+ * Copy over the committed succession list.
+ * Ensure the input vector has enough capacity.
+ */
+void
+as_exchange_succession(cf_vector* succession);
+
+/**
+ * Return the committed succession list as a string in a dyn-buf.
+ */
+void
+as_exchange_info_get_succession(cf_dyn_buf* db);
+
+/**
+ * Member-access method.
+ */
+cf_node
+as_exchange_principal();
+
+/**
+ * Lock before setting or getting exchanged info from non-exchange thread.
+ */
+void
+as_exchange_info_lock();
+
+/**
+ * Unlock after setting or getting exchanged info from non-exchange thread.
+ */
+void
+as_exchange_info_unlock();
diff --git a/as/include/fabric/fabric.h b/as/include/fabric/fabric.h
new file mode 100644
index 00000000..20734fe5
--- /dev/null
+++ b/as/include/fabric/fabric.h
@@ -0,0 +1,129 @@
+/*
+ * fabric.h
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "msg.h"
+#include "node.h"
+#include "socket.h"
+#include "tls.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_endpoint_list_s;
+struct as_hb_plugin_node_data_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define AS_FABRIC_SUCCESS			(0)
+#define AS_FABRIC_ERR_UNKNOWN		(-1)	// used by transact
+#define AS_FABRIC_ERR_NO_NODE		(-3)
+#define AS_FABRIC_ERR_TIMEOUT		(-6)	// used by transact
+
+typedef enum {
+	AS_FABRIC_CHANNEL_RW = 0,	// duplicate resolution and replica writes
+	AS_FABRIC_CHANNEL_CTRL = 1,	// clustering, migration ctrl and services info
+	AS_FABRIC_CHANNEL_BULK = 2,	// migrate records
+	AS_FABRIC_CHANNEL_META = 3,	// smd
+
+	AS_FABRIC_N_CHANNELS
+} as_fabric_channel;
+
+#define MAX_FABRIC_CHANNEL_THREADS 128
+#define MAX_FABRIC_CHANNEL_SOCKETS 128
+
+typedef struct fabric_rate_s {
+	uint64_t s_bytes[AS_FABRIC_N_CHANNELS];
+	uint64_t r_bytes[AS_FABRIC_N_CHANNELS];
+} fabric_rate;
+
+typedef int (*as_fabric_msg_fn) (cf_node node_id, msg *m, void *udata);
+typedef int (*as_fabric_transact_recv_fn) (cf_node node_id, msg *m, void *transact_data, void *udata);
+typedef int (*as_fabric_transact_complete_fn) (msg *rsp, void *udata, int err);
+
+
+//==========================================================
+// Globals.
+//
+
+extern cf_serv_cfg g_fabric_bind;
+extern cf_tls_info *g_fabric_tls;
+
+
+//==========================================================
+// Public API.
+//
+
+//------------------------------------------------
+// msg
+//
+
+msg *as_fabric_msg_get(msg_type type);
+void as_fabric_msg_put(msg *m);
+void as_fabric_msg_queue_dump(void);
+
+//------------------------------------------------
+// as_fabric
+//
+
+int as_fabric_init(void);
+int as_fabric_start(void);
+void as_fabric_set_recv_threads(as_fabric_channel channel, uint32_t count);
+int as_fabric_send(cf_node node_id, msg *m, as_fabric_channel channel);
+int as_fabric_send_list(const cf_node *nodes, uint32_t node_count, msg *m, as_fabric_channel channel);
+void as_fabric_register_msg_fn(msg_type type, const msg_template *mt, size_t mt_sz, size_t scratch_sz, as_fabric_msg_fn msg_cb, void *msg_udata);
+void as_fabric_info_peer_endpoints_get(cf_dyn_buf *db);
+bool as_fabric_is_published_endpoint_list(const struct as_endpoint_list_s *list);
+struct as_endpoint_list_s *as_fabric_hb_plugin_get_endpoint_list(struct as_hb_plugin_node_data_s *plugin_data);
+void as_fabric_rate_capture(fabric_rate *rate);
+void as_fabric_dump(bool verbose);
+
+
+//==============================================================================
+// Fabric transact.
+//
+
+// Used to send a request, and receive a response, reliably. This is guaranteed
+// to NEVER return an error directly, but might call the callback function
+// saying that we ran out of time or had some other error.
+//
+// Requires field 0 be a uint64_t which will be used by the fabric system - an
+// unknown error will be thrown if this is not true.
+
+void as_fabric_transact_init(void);
+void as_fabric_transact_start(cf_node node_id, msg *m, int timeout_ms, as_fabric_transact_complete_fn cb, void *userdata);
+int as_fabric_transact_register(msg_type type, const msg_template *mt, size_t mt_sz, size_t scratch_sz, as_fabric_transact_recv_fn cb, void *udata);
+int as_fabric_transact_reply(msg *reply_msg, void *transact_data);
diff --git a/as/include/fabric/hb.h b/as/include/fabric/hb.h
new file mode 100644
index 00000000..3462ef5b
--- /dev/null
+++ b/as/include/fabric/hb.h
@@ -0,0 +1,473 @@
+/*
+ * hb.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "msg.h"
+#include "socket.h"
+#include "tls.h"
+
+#include "fabric/hlc.h"
+
+/**
+ * Maximum number of nodes in a cluster.
+ */
+#ifndef AS_CLUSTER_SZ
+#define AS_CLUSTER_SZ 8
+#endif
+
+/**
+ * Minimum heartbeat interval.
+ */
+#define AS_HB_TX_INTERVAL_MS_MIN 50
+
+/**
+ * Maximum heartbeat interval. (10 mins)
+ */
+#define AS_HB_TX_INTERVAL_MS_MAX 600000
+
+/**
+ * Minimum max-intervals-missed.
+ */
+#define AS_HB_MAX_INTERVALS_MISSED_MIN 3
+
+/**
+ * Heartbeat modes.
+ */
+typedef enum as_hb_mode_enum
+{
+	AS_HB_MODE_UNDEF,
+	AS_HB_MODE_MULTICAST,
+	AS_HB_MODE_MESH
+} as_hb_mode;
+
+/**
+ * Heartbeat protocol versions.
+ */
+typedef enum as_hb_protocol_enum
+{
+	AS_HB_PROTOCOL_UNDEF,
+	AS_HB_PROTOCOL_NONE,
+	AS_HB_PROTOCOL_RESET,
+	AS_HB_PROTOCOL_V3
+} as_hb_protocol;
+
+/**
+ * Events published by the heartbeat subsystem.
+ */
+typedef enum
+{
+	AS_HB_NODE_ARRIVE,
+	AS_HB_NODE_DEPART,
+	AS_HB_NODE_ADJACENCY_CHANGED,
+	AS_HB_NODE_EVENT_SENTINEL
+} as_hb_event_type;
+
+/**
+ * A plugin that is publishing and receiving data via the heartbeat subsystem.
+ * The heartbeat outgoing message buffer will be populated and parsed in the
+ * order of this enum.
+ */
+typedef enum
+{
+	/**
+	 * The heartbeat subsystem itself.
+	 */
+	AS_HB_PLUGIN_HB,
+	/**
+	 * The older clustering subsystem.
+	 * TODO: Use only one plugin id and register differently based on the
+	 * clustering version.
+	 */
+	AS_HB_PLUGIN_FABRIC,
+	/**
+	 * The clustering subsystem.
+	 */
+	AS_HB_PLUGIN_CLUSTERING,
+	/**
+	 * The skew monitor.
+	 */
+	AS_HB_PLUGIN_SKEW_MONITOR,
+	/**
+	 * Dummy sentinel enum value. Should be the last.
+	 */
+	AS_HB_PLUGIN_SENTINEL
+} as_hb_plugin_id;
+
+/**
+ * The fields in the heartbeat message.
+ * New field additions only at the end.
+ */
+typedef enum
+{
+	/**
+	 * HB protocol identifier.
+	 */
+	AS_HB_MSG_ID,
+
+	/**
+	 * HB subsystem message type.
+	 */
+	AS_HB_MSG_TYPE,
+
+	/**
+	 * HB message source.
+	 */
+	AS_HB_MSG_NODE,
+
+	/**
+	 * Cluster Name.
+	 */
+	AS_HB_MSG_CLUSTER_NAME,
+
+	/**
+	 * HLC timestamp.
+	 */
+	AS_HB_MSG_HLC_TIMESTAMP,
+
+	/**
+	 * Heartbeats endpoints advertised by this node.
+	 */
+	AS_HB_MSG_ENDPOINTS,
+
+	/**
+	 * Payload for compressed messages.
+	 */
+	AS_HB_MSG_COMPRESSED_PAYLOAD,
+
+	/**
+	 * Mesh info request.
+	 */
+	AS_HB_MSG_INFO_REQUEST,
+
+	/**
+	 * Mesh info reply.
+	 */
+	AS_HB_MSG_INFO_REPLY,
+
+	/*
+	 * ---- Plugin data fields. Potentially extensible  ----
+	 */
+	/**
+	 * Fabric  data advertised by this node. Placed close to hb endpoints to
+	 * help compression, because it would most likely match with hb endpoints.
+	 */
+	AS_HB_MSG_FABRIC_DATA,
+
+	/**
+	 * Valid only for pulse messages, has adjacency list and clusterid.
+	 */
+	AS_HB_MSG_HB_DATA,
+
+	/**
+	 * Contains the cluster key and succession list.
+	 */
+	AS_HB_MSG_PAXOS_DATA,
+
+	/**
+	 * Local physical clock monotonic timestamp for when the message was sent.
+	 */
+	AS_HB_MSG_SKEW_MONITOR_DATA
+} as_hb_msg_fields;
+
+/**
+ * Heartbeat subsystem configuration.
+ */
+typedef struct as_hb_config_s
+{
+	/**
+	 * Mode of operation. Mesh or Multicast for now.
+	 */
+	as_hb_mode mode;
+
+	/**
+	 * Binding interface config.
+	 */
+	cf_serv_cfg bind_cfg;
+
+	/**
+	 * Global TLS configuration.
+	 */
+
+	cf_tls_info *tls;
+
+	/**
+	 * Multicast mode only config for multicast groups.
+	 */
+	cf_mserv_cfg multicast_group_cfg;
+
+	/**
+	 * The interval at which heartbeat pulse messages are sent in milliseconds.
+	 */
+	uint32_t tx_interval;
+
+	/**
+	 * Max number of missed heartbeat intervals after which a node is considered
+	 * expired.
+	 */
+	uint32_t max_intervals_missed;
+
+	/**
+	 * The ttl for multicast packets. Set to zero for default TTL.
+	 */
+	uint8_t multicast_ttl;
+
+	/**
+	 * HB protocol to use.
+	 */
+	as_hb_protocol protocol;
+
+	/**
+	 * Set to a value > 0 to override the MTU read from the network interface.
+	 */
+	uint32_t override_mtu;
+
+	/**
+	 * Mesh seeds from config file.
+	 * Only used for during config parsing and initialization.
+	 */
+	char* mesh_seed_addrs[AS_CLUSTER_SZ];
+	int mesh_seed_ports[AS_CLUSTER_SZ];
+	bool mesh_seed_tls[AS_CLUSTER_SZ];
+
+} as_hb_config;
+
+/**
+ * Heartbeat published event structure.
+ */
+typedef struct as_hb_event_node_s
+{
+	/**
+	 * The type of the event.
+	 */
+	as_hb_event_type evt;
+
+	/**
+	 * The event nodeid.
+	 */
+	cf_node nodeid;
+
+	/**
+	 * The monotonic timestamp when this event happened.
+	 */
+	cf_clock event_time;
+
+	/**
+	 * The monotonic timestamp when this event was detected. Will differ from
+	 * event_time for node depart events.
+	 */
+	cf_clock event_detected_time;
+} as_hb_event_node;
+
+/**
+ * A hook to allow plugin to publish its data as a part of the heartbeat
+ * message.
+ */
+typedef void (*as_hb_plugin_set_data_fn)(msg* hb_message);
+
+/**
+ * Data stored for an adjacent node for a plugin.
+ */
+typedef struct as_hb_plugin_node_data_s
+{
+	/**
+	 * Heap allocated node specific data blob for this plugin.
+	 */
+	void* data;
+
+	/**
+	 * The size of the stored data.
+	 */
+	size_t data_size;
+
+	/**
+	 * The capacity of the allocated data structure.
+	 */
+	size_t data_capacity;
+} as_hb_plugin_node_data;
+
+/**
+ * A function to parse plugin data for a node into an in memory object. Should
+ * be fast and never acquire locks.
+ *
+ * The parameter plugin_data->data will always be pointer to a previously
+ * allocated memory location. plugin_data->data_capacity will indicate the
+ * capacity of this memory. Implementations should reuse this previously
+ * allocated data blob to avoid the overhead of heap  allocations. If current
+ * data capacity is greater than the new data size please invoke cf_realloc and
+ * get a new block for current data and update plugin_data->data and
+ * plugin_data->data_capacity accordingly.
+ *
+ * This function should always data_size correctly before returning. Set
+ * plugin_data->data_size = 0 for no plugin data.
+ *
+ * @param hb_message the heartbeat message.
+ * @param source the source node.
+ * @param plugin_data (output) plugin data structure to output parsed data.
+ */
+typedef void (*as_hb_plugin_parse_data_fn)(msg* hb_message, cf_node source, as_hb_plugin_node_data* plugin_data);
+
+/**
+ * A listener for detecting changes to this plugin's data for a particular node.
+ * Does not supply old and new values of the data, because does not seem to be
+ * required currently and to keep implementation simple.
+ *
+ * @param node the node whose plugin data changed.
+ */
+typedef void (*as_hb_plugin_data_changed_fn)(cf_node nodeid);
+
+/**
+ * A plugin allows a module to pushing and read data with heartbeat pulse
+ * messages.
+ */
+typedef struct as_hb_plugin_s
+{
+	/**
+	 * The plugin id.
+	 */
+	as_hb_plugin_id id;
+
+	/**
+	 * Fixed plugin data size on wire.
+	 */
+	size_t wire_size_fixed;
+
+	/**
+	 * Additional plugin data size on wire per node in the adjacency list.
+	 */
+	size_t wire_size_per_node;
+
+	/**
+	 * The function which adds this plugin's data to the pulse message. Can be
+	 * NULL. This function can hold the plugin module's locks.
+	 */
+	as_hb_plugin_set_data_fn set_fn;
+
+	/**
+	 * A function will parses and reads this plugins data from an incoming
+	 * message. Can be NULL. This function SHOULD NOT hold the plugin module's
+	 * locks to prevent deadlocks.
+	 */
+	as_hb_plugin_parse_data_fn parse_fn;
+
+	/**
+	 * A function invoked when plugin data for a particular node changed.
+	 * Can be NULL. This function can hold the plugin module's locks.
+	 */
+	as_hb_plugin_data_changed_fn change_listener;
+} as_hb_plugin;
+
+/*
+ * -----------------------------------------------------------------
+ * HB subsystem public API
+ * -----------------------------------------------------------------
+ */
+
+void as_hb_init();
+
+void as_hb_start();
+
+void as_hb_shutdown();
+
+bool as_hb_self_is_duplicate();
+
+bool as_hb_node_is_adjacent(cf_node nodeid);
+
+typedef void (*as_hb_event_fn)(int nevents, as_hb_event_node* events, void* udata);
+
+void as_hb_register_listener(as_hb_event_fn event_callback, void* udata);
+
+void as_hb_dump(bool verbose);
+
+as_hb_protocol as_hb_protocol_get();
+
+int as_hb_protocol_set(as_hb_protocol protocol);
+
+uint32_t as_hb_node_timeout_get();
+
+void as_hb_override_mtu_set(int mtu);
+
+uint32_t as_hb_tx_interval_get();
+
+int as_hb_tx_interval_set(uint32_t new_interval);
+
+int as_hb_max_intervals_missed_set(uint32_t new_max);
+
+uint32_t as_hb_node_timeout_get();
+
+bool as_hb_max_cluster_size_isvalid(uint32_t max_cluster_size);
+
+/*
+ * -----------------------------------------------------------------
+ * HB plugin subsystem public API.
+ * -----------------------------------------------------------------
+ */
+
+void as_hb_plugin_register(as_hb_plugin* plugin);
+
+bool as_hb_is_alive(cf_node nodeid);
+
+void as_hb_config_validate();
+
+void as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict);
+
+int as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin, as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts, cf_clock* recv_monotonic_ts);
+
+typedef void (*as_hb_plugin_data_iterate_fn)(cf_node nodeid, void* plugin_data, size_t plugin_data_size, cf_clock recv_monotonic_ts, as_hlc_msg_timestamp* msg_hlc_ts, void* udata);
+
+void as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin, as_hb_plugin_data_iterate_fn iterate_fn, void* udata);
+
+void as_hb_plugin_data_iterate_all(as_hb_plugin_id plugin, as_hb_plugin_data_iterate_fn iterate_fn, void* udata);
+
+/*
+ * -----------------------------------------------------------------
+ * Info public API
+ * -----------------------------------------------------------------
+ */
+
+void as_hb_info_config_get(cf_dyn_buf* db);
+
+void as_hb_info_endpoints_get(cf_dyn_buf* db);
+
+void as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port, size_t addr_port_capacity);
+
+void as_hb_info_duplicates_get(cf_dyn_buf* db);
+
+/*
+ * -----------------------------------------------------------------
+ * Mesh mode public API
+ * -----------------------------------------------------------------
+ */
+
+int as_hb_mesh_tip(char* host, int port, bool tls);
+
+int as_hb_mesh_tip_clear(char* host, int port);
+
+int as_hb_mesh_tip_clear_all(uint32_t* cleared);
+
+void as_hb_config_validate();
diff --git a/as/include/fabric/hlc.h b/as/include/fabric/hlc.h
new file mode 100644
index 00000000..4bb7fbdf
--- /dev/null
+++ b/as/include/fabric/hlc.h
@@ -0,0 +1,160 @@
+/*
+ * hlc.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * Hybrid logical clock as described in
+ * http://www.cse.buffalo.edu/tech-reports/2014-04.pdf.
+ *
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_clock.h"
+
+#include "node.h"
+
+/**
+ * A hybrid logical clock timestamp.
+ *
+ * The most significant 48 bits represent the physical component of the hlc and
+ * the least significant 16 bits represent the logical component.
+ */
+typedef uint64_t as_hlc_timestamp;
+
+/**
+ * Timestamp for a message receive event.
+ */
+typedef struct as_hlc_msg_timestamp_s
+{
+	/**
+	 * The sender's HLC timestamp at time when the message was sent.
+	 */
+	as_hlc_timestamp send_ts;
+	/**
+	 * Local HLC timestamp at message receipt.
+	 */
+	as_hlc_timestamp recv_ts;
+} as_hlc_msg_timestamp;
+
+/**
+ * Result of ordering two hlc timestamps.
+ */
+typedef enum as_hlc_timestamp_order_e {
+	/**
+	 * The event with first timestamp happened before.
+	 */
+	AS_HLC_HAPPENS_BEFORE,
+	/**
+	 * The event with first timestamp happened after.
+	 */
+	AS_HLC_HAPPENS_AFTER,
+	/**
+	 * The order of the timestamps is indeterminated.
+	 */
+	AS_HLC_ORDER_INDETERMINATE
+} as_hlc_timestamp_order;
+
+/*----------------------------------------------------------------------------
+ * Public API.
+ *----------------------------------------------------------------------------*/
+/**
+ * Initialize hybrid logical clock.
+ */
+void as_hlc_init();
+
+/**
+ * Return a hlc timestamp representing the hlc time "now".
+ */
+as_hlc_timestamp as_hlc_timestamp_now();
+
+/**
+ * Return the physical component of a hlc timstamp
+ * @param hlc_ts the hybrid logical clock timestamp.
+ */
+cf_clock as_hlc_physical_ts_get(as_hlc_timestamp hlc_ts);
+
+/**
+ * Update the HLC on receipt of a remote message. The notion is to adjust this
+ * node's hlc to ensure the receive hlc ts > the send hlc ts.
+ *
+ * @param source for debugging and tracking only.
+ * @param send_timestamp the hlc timestamp when this message was sent.
+ * @param recv_timestamp (output) the message receive timestamp which will be
+ * populated. Can be NULL in which case it will be ignored.
+ */
+void as_hlc_timestamp_update(cf_node source, as_hlc_timestamp send_ts,
+			     as_hlc_msg_timestamp* msg_ts);
+
+/**
+ * Return the difference in milliseconds between two hlc timestamps. Note this
+ * difference may be greater than or equal to the physical wall call difference,
+ * because HLC can have non linear jumps, whenever the clock is adjusted. The
+ * difference should be used as an estimate rather than an absolute difference.
+ * For e.g. use the difference to check that the time difference is at least
+ * some number of milliseconds. However do not use this for interval statistics
+ * or to check if the difference in time is at the most some number of
+ * milliseconds.
+ *
+ * @param ts1 the first timestamp.
+ * @param ts2 the seconds timestamp.
+ * @return ts1 - ts2 in milliseconds.
+ */
+int64_t as_hlc_timestamp_diff_ms(as_hlc_timestamp ts1, as_hlc_timestamp ts2);
+
+/**
+ * Orders a local timestamp and remote message send timestamp.
+ *
+ * @param local_ts the local timestamp.
+ * @param msg_ts message receive timestamp containing the remote send and the
+ * local receive timestamp.
+ * @return the order between the local and the message timestamp.
+ */
+as_hlc_timestamp_order as_hlc_send_timestamp_order(
+  as_hlc_timestamp local_ts, as_hlc_msg_timestamp* msg_ts);
+
+/**
+ * Orders two timestamp generated by the same node / process.
+ *
+ * @param ts1 the first timestamp.
+ * @param ts2 the second timestamp.
+ * @return AS_HLC_HAPPENS_BEFORE if ts1 happens before ts2 else
+ * AS_HLC_HAPPENS_AFTER if ts1 happens after ts2  else
+ * AS_HLC_ORDER_INDETERMINATE.
+ */
+as_hlc_timestamp_order as_hlc_timestamp_order_get(as_hlc_timestamp ts1,
+						  as_hlc_timestamp ts2);
+
+/**
+ * Subtract milliseconds worth of time from the timestamp.
+ * @param timestamp the input timestamp.
+ * @param ms the number of milliseconds to subtract.
+ */
+as_hlc_timestamp as_hlc_timestamp_subtract_ms(as_hlc_timestamp timestamp,
+					       int ms);
+
+/**
+ * Dump some debugging information to the logs.
+ */
+void as_hlc_dump(bool verbose);
diff --git a/as/include/fabric/meta_batch.h b/as/include/fabric/meta_batch.h
new file mode 100644
index 00000000..1a895f5c
--- /dev/null
+++ b/as/include/fabric/meta_batch.h
@@ -0,0 +1,42 @@
+/*
+ * meta_batch.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Forward declarations.
+//
+
+struct meta_in_q_s;
+struct meta_out_q_s;
+
+
+//==========================================================
+// Public API.
+//
+
+struct meta_in_q_s *meta_in_q_create();
+void meta_in_q_destroy(struct meta_in_q_s *iq);
+void meta_in_q_rejected(struct meta_in_q_s *iq);
+
+struct meta_out_q_s *meta_out_q_create();
+void meta_out_q_destroy(struct meta_out_q_s *oq);
diff --git a/as/include/fabric/migrate.h b/as/include/fabric/migrate.h
new file mode 100644
index 00000000..80caa0d2
--- /dev/null
+++ b/as/include/fabric/migrate.h
@@ -0,0 +1,215 @@
+/*
+ * migrate.h
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_rchash.h"
+
+#include "msg.h"
+#include "node.h"
+#include "shash.h"
+
+#include "fabric/hb.h"
+#include "fabric/partition.h"
+#include "fabric/partition_balance.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_index_s;
+struct as_index_ref_s;
+struct as_namespace_s;
+struct as_remote_record_s;
+struct meta_in_q_s;
+struct meta_out_q_s;
+struct pb_task_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// For receiver-side migration flow-control.
+// TODO - move to namespace? Go even lower than 4?
+#define AS_MIGRATE_DEFAULT_MAX_NUM_INCOMING 4
+#define AS_MIGRATE_LIMIT_MAX_NUM_INCOMING 256
+
+// Maximum permissible number of migrate xmit threads.
+#define MAX_NUM_MIGRATE_XMIT_THREADS 100
+
+#define TX_FLAGS_NONE           ((uint32_t) 0x0)
+#define TX_FLAGS_ACTING_MASTER  ((uint32_t) 0x1)
+
+
+//==========================================================
+// Public API.
+//
+
+void as_migrate_init();
+void as_migrate_emigrate(const struct pb_task_s *task);
+void as_migrate_set_num_xmit_threads(uint32_t n_threads);
+void as_migrate_dump(bool verbose);
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+typedef enum {
+	// These values go on the wire, so mind backward compatibility if changing.
+	MIG_FIELD_OP,
+	MIG_FIELD_UNUSED_1,
+	MIG_FIELD_EMIG_ID,
+	MIG_FIELD_NAMESPACE,
+	MIG_FIELD_PARTITION,
+	MIG_FIELD_DIGEST,
+	MIG_FIELD_GENERATION,
+	MIG_FIELD_RECORD,
+	MIG_FIELD_CLUSTER_KEY,
+	MIG_FIELD_UNUSED_9,
+	MIG_FIELD_VOID_TIME,
+	MIG_FIELD_UNUSED_11,
+	MIG_FIELD_UNUSED_12,
+	MIG_FIELD_INFO,
+	MIG_FIELD_UNUSED_14,
+	MIG_FIELD_UNUSED_15,
+	MIG_FIELD_UNUSED_16,
+	MIG_FIELD_UNUSED_17,
+	MIG_FIELD_UNUSED_18,
+	MIG_FIELD_LAST_UPDATE_TIME,
+	MIG_FIELD_FEATURES,
+	MIG_FIELD_UNUSED_21,
+	MIG_FIELD_META_RECORDS,
+	MIG_FIELD_META_SEQUENCE,
+	MIG_FIELD_META_SEQUENCE_FINAL,
+	MIG_FIELD_PARTITION_SIZE,
+	MIG_FIELD_SET_NAME,
+	MIG_FIELD_KEY,
+	MIG_FIELD_UNUSED_28,
+	MIG_FIELD_EMIG_INSERT_ID,
+
+	NUM_MIG_FIELDS
+} migrate_msg_fields;
+
+#define OPERATION_UNDEF 0
+#define OPERATION_INSERT 1
+#define OPERATION_INSERT_ACK 2
+#define OPERATION_START 3
+#define OPERATION_START_ACK_OK 4
+#define OPERATION_START_ACK_EAGAIN 5
+#define OPERATION_START_ACK_FAIL 6
+#define OPERATION_UNUSED_7 7 // deprecated
+#define OPERATION_DONE 8
+#define OPERATION_DONE_ACK 9
+#define OPERATION_UNUSED_10 10 // deprecated
+#define OPERATION_MERGE_META 11
+#define OPERATION_MERGE_META_ACK 12
+#define OPERATION_ALL_DONE 13
+#define OPERATION_ALL_DONE_ACK 14
+
+#define MIG_INFO_UNUSED_1       0x0001
+#define MIG_INFO_UNUSED_2       0x0002
+#define MIG_INFO_UNREPLICATED   0x0004 // enterprise only
+#define MIG_INFO_TOMBSTONE      0x0008 // enterprise only
+
+#define MIG_FEATURE_MERGE 0x00000001U
+#define MIG_FEATURES_SEEN 0x80000000U // needed for backward compatibility
+extern const uint32_t MY_MIG_FEATURES;
+
+typedef struct emigration_s {
+	cf_node     dest;
+	uint64_t    cluster_key;
+	uint32_t    id;
+	pb_task_type type;
+	uint32_t    tx_flags;
+	cf_atomic32 state;
+	bool        aborted;
+	bool        from_replica;
+	uint64_t    wait_until_ms;
+
+	cf_atomic32 bytes_emigrating;
+	cf_shash    *reinsert_hash;
+	uint64_t    insert_id;
+	cf_queue    *ctrl_q;
+	struct meta_in_q_s *meta_q;
+
+	as_partition_reservation rsv;
+} emigration;
+
+typedef struct immigration_s {
+	cf_node          src;
+	uint64_t         cluster_key;
+	uint32_t         pid;
+
+	cf_atomic32      done_recv;      // flag - 0 if not yet received, atomic counter for receives
+	uint64_t         start_recv_ms;  // time the first START event was received
+	uint64_t         done_recv_ms;   // time the first DONE event was received
+
+	uint32_t         emig_id;
+	struct meta_out_q_s *meta_q;
+
+	as_migrate_result start_result;
+	uint32_t        features;
+	struct as_namespace_s *ns; // for statistics only
+
+	as_partition_reservation rsv;
+} immigration;
+
+typedef struct immigration_hkey_s {
+	cf_node src;
+	uint32_t emig_id;
+} __attribute__((__packed__)) immigration_hkey;
+
+
+// Globals.
+extern cf_rchash *g_emigration_hash;
+extern cf_rchash *g_immigration_hash;
+
+
+// Emigration, immigration, & pickled record destructors.
+void emigration_release(emigration *emig);
+void immigration_release(immigration *immig);
+
+// Emigration.
+bool should_emigrate_record(emigration *emig, struct as_index_ref_s *r_ref);
+uint32_t emigration_pack_info(const emigration *emig, const struct as_index_s *r);
+
+// Migrate fabric message handling.
+void emigration_handle_meta_batch_request(cf_node src, msg *m);
+bool immigration_ignore_pickle(const uint8_t *buf, uint32_t info);
+void immigration_init_repl_state(struct as_remote_record_s* rr, uint32_t info);
+void immigration_handle_meta_batch_ack(cf_node src, msg *m);
+
+// Meta sender.
+bool immigration_start_meta_sender(immigration *immig, uint32_t emig_features, uint64_t emig_n_recs);
diff --git a/as/include/fabric/partition.h b/as/include/fabric/partition.h
new file mode 100644
index 00000000..f8e59189
--- /dev/null
+++ b/as/include/fabric/partition.h
@@ -0,0 +1,285 @@
+/*
+ * partition.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "dynbuf.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "fabric/hb.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_index_tree_s;
+struct as_namespace_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define AS_PARTITIONS 4096
+#define AS_PARTITION_MASK (AS_PARTITIONS - 1)
+
+#define VERSION_FAMILY_BITS 4
+#define VERSION_FAMILY_UNIQUE ((1 << VERSION_FAMILY_BITS) - 1)
+#define AS_PARTITION_N_FAMILIES VERSION_FAMILY_UNIQUE
+
+typedef struct as_partition_version_s {
+	uint64_t ckey:48;
+	uint64_t family:VERSION_FAMILY_BITS;
+	uint64_t unused:8;
+	uint64_t revived:1; // enterprise only
+	uint64_t master:1;
+	uint64_t subset:1;
+	uint64_t evade:1;
+} as_partition_version;
+
+COMPILER_ASSERT(sizeof(as_partition_version) == sizeof(uint64_t));
+
+typedef struct as_partition_version_string_s {
+	char s[19 + 1]; // format CCCCccccCCCC.F.mse - F may someday be 2 characters
+} as_partition_version_string;
+
+typedef struct as_partition_s {
+	pthread_mutex_t lock;
+
+	uint32_t id;
+
+	struct as_index_tree_s* vp;
+
+	cf_atomic64 n_tombstones; // relevant only for enterprise edition
+	cf_atomic64 max_void_time; // TODO - convert to 32-bit ...
+
+	// Replica information.
+	uint32_t n_nodes; // relevant only for enterprise edition
+	uint32_t n_replicas;
+	cf_node replicas[AS_CLUSTER_SZ];
+
+	// Rebalance & migration related:
+
+	as_partition_version final_version;
+	as_partition_version version;
+	int pending_emigrations;
+	int pending_immigrations;
+	bool immigrators[AS_CLUSTER_SZ];
+
+	cf_node working_master;
+
+	uint32_t n_dupl;
+	cf_node dupls[AS_CLUSTER_SZ];
+
+	uint32_t n_witnesses;
+	cf_node witnesses[AS_CLUSTER_SZ];
+
+	bool must_appeal; // relevant only for enterprise edition
+
+	uint32_t regime; // relevant only for enterprise edition
+} as_partition;
+
+typedef struct as_partition_reservation_s {
+	struct as_namespace_s* ns;
+	as_partition* p;
+	struct as_index_tree_s* tree;
+	uint32_t regime;
+	uint32_t n_dupl;
+	cf_node dupl_nodes[AS_CLUSTER_SZ];
+} as_partition_reservation;
+
+typedef struct repl_stats_s {
+	uint64_t n_master_objects;
+	uint64_t n_prole_objects;
+	uint64_t n_non_replica_objects;
+	uint64_t n_master_tombstones;
+	uint64_t n_prole_tombstones;
+	uint64_t n_non_replica_tombstones;
+} repl_stats;
+
+#define CLIENT_BITMAP_BYTES ((AS_PARTITIONS + 7) / 8)
+#define CLIENT_B64MAP_BYTES (((CLIENT_BITMAP_BYTES + 2) / 3) * 4)
+
+typedef struct client_replica_map_s {
+	pthread_mutex_t write_lock;
+
+	volatile uint8_t bitmap[CLIENT_BITMAP_BYTES];
+	volatile char b64map[CLIENT_B64MAP_BYTES];
+} client_replica_map;
+
+typedef enum {
+	AS_MIGRATE_OK,
+	AS_MIGRATE_FAIL,
+	AS_MIGRATE_AGAIN
+} as_migrate_result;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_partition_init(struct as_namespace_s* ns, uint32_t pid);
+void as_partition_shutdown(struct as_namespace_s* ns, uint32_t pid);
+
+void as_partition_isolate_version(const struct as_namespace_s* ns, as_partition* p);
+int as_partition_check_source(const struct as_namespace_s* ns, as_partition* p, cf_node src, bool* from_replica);
+void as_partition_freeze(as_partition* p);
+
+uint32_t as_partition_get_other_replicas(as_partition* p, cf_node* nv);
+
+cf_node as_partition_writable_node(struct as_namespace_s* ns, uint32_t pid);
+cf_node as_partition_proxyee_redirect(struct as_namespace_s* ns, uint32_t pid);
+
+void as_partition_get_replicas_prole_str(cf_dyn_buf* db); // deprecate in "six months"
+void as_partition_get_replicas_master_str(cf_dyn_buf* db);
+void as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime);
+
+void as_partition_get_replica_stats(struct as_namespace_s* ns, repl_stats* p_stats);
+
+void as_partition_reserve(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
+int as_partition_reserve_timeout(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, int timeout_ms);
+int as_partition_reserve_replica(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
+int as_partition_reserve_write(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, cf_node* node);
+int as_partition_reserve_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, bool would_dup_res, cf_node* node);
+int as_partition_prereserve_query(struct as_namespace_s* ns, bool can_partition_query[], as_partition_reservation rsv[]);
+int as_partition_reserve_query(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
+int as_partition_reserve_xdr_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
+void as_partition_reservation_copy(as_partition_reservation* dst, as_partition_reservation* src);
+
+void as_partition_release(as_partition_reservation* rsv);
+
+void as_partition_getinfo_str(cf_dyn_buf* db);
+
+// Use VERSION_AS_STRING() - see below.
+static inline as_partition_version_string
+as_partition_version_as_string(const as_partition_version* version)
+{
+	as_partition_version_string str;
+
+	if (version->family == VERSION_FAMILY_UNIQUE) {
+		sprintf(str.s, "%012lx.U.%c%c%c", (uint64_t)version->ckey,
+				version->master == 0 ? '-' : 'm',
+				version->subset == 0 ? 'p' : 's',
+				version->evade == 0 ? '-' : 'e');
+	}
+	else {
+		sprintf(str.s, "%012lx.%X.%c%c%c", (uint64_t)version->ckey,
+				(uint32_t)version->family,
+				version->master == 0 ? '-' : 'm',
+				version->subset == 0 ? 'p' : 's',
+				version->evade == 0 ?
+						(version->revived == 0 ? '-' : 'r') : 'e');
+	}
+
+	return str;
+}
+
+static inline bool
+as_partition_version_is_null(const as_partition_version* version)
+{
+	return *(uint64_t*)version == 0;
+}
+
+static inline bool
+as_partition_version_has_data(const as_partition_version* version)
+{
+	return version->ckey != 0;
+}
+
+static inline bool
+as_partition_version_same(const as_partition_version* v1, const as_partition_version* v2)
+{
+	return v1->ckey == v2->ckey &&
+			v1->family == v2->family &&
+			// Note - master flag not included in definition of "same".
+			v1->subset == v2->subset &&
+			// Note - could probably exclude these too...
+			v1->evade == v2->evade &&
+			v1->revived == v2->revived;
+}
+
+static inline uint32_t
+as_partition_getid(const cf_digest* d)
+{
+	return *(uint32_t*)d & AS_PARTITION_MASK;
+}
+
+static inline int
+find_self_in_replicas(const as_partition* p)
+{
+	return index_of_node(p->replicas, p->n_replicas, g_config.self_node);
+}
+
+static inline bool
+is_self_replica(const as_partition* p)
+{
+	return contains_node(p->replicas, p->n_replicas, g_config.self_node);
+}
+
+static inline bool
+contains_self(const cf_node* nodes, uint32_t n_nodes)
+{
+	return contains_node(nodes, n_nodes, g_config.self_node);
+}
+
+#define AS_PARTITION_ID_UNDEF ((uint16_t)0xFFFF)
+
+#define AS_PARTITION_RESERVATION_INIT(__rsv) \
+	__rsv.ns = NULL; \
+	__rsv.p = NULL; \
+	__rsv.tree = NULL; \
+	__rsv.regime = 0; \
+	__rsv.n_dupl = 0;
+
+#define VERSION_AS_STRING(v_ptr) (as_partition_version_as_string(v_ptr).s)
+
+
+//==========================================================
+// Public API - client view replica maps.
+//
+
+void client_replica_maps_create(struct as_namespace_s* ns);
+void client_replica_maps_clear(struct as_namespace_s* ns);
+bool client_replica_maps_update(struct as_namespace_s* ns, uint32_t pid);
+bool client_replica_maps_is_partition_queryable(const struct as_namespace_s* ns, uint32_t pid);
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+bool partition_reserve_promote(const struct as_namespace_s* ns, const as_partition* p, bool would_dup_res);
diff --git a/as/include/fabric/partition_balance.h b/as/include/fabric/partition_balance.h
new file mode 100644
index 00000000..e01fa76d
--- /dev/null
+++ b/as/include/fabric/partition_balance.h
@@ -0,0 +1,197 @@
+/*
+ * partition_balance.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "node.h"
+
+#include "fabric/hb.h"
+#include "fabric/partition.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_namespace_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef enum {
+	PB_TASK_EMIG_TRANSFER,
+	PB_TASK_EMIG_SIGNAL_ALL_DONE,
+	PB_TASK_APPEAL
+} pb_task_type;
+
+typedef struct pb_task_s {
+	cf_node dest;
+	struct as_namespace_s* ns;
+	uint32_t pid;
+	uint64_t cluster_key;
+	pb_task_type type;
+	uint32_t tx_flags;
+} pb_task;
+
+#define MAX_RACK_ID 1000000
+#define MAX_RACK_ID_LEN 7 // number of decimal characters
+
+
+//==========================================================
+// Public API - regulate migrations.
+//
+
+void as_partition_balance_disallow_migrations();
+bool as_partition_balance_are_migrations_allowed();
+void as_partition_balance_synchronize_migrations();
+void as_partition_balance_emigration_yield();
+
+
+//==========================================================
+// Public API - balance partitions.
+//
+
+void as_partition_balance_init();
+bool as_partition_balance_is_init_resolved();
+void as_partition_balance_revert_to_orphan();
+void as_partition_balance();
+
+uint64_t as_partition_balance_remaining_migrations();
+bool as_partition_balance_revive(struct as_namespace_s* ns);
+
+
+//==========================================================
+// Public API - migration-related as_partition methods.
+//
+
+bool as_partition_pending_migrations(as_partition* p);
+
+bool as_partition_pre_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags);
+void as_partition_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags);
+as_migrate_result as_partition_immigrate_start(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node);
+as_migrate_result as_partition_immigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node);
+as_migrate_result as_partition_migrations_all_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key);
+
+// Counter that tells clients partition ownership has changed.
+extern cf_atomic32 g_partition_generation;
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+//------------------------------------------------
+// Typedefs & constants.
+//
+
+COMPILER_ASSERT((AS_CLUSTER_SZ & (AS_CLUSTER_SZ - 1)) == 0);
+
+#define AS_CLUSTER_SZ_MASKP (-(uint64_t)AS_CLUSTER_SZ)
+#define AS_CLUSTER_SZ_MASKN ((uint64_t)AS_CLUSTER_SZ - 1)
+
+typedef uint8_t sl_ix_t;
+
+COMPILER_ASSERT(AS_CLUSTER_SZ_MASKN >> (sizeof(sl_ix_t) * 8) == 0);
+
+typedef struct inter_hash_s {
+	uint64_t hashed_node;
+	uint64_t hashed_pid;
+} inter_hash;
+
+extern const as_partition_version ZERO_VERSION;
+
+
+//------------------------------------------------
+// Globals.
+//
+
+extern volatile int g_allow_migrations;
+
+extern uint64_t g_hashed_pids[AS_PARTITIONS];
+
+// Shortcuts to values set by as_exchange, for use in partition balance only.
+extern uint32_t g_cluster_size;
+extern cf_node* g_succession;
+
+extern cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS];
+extern sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS];
+
+
+//------------------------------------------------
+// Forward declarations.
+//
+
+void partition_balance_init();
+
+void pb_task_init(pb_task* task, cf_node dest, struct as_namespace_s* ns, uint32_t pid, uint64_t cluster_key, pb_task_type type, uint32_t tx_flags);
+void drop_trees(as_partition* p, struct as_namespace_s* ns);
+
+void balance_namespace(struct as_namespace_s* ns, cf_queue* mq);
+void prepare_for_appeals();
+void process_pb_tasks(cf_queue* tq);
+void balance_namespace_ap(struct as_namespace_s* ns, cf_queue* mq);
+void fill_translation(int translation[], const struct as_namespace_s* ns);
+void fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix, cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, const int translation[]);
+void rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids, uint32_t n_racks, uint32_t start_n);
+uint32_t find_self(const cf_node* ns_node_seq, const struct as_namespace_s* ns);
+uint32_t fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns, uint32_t working_master_n, uint32_t n_dupl);
+void queue_namespace_migrations(as_partition* p, struct as_namespace_s* ns, uint32_t self_n, cf_node working_master, uint32_t n_dupl, cf_node dupls[], cf_queue* mq);
+void fill_witnesses(as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns);
+
+void emigrate_done_advance_non_master_version(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags);
+void emigrate_done_advance_non_master_version_ap(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags);
+void immigrate_start_advance_non_master_version(struct as_namespace_s* ns, as_partition* p);
+void immigrate_start_advance_non_master_version_ap(as_partition* p);
+void immigrate_done_advance_final_master_version(struct as_namespace_s* ns, as_partition* p);
+void immigrate_done_advance_final_master_version_ap(struct as_namespace_s* ns, as_partition* p);
+bool immigrate_yield();
+
+
+//------------------------------------------------
+// Inlines and macros.
+//
+
+static inline bool
+is_family_same(const as_partition_version* v1, const as_partition_version* v2)
+{
+	return v1->ckey == v2->ckey && v1->family == v2->family &&
+			v1->family != VERSION_FAMILY_UNIQUE;
+}
+
+// Define macros for accessing the full node-seq and sl-ix arrays.
+#define FULL_NODE_SEQ(x, y) g_full_node_seq_table[(x * g_cluster_size) + y]
+#define FULL_SL_IX(x, y) g_full_sl_ix_table[(x * g_cluster_size) + y]
+
+// Get the partition version that was input by exchange.
+#define INPUT_VERSION(_n) (&ns->cluster_versions[ns_sl_ix[_n]][p->id])
diff --git a/as/include/fabric/roster.h b/as/include/fabric/roster.h
new file mode 100644
index 00000000..4d0d11f1
--- /dev/null
+++ b/as/include/fabric/roster.h
@@ -0,0 +1,52 @@
+/*
+ * roster.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+
+#include "node.h"
+
+#include "fabric/partition_balance.h"
+
+
+//==========================================================
+// Public API.
+//
+
+void as_roster_init_smd();
+bool as_roster_set_nodes_cmd(const char* ns_name, const char* nodes);
+
+
+//==========================================================
+// Inlines and macros.
+//
+
+// Format is: <node-id-hex-str>:<rack-id-decimal-str>,
+#define ROSTER_STRING_ELE_LEN ((sizeof(cf_node) * 2) + 1 + MAX_RACK_ID_LEN + 1)
+
+// In string lists, separate node-id and rack-id with this character.
+#define ROSTER_ID_PAIR_SEPARATOR '@'
diff --git a/as/include/fabric/skew_monitor.h b/as/include/fabric/skew_monitor.h
new file mode 100644
index 00000000..194ac7a5
--- /dev/null
+++ b/as/include/fabric/skew_monitor.h
@@ -0,0 +1,67 @@
+/*
+ * skew_monitor.h
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+
+/**
+ * Initialize skew monitor.
+ */
+void
+as_skew_monitor_init();
+
+/**
+ * Return the current estimate of the clock skew in the cluster.
+ */
+uint64_t
+as_skew_monitor_skew();
+
+/**
+ * Return the currently estimated outliers from our cluster.
+ * Outliers should have space to hold at least AS_CLUSTER_SZ nodes.
+ */
+uint32_t
+as_skew_monitor_outliers(cf_vector* outliers);
+
+/**
+ * Print skew outliers to a dynamic buffer.
+ */
+uint32_t
+as_skew_monitor_outliers_append(cf_dyn_buf* db);
+
+/**
+ * Print skew monitor info to a dynamic buffer.
+ */
+void
+as_skew_monitor_info(cf_dyn_buf* db);
+
+/**
+ * Dump some debugging information to the logs.
+ */
+void
+as_skew_monitor_dump();
diff --git a/as/include/geospatial/geojson.h b/as/include/geospatial/geojson.h
new file mode 100644
index 00000000..69eee955
--- /dev/null
+++ b/as/include/geospatial/geojson.h
@@ -0,0 +1,56 @@
+/* 
+ * Copyright 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more
+ * contributor license agreements.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You
+ * may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifndef __geojson_h
+#define __geojson_h		1
+
+#include <string>
+
+#include <jansson.h>
+
+#include <s2cellid.h>
+#include <s2region.h>
+
+namespace GeoJSON {
+
+class GeometryHandler
+{
+public:
+	virtual ~GeometryHandler() {}
+
+	virtual void handle_point(S2CellId const & cellid);
+
+	virtual bool handle_region(S2Region * regionp);
+
+	virtual double earth_radius_meters() {
+		return 6371000.0;		// Wikipedia, mean radius.
+	}
+
+	void set_json(json_t * i_jsonp) { m_jsonp = i_jsonp; }
+
+	json_t * get_json() { return m_jsonp; }
+
+private:
+	json_t * m_jsonp;
+};
+
+void parse(GeometryHandler & geohand, std::string const & geostr);
+
+} // end namespace GeoJSON
+
+#endif // __geojson_h
diff --git a/as/include/geospatial/geospatial.h b/as/include/geospatial/geospatial.h
new file mode 100644
index 00000000..dde168d3
--- /dev/null
+++ b/as/include/geospatial/geospatial.h
@@ -0,0 +1,61 @@
+/*
+ * geospatial.h
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include "base/datamodel.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern bool geo_parse(as_namespace * ns,
+					  const char * buf,
+					  size_t bufsz,
+					  uint64_t * cellidp,
+					  geo_region_t * regionp);
+	
+extern bool geo_region_cover(as_namespace * ns,
+							 geo_region_t region,
+							 int maxnumcells,
+							 uint64_t * cellctrp,
+							 uint64_t * cellminp,
+							 uint64_t * cellmaxp,
+							 int * numcellsp);
+
+extern bool geo_point_centers(as_namespace * ns,
+							  uint64_t cellidval,
+							  int maxnumcenters,
+							  uint64_t * center,
+							  int * numcentersp);
+
+extern bool geo_point_within(uint64_t cellidval, geo_region_t region);
+
+extern void geo_region_destroy(geo_region_t region);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
diff --git a/as/include/geospatial/scoped.h b/as/include/geospatial/scoped.h
new file mode 100644
index 00000000..de857dda
--- /dev/null
+++ b/as/include/geospatial/scoped.h
@@ -0,0 +1,107 @@
+/* 
+ * Copyright 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more
+ * contributor license agreements.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You
+ * may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifndef scoped_h__
+#define scoped_h__
+
+template <typename T>
+class Scoped
+{
+public:
+	/// A deletion function.
+	typedef void (*Del)(T p);
+
+	/// Default constructor.
+	///
+	/// Note - the deletion function will not be called on the nil
+	/// value.
+	///
+	/// @param[in] i_nil Nil value.
+	/// @param[in] i_del Deletion functor.
+	///
+	Scoped(T const & i_nil, Del i_del)
+		: m_val(i_nil)
+		, m_nil(i_nil)
+		, m_del(i_del)
+	{}
+
+	/// Contructor from value.
+	///
+	/// Note - the deletion function will not be called on the nil
+	/// value.
+	///
+	/// @param[in] i_val The value to assign.
+	/// @param[in] i_nil Nil value.
+	/// @param[in] i_del Deletion functor.
+	///
+	Scoped(T const & i_val, T const & i_nil, Del i_del)
+		: m_val(i_val)
+		, m_nil(i_nil)
+		, m_del(i_del)
+	{}
+
+
+	/// Destructor, calls deletion function on non-nil values.
+	///
+	~Scoped()
+	{
+		if (m_val != m_nil)
+			m_del(m_val);
+	}
+
+	/// Assignment operator.
+	///
+	/// Calls deletion on existing non-nil value and assigns new
+	/// value.
+	///
+	/// @param[in] i_val The right-hand-side is the new value.
+	///
+	inline Scoped & operator=(T const & i_val)
+	{
+		// Delete any pre-existing value.
+		if (m_val != m_nil)
+			m_del(m_val);
+
+		m_val = i_val;
+		return *this;
+	}
+
+	/// Pointer dereference.
+	///
+	inline T const operator->() const { return m_val; }
+
+	/// Reference.
+	///
+	inline operator T&() { return m_val; }
+
+	/// Takes value, will not be deleted.
+	///
+	T const take()
+	{
+		T tmp = m_val;
+		m_val = m_nil;
+		return tmp;
+	}
+
+private:
+	T			m_val;
+	T			m_nil;
+	Del			m_del;
+};
+
+#endif // scoped_h__
diff --git a/as/include/geospatial/throwstream.h b/as/include/geospatial/throwstream.h
new file mode 100644
index 00000000..e5548595
--- /dev/null
+++ b/as/include/geospatial/throwstream.h
@@ -0,0 +1,35 @@
+/* 
+ * Copyright 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more
+ * contributor license agreements.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You
+ * may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifndef __throwstream_h
+#define __throwstream_h		1
+
+#include <iostream>
+#include <sstream>
+
+// The throwstream macro assembles the string argument to the
+// exception constructor from an iostream.
+//
+#define throwstream(__except, __msg)				\
+	do {											\
+		std::ostringstream __ostrm;					\
+		__ostrm << __msg;							\
+		throw __except(__ostrm.str().c_str());		\
+	} while (false)
+
+#endif // __throwstream_h
diff --git a/as/include/storage/drv_ssd.h b/as/include/storage/drv_ssd.h
new file mode 100644
index 00000000..fe450811
--- /dev/null
+++ b/as/include/storage/drv_ssd.h
@@ -0,0 +1,463 @@
+/*
+ * drv_ssd.h
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * Common header for drv_ssd.c, drv_ssd_cold.c, drv_ssd_warm.c.
+ */
+
+#pragma once
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "cf_mutex.h"
+#include "hist.h"
+
+#include "base/datamodel.h"
+#include "fabric/partition.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_index_s;
+struct as_namespace_s;
+struct as_rec_props_s;
+struct as_storage_rd_s;
+struct drv_ssd_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// Linux has removed O_DIRECT, but not its functionality.
+#ifndef O_DIRECT
+#define O_DIRECT 00040000
+#endif
+
+#define SSD_HEADER_MAGIC	(0x4349747275730707L)
+#define SSD_VERSION			2
+// Must update conversion code when bumping version.
+//
+// SSD_VERSION history:
+// 1 - original
+// 2 - minimum storage increment (RBLOCK_SIZE) from 512 to 128 bytes
+
+// Device header flags.
+#define SSD_HEADER_FLAG_ENCRYPTED	0x01
+#define SSD_HEADER_FLAG_CP			0x02
+#define SSD_HEADER_FLAG_TRUSTED		0x04
+
+#define MAX_SSD_THREADS 20
+
+
+//------------------------------------------------
+// Device header.
+//
+typedef struct {
+	uint64_t	magic;			// shows we've got the right stuff
+	uint64_t	random;			// a random value - good for telling all disks are of the same state
+	uint32_t	write_block_size;
+	uint32_t	last_evict_void_time;
+	uint8_t		version;
+	uint8_t		flags;
+	uint16_t	devices_n;		// number of devices
+	uint32_t	header_length;
+	char		namespace[32];	// ascii representation of the namespace name, null-terminated
+	uint32_t	info_n;			// number of info slices (should be > a reasonable partition count)
+	uint32_t	info_stride;	// currently 128 bytes
+	uint8_t		info_data[];
+} __attribute__((__packed__)) ssd_device_header;
+
+
+//------------------------------------------------
+// A defragged wblock waiting to be freed.
+//
+typedef struct vacated_wblock_s {
+	uint32_t file_id;
+	uint32_t wblock_id;
+} vacated_wblock;
+
+
+//------------------------------------------------
+// Write buffer - where records accumulate until
+// (the full buffer is) flushed to a device.
+//
+typedef struct {
+	cf_atomic32			rc;
+	cf_atomic32			n_writers;	// number of concurrent writers
+	bool				skip_post_write_q;
+	uint32_t			n_vacated;
+	uint32_t			vacated_capacity;
+	vacated_wblock		*vacated_wblocks;
+	struct drv_ssd_s	*ssd;
+	uint32_t			wblock_id;
+	uint32_t			pos;
+	uint8_t				*buf;
+} ssd_write_buf;
+
+
+//------------------------------------------------
+// Per-wblock information.
+//
+typedef struct ssd_wblock_state_s {
+	cf_atomic32			inuse_sz;	// number of bytes currently used in the wblock
+	cf_mutex			LOCK;		// transactions, write_worker, and defrag all are interested in wblock_state
+	ssd_write_buf		*swb;		// pending writes for the wblock, also treated as a cache for reads
+	uint32_t			state;		// for now just a defrag flag
+	cf_atomic32			n_vac_dests; // number of wblocks into which this wblock defragged
+} ssd_wblock_state;
+
+// wblock state
+//
+// Ultimately this may become a full-blown state, but for now it's effectively
+// just a defrag flag.
+#define WBLOCK_STATE_NONE		0
+#define WBLOCK_STATE_DEFRAG		1
+
+
+//------------------------------------------------
+// Per-device information about its wblocks.
+//
+typedef struct ssd_alloc_table_s {
+	uint32_t			n_wblocks;		// number allocated below
+	ssd_wblock_state	wblock_state[];
+} ssd_alloc_table;
+
+
+//------------------------------------------------
+// Where on free_wblock_q freed wblocks go.
+//
+typedef enum {
+	FREE_TO_HEAD,
+	FREE_TO_TAIL
+} e_free_to;
+
+
+//------------------------------------------------
+// Per-device information.
+//
+typedef struct drv_ssd_s
+{
+	struct as_namespace_s *ns;
+
+	char			*name;				// this device's name
+	char			*shadow_name;		// this device's shadow's name, if any
+
+	uint32_t		running;
+
+	pthread_mutex_t	write_lock;			// lock protects writes to current swb
+	ssd_write_buf	*current_swb;		// swb currently being filled by writes
+
+	int				commit_fd;			// relevant for enterprise edition only
+	int				shadow_commit_fd;	// relevant for enterprise edition only
+
+	pthread_mutex_t	defrag_lock;		// lock protects writes to defrag swb
+	ssd_write_buf	*defrag_swb;		// swb currently being filled by defrag
+
+	cf_queue		*fd_q;				// queue of open fds
+	cf_queue		*shadow_fd_q;		// queue of open fds on shadow, if any
+
+	cf_queue		*free_wblock_q;		// IDs of free wblocks
+	cf_queue		*defrag_wblock_q;	// IDs of wblocks to defrag
+
+	cf_queue		*swb_write_q;		// pointers to swbs ready to write
+	cf_queue		*swb_shadow_q;		// pointers to swbs ready to write to shadow, if any
+	cf_queue		*swb_free_q;		// pointers to swbs free and waiting
+	cf_queue		*post_write_q;		// pointers to swbs that have been written but are cached
+
+	cf_atomic64		n_defrag_wblock_reads;	// total number of wblocks added to the defrag_wblock_q
+	cf_atomic64		n_defrag_wblock_writes;	// total number of swbs added to the swb_write_q by defrag
+	cf_atomic64		n_wblock_writes;		// total number of swbs added to the swb_write_q by writes
+
+	volatile uint64_t n_tomb_raider_reads;	// relevant for enterprise edition only
+
+	cf_atomic32		defrag_sweep;		// defrag sweep flag
+
+	uint64_t		file_size;
+	int				file_id;
+
+	uint32_t		open_flag;
+	bool			data_in_memory;
+	bool			started_fresh;		// relevant only for warm or cool restart
+
+	uint64_t		io_min_size;		// device IO operations are aligned and sized in multiples of this
+	uint64_t		commit_min_size;	// commit (write) operations are aligned and sized in multiples of this
+
+	cf_atomic64		inuse_size;			// number of bytes in actual use on this device
+
+	uint32_t		write_block_size;	// number of bytes to write at a time
+
+	uint32_t		sweep_wblock_id;				// wblocks read at startup
+	uint64_t		record_add_older_counter;		// records not inserted due to better existing one
+	uint64_t		record_add_expired_counter;		// records not inserted due to expiration
+	uint64_t		record_add_max_ttl_counter;		// records not inserted due to max-ttl
+	uint64_t		record_add_replace_counter;		// records reinserted
+	uint64_t		record_add_unique_counter;		// records inserted
+
+	ssd_alloc_table	*alloc_table;
+
+	pthread_t		maintenance_thread;
+	pthread_t		write_worker_thread[MAX_SSD_THREADS];
+	pthread_t		shadow_worker_thread;
+	pthread_t		defrag_thread;
+
+	histogram		*hist_read;
+	histogram		*hist_large_block_read;
+	histogram		*hist_write;
+	histogram		*hist_shadow_write;
+	histogram		*hist_fsync;
+} drv_ssd;
+
+
+//------------------------------------------------
+// Per-namespace storage information.
+//
+typedef struct drv_ssds_s
+{
+	ssd_device_header		*header;
+	struct as_namespace_s	*ns;
+
+	// Not a great place for this - used only at startup to determine whether to
+	// load a record.
+	bool get_state_from_storage[AS_PARTITIONS];
+
+	int					n_ssds;
+	drv_ssd				ssds[];
+} drv_ssds;
+
+
+//==========================================================
+// Private API - for enterprise separation only
+//
+
+// SSD_HEADER_SIZE must be a power of 2 and >= MAX_WRITE_BLOCK_SIZE.
+// Do NOT change SSD_HEADER_SIZE!
+#define SSD_HEADER_SIZE			(1024 * 1024)
+
+// Artificial limit on write-block-size, in case we ever move to an
+// SSD_HEADER_SIZE that's too big to be a write-block size limit.
+// MAX_WRITE_BLOCK_SIZE must be power of 2 and <= SSD_HEADER_SIZE.
+#define MAX_WRITE_BLOCK_SIZE	(1024 * 1024)
+
+// Artificial limit on write-block-size, must be power of 2 and >= RBLOCK_SIZE.
+#define MIN_WRITE_BLOCK_SIZE	(1024 * 1)
+
+#define SSD_BLOCK_MAGIC		0x037AF200
+#define LENGTH_BASE			offsetof(struct drv_ssd_block_s, keyd)
+
+#define SSD_HEADER_INFO_STRIDE 128
+
+typedef struct ssd_load_records_info_s {
+	drv_ssds *ssds;
+	drv_ssd *ssd;
+	cf_queue *complete_q;
+	void *complete_udata;
+	void *complete_rc;
+} ssd_load_records_info;
+
+// Per-record metadata on device.
+typedef struct drv_ssd_block_s {
+	uint64_t		sig;			// deprecated
+	uint32_t		magic;
+	uint32_t		length;			// total after this field - this struct's pointer + 16
+	cf_digest		keyd;
+	uint32_t		generation;
+	cf_clock		void_time;
+	uint32_t		bins_offset;	// offset to bins from data
+	uint32_t		n_bins;
+	uint64_t		last_update_time;
+	uint8_t			data[];
+} __attribute__ ((__packed__)) drv_ssd_block;
+
+// Per-bin metadata on device.
+typedef struct drv_ssd_bin_s {
+	char		name[AS_ID_BIN_SZ];	// 15 aligns well
+	uint8_t		version;			// now unused
+	uint32_t	offset;				// offset of bin data within block
+	uint32_t	len;				// size of bin data
+	uint32_t	next;				// location of next bin: block offset
+} __attribute__ ((__packed__)) drv_ssd_bin;
+
+// Info slice in device header block.
+typedef struct info_buf_s {
+	uint32_t regime; // used to be len, but was never read
+	as_partition_version version;
+} __attribute__ ((__packed__)) info_buf;
+
+// Warm and cool restart.
+void ssd_resume_devices(drv_ssds *ssds);
+void *run_ssd_cool_start(void *udata);
+void ssd_load_wblock_queues(drv_ssds *ssds);
+void ssd_start_maintenance_threads(drv_ssds *ssds);
+void ssd_start_write_worker_threads(drv_ssds *ssds);
+void ssd_start_defrag_threads(drv_ssds *ssds);
+bool is_valid_record(const drv_ssd_block *block, const char *ns_name);
+void apply_rec_props(struct as_index_s *r, struct as_namespace_s *ns, const struct as_rec_props_s *p_props);
+
+// Tomb raider.
+void ssd_cold_start_adjust_cenotaph(struct as_namespace_s *ns, const drv_ssd_block *block, struct as_index_s *r);
+void ssd_cold_start_transition_record(struct as_namespace_s *ns, const drv_ssd_block *block, struct as_index_s *r, bool is_create);
+void ssd_cold_start_drop_cenotaphs(struct as_namespace_s *ns);
+
+// Record encryption.
+void ssd_init_encryption_key(struct as_namespace_s *ns);
+void ssd_do_encrypt(const uint8_t *key, uint64_t off, drv_ssd_block *block);
+void ssd_do_decrypt(const uint8_t *key, uint64_t off, drv_ssd_block *block);
+
+// CP.
+void ssd_adjust_versions(struct as_namespace_s *ns, ssd_device_header *header);
+conflict_resolution_pol ssd_cold_start_policy(struct as_namespace_s *ns);
+void ssd_cold_start_init_repl_state(struct as_namespace_s *ns, struct as_index_s* r);
+
+// Miscellaneous.
+void ssd_header_init_cfg(const struct as_namespace_s *ns, ssd_device_header *header);
+bool ssd_header_is_valid_cfg(const struct as_namespace_s *ns, const ssd_device_header *header);
+bool ssd_cold_start_is_valid_n_bins(uint32_t n_bins);
+bool ssd_cold_start_is_record_truncated(struct as_namespace_s *ns, const drv_ssd_block *block, const struct as_rec_props_s *p_props);
+void ssd_write_header(drv_ssd *ssd, ssd_device_header *header, off_t offset, size_t size); // TODO - change name!
+
+// Durability.
+void ssd_init_commit(drv_ssd *ssd);
+uint64_t ssd_flush_max_us(const struct as_namespace_s *ns);
+void ssd_post_write(drv_ssd *ssd, ssd_write_buf *swb);
+int ssd_write_bins(struct as_storage_rd_s *rd);
+int ssd_buffer_bins(struct as_storage_rd_s *rd);
+uint32_t ssd_record_size(struct as_storage_rd_s *rd);
+ssd_write_buf *swb_get(drv_ssd *ssd);
+void ssd_init_trusted(struct as_namespace_s *ns);
+bool ssd_is_untrusted(struct as_namespace_s *ns, uint8_t header_flags);
+void ssd_set_trusted(struct as_namespace_s *ns);
+
+// Called in (enterprise-split) storage table function.
+int ssd_write(struct as_storage_rd_s *rd);
+
+
+//
+// Conversions between bytes and rblocks.
+//
+
+// TODO - make checks stricter (exclude drive header, consider drive size) ???
+#define STORAGE_RBLOCK_IS_VALID(__x)	((__x) != 0)
+#define STORAGE_RBLOCK_IS_INVALID(__x)	((__x) == 0)
+
+#define RBLOCK_SIZE			128	// 2^7
+#define LOG_2_RBLOCK_SIZE	7	// must be in sync with RBLOCK_SIZE
+
+// Round bytes up to a multiple of rblock size.
+static inline uint32_t BYTES_TO_RBLOCK_BYTES(uint32_t bytes) {
+	return (bytes + (RBLOCK_SIZE - 1)) & -RBLOCK_SIZE;
+}
+
+// Convert byte offset to rblock_id, or bytes to rblocks as long as 'bytes' is
+// already a multiple of rblock size.
+static inline uint64_t BYTES_TO_RBLOCKS(uint64_t bytes) {
+	return bytes >> LOG_2_RBLOCK_SIZE;
+}
+
+// Convert rblock_id to byte offset, or rblocks to bytes.
+static inline uint64_t RBLOCKS_TO_BYTES(uint64_t rblocks) {
+	return rblocks << LOG_2_RBLOCK_SIZE;
+}
+
+
+//
+// Conversions between bytes/rblocks and wblocks.
+//
+
+#define STORAGE_INVALID_WBLOCK 0xFFFFffff
+
+// Convert byte offset to wblock_id.
+static inline uint32_t BYTES_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t bytes) {
+	return (uint32_t)(bytes / ssd->write_block_size);
+}
+
+// Convert wblock_id to byte offset.
+static inline uint64_t WBLOCK_ID_TO_BYTES(drv_ssd *ssd, uint32_t wblock_id) {
+	return (uint64_t)wblock_id * (uint64_t)ssd->write_block_size;
+}
+
+// Convert rblock_id to wblock_id.
+static inline uint32_t RBLOCK_ID_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t rblock_id) {
+	return (uint32_t)((rblock_id << LOG_2_RBLOCK_SIZE) / ssd->write_block_size);
+}
+
+
+//
+// Size rounding needed for direct IO.
+//
+
+// Used when determining a device's io_min_size.
+#define LO_IO_MIN_SIZE 512
+#define HI_IO_MIN_SIZE 4096
+
+// Round bytes down to a multiple of device's minimum IO operation size.
+static inline uint64_t BYTES_DOWN_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) {
+	return bytes & -ssd->io_min_size;
+}
+
+// Round bytes up to a multiple of device's minimum IO operation size.
+static inline uint64_t BYTES_UP_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) {
+	return (bytes + (ssd->io_min_size - 1)) & -ssd->io_min_size;
+}
+
+
+//
+// Device header parsing utilities.
+//
+
+static inline bool
+can_convert_storage_version(uint8_t version)
+{
+	return version == 1
+			// In case I bump version 2 and forget to tweak conversion code:
+			&& SSD_VERSION == 2;
+}
+
+
+//
+// Record encryption.
+//
+
+static inline void
+ssd_encrypt(drv_ssd *ssd, uint64_t off, drv_ssd_block *block)
+{
+	if (ssd->ns->storage_encryption_key_file != NULL) {
+		ssd_do_encrypt(ssd->ns->storage_encryption_key, off, block);
+	}
+}
+
+static inline void
+ssd_decrypt(drv_ssd *ssd, uint64_t off, drv_ssd_block *block)
+{
+	if (ssd->ns->storage_encryption_key_file != NULL) {
+		ssd_do_decrypt(ssd->ns->storage_encryption_key, off, block);
+	}
+}
diff --git a/as/include/storage/storage.h b/as/include/storage/storage.h
new file mode 100644
index 00000000..eedff374
--- /dev/null
+++ b/as/include/storage/storage.h
@@ -0,0 +1,183 @@
+/*
+ * storage.h
+ *
+ * Copyright (C) 2009-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "base/rec_props.h"
+
+
+// Forward declarations.
+struct as_bin_s;
+struct as_index_s;
+struct as_partition_s;
+struct as_namespace_s;
+struct drv_ssd_s;
+struct drv_ssd_block_s;
+
+
+typedef enum {
+	AS_STORAGE_ENGINE_MEMORY	= 0,
+	AS_STORAGE_ENGINE_SSD		= 1,
+
+	AS_NUM_STORAGE_ENGINES
+} as_storage_type;
+
+typedef struct as_storage_rd_s {
+	struct as_index_s		*r;
+	struct as_namespace_s	*ns;
+
+	as_rec_props			rec_props;
+
+	struct as_bin_s			*bins;
+	uint16_t				n_bins;
+
+	bool					record_on_device;
+	bool					ignore_record_on_device;
+
+	// Parameters used when handling key storage:
+	uint32_t				key_size;
+	uint8_t					*key;
+
+	bool					is_durable_delete; // enterprise only
+
+	// Specific to storage type AS_STORAGE_ENGINE_SSD:
+	struct drv_ssd_block_s	*block;
+	uint8_t					*must_free_block;
+	struct drv_ssd_s		*ssd;
+} as_storage_rd;
+
+
+//------------------------------------------------
+// Generic "base class" functions that call
+// through storage-engine "v-tables".
+//
+
+extern void as_storage_init();
+extern void as_storage_start_tomb_raider();
+extern int as_storage_namespace_destroy(struct as_namespace_s *ns);
+
+extern int as_storage_record_destroy(struct as_namespace_s *ns, struct as_index_s *r); // not the counterpart of as_storage_record_create()
+
+// Start and finish an as_storage_rd usage cycle.
+extern int as_storage_record_create(struct as_namespace_s *ns, struct as_index_s *r, as_storage_rd *rd);
+extern int as_storage_record_open(struct as_namespace_s *ns, struct as_index_s *r, as_storage_rd *rd);
+extern int as_storage_record_close(as_storage_rd *rd);
+
+// Called within as_storage_rd usage cycle.
+extern int as_storage_record_load_n_bins(as_storage_rd *rd);
+extern int as_storage_record_load_bins(as_storage_rd *rd);
+extern bool as_storage_record_size_and_check(as_storage_rd *rd);
+extern int as_storage_record_write(as_storage_rd *rd);
+
+// Storage capacity monitoring.
+extern void as_storage_wait_for_defrag();
+extern bool as_storage_overloaded(struct as_namespace_s *ns); // returns true if write queue is too backed up
+extern bool as_storage_has_space(struct as_namespace_s *ns);
+extern void as_storage_defrag_sweep(struct as_namespace_s *ns);
+
+// Storage of generic data into device headers.
+extern void as_storage_info_set(struct as_namespace_s *ns, const struct as_partition_s *p, bool flush);
+extern void as_storage_info_get(struct as_namespace_s *ns, struct as_partition_s *p);
+extern int as_storage_info_flush(struct as_namespace_s *ns);
+extern void as_storage_save_evict_void_time(struct as_namespace_s *ns, uint32_t evict_void_time);
+
+// Statistics.
+extern int as_storage_stats(struct as_namespace_s *ns, int *available_pct, uint64_t *inuse_disk_bytes); // available percent is that of worst device
+extern int as_storage_ticker_stats(struct as_namespace_s *ns); // prints SSD histograms to the info ticker
+extern int as_storage_histogram_clear_all(struct as_namespace_s *ns); // clears all SSD histograms
+
+
+//------------------------------------------------
+// Generic functions that don't use "v-tables".
+//
+
+// Called within as_storage_rd usage cycle.
+extern uint64_t as_storage_record_get_n_bytes_memory(as_storage_rd *rd);
+extern void as_storage_record_adjust_mem_stats(as_storage_rd *rd, uint64_t start_bytes);
+extern void as_storage_record_drop_from_mem_stats(as_storage_rd *rd);
+extern bool as_storage_record_get_key(as_storage_rd *rd);
+extern size_t as_storage_record_rec_props_size(as_storage_rd *rd);
+extern void as_storage_record_set_rec_props(as_storage_rd *rd, uint8_t* rec_props_data);
+
+// Called only at shutdown to flush all device write-queues.
+extern void as_storage_shutdown();
+
+
+//------------------------------------------------
+// AS_STORAGE_ENGINE_MEMORY functions.
+//
+
+extern int as_storage_namespace_init_memory(struct as_namespace_s *ns, cf_queue *complete_q, void *udata);
+extern void as_storage_start_tomb_raider_memory(struct as_namespace_s *ns);
+extern int as_storage_namespace_destroy_memory(struct as_namespace_s *ns);
+
+extern int as_storage_record_write_memory(as_storage_rd *rd);
+
+extern void as_storage_info_get_memory(struct as_namespace_s *ns, struct as_partition_s *p);
+
+extern int as_storage_stats_memory(struct as_namespace_s *ns, int *available_pct, uint64_t *used_disk_bytes);
+
+
+//------------------------------------------------
+// AS_STORAGE_ENGINE_SSD functions.
+//
+
+extern int as_storage_namespace_init_ssd(struct as_namespace_s *ns, cf_queue *complete_q, void *udata);
+extern void as_storage_start_tomb_raider_ssd(struct as_namespace_s *ns);
+extern void as_storage_loading_records_ticker_ssd(); // called directly by as_storage_init()
+extern int as_storage_namespace_destroy_ssd(struct as_namespace_s *ns);
+
+extern int as_storage_record_destroy_ssd(struct as_namespace_s *ns, struct as_index_s *r);
+
+extern int as_storage_record_create_ssd(as_storage_rd *rd);
+extern int as_storage_record_open_ssd(as_storage_rd *rd);
+extern int as_storage_record_close_ssd(as_storage_rd *rd);
+
+extern int as_storage_record_load_n_bins_ssd(as_storage_rd *rd);
+extern int as_storage_record_load_bins_ssd(as_storage_rd *rd);
+extern bool as_storage_record_size_and_check_ssd(as_storage_rd *rd);
+extern int as_storage_record_write_ssd(as_storage_rd *rd);
+
+extern void as_storage_wait_for_defrag_ssd(struct as_namespace_s *ns);
+extern bool as_storage_overloaded_ssd(struct as_namespace_s *ns);
+extern bool as_storage_has_space_ssd(struct as_namespace_s *ns);
+extern void as_storage_defrag_sweep_ssd(struct as_namespace_s *ns);
+
+extern void as_storage_info_set_ssd(struct as_namespace_s *ns, const struct as_partition_s *p, bool flush);
+extern void as_storage_info_get_ssd(struct as_namespace_s *ns, struct as_partition_s *p);
+extern int as_storage_info_flush_ssd(struct as_namespace_s *ns);
+extern void as_storage_save_evict_void_time_ssd(struct as_namespace_s *ns, uint32_t evict_void_time);
+
+extern int as_storage_stats_ssd(struct as_namespace_s *ns, int *available_pct, uint64_t *used_disk_bytes);
+extern int as_storage_ticker_stats_ssd(struct as_namespace_s *ns);
+extern int as_storage_histogram_clear_ssd(struct as_namespace_s *ns);
+
+// Called by "base class" functions but not via table.
+extern bool as_storage_record_get_key_ssd(as_storage_rd *rd);
+extern void as_storage_shutdown_ssd(struct as_namespace_s *ns);
diff --git a/as/include/transaction/delete.h b/as/include/transaction/delete.h
new file mode 100644
index 00000000..97cdc9de
--- /dev/null
+++ b/as/include/transaction/delete.h
@@ -0,0 +1,56 @@
+/*
+ * delete.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+
+#include "base/transaction.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_index_ref_s;
+struct as_transaction_s;
+struct rw_request_s;
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status as_delete_start(struct as_transaction_s* tr);
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+bool delete_storage_overloaded(struct as_transaction_s* tr);
+transaction_status delete_master(struct as_transaction_s* tr, struct rw_request_s* rw);
+transaction_status drop_master(struct as_transaction_s* tr, struct as_index_ref_s* r_ref, struct rw_request_s* rw);
diff --git a/as/include/transaction/duplicate_resolve.h b/as/include/transaction/duplicate_resolve.h
new file mode 100644
index 00000000..72fa98a5
--- /dev/null
+++ b/as/include/transaction/duplicate_resolve.h
@@ -0,0 +1,50 @@
+/*
+ * duplicate_resolve.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include "msg.h"
+#include "node.h"
+
+#include "transaction/rw_request.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+struct rw_request_s;
+
+
+//==========================================================
+// Public API.
+//
+
+void dup_res_make_message(struct rw_request_s* rw, struct as_transaction_s* tr);
+void dup_res_setup_rw(struct rw_request_s* rw, struct as_transaction_s* tr, dup_res_done_cb dup_res_cb, timeout_done_cb timeout_cb);
+void dup_res_handle_request(cf_node node, msg* m);
+void dup_res_handle_ack(cf_node node, msg* m);
diff --git a/as/include/transaction/proxy.h b/as/include/transaction/proxy.h
new file mode 100644
index 00000000..42291df8
--- /dev/null
+++ b/as/include/transaction/proxy.h
@@ -0,0 +1,60 @@
+/*
+ * proxy.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+#include "dynbuf.h"
+#include "node.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_bin_s;
+struct as_msg_op_s;
+struct as_namespace_s;
+struct as_transaction_s;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_proxy_init();
+
+uint32_t as_proxy_hash_count();
+
+void as_proxy_divert(cf_node dst, struct as_transaction_s* tr, struct as_namespace_s* ns);
+void as_proxy_return_to_sender(const struct as_transaction_s* tr, struct as_namespace_s* ns);
+
+void as_proxy_send_response(cf_node dst, uint32_t proxy_tid,
+		uint32_t result_code, uint32_t generation, uint32_t void_time,
+		struct as_msg_op_s** ops, struct as_bin_s** bins, uint16_t bin_count,
+		struct as_namespace_s* ns, uint64_t trid);
+void as_proxy_send_ops_response(cf_node dst, uint32_t proxy_tid, cf_dyn_buf* db);
diff --git a/as/include/transaction/re_replicate.h b/as/include/transaction/re_replicate.h
new file mode 100644
index 00000000..6adfef4e
--- /dev/null
+++ b/as/include/transaction/re_replicate.h
@@ -0,0 +1,43 @@
+/*
+ * re_replicate.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include "base/transaction.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status as_re_replicate_start(struct as_transaction_s* tr);
diff --git a/as/include/transaction/read.h b/as/include/transaction/read.h
new file mode 100644
index 00000000..dabc8270
--- /dev/null
+++ b/as/include/transaction/read.h
@@ -0,0 +1,36 @@
+/*
+ * read.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include "base/transaction.h"
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status as_read_start(as_transaction* tr);
diff --git a/as/include/transaction/replica_ping.h b/as/include/transaction/replica_ping.h
new file mode 100644
index 00000000..5d5e231e
--- /dev/null
+++ b/as/include/transaction/replica_ping.h
@@ -0,0 +1,54 @@
+/*
+ * replica_ping.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+
+#include "msg.h"
+#include "node.h"
+
+#include "transaction/rw_request.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+struct rw_request_s;
+
+
+//==========================================================
+// Public API.
+//
+
+bool repl_ping_check(struct as_transaction_s* tr);
+void repl_ping_make_message(struct rw_request_s* rw, struct as_transaction_s* tr);
+void repl_ping_setup_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_ping_done_cb repl_ping_cb, timeout_done_cb timeout_cb);
+void repl_ping_reset_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_ping_done_cb cb);
+void repl_ping_handle_op(cf_node node, msg* m);
+void repl_ping_handle_ack(cf_node node, msg* m);
diff --git a/as/include/transaction/replica_write.h b/as/include/transaction/replica_write.h
new file mode 100644
index 00000000..5af68fde
--- /dev/null
+++ b/as/include/transaction/replica_write.h
@@ -0,0 +1,51 @@
+/*
+ * replica_write.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include "msg.h"
+#include "node.h"
+
+#include "transaction/rw_request.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+struct rw_request_s;
+
+
+//==========================================================
+// Public API.
+//
+
+void repl_write_make_message(struct rw_request_s* rw, struct as_transaction_s* tr);
+void repl_write_setup_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_write_done_cb repl_write_cb, timeout_done_cb timeout_cb);
+void repl_write_reset_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_write_done_cb cb);
+void repl_write_handle_op(cf_node node, msg* m);
+void repl_write_handle_ack(cf_node node, msg* m);
diff --git a/as/include/transaction/rw_request.h b/as/include/transaction/rw_request.h
new file mode 100644
index 00000000..69d9fb65
--- /dev/null
+++ b/as/include/transaction/rw_request.h
@@ -0,0 +1,209 @@
+/*
+ * rw_request.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "dynbuf.h"
+#include "msg.h"
+#include "node.h"
+
+#include "base/proto.h"
+#include "base/rec_props.h"
+#include "base/transaction.h"
+#include "fabric/hb.h"
+#include "fabric/partition.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_batch_shared_s;
+struct as_file_handle_s;
+struct cl_msg_s;
+struct iudf_origin_s;
+struct rw_request_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef bool (*dup_res_done_cb) (struct rw_request_s* rw);
+typedef void (*repl_write_done_cb) (struct rw_request_s* rw);
+typedef void (*repl_ping_done_cb) (struct rw_request_s* rw);
+typedef void (*timeout_done_cb) (struct rw_request_s* rw);
+
+typedef struct rw_wait_ele_s {
+	as_transaction			tr; // TODO - only needs to be transaction head
+	struct rw_wait_ele_s*	next;
+} rw_wait_ele;
+
+
+typedef struct rw_request_s {
+
+	//------------------------------------------------------
+	// Matches as_transaction.
+	//
+
+	struct cl_msg_s*	msgp;
+	uint32_t			msg_fields;
+
+	uint8_t				origin;
+	uint8_t				from_flags;
+
+	union {
+		void*						any;
+		struct as_file_handle_s*	proto_fd_h;
+		cf_node						proxy_node;
+		struct iudf_origin_s*		iudf_orig;
+		struct as_batch_shared_s*	batch_shared;
+	} from;
+
+	union {
+		uint32_t any;
+		uint32_t batch_index;
+		uint32_t proxy_tid;
+	} from_data;
+
+	cf_digest			keyd;
+
+	uint64_t			start_time;
+	uint64_t			benchmark_time;
+
+	as_partition_reservation rsv;
+
+	uint64_t			end_time;
+	uint8_t				result_code;
+	uint8_t				flags;
+	uint16_t			generation;
+	uint32_t			void_time;
+	uint64_t			last_update_time;
+
+	//
+	// End of as_transaction look-alike.
+	//------------------------------------------------------
+
+	pthread_mutex_t		lock;
+
+	rw_wait_ele*		wait_queue_head;
+	rw_wait_ele*		wait_queue_tail;
+	uint32_t			wait_queue_depth;
+
+	bool				is_set_up; // TODO - redundant with timeout_cb
+
+	// Store pickled data, for use in replica write.
+	uint8_t*			pickled_buf;
+	size_t				pickled_sz;
+	as_rec_props		pickled_rec_props;
+
+	// Store ops' responses here.
+	cf_dyn_buf			response_db;
+
+	// Manage responses for duplicate resolution and replica write requests, or
+	// alternatively, timeouts.
+	uint32_t			tid;
+	bool				dup_res_complete;
+	bool				repl_write_complete;
+	bool				repl_ping_complete;
+	dup_res_done_cb		dup_res_cb;
+	repl_write_done_cb	repl_write_cb;
+	repl_ping_done_cb	repl_ping_cb;
+	timeout_done_cb		timeout_cb;
+
+	// Message being sent to dest_nodes. May be duplicate resolution or replica
+	// write request. Message is kept in case it needs to be retransmitted.
+	msg*				dest_msg;
+
+	uint64_t			xmit_ms; // time of next retransmit
+	uint32_t			retry_interval_ms; // interval to add for next retransmit
+
+	// Destination info for duplicate resolution and replica write requests.
+	uint32_t			n_dest_nodes;
+	cf_node				dest_nodes[AS_CLUSTER_SZ];
+	bool				dest_complete[AS_CLUSTER_SZ];
+
+	// Duplicate resolution response messages from nodes with duplicates.
+	msg*				best_dup_msg;
+	// TODO - could store best dup node-id - worth it?
+	uint8_t				best_dup_result_code;
+	uint16_t			best_dup_gen;
+	uint64_t			best_dup_lut;
+
+	bool				tie_was_replicated; // enterprise only
+
+} rw_request;
+
+
+//==========================================================
+// Public API.
+//
+
+rw_request* rw_request_create();
+void rw_request_destroy(rw_request* rw);
+void rw_request_wait_q_push(rw_request* rw, as_transaction* tr);
+void rw_request_wait_q_push_head(rw_request* rw, as_transaction* tr);
+
+
+static inline void
+rw_request_hdestroy(void* pv)
+{
+	rw_request_destroy((rw_request*)pv);
+}
+
+
+static inline void
+rw_request_release(rw_request* rw)
+{
+	if (cf_rc_release(rw) == 0) {
+		rw_request_destroy(rw);
+		cf_rc_free(rw);
+	}
+}
+
+
+// See as_transaction_trid().
+static inline uint64_t
+rw_request_trid(const rw_request* rw)
+{
+	if ((rw->msg_fields & AS_MSG_FIELD_BIT_TRID) == 0) {
+		return 0;
+	}
+
+	as_msg_field *f = as_msg_field_get(&rw->msgp->msg, AS_MSG_FIELD_TYPE_TRID);
+
+	return cf_swap_from_be64(*(uint64_t*)f->data);
+}
diff --git a/as/include/transaction/rw_request_hash.h b/as/include/transaction/rw_request_hash.h
new file mode 100644
index 00000000..1bee799b
--- /dev/null
+++ b/as/include/transaction/rw_request_hash.h
@@ -0,0 +1,111 @@
+/*
+ * rw_request_hash.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+#include "citrusleaf/cf_digest.h"
+
+#include "base/transaction.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+struct rw_request_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef enum {
+	// These values go on the wire, so mind backward compatibility if changing.
+	RW_FIELD_OP,
+	RW_FIELD_RESULT,
+	RW_FIELD_NAMESPACE,
+	RW_FIELD_NS_ID,
+	RW_FIELD_GENERATION,
+	RW_FIELD_DIGEST,
+	RW_FIELD_UNUSED_6,
+	RW_FIELD_UNUSED_7,
+	RW_FIELD_CLUSTER_KEY,
+	RW_FIELD_RECORD,
+	RW_FIELD_TID,
+	RW_FIELD_VOID_TIME,
+	RW_FIELD_INFO,
+	RW_FIELD_UNUSED_13,
+	RW_FIELD_UNUSED_14,
+	RW_FIELD_UNUSED_15,
+	RW_FIELD_LAST_UPDATE_TIME,
+	RW_FIELD_SET_NAME,
+	RW_FIELD_KEY,
+	RW_FIELD_REGIME,
+
+	NUM_RW_FIELDS
+} rw_msg_field;
+
+#define RW_OP_WRITE 1
+#define RW_OP_WRITE_ACK 2
+#define RW_OP_DUP 3
+#define RW_OP_DUP_ACK 4
+#define RW_OP_REPL_CONFIRM 5
+#define RW_OP_REPL_PING 6
+#define RW_OP_REPL_PING_ACK 7
+
+#define RW_INFO_XDR				0x0001
+#define RW_INFO_NO_REPL_ACK		0x0002
+#define RW_INFO_NSUP_DELETE		0x0004
+#define RW_INFO_UNUSED_8		0x0008 // was LDT dummy (no data)
+#define RW_INFO_UNUSED_10		0x0010 // was LDT parent record
+#define RW_INFO_UNUSED_20		0x0020 // was LDT subrecord
+#define RW_INFO_UNUSED_40		0x0040 // was LDT ESR
+#define RW_INFO_SINDEX_TOUCHED	0x0080 // sindex was touched
+#define RW_INFO_UNUSED_100		0x0100 // was LDT multi-op message
+#define RW_INFO_UNREPLICATED	0x0200 // enterprise only
+#define RW_INFO_TOMBSTONE		0x0400 // enterprise only
+
+typedef struct rw_request_hkey_s {
+	uint32_t	ns_id;
+	cf_digest	keyd;
+} __attribute__((__packed__)) rw_request_hkey;
+
+
+//==========================================================
+// Public API.
+//
+
+void as_rw_init();
+
+uint32_t rw_request_hash_count();
+transaction_status rw_request_hash_insert(rw_request_hkey* hkey, struct rw_request_s* rw, struct as_transaction_s* tr);
+void rw_request_hash_delete(rw_request_hkey* hkey, struct rw_request_s* rw);
+struct rw_request_s* rw_request_hash_get(rw_request_hkey* hkey);
+
+void rw_request_hash_dump();
diff --git a/as/include/transaction/rw_utils.h b/as/include/transaction/rw_utils.h
new file mode 100644
index 00000000..d6324bbe
--- /dev/null
+++ b/as/include/transaction/rw_utils.h
@@ -0,0 +1,201 @@
+/*
+ * rw_utils.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_digest.h"
+
+#include "msg.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "transaction/rw_request.h"
+#include "transaction/udf.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_bin_s;
+struct as_index_s;
+struct as_index_tree_s;
+struct as_msg_s;
+struct as_namespace_s;
+struct as_remote_record_s;
+struct as_storage_rd_s;
+struct as_transaction_s;
+struct rw_request_s;
+struct udf_record_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct index_metadata_s {
+	uint32_t void_time;
+	uint64_t last_update_time;
+	uint16_t generation;
+} index_metadata;
+
+typedef struct now_times_s {
+	uint64_t now_ns;
+	uint64_t now_ms;
+} now_times;
+
+// For now, use only for as_msg record_ttl special values.
+#define TTL_NAMESPACE_DEFAULT	0
+#define TTL_NEVER_EXPIRE		((uint32_t)-1)
+#define TTL_DONT_UPDATE			((uint32_t)-2)
+
+
+//==========================================================
+// Public API.
+//
+
+bool validate_delete_durability(struct as_transaction_s* tr);
+bool xdr_allows_write(struct as_transaction_s* tr);
+void send_rw_messages(struct rw_request_s* rw);
+void send_rw_messages_forget(struct rw_request_s* rw);
+int repl_state_check(struct as_index_s* r, struct as_transaction_s* tr);
+void will_replicate(struct as_index_s* r, struct as_namespace_s* ns);
+bool insufficient_replica_destinations(const struct as_namespace_s* ns, uint32_t n_dests);
+void finished_replicated(struct as_transaction_s* tr);
+void finished_not_replicated(struct rw_request_s* rw);
+bool generation_check(const struct as_index_s* r, const struct as_msg_s* m, const struct as_namespace_s* ns);
+int set_set_from_msg(struct as_index_s* r, struct as_namespace_s* ns, struct as_msg_s* m);
+int set_delete_durablility(const struct as_transaction_s* tr, struct as_storage_rd_s* rd);
+bool check_msg_key(struct as_msg_s* m, struct as_storage_rd_s* rd);
+bool get_msg_key(struct as_transaction_s* tr, struct as_storage_rd_s* rd);
+int handle_msg_key(struct as_transaction_s* tr, struct as_storage_rd_s* rd);
+void update_metadata_in_index(struct as_transaction_s* tr, bool increment_generation, struct as_index_s* r);
+void pickle_all(struct as_storage_rd_s* rd, struct rw_request_s* rw);
+bool write_sindex_update(struct as_namespace_s* ns, const char* set_name, cf_digest* keyd, struct as_bin_s* old_bins, uint32_t n_old_bins, struct as_bin_s* new_bins, uint32_t n_new_bins);
+void record_delete_adjust_sindex(struct as_index_s* r, struct as_namespace_s* ns);
+void delete_adjust_sindex(struct as_storage_rd_s* rd);
+void remove_from_sindex(struct as_namespace_s* ns, const char* set_name, cf_digest* keyd, struct as_bin_s* bins, uint32_t n_bins);
+bool xdr_must_ship_delete(struct as_namespace_s* ns, bool is_nsup_delete, bool is_xdr_op);
+
+
+// TODO - rename as as_record_... and move to record.c?
+static inline bool
+record_has_sindex(const as_record* r, as_namespace* ns)
+{
+	if (! as_sindex_ns_has_sindex(ns)) {
+		return false;
+	}
+
+	as_set* set = as_namespace_get_record_set(ns, r);
+
+	return set ? set->n_sindexes != 0 : ns->n_setless_sindexes != 0;
+}
+
+
+static inline bool
+respond_on_master_complete(as_transaction* tr)
+{
+	return tr->origin == FROM_CLIENT &&
+			TR_WRITE_COMMIT_LEVEL(tr) == AS_WRITE_COMMIT_LEVEL_MASTER;
+}
+
+
+static inline void
+destroy_stack_bins(as_bin* stack_bins, uint32_t n_bins)
+{
+	for (uint32_t i = 0; i < n_bins; i++) {
+		as_bin_particle_destroy(&stack_bins[i], true);
+	}
+}
+
+
+// Not a nice way to specify a read-all op - dictated by backward compatibility.
+// Note - must check this before checking for normal read op!
+static inline bool
+op_is_read_all(as_msg_op* op, as_msg* m)
+{
+	return op->name_sz == 0 && op->op == AS_MSG_OP_READ &&
+			(m->info1 & AS_MSG_INFO1_GET_ALL) != 0;
+}
+
+
+static inline bool
+is_valid_ttl(as_namespace* ns, uint32_t ttl)
+{
+	// Note - for now, ttl must be as_msg record_ttl.
+	// Note - ttl <= ns->max_ttl includes ttl == TTL_NAMESPACE_DEFAULT.
+	return ttl <= ns->max_ttl ||
+			ttl == TTL_NEVER_EXPIRE || ttl == TTL_DONT_UPDATE;
+}
+
+
+static inline void
+clear_delete_response_metadata(as_transaction* tr)
+{
+	// If write became delete, respond to origin with no metadata.
+	if ((tr->flags & AS_TRANSACTION_FLAG_IS_DELETE) != 0) {
+		tr->generation = 0;
+		tr->void_time = 0;
+		tr->last_update_time = 0;
+	}
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+bool create_only_check(const struct as_index_s* r, const struct as_msg_s* m);
+void write_delete_record(struct as_index_s* r, struct as_index_tree_s* tree);
+
+udf_optype udf_finish_delete(struct udf_record_s* urecord);
+
+uint32_t dup_res_pack_repl_state_info(const struct as_index_s* r, struct as_namespace_s* ns);
+uint32_t dup_res_pack_info(const struct as_index_s* r, struct as_namespace_s* ns);
+bool dup_res_should_retry_transaction(struct rw_request_s* rw, uint32_t result_code);
+void dup_res_handle_tie(struct rw_request_s* rw, const msg* m, uint32_t result_code);
+void apply_if_tie(struct rw_request_s* rw);
+void dup_res_translate_result_code(struct rw_request_s* rw);
+bool dup_res_ignore_pickle(const uint8_t* buf, uint32_t info);
+void dup_res_init_repl_state(struct as_remote_record_s* rr, uint32_t info);
+
+void repl_write_flag_pickle(const struct as_transaction_s* tr, const uint8_t* buf, uint32_t* info);
+bool repl_write_pickle_is_drop(const uint8_t* buf, uint32_t info);
+void repl_write_init_repl_state(struct as_remote_record_s* rr, bool from_replica);
+conflict_resolution_pol repl_write_conflict_resolution_policy(const struct as_namespace_s* ns);
+bool repl_write_should_retransmit_replicas(struct rw_request_s* rw, uint32_t result_code);
+void repl_write_send_confirmation(struct rw_request_s* rw);
+void repl_write_handle_confirmation(msg* m);
+
+int record_replace_check(struct as_index_s* r, struct as_namespace_s* ns);
+void record_replaced(struct as_index_s* r, struct as_remote_record_s* rr);
diff --git a/as/include/transaction/udf.h b/as/include/transaction/udf.h
new file mode 100644
index 00000000..cb8a1668
--- /dev/null
+++ b/as/include/transaction/udf.h
@@ -0,0 +1,98 @@
+/*
+ * udf.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+#include "aerospike/as_aerospike.h"
+#include "aerospike/as_list.h"
+
+#include "base/predexp.h"
+#include "base/transaction.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+struct predexp_eval_base_s;
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef enum {
+	UDF_OPTYPE_NONE,
+	UDF_OPTYPE_WAITING,
+	UDF_OPTYPE_READ,
+	UDF_OPTYPE_WRITE,
+	UDF_OPTYPE_DELETE
+} udf_optype;
+
+#define UDF_MAX_STRING_SZ 128
+
+typedef struct udf_def_s {
+	char			filename[UDF_MAX_STRING_SZ];
+	char			function[UDF_MAX_STRING_SZ];
+	as_list*		arglist;
+	uint8_t			type;
+} udf_def;
+
+typedef int (*iudf_cb)(void* udata, int retcode);
+
+typedef struct iudf_origin_s {
+	udf_def			def;
+	struct predexp_eval_base_s*	predexp;
+	iudf_cb			cb;
+	void*			udata;
+} iudf_origin;
+
+
+//==========================================================
+// Public API.
+//
+
+static inline void
+iudf_origin_destroy(iudf_origin* origin)
+{
+	if (origin->def.arglist) {
+		as_list_destroy(origin->def.arglist);
+	}
+
+	if (origin->predexp) {
+		predexp_destroy(origin->predexp);
+	}
+}
+
+void as_udf_init();
+udf_def* udf_def_init_from_msg(udf_def* def, const struct as_transaction_s* tr);
+
+transaction_status as_udf_start(struct as_transaction_s* tr);
+
+extern as_aerospike g_as_aerospike;
diff --git a/as/include/transaction/write.h b/as/include/transaction/write.h
new file mode 100644
index 00000000..dfb5f210
--- /dev/null
+++ b/as/include/transaction/write.h
@@ -0,0 +1,43 @@
+/*
+ * write.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include "base/transaction.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+struct as_transaction_s;
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status as_write_start(struct as_transaction_s* tr);
diff --git a/as/src/Makefile b/as/src/Makefile
new file mode 100644
index 00000000..0a775f91
--- /dev/null
+++ b/as/src/Makefile
@@ -0,0 +1,192 @@
+# Aerospike Server
+# Makefile
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.in
+
+# Use SystemTap?  [By default, no.]
+USE_SYSTEMTAP = 0
+
+ifeq ($(USE_SYSTEMTAP),1)
+CFLAGS +=	-DUSE_SYSTEMTAP
+endif
+
+ifeq ($(USE_SYSTEMTAP),1)
+SYSTEMTAP_PROBES_D = base/probes.d
+SYSTEMTAP_PROBES_H = $(GEN_DIR)/probes.h
+SYSTEMTAP_PROBES_O = $(OBJECT_DIR)/probes.o
+endif
+
+ifeq ($(USE_EE),1)
+  include $(EEREPO)/as/make_in/Makefile.vars
+  include $(EEREPO)/xdr/make_in/Makefile.vars
+endif
+
+BASE_HEADERS += aggr.h batch.h cdt.h cfg.h datamodel.h features.h index.h job_manager.h json_init.h
+BASE_HEADERS += monitor.h packet_compression.h
+BASE_HEADERS += particle.h particle_blob.h particle_integer.h predexp.h
+BASE_HEADERS += proto.h rec_props.h scan.h secondary_index.h security.h security_config.h stats.h system_metadata.h
+BASE_HEADERS += thr_batch.h thr_info.h thr_query.h thr_sindex.h
+BASE_HEADERS += thr_tsvc.h ticker.h transaction.h transaction_policy.h truncate.h
+BASE_HEADERS += udf_aerospike.h udf_arglist.h udf_cask.h
+BASE_HEADERS += udf_memtracker.h udf_record.h udf_timer.h
+BASE_HEADERS += xdr_serverside.h xdr_config.h
+
+BASE_SOURCES += aggr.c as.c batch.c bin.c cdt.c cfg.c index.c job_manager.c json_init.c
+BASE_SOURCES += monitor.c namespace.c packet_compression.c
+BASE_SOURCES += particle.c particle_blob.c particle_float.c particle_geojson.c particle_integer.c
+BASE_SOURCES += particle_list.c particle_map.c particle_string.c predexp.c
+BASE_SOURCES += proto.c rec_props.c record.c scan.c signal.c secondary_index.c system_metadata.c
+BASE_SOURCES += thr_batch.c thr_demarshal.c thr_info.c thr_info_port.c thr_nsup.c
+BASE_SOURCES += thr_query.c thr_sindex.c thr_tsvc.c ticker.c transaction.c truncate.c
+BASE_SOURCES += udf_aerospike.c udf_arglist.c udf_cask.c
+BASE_SOURCES += udf_memtracker.c udf_record.c udf_timer.c
+BASE_SOURCES += xdr_config.c
+
+ifneq ($(USE_EE),1)
+  BASE_SOURCES += cfg_ce.c
+  BASE_SOURCES += features_ce.c
+  BASE_SOURCES += index_ce.c
+  BASE_SOURCES += namespace_ce.c
+  BASE_SOURCES += record_ce.c
+  BASE_SOURCES += security_ce.c
+  BASE_SOURCES += truncate_ce.c
+  BASE_SOURCES += xdr_serverside_stubs.c
+endif
+
+FABRIC_HEADERS += clustering.h endpoint.h exchange.h fabric.h hb.h hlc.h meta_batch.h migrate.h partition.h partition_balance.h roster.h skew_monitor.h
+FABRIC_SOURCES += clustering.c endpoint.c exchange.c fabric.c hb.c hlc.c migrate.c partition.c partition_balance.c skew_monitor.c
+ifneq ($(USE_EE),1)
+  FABRIC_SOURCES += meta_batch_ce.c
+  FABRIC_SOURCES += migrate_ce.c
+  FABRIC_SOURCES += partition_balance_ce.c
+  FABRIC_SOURCES += partition_ce.c
+  FABRIC_SOURCES += roster_ce.c
+endif
+
+GEOSPATIAL_HEADERS += geospatial.h
+GEOSPATIAL_SOURCES += geospatial.cc geojson.cc
+
+STORAGE_HEADERS += storage.h drv_ssd.h
+STORAGE_SOURCES += storage.c drv_memory.c drv_ssd.c
+ifneq ($(USE_EE),1)
+  STORAGE_SOURCES += drv_memory_ce.c
+  STORAGE_SOURCES += drv_ssd_ce.c
+endif
+
+TRANSACTION_HEADERS += delete.h duplicate_resolve.h proxy.h re_replicate.h read.h replica_ping.h replica_write.h rw_request_hash.h rw_request.h rw_utils.h udf.h write.h
+TRANSACTION_SOURCES += delete.c duplicate_resolve.c proxy.c read.c replica_write.c rw_request_hash.c rw_request.c rw_utils.c udf.c write.c
+ifneq ($(USE_EE),1)
+  TRANSACTION_SOURCES += delete_ce.c
+  TRANSACTION_SOURCES += re_replicate_ce.c
+  TRANSACTION_SOURCES += replica_ping_ce.c
+  TRANSACTION_SOURCES += rw_utils_ce.c
+endif
+
+HEADERS = $(BASE_HEADERS:%=base/%) $(FABRIC_HEADERS:%=fabric/%) $(STORAGE_HEADERS:%=storage/%) $(GEOSPATIAL_HEADERS:%=geospatial/%) $(TRANSACTION_HEADERS:%=transaction/%)
+SOURCES = $(BASE_SOURCES:%=base/%) $(FABRIC_SOURCES:%=fabric/%) $(STORAGE_SOURCES:%=storage/%) $(GEOSPATIAL_SOURCES:%=geospatial/%) $(TRANSACTION_SOURCES:%=transaction/%)
+
+SERVER = $(BIN_DIR)/asd
+
+INCLUDES += $(INCLUDE_DIR:%=-I%)
+INCLUDES += -I$(CF)/include
+INCLUDES += -I$(AI)/include
+INCLUDES += -I$(COMMON)/target/$(PLATFORM)/include
+INCLUDES += -I$(MOD_LUA)/target/$(PLATFORM)/include
+INCLUDES += -I$(JANSSON)/src
+INCLUDES += -I$(S2)
+INCLUDES += -I$(XDR_INCLUDES)
+
+ifeq ($(USE_LUAJIT),1)
+  INCLUDES += -I$(LUAJIT)/src
+else
+  INCLUDE_LUA_5_1 = /usr/include/lua5.1
+  ifneq ($(wildcard $(INCLUDE_LUA_5_1)),)
+    INCLUDES += -I$(INCLUDE_LUA_5_1)
+    LUA_SUFFIX = 5.1
+  endif
+endif
+
+AS_LIBRARIES += $(LIBRARY_DIR)/libcf.a
+AS_LIBRARIES += $(LIBRARY_DIR)/libai.a
+AS_LIBRARIES += $(MOD_LUA)/target/$(PLATFORM)/lib/libmod_lua.a
+AS_LIBRARIES += $(COMMON)/target/$(PLATFORM)/lib/libaerospike-common.a
+
+ifeq ($(DOPROFILE),1)
+  LIBRARIES += -pg -fprofile-arcs -lgcov
+endif
+
+# Add either the LuaJIT or Lua library
+ifeq ($(USE_LUAJIT),1)
+  ifeq ($(LD_LUAJIT),static)
+    AS_LIBRARIES += $(LUAJIT)/src/libluajit.a
+  else
+    LIBRARIES += -L$(LUAJIT)/src -lluajit
+  endif
+else
+  ifeq ($(LD_LUA),static)
+    # Find and add the static Lua library.
+    AS_LIBRARIES += $(or \
+      $(wildcard /usr/local/lib/liblua.a), \
+      $(wildcard /usr/lib64/liblua$(LUA_SUFFIX).a), \
+      $(wildcard /usr/lib/x86_64-linux-gnu/liblua$(LUA_SUFFIX).a), \
+      $(wildcard /usr/lib/liblua.a), \
+      $(wildcard /usr/lib/powerpc64le-linux-gnu/liblua.a), \
+      $(error Cannot find "liblua.a"))
+  else
+    LIBRARIES += -llua$(LUA_SUFFIX)
+  endif
+endif
+
+ifeq ($(LD_JANSSON),static)
+  AS_LIBRARIES += $(JANSSON)/src/.libs/libjansson.a
+else
+  LIBRARIES += -L$(JANSSON)/src/.libs -ljansson
+endif
+
+LIBRARIES += -L$(S2) -ls2 -ls2cellid -lgoogle-strings -lgoogle-base \
+			-lgoogle-util-coding -lgoogle-util-math -lstdc++
+
+LIBRARIES := $(AS_LIBRARIES) $(LIBRARIES)
+
+AS_LIB_DEPS = $(AS_LIBRARIES)
+
+OBJECTS.c = $(SOURCES:%.c=$(OBJECT_DIR)/%.o) $(VERSION_OBJ) $(SYSTEMTAP_PROBES_O)
+OBJECTS = $(OBJECTS.c:%.cc=$(OBJECT_DIR)/%.o)
+DEPENDENCIES = $(OBJECTS:%.o=%.d)
+DEPENDENCIES += $(XDR_DEPENDENCIES)
+
+.PHONY: all
+all: $(SYSTEMTAP_PROBES_H) $(SERVER)
+
+.PHONY: clean
+clean:
+	$(RM) $(OBJECTS) $(SERVER){,.stripped}
+	$(RM) $(DEPENDENCIES)
+
+# Emacs syntax check target.CHK_SOURCES is set by emacs to the files being edited.
+.PHONY: check-syntax
+check-syntax:
+	$(CC) -Wall -Wextra -pedantic -fsyntax-only $(CHK_SOURCES)
+
+$(SERVER): $(OBJECTS) $(AS_LIB_DEPS) $(XDR_LIBRARY) $(XDR_ALL_OBJECTS)
+	$(LINK.c) -o $(SERVER) $(OBJECTS) $(XDR_ALL_OBJECTS) $(LIBRARIES)
+
+ifeq ($(USE_EE),1)
+  include $(XDR)/make_in/Makefile.targets
+endif
+
+include $(DEPTH)/make_in/Makefile.targets
+
+# Ignore S2 induced warnings
+S2_WNO = -Wno-unused-local-typedefs -Wno-deprecated -Wno-sign-compare
+$(OBJECT_DIR)/geospatial/%.o: CXXFLAGS += $(S2_WNO)
+$(OBJECT_DIR)/geospatial/%.o: CFLAGS := $(filter-out -std=gnu99,$(CFLAGS))
+
+ifeq ($(USE_SYSTEMTAP),1)
+$(SYSTEMTAP_PROBES_H):	$(SYSTEMTAP_PROBES_D)
+	dtrace -h -s $< -o $@
+
+$(SYSTEMTAP_PROBES_O):	$(SYSTEMTAP_PROBES_D)
+	dtrace -G -s $< -o $@
+endif
diff --git a/as/src/base/aggr.c b/as/src/base/aggr.c
new file mode 100644
index 00000000..88735ce9
--- /dev/null
+++ b/as/src/base/aggr.c
@@ -0,0 +1,337 @@
+/*
+ * aggr.c
+ *
+ * Copyright (C) 2014-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/aggr.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+
+#include "aerospike/as_val.h"
+#include "aerospike/mod_lua.h"
+#include "citrusleaf/cf_ll.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/transaction.h"
+#include "base/udf_arglist.h"
+#include "base/udf_memtracker.h"
+#include "base/udf_record.h"
+#include "fabric/partition.h"
+
+
+#define AS_AGGR_ERR  -1
+#define AS_AGGR_OK    0
+
+/*
+ * Aggregation Stream Object
+ */
+// **************************************************************************************************
+typedef struct {
+	// Iteration
+	cf_ll_iterator        * iter;
+	as_index_keys_arr     * keys_arr;
+	int                     keys_arr_offset;
+
+	// Record
+	bool                       rec_open; // Record in stream open
+	as_rec                   * urec;     // UDF record cloak
+	as_namespace             * ns;
+	as_partition_reservation * rsv;      // Reservation Object
+
+	// Module Data
+	as_aggr_call          * call;   // Aggregation info
+	void                  * udata;  // Execution context
+} aggr_state;
+
+static as_partition_reservation *
+ptn_reserve(aggr_state *astate, uint32_t pid, as_partition_reservation *rsv)
+{
+	as_aggr_call *call = astate->call;
+	if (call && call->aggr_hooks && call->aggr_hooks->ptn_reserve) {
+		return call->aggr_hooks->ptn_reserve(astate->udata, astate->ns, pid, rsv);
+	}
+	return NULL;
+}
+
+static void
+ptn_release(aggr_state *astate)
+{
+	as_aggr_call  *call = astate->call;
+	if (call && call->aggr_hooks && call->aggr_hooks->ptn_release) {
+		call->aggr_hooks->ptn_release(astate->udata, astate->rsv);
+	}
+}
+
+#if 0
+// In case we ever need this hook...
+static void
+set_error(aggr_state *astate, int err)
+{
+	as_aggr_call  *call = astate->call;
+	if (call && call->aggr_hooks && call->aggr_hooks->set_error) {
+		call->aggr_hooks->set_error(astate->udata, err);
+	}
+}
+#endif // 0
+
+static bool
+pre_check(aggr_state *astate, void *skey)
+{
+	as_aggr_call  *call = astate->call;
+	if (call && call->aggr_hooks && call->aggr_hooks->pre_check) {
+		return call->aggr_hooks->pre_check(astate->udata, as_rec_source(astate->urec), skey);
+	}
+	return true; // if not defined pre_check succeeds
+}
+
+static int
+aopen(aggr_state *astate, const cf_digest *digest)
+{
+	udf_record   * urecord  = as_rec_source(astate->urec);
+	as_index_ref   * r_ref  = urecord->r_ref;
+	as_transaction * tr     = urecord->tr;
+
+	int pid                = as_partition_getid(digest);
+	urecord->keyd = *digest;
+
+	astate->rsv        = ptn_reserve(astate, pid, &tr->rsv);
+	if (!astate->rsv) {
+		cf_debug(AS_AGGR, "Reservation not done for partition %d", pid);
+		return -1;
+	}
+
+	// NB: Partial Initialization due to heaviness. Not everything needed
+	// TODO: Make such initialization Commodity
+	tr->rsv.ns          = astate->rsv->ns;
+	tr->rsv.p           = astate->rsv->p;
+	tr->rsv.tree        = astate->rsv->tree;
+	tr->keyd            = urecord->keyd;
+
+	r_ref->skip_lock    = false;
+	if (udf_record_open(urecord) == 0) {
+		astate->rec_open   = true;
+		return 0;
+	}
+	ptn_release(astate);
+	return -1;
+}
+
+void
+aclose(aggr_state *astate)
+{
+	// Bypassing doing the direct destroy because we need to
+	// avoid reducing the ref count. This rec (query_record
+	// implementation of as_rec) is ref counted when passed from
+	// here to Lua. If Lua access it even after moving to next
+	// element in the stream it does it at its own risk. Record
+	// may have changed under the hood.
+	if (astate->rec_open) {
+		udf_record_close(as_rec_source(astate->urec));
+		ptn_release(astate);
+		astate->rec_open = false;
+	}
+	return;
+}
+
+void
+acleanup(aggr_state *astate)
+{
+	if (astate->iter) {
+		cf_ll_releaseIterator(astate->iter);
+		astate->iter = NULL;
+	}
+	aclose(astate);
+
+	as_rec_destroy(astate->urec);
+}
+
+// **************************************************************************************************
+
+/*
+ * Aggregation Input Stream
+ */
+// **************************************************************************************************
+cf_digest *
+get_next(aggr_state *astate)
+{
+	astate->keys_arr_offset++;
+	if (!astate->keys_arr || (astate->keys_arr_offset == astate->keys_arr->num)) {
+
+		cf_ll_element * ele = cf_ll_getNext(astate->iter);
+
+		// if NULL or number of element 0. No holes expected
+		if (!ele) {
+			return NULL;
+		}
+
+		astate->keys_arr    = ((as_index_keys_ll_element*)ele)->keys_arr;
+		if (!astate->keys_arr || (astate->keys_arr->num < 1)) {
+			astate->keys_arr = NULL;
+			return NULL;
+		}
+
+		astate->keys_arr_offset = 0;
+	}
+	return &astate->keys_arr->pindex_digs[astate->keys_arr_offset];
+}
+
+// only operates on the record as_val in the stream points to
+// and updates the references ... this function has to acquire
+// partition reservation and also the object lock. So if the UDF
+// does something stupid the object lock is gonna get held for
+// a while ... there has to be timeout mechanism in here I think
+static as_val *
+istream_read(const as_stream *s)
+{
+	aggr_state *astate = as_stream_source(s);
+
+	aclose(astate);
+
+	// Iterate through stream to get next digest and
+	// populate record with it
+	while (!astate->rec_open) {
+
+		if (get_next(astate) == NULL) {
+			return NULL;
+		}
+
+		if (!aopen(astate, &astate->keys_arr->pindex_digs[astate->keys_arr_offset])) {
+			if (!pre_check(astate, &astate->keys_arr->sindex_keys[astate->keys_arr_offset])) {
+				aclose(astate);
+			}
+		}
+	}
+	return (as_val *)astate->urec;
+}
+
+const as_stream_hooks istream_hooks = {
+		.destroy	= NULL,
+		.read		= istream_read,
+		.write		= NULL
+};
+// **************************************************************************************************
+
+
+
+/*
+ * Aggregation Output Stream
+ */
+// **************************************************************************************************
+as_stream_status
+ostream_write(const as_stream *s, as_val *val)
+{
+	aggr_state *astate = (aggr_state *)as_stream_source(s);
+	return astate->call->aggr_hooks->ostream_write(astate->udata, val);
+}
+
+const as_stream_hooks ostream_hooks = {
+		.destroy	= NULL,
+		.read		= NULL,
+		.write		= ostream_write
+};
+// **************************************************************************************************
+
+
+/*
+ * Aggregation AS_AEROSPIKE interface for LUA
+ */
+// **************************************************************************************************
+static int
+as_aggr_aerospike_log(const as_aerospike * a, const char * file, const int line, const int lvl, const char * msg)
+{
+	cf_fault_event(AS_AGGR, lvl, file, line, "%s", (char *) msg);
+	return 0;
+}
+
+static const as_aerospike_hooks as_aggr_aerospike_hooks = {
+	.rec_update       = NULL,
+	.rec_remove       = NULL,
+	.rec_exists       = NULL,
+	.log              = as_aggr_aerospike_log,
+	.get_current_time = NULL,
+	.destroy          = NULL
+};
+// **************************************************************************************************
+
+
+
+int
+as_aggr_process(as_namespace *ns, as_aggr_call * ag_call, cf_ll * ap_recl, void * udata, as_result * ap_res)
+{
+	as_index_ref    r_ref;
+	r_ref.skip_lock   = false;
+	as_storage_rd   rd;
+	bzero(&rd, sizeof(as_storage_rd));
+	as_transaction  tr;
+
+
+	udf_record urecord;
+	udf_record_init(&urecord, false);
+	urecord.tr      = &tr;
+	urecord.r_ref   = &r_ref;
+	urecord.rd      = &rd;
+	as_rec   * urec = as_rec_new(&urecord, &udf_record_hooks);
+
+	aggr_state astate = {
+		.iter            = cf_ll_getIterator(ap_recl, true /*forward*/),
+		.urec            = urec,
+		.keys_arr        = NULL,
+		.keys_arr_offset = 0,
+		.call            = ag_call,
+		.udata           = udata,
+		.rec_open        = false,
+		.rsv             = &tr.rsv,
+		.ns              = ns
+	};
+
+	if (!astate.iter) {
+		cf_warning (AS_AGGR, "Could not set up iterator .. possibly out of memory .. Aborting Query !!");
+		as_rec_destroy(urec);
+		return AS_AGGR_ERR;
+	}
+
+	as_aerospike as;
+	as_aerospike_init(&as, NULL, &as_aggr_aerospike_hooks);
+
+	// Input Stream
+	as_stream istream;
+	as_stream_init(&istream, &astate, &istream_hooks);
+
+	// Output stream
+	as_stream ostream;
+	as_stream_init(&ostream, &astate, &ostream_hooks);
+
+	as_udf_context ctx = {
+		.as         = &as,
+		.timer      = NULL,
+		.memtracker = NULL
+	};
+	int ret = as_module_apply_stream(&mod_lua, &ctx, ag_call->def.filename, ag_call->def.function, &istream, ag_call->def.arglist, &ostream, ap_res);
+
+	acleanup(&astate);
+	return ret;
+}
diff --git a/as/src/base/as.c b/as/src/base/as.c
new file mode 100644
index 00000000..88fe431b
--- /dev/null
+++ b/as/src/base/as.c
@@ -0,0 +1,520 @@
+/*
+ * as.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "daemon.h"
+#include "fault.h"
+#include "hardware.h"
+#include "tls.h"
+
+#include "base/batch.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/json_init.h"
+#include "base/monitor.h"
+#include "base/scan.h"
+#include "base/secondary_index.h"
+#include "base/security.h"
+#include "base/system_metadata.h"
+#include "base/stats.h"
+#include "base/thr_batch.h"
+#include "base/thr_info.h"
+#include "base/thr_info_port.h"
+#include "base/thr_sindex.h"
+#include "base/thr_tsvc.h"
+#include "base/ticker.h"
+#include "base/xdr_serverside.h"
+#include "fabric/clustering.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "fabric/migrate.h"
+#include "fabric/skew_monitor.h"
+#include "storage/storage.h"
+#include "transaction/proxy.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/udf.h"
+
+
+//==========================================================
+// Constants.
+//
+
+// String constants in version.c, generated by make.
+extern const char aerospike_build_type[];
+extern const char aerospike_build_id[];
+
+// Command line options for the Aerospike server.
+static const struct option CMD_OPTS[] = {
+		{ "help", no_argument, NULL, 'h' },
+		{ "version", no_argument, NULL, 'v' },
+		{ "config-file", required_argument, NULL, 'f' },
+		{ "foreground", no_argument, NULL, 'd' },
+		{ "fgdaemon", no_argument, NULL, 'F' },
+		{ "cold-start", no_argument, NULL, 'c' },
+		{ "instance", required_argument, NULL, 'n' },
+		{ NULL, 0, NULL, 0 }
+};
+
+static const char HELP[] =
+		"\n"
+		"Aerospike server installation installs the script /etc/init.d/aerospike which\n"
+		"is normally used to start and stop the server. The script is also found as\n"
+		"as/etc/init-script in the source tree.\n"
+		"\n"
+		"asd informative command-line options:\n"
+		"\n"
+		"--help"
+		"\n"
+		"Print this message and exit.\n"
+		"\n"
+		"--version"
+		"\n"
+		"Print edition and build version information and exit.\n"
+		"\n"
+		"asd runtime command-line options:\n"
+		"\n"
+		"--config-file <file>"
+		"\n"
+		"Specify the location of the Aerospike server config file. If this option is not\n"
+		"specified, the default location /etc/aerospike/aerospike.conf is used.\n"
+		"\n"
+		"--foreground"
+		"\n"
+		"Specify that Aerospike not be daemonized. This is useful for running Aerospike\n"
+		"in gdb. Alternatively, add 'run-as-daemon false' in the service context of the\n"
+		"Aerospike config file.\n"
+		"\n"
+		"--fgdaemon"
+		"\n"
+		"Specify that Aerospike is to be run as a \"new-style\" (foreground) daemon. This\n"
+		"is useful for running Aerospike under systemd or Docker.\n"
+		"\n"
+		"--cold-start"
+		"\n"
+		"(Enterprise edition only.) At startup, force the Aerospike server to read all\n"
+		"records from storage devices to rebuild the index.\n"
+		"\n"
+		"--instance <0-15>"
+		"\n"
+		"(Enterprise edition only.) If running multiple instances of Aerospike on one\n"
+		"machine (not recommended), each instance must be uniquely designated via this\n"
+		"option.\n"
+		;
+
+static const char USAGE[] =
+		"\n"
+		"asd informative command-line options:\n"
+		"[--help]\n"
+		"[--version]\n"
+		"\n"
+		"asd runtime command-line options:\n"
+		"[--config-file <file>] "
+		"[--foreground] "
+		"[--fgdaemon] "
+		"[--cold-start] "
+		"[--instance <0-15>]\n"
+		;
+
+static const char DEFAULT_CONFIG_FILE[] = "/etc/aerospike/aerospike.conf";
+
+static const char SMD_DIR_NAME[] = "/smd";
+
+
+//==========================================================
+// Globals.
+//
+
+pthread_mutex_t g_main_deadlock = PTHREAD_MUTEX_INITIALIZER;
+bool g_startup_complete = false;
+bool g_shutdown_started = false;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+// signal.c, thr_demarshal.c and thr_nsup.c don't have header files.
+extern void as_signal_setup();
+extern void as_demarshal_start();
+extern void as_nsup_start();
+
+static void write_pidfile(char *pidfile);
+static void validate_directory(const char *path, const char *log_tag);
+static void validate_smd_directory();
+
+
+//==========================================================
+// Aerospike server entry point.
+//
+
+int
+main(int argc, char **argv)
+{
+	g_start_ms = cf_getms();
+
+	// Initialize memory allocation.
+	cf_alloc_init();
+
+	// Initialize fault management framework.
+	cf_fault_init();
+
+	// Setup signal handlers.
+	as_signal_setup();
+
+	// Initialize TLS library.
+	tls_check_init();
+
+	int opt;
+	int opt_i;
+	const char *config_file = DEFAULT_CONFIG_FILE;
+	bool run_in_foreground = false;
+	bool new_style_daemon = false;
+	bool cold_start_cmd = false;
+	uint32_t instance = 0;
+
+	// Parse command line options.
+	while ((opt = getopt_long(argc, argv, "", CMD_OPTS, &opt_i)) != -1) {
+		switch (opt) {
+		case 'h':
+			// printf() since we want stdout and don't want cf_fault's prefix.
+			printf("%s\n", HELP);
+			return 0;
+		case 'v':
+			// printf() since we want stdout and don't want cf_fault's prefix.
+			printf("%s build %s\n", aerospike_build_type, aerospike_build_id);
+			return 0;
+		case 'f':
+			config_file = cf_strdup(optarg);
+			break;
+		case 'F':
+			// As a "new-style" daemon(*), asd runs in the foreground and
+			// ignores the following configuration items:
+			//  - user ('user')
+			//	- group ('group')
+			//  - PID file ('pidfile')
+			//
+			// If ignoring configuration items, or if the 'console' sink is not
+			// specified, warnings will appear in stderr.
+			//
+			// (*) http://0pointer.de/public/systemd-man/daemon.html#New-Style%20Daemons
+			run_in_foreground = true;
+			new_style_daemon = true;
+			break;
+		case 'd':
+			run_in_foreground = true;
+			break;
+		case 'c':
+			cold_start_cmd = true;
+			break;
+		case 'n':
+			instance = (uint32_t)strtol(optarg, NULL, 0);
+			break;
+		default:
+			// fprintf() since we don't want cf_fault's prefix.
+			fprintf(stderr, "%s\n", USAGE);
+			return 1;
+		}
+	}
+
+	// Set all fields in the global runtime configuration instance. This parses
+	// the configuration file, and creates as_namespace objects. (Return value
+	// is a shortcut pointer to the global runtime configuration instance.)
+	as_config *c = as_config_init(config_file);
+
+	// Detect NUMA topology and, if requested, prepare for CPU and NUMA pinning.
+	cf_topo_config(c->auto_pin, (cf_topo_numa_node_index)instance,
+			&c->service.bind);
+
+	// Perform privilege separation as necessary. If configured user & group
+	// don't have root privileges, all resources created or reopened past this
+	// point must be set up so that they are accessible without root privileges.
+	// If not, the process will self-terminate with (hopefully!) a log message
+	// indicating which resource is not set up properly.
+	if (0 != c->uid && 0 == geteuid()) {
+		if (! new_style_daemon) {
+			// To see this log, change NO_SINKS_LIMIT in fault.c:
+			cf_info(AS_AS, "privsep to %d %d", c->uid, c->gid);
+			cf_process_privsep(c->uid, c->gid);
+		}
+		else {
+			cf_warning(AS_AS, "will not do privsep in new-style daemon mode");
+		}
+	}
+
+	//
+	// All resources such as files, devices, and shared memory must be created
+	// or reopened below this line! (The configuration file is the only thing
+	// that must be opened above, in order to parse the user & group.)
+	//==========================================================================
+
+	// A "new-style" daemon expects console logging to be configured. (If not,
+	// log messages won't be seen via the standard path.)
+	if (new_style_daemon) {
+		if (! cf_fault_console_is_held()) {
+			cf_warning(AS_AS, "in new-style daemon mode, console logging is not configured");
+		}
+	}
+
+	// Activate log sinks. Up to this point, 'cf_' log output goes to stderr,
+	// filtered according to NO_SINKS_LIMIT in fault.c. After this point, 'cf_'
+	// log output will appear in all log file sinks specified in configuration,
+	// with specified filtering. If console sink is specified in configuration,
+	// 'cf_' log output will continue going to stderr, but filtering will switch
+	// from NO_SINKS_LIMIT to that specified in console sink configuration.
+	if (0 != cf_fault_sink_activate_all_held()) {
+		// Specifics of failure are logged in cf_fault_sink_activate_all_held().
+		cf_crash_nostack(AS_AS, "can't open log sink(s)");
+	}
+
+	// Daemonize asd if specified. After daemonization, output to stderr will no
+	// longer appear in terminal. Instead, check /tmp/aerospike-console.<pid>
+	// for console output.
+	if (! run_in_foreground && c->run_as_daemon) {
+		// Don't close any open files when daemonizing. At this point only log
+		// sink files are open - instruct cf_process_daemonize() to ignore them.
+		int open_fds[CF_FAULT_SINKS_MAX];
+		int num_open_fds = cf_fault_sink_get_fd_list(open_fds);
+
+		cf_process_daemonize(open_fds, num_open_fds);
+	}
+
+	// Log which build this is - should be the first line in the log file.
+	cf_info(AS_AS, "<><><><><><><><><><>  %s build %s  <><><><><><><><><><>",
+			aerospike_build_type, aerospike_build_id);
+
+	// Includes echoing the configuration file to log.
+	as_config_post_process(c, config_file);
+
+	xdr_config_post_process();
+
+	// If we allocated a non-default config file name, free it.
+	if (config_file != DEFAULT_CONFIG_FILE) {
+		cf_free((void*)config_file);
+	}
+
+	// Write the pid file, if specified.
+	if (! new_style_daemon) {
+		write_pidfile(c->pidfile);
+	}
+	else {
+		if (c->pidfile) {
+			cf_warning(AS_AS, "will not write PID file in new-style daemon mode");
+		}
+	}
+
+	// Check that required directories are set up properly.
+	validate_directory(c->work_directory, "work");
+	validate_directory(c->mod_lua.system_path, "Lua system");
+	validate_directory(c->mod_lua.user_path, "Lua user");
+	validate_smd_directory();
+
+	// Initialize subsystems. At this point we're allocating local resources,
+	// starting worker threads, etc. (But no communication with other server
+	// nodes or clients yet.)
+
+	as_json_init();				// Jansson JSON API used by System Metadata
+	as_smd_init();				// System Metadata first - others depend on it
+	as_index_tree_gc_init();	// thread to purge dropped index trees
+	as_sindex_thr_init();		// defrag secondary index (ok during population)
+
+	// Initialize namespaces. Each namespace decides here whether it will do a
+	// warm or cold start. Index arenas, partition structures and index tree
+	// structures are initialized. Secondary index system metadata is restored.
+	as_namespaces_init(cold_start_cmd, instance);
+
+	// Initialize the storage system. For cold starts, this includes reading
+	// all the objects off the drives. This may block for a long time. The
+	// defrag subsystem starts operating at the end of this call.
+	as_storage_init();
+
+	// Migrate memory to correct NUMA node (includes restored index arenas).
+	cf_topo_migrate_memory();
+
+	// Populate all secondary indexes. This may block for a long time.
+	as_sindex_boot_populateall();
+
+	cf_info(AS_AS, "initializing services...");
+
+	as_netio_init();
+	as_security_init();			// security features
+	as_tsvc_init();				// all transaction handling
+	as_hb_init();				// inter-node heartbeat
+	as_skew_monitor_init();		// clock skew monitor
+	as_fabric_init();			// inter-node communications
+	as_exchange_init();			// initialize the cluster exchange subsystem
+	as_clustering_init();		// clustering-v5 start
+	as_info_init();				// info transaction handling
+	as_migrate_init();			// move data between nodes
+	as_proxy_init();			// do work on behalf of others
+	as_rw_init();				// read & write service
+	as_query_init();			// query transaction handling
+	as_udf_init();				// user-defined functions
+	as_scan_init();				// scan a namespace or set
+	as_batch_init();			// batch transaction handling
+	as_batch_direct_init();		// low priority transaction handling
+	as_xdr_init();				// cross data-center replication
+	as_mon_init();				// monitor
+
+	// Wait for enough available storage. We've been defragging all along, but
+	// here we wait until it's enough. This may block for a long time.
+	as_storage_wait_for_defrag();
+
+	// Start subsystems. At this point we may begin communicating with other
+	// cluster nodes, and ultimately with clients.
+
+	as_smd_start(g_smd);		// enables receiving cluster state change events
+	as_fabric_start();			// may send & receive fabric messages
+	as_xdr_start();				// XDR should start before it joins other nodes
+	as_hb_start();				// start inter-node heartbeat
+	as_exchange_start();		// start the cluster exchange subsystem
+	as_clustering_start();		// clustering-v5 start
+	as_nsup_start();			// may send delete transactions to other nodes
+	as_demarshal_start();		// server will now receive client transactions
+	as_info_port_start();		// server will now receive info transactions
+	as_ticker_start();			// only after everything else is started
+
+	// Relevant for enterprise edition only.
+	as_storage_start_tomb_raider();
+
+	// Log a service-ready message.
+	cf_info(AS_AS, "service ready: soon there will be cake!");
+
+	//--------------------------------------------
+	// Startup is done. This thread will now wait
+	// quietly for a shutdown signal.
+	//
+
+	// Stop this thread from finishing. Intentionally deadlocking on a mutex is
+	// a remarkably efficient way to do this.
+	pthread_mutex_lock(&g_main_deadlock);
+	g_startup_complete = true;
+	pthread_mutex_lock(&g_main_deadlock);
+
+	// When the service is running, you are here (deadlocked) - the signals that
+	// stop the service (yes, these signals always occur in this thread) will
+	// unlock the mutex, allowing us to continue.
+
+	g_shutdown_started = true;
+	pthread_mutex_unlock(&g_main_deadlock);
+	pthread_mutex_destroy(&g_main_deadlock);
+
+	//--------------------------------------------
+	// Received a shutdown signal.
+	//
+
+	as_storage_shutdown();
+	as_xdr_shutdown();
+	as_smd_shutdown(g_smd);
+
+	cf_info(AS_AS, "finished clean shutdown - exiting");
+
+	// If shutdown was totally clean (all threads joined) we could just return,
+	// but for now we exit to make sure all threads die.
+#ifdef DOPROFILE
+	exit(0); // exit(0) so profile build actually dumps gmon.out
+#else
+	_exit(0);
+#endif
+
+	return 0;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static void
+write_pidfile(char *pidfile)
+{
+	if (! pidfile) {
+		// If there's no pid file specified in the config file, just move on.
+		return;
+	}
+
+	// Note - the directory the pid file is in must already exist.
+
+	remove(pidfile);
+
+	int pid_fd = open(pidfile, O_CREAT | O_RDWR,
+			S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH);
+
+	if (pid_fd < 0) {
+		cf_crash_nostack(AS_AS, "failed to open pid file %s: %s", pidfile,
+				cf_strerror(errno));
+	}
+
+	char pidstr[16];
+	sprintf(pidstr, "%u\n", (uint32_t)getpid());
+
+	// If we can't access this resource, just log a warning and continue -
+	// it is not critical to the process.
+	if (write(pid_fd, pidstr, strlen(pidstr)) == -1) {
+		cf_warning(AS_AS, "failed write to pid file %s: %s", pidfile,
+				cf_strerror(errno));
+	}
+
+	close(pid_fd);
+}
+
+static void
+validate_directory(const char *path, const char *log_tag)
+{
+	struct stat buf;
+
+	if (stat(path, &buf) != 0) {
+		cf_crash_nostack(AS_AS, "%s directory '%s' is not set up properly: %s",
+				log_tag, path, cf_strerror(errno));
+	}
+	else if (! S_ISDIR(buf.st_mode)) {
+		cf_crash_nostack(AS_AS, "%s directory '%s' is not set up properly: Not a directory",
+				log_tag, path);
+	}
+}
+
+static void
+validate_smd_directory()
+{
+	size_t len = strlen(g_config.work_directory);
+	char smd_path[len + sizeof(SMD_DIR_NAME)];
+
+	strcpy(smd_path, g_config.work_directory);
+	strcpy(smd_path + len, SMD_DIR_NAME);
+	validate_directory(smd_path, "system metadata");
+}
diff --git a/as/src/base/batch.c b/as/src/base/batch.c
new file mode 100644
index 00000000..e33f3cae
--- /dev/null
+++ b/as/src/base/batch.c
@@ -0,0 +1,1155 @@
+/*
+ * batch.c
+ *
+ * Copyright (C) 2012-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+#include "base/batch.h"
+#include "aerospike/as_buffer_pool.h"
+#include "aerospike/as_thread_pool.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/security.h"
+#include "base/stats.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "hardware.h"
+#include "socket.h"
+#include <errno.h>
+#include <unistd.h>
+
+//---------------------------------------------------------
+// MACROS
+//---------------------------------------------------------
+
+#define BATCH_BLOCK_SIZE (1024 * 128) // 128K
+#define BATCH_MAX_TRANSACTION_SIZE (1024 * 1024 * 10) // 10MB
+#define BATCH_REPEAT_SIZE 25  // index(4),digest(20) and repeat(1)
+
+//---------------------------------------------------------
+// TYPES
+//---------------------------------------------------------
+
+// Pad batch input header to 30 bytes which is also the size of a transaction header.
+// This allows the input memory to be used as transaction cl_msg memory.
+// This saves a large number of memory allocations while allowing different
+// namespaces/bin name filters to be in the same batch.
+typedef struct {
+	uint32_t index;
+	cf_digest keyd;
+	uint8_t repeat;
+	uint8_t info1;
+	uint16_t n_fields;
+	uint16_t n_ops;
+} __attribute__((__packed__)) as_batch_input;
+
+typedef struct {
+	uint32_t capacity;
+	uint32_t size;
+	uint32_t tran_count;
+	cf_atomic32 writers;
+	as_proto proto;
+	uint8_t data[];
+} __attribute__((__packed__)) as_batch_buffer;
+
+struct as_batch_shared_s {
+	pthread_mutex_t lock;
+	cf_queue* response_queue;
+	as_file_handle* fd_h;
+	cl_msg* msgp;
+	as_batch_buffer* buffer;
+	uint64_t start;
+	uint32_t tran_count_response;
+	uint32_t tran_count;
+	uint32_t tran_max;
+	int result_code;
+	bool bad_response_fd;
+};
+
+typedef struct {
+	as_batch_shared* shared;
+	as_batch_buffer* buffer;
+} as_batch_response;
+
+typedef struct {
+	cf_queue* response_queue;
+	cf_queue* complete_queue;
+	cf_atomic32 count;
+	volatile bool active;
+} as_batch_queue;
+
+typedef struct {
+	as_batch_queue* batch_queue;
+	bool complete;
+} as_batch_work;
+
+//---------------------------------------------------------
+// STATIC DATA
+//---------------------------------------------------------
+
+static as_thread_pool batch_thread_pool;
+static as_buffer_pool batch_buffer_pool;
+
+static as_batch_queue batch_queues[MAX_BATCH_THREADS];
+static pthread_mutex_t batch_resize_lock;
+
+//---------------------------------------------------------
+// STATIC FUNCTIONS
+//---------------------------------------------------------
+
+static int
+as_batch_send(cf_socket *sock, uint8_t* buf, size_t len, int flags)
+{
+	if (cf_socket_send_all(sock, buf, len, flags, CF_SOCKET_TIMEOUT) < 0) {
+		// Common when a client aborts.
+		cf_debug(AS_BATCH, "Batch send response error, errno %d fd %d", errno, CSFD(sock));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+as_batch_send_error(as_transaction* btr, int result_code)
+{
+	cl_msg m;
+	m.proto.version = PROTO_VERSION;
+	m.proto.type = PROTO_TYPE_AS_MSG;
+	m.proto.sz = sizeof(as_msg);
+	as_proto_swap(&m.proto);
+	m.msg.header_sz = sizeof(as_msg);
+	m.msg.info1 = 0;
+	m.msg.info2 = 0;
+	m.msg.info3 = AS_MSG_INFO3_LAST;
+	m.msg.unused = 0;
+	m.msg.result_code = result_code;
+	m.msg.generation = 0;
+	m.msg.record_ttl = 0;
+	m.msg.transaction_ttl = 0;
+	m.msg.n_fields = 0;
+	m.msg.n_ops = 0;
+	as_msg_swap_header(&m.msg);
+
+	int status = as_batch_send(&btr->from.proto_fd_h->sock, (uint8_t*)&m, sizeof(m), MSG_NOSIGNAL);
+
+	as_end_of_transaction(btr->from.proto_fd_h, status != 0);
+	btr->from.proto_fd_h = NULL;
+
+	cf_free(btr->msgp);
+	btr->msgp = 0;
+
+	if (result_code == AS_PROTO_RESULT_FAIL_TIMEOUT) {
+		cf_atomic64_incr(&g_stats.batch_index_timeout);
+	}
+	else {
+		cf_atomic64_incr(&g_stats.batch_index_errors);
+	}
+	return status;
+}
+
+static void
+as_batch_send_buffer(as_batch_shared* shared, as_batch_buffer* buffer)
+{
+	// Don't send buffer if an error has already occurred.
+	if (shared->bad_response_fd || shared->result_code) {
+		return;
+	}
+
+	// Send buffer block to client socket.
+	buffer->proto.version = PROTO_VERSION;
+	buffer->proto.type = PROTO_TYPE_AS_MSG;
+	buffer->proto.sz = buffer->size;
+	as_proto_swap(&buffer->proto);
+
+	int status = as_batch_send(&shared->fd_h->sock, (uint8_t*)&buffer->proto, sizeof(as_proto) + buffer->size, MSG_NOSIGNAL | MSG_MORE);
+
+	if (status) {
+		// Socket error. Release shared->fd_h after all sub-transactions are
+		// complete - shared->fd_h needed for security filter.
+		shared->bad_response_fd = true;
+		cf_atomic64_incr(&g_stats.batch_index_errors);
+	}
+}
+
+static void
+as_batch_send_final(as_batch_shared* shared)
+{
+	// Send protocol trailer to client socket.
+	if (shared->bad_response_fd) {
+		as_end_of_transaction_force_close(shared->fd_h);
+		shared->fd_h = NULL;
+		return;
+	}
+
+	cl_msg m;
+	m.proto.version = PROTO_VERSION;
+	m.proto.type = PROTO_TYPE_AS_MSG;
+	m.proto.sz = sizeof(as_msg);
+	as_proto_swap(&m.proto);
+	m.msg.header_sz = sizeof(as_msg);
+	m.msg.info1 = 0;
+	m.msg.info2 = 0;
+	m.msg.info3 = AS_MSG_INFO3_LAST;
+	m.msg.unused = 0;
+	m.msg.result_code = shared->result_code;
+	m.msg.generation = 0;
+	m.msg.record_ttl = 0;
+	m.msg.transaction_ttl = 0;
+	m.msg.n_fields = 0;
+	m.msg.n_ops = 0;
+	as_msg_swap_header(&m.msg);
+
+	int status = as_batch_send(&shared->fd_h->sock, (uint8_t*) &m, sizeof(m), MSG_NOSIGNAL);
+
+	as_end_of_transaction(shared->fd_h, status != 0);
+	shared->fd_h = NULL;
+
+	// For now the model is timeouts don't appear in histograms.
+	if (shared->result_code != AS_PROTO_RESULT_FAIL_TIMEOUT) {
+		G_HIST_ACTIVATE_INSERT_DATA_POINT(batch_index_hist, shared->start);
+	}
+
+	// Check final return code in order to update statistics.
+	if (status == 0 && shared->result_code == 0) {
+		cf_atomic64_incr(&g_stats.batch_index_complete);
+	}
+	else {
+		if (shared->result_code == AS_PROTO_RESULT_FAIL_TIMEOUT) {
+			cf_atomic64_incr(&g_stats.batch_index_timeout);
+		}
+		else {
+			cf_atomic64_incr(&g_stats.batch_index_errors);
+		}
+	}
+}
+
+static inline void
+as_batch_free(as_batch_shared* shared, as_batch_queue* batch_queue)
+{
+	// Destroy lock
+	pthread_mutex_destroy(&shared->lock);
+
+	// Release memory
+	cf_free(shared->msgp);
+	cf_free(shared);
+
+	// It's critical that this count is decremented after the transaction is
+	// completely finished with the queue because "shutdown threads" relies
+	// on this information when performing graceful shutdown.
+	cf_atomic32_decr(&batch_queue->count);
+}
+
+static void
+as_batch_worker(void* udata)
+{
+	// Send batch data to client, one buffer block at a time.
+	as_batch_work* work = (as_batch_work*)udata;
+	as_batch_queue* batch_queue = work->batch_queue;
+	cf_queue* response_queue = batch_queue->response_queue;
+	as_batch_response response;
+	as_batch_shared* shared;
+	as_batch_buffer* buffer;
+
+	while (cf_queue_pop(response_queue, &response, CF_QUEUE_FOREVER) == CF_QUEUE_OK) {
+		// Check if this thread task should end.
+		shared = response.shared;
+		if (! shared) {
+			break;
+		}
+
+		buffer = response.buffer;
+		shared->tran_count_response += buffer->tran_count;
+
+		if (buffer->capacity) {
+			// Send buffer block to client.
+			as_batch_send_buffer(shared, buffer);
+
+			if (as_buffer_pool_push_limit(&batch_buffer_pool, buffer, buffer->capacity, g_config.batch_max_unused_buffers) != 0) {
+				cf_atomic64_incr(&g_stats.batch_index_destroyed_buffers);
+			}
+		}
+		else {
+			// Server error buffers should not be put into buffer pool.
+			cf_free(buffer);
+			cf_atomic64_incr(&g_stats.batch_index_destroyed_buffers);
+		}
+
+		// Wait till all transactions have been received before sending
+		// final batch entry and releasing memory.
+		if (shared->tran_count_response == shared->tran_max) {
+			as_batch_send_final(shared);
+			as_batch_free(shared, batch_queue);
+		}
+	}
+
+	// Send back completion notification.
+	uint32_t complete = 1;
+	cf_queue_push(work->batch_queue->complete_queue, &complete);
+}
+
+static int
+as_batch_create_thread_queues(uint32_t begin, uint32_t end)
+{
+	// Allocate one queue per batch response worker thread.
+	int status = 0;
+
+	as_batch_work work;
+	work.complete = false;
+
+	for (uint32_t i = begin; i < end; i++) {
+		work.batch_queue = &batch_queues[i];
+		work.batch_queue->response_queue = cf_queue_create(sizeof(as_batch_response), true);
+		work.batch_queue->complete_queue = cf_queue_create(sizeof(uint32_t), true);
+		work.batch_queue->count = 0;
+		work.batch_queue->active = true;
+
+		int rc = as_thread_pool_queue_task_fixed(&batch_thread_pool, &work);
+
+		if (rc) {
+			cf_warning(AS_BATCH, "Failed to create batch thread %u: %d", i, rc);
+			status = rc;
+		}
+	}
+	return status;
+}
+
+static bool
+as_batch_wait(uint32_t begin, uint32_t end)
+{
+	for (uint32_t i = begin; i < end; i++) {
+		if (batch_queues[i].count > 0) {
+			return false;
+		}
+	}
+	return true;
+}
+
+static int
+as_batch_shutdown_thread_queues(uint32_t begin, uint32_t end)
+{
+	// Set excess queues to inactive.
+	// Existing batch transactions will be allowed to complete.
+	for (uint32_t i = begin; i < end; i++) {
+		batch_queues[i].active = false;
+	}
+
+	// Wait till there are no more active batch transactions on the queues.
+	// Timeout after 30 seconds.
+	uint64_t limitus = cf_getus() + (1000 * 1000 * 30);
+	usleep(50 * 1000);  // Sleep 50ms
+	do {
+		if (as_batch_wait(begin, end)) {
+			break;
+		}
+		usleep(500 * 1000);  // Sleep 500ms
+
+		if (cf_getus() > limitus) {
+			cf_warning(AS_BATCH, "Batch shutdown threads failed on timeout. Transactions remain on queue.");
+			// Reactivate queues.
+			for (uint32_t i = begin; i < end; i++) {
+				batch_queues[i].active = true;
+			}
+			return -1;
+		}
+	} while (true);
+
+	// Send stop command to excess queues.
+	as_batch_response response;
+	memset(&response, 0, sizeof(as_batch_response));
+
+	for (uint32_t i = begin; i < end; i++) {
+		cf_queue_push(batch_queues[i].response_queue, &response);
+	}
+
+	// Wait for completion events.
+	uint32_t complete;
+	for (uint32_t i = begin; i < end; i++) {
+		as_batch_queue* bq = &batch_queues[i];
+		cf_queue_pop(bq->complete_queue, &complete, CF_QUEUE_FOREVER);
+		cf_queue_destroy(bq->complete_queue);
+		bq->complete_queue = 0;
+		cf_queue_destroy(bq->response_queue);
+		bq->response_queue = 0;
+	}
+	return 0;
+}
+
+static as_batch_queue*
+as_batch_find_queue(int queue_index)
+{
+	// Search backwards for an active queue.
+	for (int index = queue_index - 1; index >= 0; index--) {
+		as_batch_queue* bq = &batch_queues[index];
+
+		if (bq->active && cf_queue_sz(bq->response_queue) < g_config.batch_max_buffers_per_queue) {
+			return bq;
+		}
+	}
+
+	// Search forwards.
+	for (int index = queue_index + 1; index < MAX_BATCH_THREADS; index++) {
+		as_batch_queue* bq = &batch_queues[index];
+
+		// If current queue is not active, future queues will not be active either.
+		if (! bq->active) {
+			break;
+		}
+
+		if (cf_queue_sz(bq->response_queue) < g_config.batch_max_buffers_per_queue) {
+			return bq;
+		}
+	}
+	return 0;
+}
+
+static as_batch_buffer*
+as_batch_buffer_create(uint32_t size)
+{
+	as_batch_buffer* buffer = cf_malloc(size);
+	buffer->capacity = size - batch_buffer_pool.header_size;
+	cf_atomic64_incr(&g_stats.batch_index_created_buffers);
+	return buffer;
+}
+
+static uint8_t*
+as_batch_buffer_pop(as_batch_shared* shared, uint32_t size)
+{
+	as_batch_buffer* buffer;
+	uint32_t mem_size = size + batch_buffer_pool.header_size;
+
+	if (mem_size > batch_buffer_pool.buffer_size) {
+		// Requested size is greater than fixed buffer size.
+		// Allocate new buffer, but don't put back into pool.
+		buffer = as_batch_buffer_create(mem_size);
+		cf_atomic64_incr(&g_stats.batch_index_huge_buffers);
+	}
+	else {
+		// Pop existing buffer from queue.
+		// The extra lock here is unavoidable.
+		int status = cf_queue_pop(batch_buffer_pool.queue, &buffer, CF_QUEUE_NOWAIT);
+
+		if (status == CF_QUEUE_OK) {
+			buffer->capacity = batch_buffer_pool.buffer_size - batch_buffer_pool.header_size;
+		}
+		else if (status == CF_QUEUE_EMPTY) {
+			// Queue is empty.  Create new buffer.
+			buffer = as_batch_buffer_create(batch_buffer_pool.buffer_size);
+		}
+		else {
+			cf_warning(AS_BATCH, "Failed to pop new batch buffer: %d", status);
+			// Try to allocate small buffer with just header.
+			as_batch_buffer* buffer = cf_malloc(sizeof(as_batch_buffer));
+			buffer->capacity = 0;
+			buffer->size = 0;
+			buffer->tran_count = 1;
+			buffer->writers = 2;
+			shared->buffer = buffer;
+			shared->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN;
+			return 0;
+		}
+	}
+
+	// Reserve a slot in new buffer.
+	buffer->size = size;
+	buffer->tran_count = 1;
+	buffer->writers = 2;
+	shared->buffer = buffer;
+	return buffer->data;
+}
+
+static inline void
+as_batch_buffer_complete(as_batch_shared* shared, as_batch_buffer* buffer)
+{
+	// Flush when all writers have finished writing into the buffer.
+	if (cf_atomic32_decr(&buffer->writers) == 0) {
+		as_batch_response response = {.shared = shared, .buffer = buffer};
+		cf_queue_push(shared->response_queue, &response);
+	}
+}
+
+static uint8_t*
+as_batch_reserve(as_batch_shared* shared, uint32_t size, int result_code, as_batch_buffer** buffer_out, bool* complete)
+{
+	as_batch_buffer* buffer;
+	uint8_t* data;
+
+	pthread_mutex_lock(&shared->lock);
+	*complete = (++shared->tran_count == shared->tran_max);
+	buffer = shared->buffer;
+
+	if (! buffer) {
+		// No previous buffer.  Get new buffer.
+		data = as_batch_buffer_pop(shared, size);
+		*buffer_out = shared->buffer;
+		pthread_mutex_unlock(&shared->lock);
+	}
+	else if (buffer->size + size <= buffer->capacity) {
+		// Result fits into existing block.  Reserve a slot.
+		data = buffer->data + buffer->size;
+		buffer->size += size;
+		buffer->tran_count++;
+		cf_atomic32_incr(&buffer->writers);
+		*buffer_out = buffer;
+		pthread_mutex_unlock(&shared->lock);
+	}
+	else {
+		// Result does not fit into existing block.
+		// Make copy of existing buffer.
+		as_batch_buffer* prev_buffer = buffer;
+
+		// Get new buffer.
+		data = as_batch_buffer_pop(shared, size);
+		*buffer_out = shared->buffer;
+		pthread_mutex_unlock(&shared->lock);
+
+		as_batch_buffer_complete(shared, prev_buffer);
+	}
+
+	if (! (result_code == AS_PROTO_RESULT_OK || result_code == AS_PROTO_RESULT_FAIL_NOT_FOUND)) {
+		// Result code can be set outside of lock because it doesn't matter which transaction's
+		// result code is used as long as it's an error.
+		shared->result_code = result_code;
+	}
+	return data;
+}
+
+static inline void
+as_batch_transaction_end(as_batch_shared* shared, as_batch_buffer* buffer, bool complete)
+{
+	// This flush can only be triggered when the buffer is full.
+	as_batch_buffer_complete(shared, buffer);
+
+	if (complete) {
+		// This flush only occurs when all transactions in batch have been processed.
+		as_batch_buffer_complete(shared, buffer);
+	}
+}
+
+static void
+as_batch_terminate(as_batch_shared* shared, uint32_t tran_count, int result_code)
+{
+	// Terminate batch by adding phantom transactions to shared and buffer tran counts.
+	// This is done so the memory is released at the end only once.
+	as_batch_buffer* buffer;
+	bool complete;
+
+	pthread_mutex_lock(&shared->lock);
+	buffer = shared->buffer;
+	shared->result_code = result_code;
+	shared->tran_count += tran_count;
+	complete = (shared->tran_count == shared->tran_max);
+
+	if (! buffer) {
+		// No previous buffer.  Get new buffer.
+		as_batch_buffer_pop(shared, 0);
+		buffer = shared->buffer;
+		buffer->tran_count = tran_count;  // Override tran_count.
+	}
+	else {
+		// Buffer exists. Add phantom transactions.
+		buffer->tran_count += tran_count;
+		cf_atomic32_incr(&buffer->writers);
+	}
+	pthread_mutex_unlock(&shared->lock);
+	as_batch_transaction_end(shared, buffer, complete);
+}
+
+//---------------------------------------------------------
+// FUNCTIONS
+//---------------------------------------------------------
+
+int
+as_batch_init()
+{
+	if (pthread_mutex_init(&batch_resize_lock, NULL)) {
+		cf_warning(AS_BATCH, "Failed to initialize batch resize lock");
+		return -1;
+	}
+
+	// Default 'batch-index-threads' can't be set before call to cf_topo_init().
+	if (g_config.n_batch_index_threads == 0) {
+		g_config.n_batch_index_threads = cf_topo_count_cpus();
+	}
+
+	cf_info(AS_BATCH, "starting %u batch-index-threads", g_config.n_batch_index_threads);
+
+	int rc = as_thread_pool_init_fixed(&batch_thread_pool, g_config.n_batch_index_threads, as_batch_worker,
+			sizeof(as_batch_work), offsetof(as_batch_work,complete));
+
+	if (rc) {
+		cf_warning(AS_BATCH, "Failed to initialize batch-index-threads to %u: %d", g_config.n_batch_index_threads, rc);
+		return rc;
+	}
+
+	rc = as_buffer_pool_init(&batch_buffer_pool, sizeof(as_batch_buffer), BATCH_BLOCK_SIZE);
+
+	if (rc) {
+		cf_warning(AS_BATCH, "Failed to initialize batch buffer pool: %d", rc);
+		return rc;
+	}
+
+	rc = as_batch_create_thread_queues(0, g_config.n_batch_index_threads);
+
+	if (rc) {
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+as_batch_queue_task(as_transaction* btr)
+{
+	uint64_t counter = cf_atomic64_incr(&g_stats.batch_index_initiate);
+	uint32_t thread_size = batch_thread_pool.thread_size;
+
+	if (thread_size == 0 || thread_size > MAX_BATCH_THREADS) {
+		cf_warning(AS_BATCH, "batch-index-threads has been disabled: %d", thread_size);
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_BATCH_DISABLED);
+	}
+	uint32_t queue_index = counter % thread_size;
+
+	// Validate batch transaction
+	as_proto* bproto = &btr->msgp->proto;
+
+	if (bproto->sz > PROTO_SIZE_MAX) {
+		cf_warning(AS_BATCH, "can't process message: invalid size %lu should be %d or less",
+				(uint64_t)bproto->sz, PROTO_SIZE_MAX);
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER);
+	}
+
+	if (bproto->type != PROTO_TYPE_AS_MSG) {
+		cf_warning(AS_BATCH, "Invalid proto type. Expected %d Received %d", PROTO_TYPE_AS_MSG, bproto->type);
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER);
+	}
+
+	// Check that the socket is authenticated.
+	uint8_t result = as_security_check(btr->from.proto_fd_h, PERM_NONE);
+
+	if (result != AS_PROTO_RESULT_OK) {
+		as_security_log(btr->from.proto_fd_h, result, PERM_NONE, NULL, NULL);
+		return as_batch_send_error(btr, result);
+	}
+
+	// Parse header
+	as_msg* bmsg = &btr->msgp->msg;
+	as_msg_swap_header(bmsg);
+
+	// Parse fields
+	uint8_t* limit = (uint8_t*)bmsg + bproto->sz;
+	as_msg_field* mf = (as_msg_field*)bmsg->data;
+	as_msg_field* end;
+	as_msg_field* bf = 0;
+
+	for (int i = 0; i < bmsg->n_fields; i++) {
+		if ((uint8_t*)mf >= limit) {
+			cf_warning(AS_BATCH, "Batch field limit reached");
+			return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER);
+		}
+		as_msg_swap_field(mf);
+		end = as_msg_field_get_next(mf);
+
+		if (mf->type == AS_MSG_FIELD_TYPE_BATCH || mf->type == AS_MSG_FIELD_TYPE_BATCH_WITH_SET) {
+			bf = mf;
+		}
+		mf = end;
+	}
+
+	if (! bf) {
+		cf_warning(AS_BATCH, "Batch index field not found");
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER);
+	}
+
+	// Parse batch field
+	uint8_t* data = bf->data;
+	uint32_t tran_count = cf_swap_from_be32(*(uint32_t*)data);
+	data += sizeof(uint32_t);
+
+	if (tran_count == 0) {
+		cf_warning(AS_BATCH, "Batch request size is zero");
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER);
+	}
+
+	if (tran_count > g_config.batch_max_requests) {
+		cf_warning(AS_BATCH, "Batch request size %u exceeds max %u", tran_count, g_config.batch_max_requests);
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS);
+	}
+
+	// Initialize shared data
+	as_batch_shared* shared = cf_malloc(sizeof(as_batch_shared));
+
+	memset(shared, 0, sizeof(as_batch_shared));
+
+	if (pthread_mutex_init(&shared->lock, NULL)) {
+		cf_warning(AS_BATCH, "Failed to initialize batch lock");
+		cf_free(shared);
+		return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_UNKNOWN);
+	}
+
+	shared->start = btr->start_time;
+	shared->fd_h = btr->from.proto_fd_h;
+	shared->msgp = btr->msgp;
+	shared->tran_max = tran_count;
+
+	// Find batch queue to send transaction responses.
+	as_batch_queue* batch_queue = &batch_queues[queue_index];
+
+	// batch_max_buffers_per_queue is a soft limit, but still must be checked under lock.
+	if (! (batch_queue->active && cf_queue_sz(batch_queue->response_queue) < g_config.batch_max_buffers_per_queue)) {
+		// Queue buffer limit has been exceeded or thread has been shutdown (probably due to
+		// downwards thread resize).  Search for an available queue.
+		// cf_warning(AS_BATCH, "Queue %u full %d", queue_index, cf_queue_sz(batch_queue->response_queue));
+		batch_queue = as_batch_find_queue(queue_index);
+
+		if (! batch_queue) {
+			cf_warning(AS_BATCH, "Failed to find active batch queue that is not full");
+			cf_free(shared);
+			return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_BATCH_QUEUES_FULL);
+		}
+	}
+	// Increment batch queue transaction count.
+	cf_atomic32_incr(&batch_queue->count);
+	shared->response_queue = batch_queue->response_queue;
+
+	// Initialize generic transaction.
+	as_transaction tr;
+	as_transaction_init_head(&tr, 0, 0);
+
+	tr.origin = FROM_BATCH;
+	tr.from_flags |= FROM_FLAG_BATCH_SUB;
+	tr.start_time = btr->start_time;
+
+	// Read batch keys and initialize generic transactions.
+	as_batch_input* in;
+	cl_msg* out = NULL;
+	cl_msg* prev_msgp = NULL;
+	as_msg_op* op;
+	uint32_t tran_row = 0;
+	uint8_t info = *data++;  // allow transaction inline.
+
+	bool allow_inline = (g_config.n_namespaces_inlined != 0 && info);
+	bool check_inline = (allow_inline && g_config.n_namespaces_not_inlined != 0);
+	bool should_inline = (allow_inline && g_config.n_namespaces_not_inlined == 0);
+
+	// Split batch rows into separate single record read transactions.
+	// The read transactions are located in the same memory block as
+	// the original batch transactions. This allows us to avoid performing
+	// an extra malloc for each transaction.
+	while (tran_row < tran_count && data + BATCH_REPEAT_SIZE <= limit) {
+		// Copy transaction data before memory gets overwritten.
+		in = (as_batch_input*)data;
+
+		tr.from.batch_shared = shared; // is set NULL after sub-transaction
+		tr.from_data.batch_index = cf_swap_from_be32(in->index);
+		tr.keyd = in->keyd;
+		tr.benchmark_time = btr->benchmark_time; // must reset for each usage
+
+		if (in->repeat) {
+			if (! prev_msgp) {
+				break; // bad bytes from client - repeat set on first item
+			}
+
+			// Row should use previous namespace and bin names.
+			data += BATCH_REPEAT_SIZE;
+			tr.msgp = prev_msgp;
+		}
+		else {
+			tr.msg_fields = 0; // erase previous AS_MSG_FIELD_BIT_SET flag, if any
+			as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_NAMESPACE);
+
+			// Row contains full namespace/bin names.
+			out = (cl_msg*)data;
+
+			if (data + sizeof(cl_msg) + sizeof(as_msg_field) > limit) {
+				break;
+			}
+
+			out->msg.header_sz = sizeof(as_msg);
+			out->msg.info1 = in->info1;
+			out->msg.info2 = 0;
+			out->msg.info3 = bmsg->info3 & AS_MSG_INFO3_LINEARIZE_READ;
+			out->msg.unused = 0;
+			out->msg.result_code = 0;
+			out->msg.generation = 0;
+			out->msg.record_ttl = 0;
+			out->msg.transaction_ttl = bmsg->transaction_ttl; // already swapped
+			// n_fields/n_ops is in exact same place on both input/output, but the value still
+			// needs to be swapped.
+			out->msg.n_fields = cf_swap_from_be16(in->n_fields);
+
+			// Older clients sent zero, but always sent namespace.  Adjust this.
+			if (out->msg.n_fields == 0) {
+				out->msg.n_fields = 1;
+			}
+
+			out->msg.n_ops = cf_swap_from_be16(in->n_ops);
+
+			// Namespace input is same as namespace field, so just leave in place and swap.
+			data += sizeof(cl_msg);
+			mf = (as_msg_field*)data;
+			as_msg_swap_field(mf);
+			if (check_inline) {
+				as_namespace* ns = as_namespace_get_bymsgfield(mf);
+				should_inline = ns && ns->storage_data_in_memory;
+			}
+			mf = as_msg_field_get_next(mf);
+			data = (uint8_t*)mf;
+
+			// Swap remaining fields.
+			for (uint16_t j = 1; j < out->msg.n_fields; j++) {
+				if (data + sizeof(as_msg_field) > limit) {
+					goto TranEnd;
+				}
+
+				if (mf->type == AS_MSG_FIELD_TYPE_SET) {
+					as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_SET);
+				}
+
+				as_msg_swap_field(mf);
+				mf = as_msg_field_get_next(mf);
+				data = (uint8_t*)mf;
+			}
+
+			if (out->msg.n_ops) {
+				// Bin names input is same as transaction ops, so just leave in place and swap.
+				uint16_t n_ops = out->msg.n_ops;
+				for (uint16_t j = 0; j < n_ops; j++) {
+					if (data + sizeof(as_msg_op) > limit) {
+						goto TranEnd;
+					}
+					op = (as_msg_op*)data;
+					as_msg_swap_op(op);
+					op = as_msg_op_get_next(op);
+					data = (uint8_t*)op;
+				}
+			}
+
+			// Initialize msg header.
+			out->proto.version = PROTO_VERSION;
+			out->proto.type = PROTO_TYPE_AS_MSG;
+			out->proto.sz = (data - (uint8_t*)&out->msg);
+			tr.msgp = out;
+			prev_msgp = out;
+		}
+
+		if (data > limit) {
+			break;
+		}
+
+		// Submit transaction.
+		if (should_inline) {
+			as_tsvc_process_transaction(&tr);
+		}
+		else {
+			// Queue transaction to be processed by a transaction thread.
+			as_tsvc_enqueue(&tr);
+		}
+		tran_row++;
+	}
+
+TranEnd:
+	if (tran_row < tran_count) {
+		// Mismatch between tran_count and actual data.  Terminate transaction.
+		cf_warning(AS_BATCH, "Batch keys mismatch. Expected %u Received %u", tran_count, tran_row);
+		as_batch_terminate(shared, tran_count - tran_row, AS_PROTO_RESULT_FAIL_PARAMETER);
+	}
+
+	// Reset original socket because socket now owned by batch shared.
+	btr->from.proto_fd_h = NULL;
+	return 0;
+}
+
+void
+as_batch_add_result(as_transaction* tr, uint16_t n_bins, as_bin** bins,
+		as_msg_op** ops)
+{
+	as_namespace* ns = tr->rsv.ns;
+
+	// Calculate size.
+	size_t size = sizeof(as_msg);
+	size += sizeof(as_msg_field) + sizeof(cf_digest);
+
+	uint16_t n_fields = 1;
+
+	for (uint16_t i = 0; i < n_bins; i++) {
+		as_bin* bin = bins[i];
+		size += sizeof(as_msg_op);
+
+		if (ops) {
+			size += ops[i]->name_sz;
+		}
+		else if (bin) {
+			size += ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(ns, bin->id));
+		}
+		else {
+			cf_crash(AS_BATCH, "making response message with null bin and op");
+		}
+
+		if (bin) {
+			size += as_bin_particle_client_value_size(bin);
+		}
+	}
+
+	as_batch_shared* shared = tr->from.batch_shared;
+
+	if (size > BATCH_MAX_TRANSACTION_SIZE) {
+		cf_warning(AS_BATCH, "Record size %zu exceeds max %d", size, BATCH_MAX_TRANSACTION_SIZE);
+		as_batch_add_error(shared, tr->from_data.batch_index, AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG);
+		return;
+	}
+
+	as_batch_buffer* buffer;
+	bool complete;
+	uint8_t* data = as_batch_reserve(shared, size, tr->result_code, &buffer, &complete);
+
+	if (data) {
+		// Write header.
+		uint8_t* p = data;
+		as_msg* m = (as_msg*)p;
+		m->header_sz = sizeof(as_msg);
+		m->info1 = 0;
+		m->info2 = 0;
+		m->info3 = 0;
+		m->unused = 0;
+		m->result_code = tr->result_code;
+		m->generation = plain_generation(tr->generation, ns);
+		m->record_ttl = tr->void_time;
+
+		// Overload transaction_ttl to store batch index.
+		m->transaction_ttl = tr->from_data.batch_index;
+
+		m->n_fields = n_fields;
+		m->n_ops = n_bins;
+		as_msg_swap_header(m);
+		p += sizeof(as_msg);
+
+		as_msg_field* field = (as_msg_field*)p;
+		field->field_sz = sizeof(cf_digest) + 1;
+		field->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
+		memcpy(field->data, &tr->keyd, sizeof(cf_digest));
+		as_msg_swap_field(field);
+		p += sizeof(as_msg_field) + sizeof(cf_digest);
+
+		for (uint16_t i = 0; i < n_bins; i++) {
+			as_bin* bin = bins[i];
+			as_msg_op* op = (as_msg_op*)p;
+			op->op = AS_MSG_OP_READ;
+			op->version = 0;
+
+			if (ops) {
+				as_msg_op* src = ops[i];
+				memcpy(op->name, src->name, src->name_sz);
+				op->name_sz = src->name_sz;
+			}
+			else {
+				op->name_sz = as_bin_memcpy_name(ns, op->name, bin);
+			}
+
+			op->op_sz = 4 + op->name_sz;
+			p += sizeof(as_msg_op) + op->name_sz;
+			p += as_bin_particle_to_client(bin, op);
+			as_msg_swap_op(op);
+		}
+	}
+	as_batch_transaction_end(shared, buffer, complete);
+}
+
+void
+as_batch_add_proxy_result(as_batch_shared* shared, uint32_t index, cf_digest* digest, cl_msg* cmsg, size_t proxy_size)
+{
+	as_msg* msg = &cmsg->msg;
+	size_t size = proxy_size + sizeof(as_msg_field) + sizeof(cf_digest) - sizeof(as_proto);
+
+	if (size > BATCH_MAX_TRANSACTION_SIZE) {
+		cf_warning(AS_BATCH, "Record size %zu exceeds max %d", size, BATCH_MAX_TRANSACTION_SIZE);
+		as_batch_add_error(shared, index, AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG);
+		return;
+	}
+
+	as_batch_buffer* buffer;
+	bool complete;
+	uint8_t* data = as_batch_reserve(shared, size, msg->result_code, &buffer, &complete);
+
+	if (data) {
+		// Overload transaction_ttl to store batch index.
+		msg->transaction_ttl = htonl(index);
+
+		// Write header
+		uint16_t n_fields = ntohs(msg->n_fields);
+		msg->n_fields = htons(n_fields + 1);
+		memcpy(data, msg, sizeof(as_msg));
+		uint8_t* trg = data + sizeof(as_msg);
+
+		// Write digest field
+		as_msg_field* field = (as_msg_field*)trg;
+		field->field_sz = sizeof(cf_digest) + 1;
+		field->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
+		memcpy(field->data, digest, sizeof(cf_digest));
+		as_msg_swap_field(field);
+		trg += sizeof(as_msg_field) + sizeof(cf_digest);
+
+		// Copy others fields and ops.
+		size = ((uint8_t*)cmsg + proxy_size) - msg->data;
+		memcpy(trg, msg->data, size);
+	}
+	as_batch_transaction_end(shared, buffer, complete);
+}
+
+void
+as_batch_add_error(as_batch_shared* shared, uint32_t index, int result_code)
+{
+	as_batch_buffer* buffer;
+	bool complete;
+	uint8_t* data = as_batch_reserve(shared, sizeof(as_msg), result_code, &buffer, &complete);
+
+	if (data) {
+		// Write error.
+		as_msg* m = (as_msg*)data;
+		m->header_sz = sizeof(as_msg);
+		m->info1 = 0;
+		m->info2 = 0;
+		m->info3 = 0;
+		m->unused = 0;
+		m->result_code = result_code;
+		m->generation = 0;
+		m->record_ttl = 0;
+		// Overload transaction_ttl to store batch index.
+		m->transaction_ttl = index;
+		m->n_fields = 0;
+		m->n_ops = 0;
+		as_msg_swap_header(m);
+	}
+	as_batch_transaction_end(shared, buffer, complete);
+}
+
+int
+as_batch_threads_resize(uint32_t threads)
+{
+	if (threads > MAX_BATCH_THREADS) {
+		cf_warning(AS_BATCH, "batch-index-threads %u exceeds max %u", threads, MAX_BATCH_THREADS);
+		return -1;
+	}
+
+	if (pthread_mutex_lock(&batch_resize_lock)) {
+		cf_warning(AS_BATCH, "Batch resize lock failed");
+		return -2;
+	}
+
+	// Resize thread pool.  The threads will wait for graceful shutdown on downwards resize.
+	uint32_t threads_orig = batch_thread_pool.thread_size;
+	cf_info(AS_BATCH, "Resize batch-index-threads from %u to %u", threads_orig, threads);
+	int status = 0;
+
+	if (threads != threads_orig) {
+		if (threads > threads_orig) {
+			// Increase threads before initializing queues.
+			status = as_thread_pool_resize(&batch_thread_pool, threads);
+
+			if (status == 0) {
+				g_config.n_batch_index_threads = threads;
+				// Adjust queues to match new thread size.
+				status = as_batch_create_thread_queues(threads_orig, threads);
+			}
+			else {
+				// Show warning, but keep going as some threads may have been successfully added/removed.
+				cf_warning(AS_BATCH, "Failed to resize batch-index-threads. status=%d, batch-index-threads=%u",
+						status, g_config.n_batch_index_threads);
+				threads = batch_thread_pool.thread_size;
+
+				if (threads > threads_orig) {
+					g_config.n_batch_index_threads = threads;
+					// Adjust queues to match new thread size.
+					status = as_batch_create_thread_queues(threads_orig, threads);
+				}
+			}
+		}
+		else {
+			// Shutdown queues before shutting down threads.
+			status = as_batch_shutdown_thread_queues(threads, threads_orig);
+
+			if (status == 0) {
+				// Adjust threads to match new queue size.
+				status = as_thread_pool_resize(&batch_thread_pool, threads);
+				g_config.n_batch_index_threads = batch_thread_pool.thread_size;
+
+				if (status) {
+					cf_warning(AS_BATCH, "Failed to resize batch-index-threads. status=%d, batch-index-threads=%u",
+							status, g_config.n_batch_index_threads);
+				}
+			}
+		}
+	}
+	pthread_mutex_unlock(&batch_resize_lock);
+	return status;
+}
+
+void
+as_batch_queues_info(cf_dyn_buf* db)
+{
+	if (pthread_mutex_lock(&batch_resize_lock)) {
+		cf_warning(AS_BATCH, "Batch info resize lock failed");
+		return;
+	}
+
+	uint32_t max = batch_thread_pool.thread_size;
+
+	for (uint32_t i = 0; i < max; i++) {
+		if (i > 0) {
+			cf_dyn_buf_append_char(db, ',');
+		}
+		as_batch_queue* bq = &batch_queues[i];
+		cf_dyn_buf_append_uint32(db, bq->count);  // Batch count
+		cf_dyn_buf_append_char(db, ':');
+		cf_dyn_buf_append_int(db, cf_queue_sz(bq->response_queue));  // Buffer count
+	}
+	pthread_mutex_unlock(&batch_resize_lock);
+}
+
+int
+as_batch_unused_buffers()
+{
+	return cf_queue_sz(batch_buffer_pool.queue);
+}
+
+// Not currently called.  Put in this place holder in case server decides to
+// implement clean shutdowns in the future.
+void
+as_batch_destroy()
+{
+	as_thread_pool_destroy(&batch_thread_pool);
+	as_buffer_pool_destroy(&batch_buffer_pool);
+
+	pthread_mutex_lock(&batch_resize_lock);
+	as_batch_shutdown_thread_queues(0, batch_thread_pool.thread_size);
+	pthread_mutex_unlock(&batch_resize_lock);
+	pthread_mutex_destroy(&batch_resize_lock);
+}
+
+as_file_handle*
+as_batch_get_fd_h(as_batch_shared* shared)
+{
+	return shared->fd_h;
+}
diff --git a/as/src/base/bin.c b/as/src/base/bin.c
new file mode 100644
index 00000000..a04c9d96
--- /dev/null
+++ b/as/src/base/bin.c
@@ -0,0 +1,685 @@
+/*
+ * bin.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "fault.h"
+#include "vmapx.h"
+
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+// Never called if single-bin.
+static inline bool
+as_bin_get_id_w_len(as_namespace *ns, const uint8_t *name, size_t len,
+		uint32_t *p_id)
+{
+	return cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, (const char *)name,
+			len, p_id) == CF_VMAPX_OK;
+}
+
+static inline void
+as_bin_init_nameless(as_bin *b)
+{
+	as_bin_state_set(b, AS_BIN_STATE_UNUSED);
+	b->particle = NULL;
+}
+
+static inline as_bin_space *
+safe_bin_space(const as_record *r)
+{
+	return r->dim ? as_index_get_bin_space(r) : NULL;
+}
+
+static inline uint16_t
+safe_n_bins(const as_record *r)
+{
+	as_bin_space* bin_space = safe_bin_space(r);
+
+	return bin_space ? bin_space->n_bins : 0;
+}
+
+static inline as_bin *
+safe_bins(const as_record *r)
+{
+	as_bin_space* bin_space = safe_bin_space(r);
+
+	return bin_space ? bin_space->bins : NULL;
+}
+
+static inline void
+as_bin_init_w_len(as_namespace *ns, as_bin *b, const uint8_t *name, size_t len)
+{
+	as_bin_init_nameless(b);
+	as_bin_set_id_from_name_buf(ns, b, name, len);
+	// Don't touch b->unused - like b->id, it's past the end of its enclosing
+	// as_index if single-bin, data-in-memory.
+}
+
+
+//==========================================================
+// Public API.
+//
+
+// Caller-beware, name cannot be null, must be null-terminated.
+int16_t
+as_bin_get_id(as_namespace *ns, const char *name)
+{
+	cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call");
+
+	uint32_t idx;
+
+	if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, &idx) == CF_VMAPX_OK) {
+		return (uint16_t)idx;
+	}
+
+	return -1;
+}
+
+
+uint16_t
+as_bin_get_or_assign_id(as_namespace *ns, const char *name)
+{
+	cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call");
+
+	uint32_t idx;
+
+	if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, &idx) == CF_VMAPX_OK) {
+		return (uint16_t)idx;
+	}
+
+	cf_vmapx_err result = cf_vmapx_put_unique(ns->p_bin_name_vmap, name, &idx);
+
+	if (! (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS)) {
+		// Tedious to handle safely for all usage paths, so for now...
+		cf_crash(AS_BIN, "couldn't add bin name %s, vmap err %d", name, result);
+	}
+
+	return (uint16_t)idx;
+}
+
+
+uint16_t
+as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len)
+{
+	cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call");
+
+	uint32_t idx;
+
+	if (cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, name, len, &idx) ==
+			CF_VMAPX_OK) {
+		return (uint16_t)idx;
+	}
+
+	cf_vmapx_err result = cf_vmapx_put_unique_w_len(ns->p_bin_name_vmap, name,
+			len, &idx);
+
+	if (! (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS)) {
+		// Tedious to handle safely for all usage paths, so for now...
+		cf_crash(AS_BIN, "couldn't add bin name %s, vmap err %d", name, result);
+	}
+
+	return (uint16_t)idx;
+}
+
+
+const char *
+as_bin_get_name_from_id(as_namespace *ns, uint16_t id)
+{
+	cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call");
+
+	const char* name = NULL;
+
+	if (cf_vmapx_get_by_index(ns->p_bin_name_vmap, id, (void**)&name) !=
+			CF_VMAPX_OK) {
+		// Should be impossible since id originates from vmap.
+		cf_crash(AS_BIN, "no bin name for id %u", id);
+	}
+
+	return name;
+}
+
+
+bool
+as_bin_name_within_quota(as_namespace *ns, const char *name)
+{
+	// Won't exceed quota if single-bin or currently below quota.
+	if (ns->single_bin ||
+			cf_vmapx_count(ns->p_bin_name_vmap) < BIN_NAMES_QUOTA) {
+		return true;
+	}
+
+	// Won't exceed quota if name is found (and so would NOT be added to vmap).
+	if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, NULL) == CF_VMAPX_OK) {
+		return true;
+	}
+
+	cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s",
+			ns->name, name);
+
+	return false;
+}
+
+
+void
+as_bin_init(as_namespace *ns, as_bin *b, const char *name)
+{
+	as_bin_init_nameless(b);
+	as_bin_set_id_from_name(ns, b, name);
+	// Don't touch b->unused - like b->id, it's past the end of its enclosing
+	// as_index if single-bin, data-in-memory.
+}
+
+
+void
+as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from)
+{
+	if (ns->single_bin) {
+		as_single_bin_copy(to, from);
+	}
+	else {
+		*to = *from;
+	}
+}
+
+
+// - Seems like an as_storage_record method, but leaving it here for now.
+// - sets rd->n_bins!
+int
+as_storage_rd_load_n_bins(as_storage_rd *rd)
+{
+	if (rd->ns->single_bin) {
+		rd->n_bins = 1;
+		return 0;
+	}
+
+	if (rd->ns->storage_data_in_memory) {
+		rd->n_bins = safe_n_bins(rd->r);
+		return 0;
+	}
+
+	rd->n_bins = 0;
+
+	if (rd->record_on_device && ! rd->ignore_record_on_device) {
+		return as_storage_record_load_n_bins(rd); // sets rd->n_bins
+	}
+
+	return 0;
+}
+
+
+// - Seems like an as_storage_record method, but leaving it here for now.
+// - sets rd->bins!
+int
+as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins)
+{
+	if (rd->ns->storage_data_in_memory) {
+		rd->bins = rd->ns->single_bin ? as_index_get_single_bin(rd->r) :
+				safe_bins(rd->r);
+		return 0;
+	}
+
+	// Data NOT in-memory.
+
+	rd->bins = stack_bins;
+	as_bin_set_all_empty(rd);
+
+	if (rd->record_on_device && ! rd->ignore_record_on_device) {
+		return as_storage_record_load_bins(rd);
+	}
+
+	return 0;
+}
+
+
+uint16_t
+as_bin_inuse_count(as_storage_rd *rd)
+{
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		if (! as_bin_inuse(&rd->bins[i])) {
+			return i;
+		}
+	}
+
+	return rd->n_bins;
+}
+
+
+void
+as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs)
+{
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		bin_ptrs[i] = &rd->bins[i];
+	}
+}
+
+
+as_bin *
+as_bin_get_by_id(as_storage_rd *rd, uint32_t id)
+{
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		as_bin *b = &rd->bins[i];
+
+		if (! as_bin_inuse(b)) {
+			break;
+		}
+
+		if ((uint32_t)b->id == id) {
+			return b;
+		}
+	}
+
+	return NULL;
+}
+
+
+as_bin *
+as_bin_get(as_storage_rd *rd, const char *name)
+{
+	if (rd->ns->single_bin) {
+		return as_bin_inuse_has(rd) ? rd->bins : NULL;
+	}
+
+	uint32_t id;
+
+	if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) {
+		return NULL;
+	}
+
+	return as_bin_get_by_id(rd, id);
+}
+
+
+as_bin *
+as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len)
+{
+	if (rd->ns->single_bin) {
+		return as_bin_inuse_has(rd) ? rd->bins : NULL;
+	}
+
+	uint32_t id;
+
+	if (! as_bin_get_id_w_len(rd->ns, name, len, &id)) {
+		return NULL;
+	}
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		as_bin *b = &rd->bins[i];
+
+		if (! as_bin_inuse(b)) {
+			break;
+		}
+
+		if ((uint32_t)b->id == id) {
+			return b;
+		}
+	}
+
+	return NULL;
+}
+
+
+// Does not check bin name length or quota.
+as_bin *
+as_bin_create(as_storage_rd *rd, const char *name)
+{
+	if (rd->ns->single_bin) {
+		if (as_bin_inuse(rd->bins)) {
+			cf_crash(AS_BIN, "single bin create found bin in use");
+		}
+
+		as_bin_init_nameless(rd->bins);
+
+		return rd->bins;
+	}
+
+	as_bin *b = NULL;
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		if (! as_bin_inuse(&rd->bins[i])) {
+			b = &rd->bins[i];
+			break;
+		}
+	}
+
+	if (b) {
+		as_bin_init(rd->ns, b, name);
+	}
+
+	return b;
+}
+
+
+as_bin *
+as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len,
+		int *result)
+{
+	as_namespace *ns = rd->ns;
+
+	if (ns->single_bin) {
+		if (as_bin_inuse(rd->bins)) {
+			cf_crash(AS_BIN, "single bin create found bin in use");
+		}
+
+		as_bin_init_nameless(rd->bins);
+
+		return rd->bins;
+	}
+
+	if (len >= AS_ID_BIN_SZ) {
+		cf_warning(AS_BIN, "bin name too long (%lu)", len);
+		*result = AS_PROTO_RESULT_FAIL_BIN_NAME;
+		return NULL;
+	}
+
+	uint32_t id = (uint32_t)-1;
+
+	if (cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, (const char *)name, len,
+			&id) != CF_VMAPX_OK &&
+			cf_vmapx_count(ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) {
+		CF_ZSTR_DEFINE(zname, AS_ID_BIN_SZ, name, len);
+
+		cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s",
+				ns->name, zname);
+
+		*result = AS_PROTO_RESULT_FAIL_BIN_NAME;
+		return NULL;
+	}
+
+	as_bin *b = NULL;
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		if (! as_bin_inuse(&rd->bins[i])) {
+			b = &rd->bins[i];
+			break;
+		}
+	}
+
+	cf_assert(b, AS_BIN, "ran out of allocated bins in rd");
+
+	if (id == (uint32_t)-1) {
+		as_bin_init_w_len(ns, b, name, len);
+	}
+	else {
+		as_bin_init_nameless(b);
+		b->id = (uint16_t)id;
+	}
+
+	return b;
+}
+
+
+// Does not check bin name length.
+// Checks bin name quota - use appropriately.
+as_bin *
+as_bin_get_or_create(as_storage_rd *rd, const char *name)
+{
+	as_namespace *ns = rd->ns;
+
+	if (ns->single_bin) {
+		if (! as_bin_inuse_has(rd)) {
+			as_bin_init_nameless(rd->bins);
+		}
+
+		return rd->bins;
+	}
+
+	uint32_t id = (uint32_t)-1;
+	uint16_t i;
+	as_bin *b;
+
+	if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, &id) == CF_VMAPX_OK) {
+		for (i = 0; i < rd->n_bins; i++) {
+			b = &rd->bins[i];
+
+			if (! as_bin_inuse(b)) {
+				break;
+			}
+
+			if ((uint32_t)b->id == id) {
+				return b;
+			}
+		}
+	}
+	else {
+		if (cf_vmapx_count(ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) {
+			cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s",
+					ns->name, name);
+			return NULL;
+		}
+
+		i = as_bin_inuse_count(rd);
+	}
+
+	cf_assert(i < rd->n_bins, AS_BIN, "ran out of allocated bins in rd");
+
+	b = &rd->bins[i];
+
+	if (id == (uint32_t)-1) {
+		as_bin_init(ns, b, name);
+	}
+	else {
+		as_bin_init_nameless(b);
+		b->id = (uint16_t)id;
+	}
+
+	return b;
+}
+
+
+// Does not check bin name length.
+// Checks bin name quota - use appropriately.
+as_bin *
+as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name,
+		size_t len, int *result)
+{
+	as_namespace *ns = rd->ns;
+
+	if (ns->single_bin) {
+		if (! as_bin_inuse_has(rd)) {
+			as_bin_init_nameless(rd->bins);
+		}
+
+		return rd->bins;
+	}
+
+	uint32_t id = (uint32_t)-1;
+	uint16_t i;
+	as_bin *b;
+
+	if (cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, (const char *)name, len,
+			&id) == CF_VMAPX_OK) {
+		for (i = 0; i < rd->n_bins; i++) {
+			b = &rd->bins[i];
+
+			if (! as_bin_inuse(b)) {
+				break;
+			}
+
+			if ((uint32_t)b->id == id) {
+				return b;
+			}
+		}
+	}
+	else {
+		if (cf_vmapx_count(ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) {
+			CF_ZSTR_DEFINE(zname, AS_ID_BIN_SZ, name, len);
+
+			cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s",
+					ns->name, zname);
+
+			*result = AS_PROTO_RESULT_FAIL_BIN_NAME;
+			return NULL;
+		}
+
+		i = as_bin_inuse_count(rd);
+	}
+
+	cf_assert(i < rd->n_bins, AS_BIN, "ran out of allocated bins in rd");
+
+	b = &rd->bins[i];
+
+	if (id == (uint32_t)-1) {
+		as_bin_init_w_len(ns, b, name, len);
+	}
+	else {
+		as_bin_init_nameless(b);
+		b->id = (uint16_t)id;
+	}
+
+	return b;
+}
+
+
+int32_t
+as_bin_get_index(as_storage_rd *rd, const char *name)
+{
+	if (rd->ns->single_bin) {
+		return as_bin_inuse_has(rd) ? 0 : -1;
+	}
+
+	uint32_t id;
+
+	if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) {
+		return -1;
+	}
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		as_bin *b = &rd->bins[i];
+
+		if (! as_bin_inuse(b)) {
+			break;
+		}
+
+		if ((uint32_t)b->id == id) {
+			return (int32_t)i;
+		}
+	}
+
+	return -1;
+}
+
+
+int32_t
+as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len)
+{
+	if (rd->ns->single_bin) {
+		return as_bin_inuse_has(rd) ? 0 : -1;
+	}
+
+	uint32_t id;
+
+	if (! as_bin_get_id_w_len(rd->ns, name, len, &id)) {
+		return -1;
+	}
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		as_bin *b = &rd->bins[i];
+
+		if (! as_bin_inuse(b)) {
+			break;
+		}
+
+		if ((uint32_t)b->id == id) {
+			return (int32_t)i;
+		}
+	}
+
+	return -1;
+}
+
+
+void
+as_bin_destroy(as_storage_rd *rd, uint16_t i)
+{
+	as_bin_particle_destroy(&rd->bins[i], rd->ns->storage_data_in_memory);
+	as_bin_set_empty_shift(rd, i);
+}
+
+
+void
+as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta)
+{
+	as_record *r = rd->r;
+
+	if (rd->n_bins == 0) {
+		rd->n_bins = (uint16_t)delta;
+
+		size_t size = sizeof(as_bin_space) + (rd->n_bins * sizeof(as_bin));
+		as_bin_space* bin_space = (as_bin_space*)cf_malloc_ns(size);
+
+		rd->bins = bin_space->bins;
+		as_bin_set_all_empty(rd);
+
+		bin_space->n_bins = rd->n_bins;
+		as_index_set_bin_space(r, bin_space);
+
+		return;
+	}
+	// else - there were bins before.
+
+	uint16_t new_n_bins = (uint16_t)((int32_t)rd->n_bins + delta);
+
+	if (delta < 0) {
+		as_record_destroy_bins_from(rd, new_n_bins);
+	}
+
+	uint16_t old_n_bins = rd->n_bins;
+
+	rd->n_bins = new_n_bins;
+
+	if (new_n_bins != 0) {
+		size_t size = sizeof(as_bin_space) + (rd->n_bins * sizeof(as_bin));
+		as_bin_space* bin_space = (as_bin_space*)
+				cf_realloc_ns((void*)as_index_get_bin_space(r), size);
+
+		rd->bins = bin_space->bins;
+
+		if (delta > 0) {
+			as_bin_set_empty_from(rd, old_n_bins);
+		}
+
+		bin_space->n_bins = rd->n_bins;
+		as_index_set_bin_space(r, bin_space);
+	}
+	else {
+		cf_free((void*)as_index_get_bin_space(r));
+		as_index_set_bin_space(r, NULL);
+		rd->bins = NULL;
+	}
+}
diff --git a/as/src/base/cdt.c b/as/src/base/cdt.c
new file mode 100644
index 00000000..02cbd9d9
--- /dev/null
+++ b/as/src/base/cdt.c
@@ -0,0 +1,2607 @@
+/*
+ * cdt.c
+ *
+ * Copyright (C) 2015-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/cdt.h"
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_byte_order.h"
+
+#include "bits.h"
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/particle.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define VA_FIRST(first, ...)	first
+#define VA_REST(first, ...)		__VA_ARGS__
+
+#define CDT_OP_ENTRY(op, type, ...) [op].name = # op, [op].args = (const as_cdt_paramtype[]){VA_REST(__VA_ARGS__, 0)}, [op].count = VA_NARGS(__VA_ARGS__) - 1, [op].opt_args = VA_FIRST(__VA_ARGS__)
+
+const cdt_op_table_entry cdt_op_table[] = {
+
+	//============================================
+	// LIST
+
+	//--------------------------------------------
+	// Modify OPs
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_SET_TYPE,		AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS),
+
+	// Adds
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_APPEND,			AS_OPERATOR_CDT_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_APPEND_ITEMS,	AS_OPERATOR_CDT_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_INSERT,			AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_INSERT_ITEMS,	AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+
+	// Removes
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_POP,			AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_POP_RANGE,		AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE,			AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_RANGE,	AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+
+	// Modifies
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_SET,			AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_TRIM,			AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_CLEAR,			AS_OPERATOR_CDT_MODIFY, 0),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_INCREMENT,		AS_OPERATOR_CDT_MODIFY, 3, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_FLAGS),
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_SORT,			AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS),
+
+	//--------------------------------------------
+	// Read OPs
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_SIZE,			AS_OPERATOR_CDT_READ, 0),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET,			AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_RANGE,		AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+
+	//--------------------------------------------
+	// GET/REMOVE
+
+	// GET_BYs
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_INDEX,				AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_VALUE,				AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_RANK,				AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX),
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_ALL_BY_VALUE,			AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_ALL_BY_VALUE_LIST,		AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD),
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_INDEX_RANGE,			AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_VALUE_INTERVAL,		AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_RANK_RANGE,			AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+
+	// REMOVE_BYs
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_INDEX,			AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_VALUE,			AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_RANK,				AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX),
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE,		AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE_LIST,	AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD),
+
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_INDEX_RANGE,		AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_VALUE_INTERVAL,	AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE,		AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+
+	//============================================
+	// MAP
+
+	//--------------------------------------------
+	// Create and flags
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_SET_TYPE,					AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_FLAGS),
+
+	//--------------------------------------------
+	// Modify OPs
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_ADD,							AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_ADD_ITEMS,					AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_PUT,							AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_PUT_ITEMS,					AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REPLACE,						AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REPLACE_ITEMS,				AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_PAYLOAD),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_INCREMENT,					AS_OPERATOR_MAP_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_DECREMENT,					AS_OPERATOR_MAP_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_CLEAR,						AS_OPERATOR_MAP_MODIFY, 0),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_KEY,				AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_VALUE,				AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_INDEX,				AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_RANK,				AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_KEY_LIST,			AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_ALL_BY_VALUE,			AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_VALUE_LIST,		AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_KEY_INTERVAL,		AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_INDEX_RANGE,		AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_VALUE_INTERVAL,	AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_RANK_RANGE,		AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+
+	//--------------------------------------------
+	// Read OPs
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_SIZE,						AS_OPERATOR_MAP_READ, 0),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_KEY,					AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_INDEX,				AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_VALUE,				AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_RANK,					AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_ALL_BY_VALUE,			AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_KEY_INTERVAL,			AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_INDEX_RANGE,			AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_VALUE_INTERVAL,		AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_RANK_RANGE,			AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT),
+
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_KEY_LIST,				AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+	CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_VALUE_LIST,			AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD),
+
+};
+
+static const size_t cdt_op_table_size = sizeof(cdt_op_table) / sizeof(cdt_op_table_entry);
+
+extern const as_particle_vtable *particle_vtable[];
+
+typedef struct index_pack24_s {
+	uint32_t value:24;
+} __attribute__ ((__packed__)) index_pack24;
+
+typedef struct {
+	const order_index *ordidx;
+	bool error;
+} index_sort_userdata;
+
+
+//==========================================================
+// Forward declares.
+//
+
+static bool unpack_list_value(as_unpacker *pk, cdt_payload *payload_r);
+static bool unpack_map_key(as_unpacker *pk, cdt_payload *payload_r);
+static bool unpack_map_value(as_unpacker *pk, cdt_payload *payload_r);
+
+inline static void cdt_payload_pack_val(cdt_payload *value, const as_val *val);
+
+static inline uint32_t order_index_ele_sz(uint32_t max_idx);
+
+
+//==========================================================
+// CDT helpers.
+//
+
+// Calculate count given index and max_index.
+// Assumes index < ele_count.
+static uint32_t
+calc_count(uint32_t index, uint64_t in_count, uint32_t max_index)
+{
+	// Since we assume index < ele_count, (max - index) will never overflow.
+	if (in_count >= (uint64_t)max_index - index) {
+		return max_index - index;
+	}
+
+	return (uint32_t)in_count;
+}
+
+static void
+calc_index_count_multi(int64_t in_index, uint64_t in_count, uint32_t ele_count,
+		uint32_t *out_index, uint32_t *out_count)
+{
+	if (in_index >= ele_count) {
+		*out_index = ele_count;
+		*out_count = 0;
+	}
+	else if ((in_index = calc_index(in_index, ele_count)) < 0) {
+		if ((uint64_t)(-in_index) < in_count) {
+			uint64_t out64 = in_count + in_index;
+
+			if (out64 > (uint64_t)ele_count) {
+				out64 = ele_count;
+			}
+
+			*out_count = (uint32_t)out64;
+		}
+		else {
+			*out_count = 0;
+		}
+
+		*out_index = 0;
+	}
+	else {
+		*out_index = (uint32_t)in_index;
+		*out_count = calc_count((uint32_t)in_index, in_count, ele_count);
+	}
+}
+
+// Transform to absolute (uint32_t) index/count bounded by ele_count.
+bool
+calc_index_count(int64_t in_index, uint64_t in_count, uint32_t ele_count,
+		uint32_t *out_index, uint32_t *out_count, bool is_multi)
+{
+	if (is_multi) {
+		calc_index_count_multi(in_index, in_count, ele_count, out_index,
+				out_count);
+		return true;
+	}
+
+	if (in_index >= (int64_t)ele_count ||
+			(in_index = calc_index(in_index, ele_count)) < 0) {
+		return false;
+	}
+
+	*out_index = (uint32_t)in_index;
+	*out_count = calc_count((uint32_t)in_index, in_count, ele_count);
+
+	return true;
+}
+
+static bool
+unpack_list_value(as_unpacker *pk, cdt_payload *payload_r)
+{
+	payload_r->ptr = pk->buffer + pk->offset;
+
+	int64_t sz = as_unpack_size(pk);
+
+	if (sz <= 0) {
+		cf_warning(AS_PARTICLE, "unpack_list_value() invalid msgpack");
+		return false;
+	}
+
+	payload_r->sz = (uint32_t)sz;
+
+	return true;
+}
+
+static bool
+unpack_map_key(as_unpacker *pk, cdt_payload *payload_r)
+{
+	payload_r->ptr = pk->buffer + pk->offset;
+
+	int64_t sz = as_unpack_size(pk);
+
+	if (sz <= 0) {
+		cf_warning(AS_PARTICLE, "unpack_map_key() invalid msgpack");
+		return false;
+	}
+
+	payload_r->sz = (uint32_t)sz;
+
+	if (as_unpack_size(pk) <= 0) { // skip value
+		cf_warning(AS_PARTICLE, "unpack_map_key() invalid msgpack");
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+unpack_map_value(as_unpacker *pk, cdt_payload *payload_r)
+{
+	if (as_unpack_size(pk) <= 0) { // skip key
+		cf_warning(AS_PARTICLE, "unpack_map_value() invalid msgpack");
+		return false;
+	}
+
+	payload_r->ptr = pk->buffer + pk->offset;
+
+	int64_t sz = as_unpack_size(pk);
+
+	if (sz <= 0) {
+		cf_warning(AS_PARTICLE, "unpack_map_value() invalid msgpack");
+		return false;
+	}
+
+	payload_r->sz = (uint32_t)sz;
+
+	return true;
+}
+
+
+//==========================================================
+// cdt_result_data
+//
+
+bool
+result_data_set_not_found(cdt_result_data *rd, int64_t index)
+{
+	switch (rd->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_REVINDEX_RANGE:
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_RANK_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+		result_data_set_list_int2x(rd, index, 0);
+		break;
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+		if (rd->is_multi) {
+			as_bin_set_unordered_empty_list(rd->result, rd->alloc);
+			break;
+		}
+
+		as_bin_set_int(rd->result, -1);
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(rd->result, 0);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+		if (rd->is_multi) {
+			as_bin_set_unordered_empty_list(rd->result, rd->alloc);
+		}
+		break;
+	case RESULT_TYPE_MAP:
+		as_bin_set_empty_packed_map(rd->result, rd->alloc,
+				AS_PACKED_MAP_FLAG_PRESERVE_ORDER);
+		break;
+	default:
+		cf_warning(AS_PARTICLE, "result_data_set_not_found() invalid result type %d", rd->type);
+		return false;
+	}
+
+	return true;
+}
+
+void
+result_data_set_list_int2x(cdt_result_data *rd, int64_t i1, int64_t i2)
+{
+	define_int_list_builder(builder, rd->alloc, 2);
+
+	cdt_container_builder_add_int64(&builder, i1);
+	cdt_container_builder_add_int64(&builder, i2);
+	cdt_container_builder_set_result(&builder, rd);
+}
+
+int
+result_data_set_index_rank_count(cdt_result_data *rd, uint32_t start,
+		uint32_t count, uint32_t ele_count)
+{
+	bool is_rev = false;
+	bool inverted = result_data_is_inverted(rd);
+
+	switch (rd->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(rd->result, inverted ? ele_count - count : count);
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_REVRANK:
+		is_rev = true;
+		/* no break */
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_RANK: {
+		if (! rd->is_multi) {
+			if (count == 0) {
+				as_bin_set_int(rd->result, -1);
+				break;
+			}
+
+			if (is_rev) {
+				start = ele_count - start - 1;
+			}
+
+			as_bin_set_int(rd->result, start);
+			break;
+		}
+
+		cdt_container_builder builder;
+
+		if (inverted) {
+			uint32_t inv_count = ele_count - count;
+
+			cdt_int_list_builder_start(&builder, rd->alloc, inv_count);
+			cdt_container_builder_add_int_range(&builder, 0, start, ele_count,
+					is_rev);
+			cdt_container_builder_add_int_range(&builder, start + count,
+					ele_count - start - count, ele_count, is_rev);
+		}
+		else {
+			cdt_int_list_builder_start(&builder, rd->alloc, count);
+			cdt_container_builder_add_int_range(&builder, start, count,
+					ele_count, is_rev);
+		}
+
+		cdt_container_builder_set_result(&builder, rd);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "result_data_set_index_rank_count() invalid return type %d", rd->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+int
+result_data_set_range(cdt_result_data *rd, uint32_t start, uint32_t count,
+		uint32_t ele_count)
+{
+	switch (rd->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_COUNT:
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_REVRANK:
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_RANK:
+		return result_data_set_index_rank_count(rd, start, count, ele_count);
+	case RESULT_TYPE_REVINDEX_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+		start = ele_count - start - count;
+		/* no break */
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_RANK_RANGE: {
+		if (result_data_is_inverted(rd)) {
+			cf_warning(AS_PARTICLE, "result_data_set_range() result_type %d not supported with INVERTED flag", rd->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		result_data_set_list_int2x(rd, start, count);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "result_data_set_range() invalid return type %d", rd->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Does not respect inverted flag.
+void
+result_data_set_by_irc(cdt_result_data *rd,
+		const order_index *irc, const order_index *idx_map,
+		uint32_t total_count)
+{
+	bool is_rev = rd->type == RESULT_TYPE_REVINDEX ||
+			rd->type == RESULT_TYPE_REVRANK;
+	uint32_t items_count = irc->_.ele_count / 2;
+	define_int_list_builder(builder, rd->alloc, total_count);
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		uint32_t count = order_index_get(irc, (2 * i) + 1);
+
+		if (count == 0) {
+			continue;
+		}
+
+		uint32_t rank = order_index_get(irc, 2 * i);
+
+		if (idx_map) {
+			for (uint32_t j = rank; j < rank + count; j++) {
+				cdt_container_builder_add_int_range(&builder,
+						order_index_get(idx_map, j), 1, irc->max_idx, is_rev);
+			}
+		}
+		else {
+			cdt_container_builder_add_int_range(&builder, rank, count,
+					irc->max_idx, is_rev);
+		}
+	}
+
+	cdt_container_builder_set_result(&builder, rd);
+}
+
+void
+result_data_set_by_itemlist_irc(cdt_result_data *rd,
+		const order_index *items_ord, order_index *irc,
+		uint32_t total_count)
+{
+	cdt_container_builder builder;
+	bool inverted = result_data_is_inverted(rd);
+	uint32_t items_count = items_ord->_.ele_count;
+	uint32_t ele_count = irc->max_idx;
+	bool is_rev = rd->type == RESULT_TYPE_REVINDEX ||
+			rd->type == RESULT_TYPE_REVRANK;
+
+	if (! inverted) {
+		cdt_int_list_builder_start(&builder, rd->alloc, total_count);
+
+		for (uint32_t i = 0; i < items_count; i++) {
+			uint32_t count = order_index_get(irc, (i * 2) + 1);
+
+			if (count == 0) {
+				continue;
+			}
+
+			uint32_t rank = order_index_get(irc, i * 2);
+
+			for (uint32_t j = 0; j < count; j++) {
+				cdt_container_builder_add_int_range(&builder,
+						rank + j, 1, ele_count, is_rev);
+			}
+		}
+	}
+	else {
+		cdt_int_list_builder_start(&builder, rd->alloc, total_count);
+
+		uint32_t prev = 0;
+
+		for (uint32_t i = 0; i < items_count; i++) {
+			uint32_t kl_idx = order_index_get(items_ord, i);
+			uint32_t count = order_index_get(irc, (kl_idx * 2) + 1);
+
+			if (count == 0) {
+				continue;
+			}
+
+			uint32_t index = order_index_get(irc, kl_idx * 2);
+
+			cdt_container_builder_add_int_range(&builder, prev,
+					index - prev, ele_count, is_rev);
+			prev = index + count;
+		}
+
+		cdt_container_builder_add_int_range(&builder, prev,
+				ele_count - prev, ele_count, is_rev);
+	}
+
+	cdt_container_builder_set_result(&builder, rd);
+}
+
+// Does not respect inverted flag.
+void
+result_data_set_int_list_by_mask(cdt_result_data *rd, const uint64_t *mask,
+		uint32_t count, uint32_t ele_count)
+{
+	bool is_rev = rd->type == RESULT_TYPE_REVINDEX ||
+			rd->type == RESULT_TYPE_REVRANK;
+
+	if (! rd->is_multi) {
+		uint32_t idx = cdt_idx_mask_find(mask, 0, ele_count, false);
+
+		if (is_rev) {
+			idx = ele_count - idx - 1;
+		}
+
+		as_bin_set_int(rd->result, (int64_t)idx);
+		return;
+	}
+
+	define_int_list_builder(builder, rd->alloc, count);
+	uint32_t idx = 0;
+
+	for (uint32_t i = 0; i < count; i++) {
+		idx = cdt_idx_mask_find(mask, idx, ele_count, false);
+
+		int64_t val = (is_rev ? ele_count - idx - 1 : idx);
+
+		cdt_container_builder_add_int64(&builder, val);
+		idx++;
+	}
+
+	cdt_container_builder_set_result(&builder, rd);
+}
+
+
+//==========================================================
+// as_bin functions.
+//
+
+void
+as_bin_set_int(as_bin *b, int64_t value)
+{
+	b->particle = (as_particle *)value;
+	as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_INTEGER);
+}
+
+void
+as_bin_set_double(as_bin *b, double value)
+{
+	*((double *)(&b->particle)) = value;
+	as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_FLOAT);
+}
+
+
+//==========================================================
+//cdt_calc_delta
+//
+
+bool
+cdt_calc_delta_init(cdt_calc_delta *cdv, const cdt_payload *delta_value,
+		bool is_decrement)
+{
+	if (delta_value && delta_value->ptr) {
+		as_unpacker pk_delta_value = {
+				.buffer = delta_value->ptr,
+				.length = delta_value->sz
+		};
+
+		cdv->type = as_unpack_peek_type(&pk_delta_value);
+
+		if (cdv->type == AS_INTEGER) {
+			if (as_unpack_int64(&pk_delta_value, &cdv->incr_int) != 0) {
+				cf_warning(AS_PARTICLE, "cdt_delta_value_init() invalid packed delta value");
+				return false;
+			}
+		}
+		else if (cdv->type == AS_DOUBLE) {
+			if (as_unpack_double(&pk_delta_value, &cdv->incr_double) != 0) {
+				cf_warning(AS_PARTICLE, "cdt_delta_value_init() invalid packed delta value");
+				return false;
+			}
+		}
+		else {
+			cf_warning(AS_PARTICLE, "cdt_delta_value_init() delta is not int/double");
+			return false;
+		}
+	}
+	else {
+		cdv->type = AS_UNDEF;
+		cdv->incr_int = 1;
+		cdv->incr_double = 1;
+	}
+
+	if (is_decrement) {
+		cdv->incr_int = -cdv->incr_int;
+		cdv->incr_double = -cdv->incr_double;
+	}
+
+	cdv->value_int = 0;
+	cdv->value_double = 0;
+
+	return true;
+}
+
+bool
+cdt_calc_delta_add(cdt_calc_delta *cdv, as_unpacker *pk_value)
+{
+	if (pk_value) {
+		as_val_t packed_value_type = as_unpack_peek_type(pk_value);
+
+		if (packed_value_type == AS_INTEGER) {
+			if (as_unpack_int64(pk_value, &cdv->value_int) != 0) {
+				cf_warning(AS_PARTICLE, "cdt_delta_value_add() invalid packed int");
+				return false;
+			}
+
+			if (cdv->type == AS_DOUBLE) {
+				cdv->value_int += (int64_t)cdv->incr_double;
+			}
+			else {
+				cdv->value_int += cdv->incr_int;
+			}
+		}
+		else if (packed_value_type == AS_DOUBLE) {
+			if (as_unpack_double(pk_value, &cdv->value_double) != 0) {
+				cf_warning(AS_PARTICLE, "cdt_delta_value_add() invalid packed double");
+				return false;
+			}
+
+			if (cdv->type == AS_DOUBLE) {
+				cdv->value_double += cdv->incr_double;
+			}
+			else {
+				cdv->value_double += (double)cdv->incr_int;
+			}
+		}
+		else {
+			cf_warning(AS_PARTICLE, "cdt_delta_value_add() only valid for int/double");
+			return false;
+		}
+
+		cdv->type = packed_value_type;
+	}
+	else if (cdv->type == AS_DOUBLE) {
+		cdv->value_double += cdv->incr_double;
+	}
+	else {
+		cdv->type = AS_INTEGER; // default to AS_INTEGER if UNDEF
+		cdv->value_int += cdv->incr_int;
+	}
+
+	return true;
+}
+
+void
+cdt_calc_delta_pack_and_result(cdt_calc_delta *cdv, cdt_payload *value,
+		as_bin *result)
+{
+	if (cdv->type == AS_DOUBLE) {
+		cdt_payload_pack_double(value, cdv->value_double);
+		as_bin_set_double(result, cdv->value_double);
+	}
+	else {
+		cdt_payload_pack_int(value, cdv->value_int);
+		as_bin_set_int(result, cdv->value_int);
+	}
+}
+
+
+//==========================================================
+// cdt_payload functions.
+//
+
+bool
+cdt_payload_is_int(const cdt_payload *payload)
+{
+	return as_unpack_buf_peek_type(payload->ptr, payload->sz) == AS_INTEGER;
+}
+
+int64_t
+cdt_payload_get_int64(const cdt_payload *payload)
+{
+	int64_t ret = 0;
+	as_unpacker pk = {
+			.buffer = payload->ptr,
+			.offset = 0,
+			.length = payload->sz
+	};
+
+	as_unpack_int64(&pk, &ret);
+
+	return ret;
+}
+
+inline static void
+cdt_payload_pack_val(cdt_payload *value, const as_val *val)
+{
+	as_serializer ser;
+	as_msgpack_init(&ser);
+
+	value->sz = as_serializer_serialize_presized(&ser, val,
+			(uint8_t *)value->ptr);
+
+	as_serializer_destroy(&ser);
+}
+
+void
+cdt_payload_pack_int(cdt_payload *packed, int64_t value)
+{
+	as_integer val;
+	as_integer_init(&val, value);
+
+	cdt_payload_pack_val(packed, (as_val *)&val);
+}
+
+void
+cdt_payload_pack_double(cdt_payload *packed, double value)
+{
+	as_double val;
+	as_double_init(&val, value);
+
+	return cdt_payload_pack_val(packed, (as_val *)&val);
+}
+
+
+//==========================================================
+// cdt_container_builder functions.
+//
+
+void
+cdt_container_builder_add(cdt_container_builder *builder, const uint8_t *buf,
+		uint32_t sz)
+{
+	memcpy(builder->write_ptr, buf, sz);
+	builder->write_ptr += sz;
+	*builder->sz += sz;
+	builder->ele_count++;
+}
+
+void
+cdt_container_builder_add_n(cdt_container_builder *builder, const uint8_t *buf,
+		uint32_t count, uint32_t sz)
+{
+	if (buf) {
+		memcpy(builder->write_ptr, buf, sz);
+	}
+
+	builder->write_ptr += sz;
+	*builder->sz += sz;
+	builder->ele_count += count;
+}
+
+void
+cdt_container_builder_add_int64(cdt_container_builder *builder, int64_t value)
+{
+	as_integer val64;
+
+	as_packer pk = {
+			.buffer = builder->write_ptr,
+			.capacity = INT_MAX
+	};
+
+	as_integer_init(&val64, value);
+	as_pack_val(&pk, (const as_val *)&val64);
+	builder->write_ptr += pk.offset;
+	*builder->sz += (uint32_t)pk.offset;
+	builder->ele_count++;
+}
+
+void
+cdt_container_builder_add_int_range(cdt_container_builder *builder,
+		uint32_t start, uint32_t count, uint32_t ele_count, bool is_rev)
+{
+	if (is_rev) {
+		start = ele_count - start - count;
+	}
+
+	for (uint32_t i = 0; i < count; i++) {
+		cdt_container_builder_add_int64(builder, (int64_t)(start + i));
+	}
+}
+
+void
+cdt_container_builder_set_result(cdt_container_builder *builder,
+		cdt_result_data *result)
+{
+	result->result->particle = builder->particle;
+	as_bin_state_set_from_type(result->result, (as_particle_type)((uint8_t *)builder->particle)[0]);
+}
+
+
+//==========================================================
+// cdt_process_state functions.
+//
+
+bool
+cdt_process_state_init(cdt_process_state *cdt_state, const as_msg_op *op)
+{
+	const uint8_t *data = op->name + op->name_sz;
+	uint32_t sz = op->op_sz - 4 - op->name_sz;
+
+	if (data[0] == 0) { // TODO - deprecate this in "6 months"
+		if (sz < sizeof(uint16_t)) {
+			cf_warning(AS_PARTICLE, "cdt_parse_state_init() as_msg_op data too small to be valid: size=%u", sz);
+			return false;
+		}
+
+		const uint16_t *type_ptr = (const uint16_t *)data;
+
+		cdt_state->type = cf_swap_from_be16(*type_ptr);
+		cdt_state->pk.buffer = data + sizeof(uint16_t);
+		cdt_state->pk.length = sz - sizeof(uint16_t);
+		cdt_state->pk.offset = 0;
+
+		int64_t ele_count = (cdt_state->pk.length == 0) ?
+				0 : as_unpack_list_header_element_count(&cdt_state->pk);
+
+		if (ele_count < 0) {
+			cf_warning(AS_PARTICLE, "cdt_parse_state_init() unpack list header failed: size=%u type=%u ele_count=%ld", sz, cdt_state->type, ele_count);
+			return false;
+		}
+
+		cdt_state->ele_count = (uint32_t)ele_count;
+
+		return true;
+	}
+
+	cdt_state->pk.buffer = data;
+	cdt_state->pk.length = sz;
+	cdt_state->pk.offset = 0;
+
+	int64_t ele_count = as_unpack_list_header_element_count(&cdt_state->pk);
+	uint64_t type64;
+
+	if (ele_count < 1 || as_unpack_uint64(&cdt_state->pk, &type64) != 0) {
+		cf_warning(AS_PARTICLE, "cdt_parse_state_init() unpack parameters failed: size=%u ele_count=%ld", sz, ele_count);
+		return false;
+	}
+
+	cdt_state->type = (as_cdt_optype)type64;
+	cdt_state->ele_count = (uint32_t)ele_count;
+
+	return true;
+}
+
+bool
+cdt_process_state_get_params(cdt_process_state *state, size_t n, ...)
+{
+	as_cdt_optype op = state->type;
+
+	if (op >= cdt_op_table_size) {
+		return false;
+	}
+
+	const cdt_op_table_entry *entry = &cdt_op_table[op];
+	uint32_t required_count = entry->count - entry->opt_args;
+
+	cf_assert(n >= (size_t)required_count, AS_PARTICLE, "cdt_process_state_get_params() called with %zu params, require at least %u - %u = %u params", n, entry->count, entry->opt_args, required_count);
+
+	if (n == 0 || entry->args[0] == 0) {
+		return true;
+	}
+
+	if (state->ele_count < required_count) {
+		cf_warning(AS_PARTICLE, "cdt_process_state_get_params() count mismatch: got %u from client < expected %u", state->ele_count, required_count);
+		return false;
+	}
+
+	if (state->ele_count > (uint32_t)entry->count) {
+		cf_warning(AS_PARTICLE, "cdt_process_state_get_params() count mismatch: got %u from client > expected %u", state->ele_count, entry->count);
+		return false;
+	}
+
+	va_list vl;
+	va_start(vl, n);
+
+	for (uint32_t i = 0; i < state->ele_count; i++) {
+		switch (entry->args[i]) {
+		case AS_CDT_PARAM_PAYLOAD: {
+			cdt_payload *arg = va_arg(vl, cdt_payload *);
+
+			arg->ptr = state->pk.buffer + state->pk.offset;
+
+			int64_t sz = as_unpack_size(&state->pk);
+
+			if (sz <= 0) {
+				va_end(vl);
+				return false;
+			}
+
+			arg->sz = (uint32_t)sz;
+
+			break;
+		}
+		case AS_CDT_PARAM_FLAGS:
+		case AS_CDT_PARAM_COUNT: {
+			uint64_t *arg = va_arg(vl, uint64_t *);
+
+			if (as_unpack_uint64(&state->pk, arg) != 0) {
+				va_end(vl);
+				return false;
+			}
+
+			break;
+		}
+		case AS_CDT_PARAM_INDEX: {
+			int64_t *arg = va_arg(vl, int64_t *);
+
+			if (as_unpack_int64(&state->pk, arg) != 0) {
+				va_end(vl);
+				return false;
+			}
+
+			break;
+		}
+		default:
+			va_end(vl);
+			return false;
+		}
+	}
+
+	va_end(vl);
+
+	return true;
+}
+
+const char *
+cdt_process_state_get_op_name(const cdt_process_state *state)
+{
+	as_cdt_optype op = state->type;
+
+	if (op >= cdt_op_table_size) {
+		return NULL;
+	}
+
+	const cdt_op_table_entry *entry = &cdt_op_table[op];
+
+	return entry->name;
+}
+
+
+//==========================================================
+// rollback_alloc functions.
+//
+
+void
+rollback_alloc_push(rollback_alloc *packed_alloc, void *ptr)
+{
+	if (packed_alloc->malloc_list_sz >= packed_alloc->malloc_list_cap) {
+		cf_crash(AS_PARTICLE, "rollback_alloc_push() need to make rollback list larger: cap=%zu", packed_alloc->malloc_list_cap);
+	}
+
+	packed_alloc->malloc_list[packed_alloc->malloc_list_sz++] = ptr;
+}
+
+uint8_t *
+rollback_alloc_reserve(rollback_alloc *alloc_buf, size_t size)
+{
+	cf_assert(alloc_buf, AS_PARTICLE, "alloc_buf NULL");
+
+	uint8_t *ptr;
+
+	if (alloc_buf->ll_buf) {
+		cf_ll_buf_reserve(alloc_buf->ll_buf, size, &ptr);
+	}
+	else {
+		ptr = alloc_buf->malloc_ns ? cf_malloc_ns(size) : cf_malloc(size);
+		rollback_alloc_push(alloc_buf, ptr);
+	}
+
+	return ptr;
+}
+
+void
+rollback_alloc_rollback(rollback_alloc *alloc_buf)
+{
+	if (alloc_buf->ll_buf) {
+		return;
+	}
+
+	for (size_t i = 0; i < alloc_buf->malloc_list_sz; i++) {
+		cf_free(alloc_buf->malloc_list[i]);
+	}
+
+	alloc_buf->malloc_list_sz = 0;
+}
+
+bool
+rollback_alloc_from_msgpack(rollback_alloc *alloc_buf, as_bin *b,
+		const cdt_payload *seg)
+{
+	// We assume the bin is empty.
+
+	as_particle_type type = as_particle_type_from_msgpack(seg->ptr, seg->sz);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return false;
+	}
+
+	if (type == AS_PARTICLE_TYPE_NULL) {
+		return true;
+	}
+
+	uint32_t sz =
+			particle_vtable[type]->size_from_msgpack_fn(seg->ptr, seg->sz);
+
+	if (sz != 0) {
+		b->particle = (as_particle *)rollback_alloc_reserve(alloc_buf, sz);
+
+		if (! b->particle) {
+			return false;
+		}
+	}
+
+	particle_vtable[type]->from_msgpack_fn(seg->ptr, seg->sz, &b->particle);
+
+	// Set the bin's iparticle metadata.
+	as_bin_state_set_from_type(b, type);
+
+	return true;
+}
+
+
+//==========================================================
+// as_bin_cdt_packed functions.
+//
+
+int
+as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result,
+		cf_ll_buf *particles_llb)
+{
+	cdt_process_state state;
+
+	if (! cdt_process_state_init(&state, op)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	cdt_modify_data udata = {
+			.b = b,
+			.result = result,
+			.alloc_buf = particles_llb,
+			.ret_code = AS_PROTO_RESULT_OK,
+	};
+
+	bool success;
+
+	if ((int)state.type <= (int)AS_CDT_OP_LIST_LAST) {
+		success = cdt_process_state_packed_list_modify_optype(&state, &udata);
+	}
+	else {
+		success = cdt_process_state_packed_map_modify_optype(&state, &udata);
+	}
+
+	if (! success) {
+		as_bin_set_empty(b);
+		as_bin_set_empty(result);
+	}
+
+	return udata.ret_code;
+}
+
+int
+as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result)
+{
+	cdt_process_state state;
+
+	if (! cdt_process_state_init(&state, op)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	cdt_read_data udata = {
+			.b = b,
+			.result = result,
+			.ret_code = AS_PROTO_RESULT_OK,
+	};
+
+	bool success;
+
+	if ((int)state.type <= AS_CDT_OP_LIST_LAST) {
+		success = cdt_process_state_packed_list_read_optype(&state, &udata);
+	}
+	else {
+		success = cdt_process_state_packed_map_read_optype(&state, &udata);
+	}
+
+	if (! success) {
+		as_bin_set_empty(result);
+	}
+
+	return udata.ret_code;
+}
+
+
+//==========================================================
+// msgpacked_index
+//
+
+void
+msgpacked_index_set(msgpacked_index *idxs, uint32_t index, uint32_t value)
+{
+	switch (idxs->ele_sz) {
+	case 1:
+		idxs->ptr[index] = (uint8_t)value;
+		break;
+	case 2:
+		((uint16_t *)idxs->ptr)[index] = (uint16_t)value;
+		break;
+	case 3:
+		((index_pack24 *)idxs->ptr)[index].value = value;
+		break;
+	default:
+		((uint32_t *)idxs->ptr)[index] = value;
+		break;
+	}
+}
+
+void
+msgpacked_index_incr(msgpacked_index *idxs, uint32_t index)
+{
+	switch (idxs->ele_sz) {
+	case 1:
+		idxs->ptr[index]++;
+		break;
+	case 2:
+		((uint16_t *)idxs->ptr)[index]++;
+		break;
+	case 3:
+		((index_pack24 *)idxs->ptr)[index].value++;
+		break;
+	default:
+		((uint32_t *)idxs->ptr)[index]++;
+		break;
+	}
+}
+
+void
+msgpacked_index_set_ptr(msgpacked_index *idxs, uint8_t *ptr)
+{
+	idxs->ptr = ptr;
+}
+
+// Get pointer at index.
+void *
+msgpacked_index_get_mem(const msgpacked_index *idxs, uint32_t index)
+{
+	return (void *)(idxs->ptr + idxs->ele_sz * index);
+}
+
+uint32_t
+msgpacked_index_size(const msgpacked_index *idxs)
+{
+	return idxs->ele_sz * idxs->ele_count;
+}
+
+uint32_t
+msgpacked_index_ptr2value(const msgpacked_index *idxs, const void *ptr)
+{
+	switch (idxs->ele_sz) {
+	case 1:
+		return *((const uint8_t *)ptr);
+	case 2:
+		return *((const uint16_t *)ptr);
+	case 3:
+		return ((const index_pack24 *)ptr)->value;
+	default:
+		break;
+	}
+
+	return *((const uint32_t *)ptr);
+}
+
+uint32_t
+msgpacked_index_get(const msgpacked_index *idxs, uint32_t index)
+{
+	switch (idxs->ele_sz) {
+	case 1:
+		return idxs->ptr[index];
+	case 2:
+		return ((const uint16_t *)idxs->ptr)[index];
+	case 3:
+		return ((const index_pack24 *)idxs->ptr)[index].value;
+	default:
+		break;
+	}
+
+	return ((const uint32_t *)idxs->ptr)[index];
+}
+
+// Find find_index in a list of sorted_indexes.
+// *where will be the location where find_index is (if exist) or is suppose to
+// be (if not exist).
+// Return true if find_index is in sorted_indexes.
+bool
+msgpacked_index_find_index_sorted(const msgpacked_index *sorted_indexes,
+		uint32_t find_index, uint32_t count, uint32_t *where)
+{
+	if (count == 0) {
+		*where = 0;
+		return false;
+	}
+
+	uint32_t upper = count;
+	uint32_t lower = 0;
+	uint32_t i = count / 2;
+
+	while (true) {
+		uint32_t index = msgpacked_index_get(sorted_indexes, i);
+
+		if (find_index == index) {
+			*where = i;
+			return true;
+		}
+
+		if (find_index > index) {
+			if (i >= upper - 1) {
+				*where = i + 1;
+				break;
+			}
+
+			lower = i + 1;
+			i += upper;
+			i /= 2;
+		}
+		else {
+			if (i <= lower) {
+				*where = i;
+				break;
+			}
+
+			upper = i;
+			i += lower;
+			i /= 2;
+		}
+	}
+
+	return false;
+}
+
+void
+msgpacked_index_print(const msgpacked_index *idxs, const char *name)
+{
+	size_t ele_count = idxs->ele_count;
+	char buf[1024];
+	char *ptr = buf;
+
+	if (idxs->ptr) {
+		for (size_t i = 0; i < ele_count; i++) {
+			if (buf + 1024 - ptr < 12) {
+				break;
+			}
+
+			ptr += sprintf(ptr, "%u, ", msgpacked_index_get(idxs, i));
+		}
+
+		if (ele_count > 0) {
+			ptr -= 2;
+		}
+
+		*ptr = '\0';
+	}
+	else {
+		strcpy(buf, "(null)");
+	}
+
+	cf_warning(AS_PARTICLE, "%s: index[%zu]={%s}", name, ele_count, buf);
+}
+
+
+//==========================================================
+// offset_index
+//
+
+void
+offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr,
+		uint32_t ele_count, const uint8_t *contents, uint32_t content_sz)
+{
+	offidx->_.ele_count = ele_count;
+	offidx->content_sz = content_sz;
+
+	if (content_sz < (1 << 8)) {
+		offidx->_.ele_sz = 1;
+	}
+	else if (content_sz < (1 << 16)) {
+		offidx->_.ele_sz = 2;
+	}
+	else if (content_sz < (1 << 24)) {
+		offidx->_.ele_sz = 3;
+	}
+	else {
+		offidx->_.ele_sz = 4;
+	}
+
+	offidx->_.ptr = idx_mem_ptr;
+	offidx->contents = contents;
+	offidx->is_partial = false;
+}
+
+void
+offset_index_set(offset_index *offidx, uint32_t index, uint32_t value)
+{
+	if (index == 0 || index == offidx->_.ele_count) {
+		return;
+	}
+
+	msgpacked_index_set((msgpacked_index *)offidx, index, value);
+}
+
+bool
+offset_index_set_next(offset_index *offidx, uint32_t index, uint32_t value)
+{
+	if (index >= offidx->_.ele_count) {
+		return true;
+	}
+
+	uint32_t filled = offset_index_get_filled(offidx);
+
+	if (index == filled) {
+		offset_index_set(offidx, index, value);
+		offset_index_set_filled(offidx, filled + 1);
+
+		return true;
+	}
+
+	if (index < filled) {
+		return value == offset_index_get_const(offidx, index);
+	}
+
+	return false;
+}
+
+void
+offset_index_set_filled(offset_index *offidx, uint32_t ele_filled)
+{
+	if (offidx->_.ele_count == 0) {
+		return;
+	}
+
+	cf_assert(ele_filled <= offidx->_.ele_count, AS_PARTICLE, "ele_filled(%u) > ele_count(%u)", ele_filled, offidx->_.ele_count);
+	msgpacked_index_set((msgpacked_index *)offidx, 0, ele_filled);
+}
+
+void
+offset_index_set_ptr(offset_index *offidx, uint8_t *idx_mem,
+		const uint8_t *packed_mem)
+{
+	msgpacked_index_set_ptr((msgpacked_index *)offidx, idx_mem);
+	offidx->contents = packed_mem;
+}
+
+void
+offset_index_copy(offset_index *dest, const offset_index *src, uint32_t d_start,
+		uint32_t s_start, uint32_t count, int delta)
+{
+	cf_assert(d_start + count <= dest->_.ele_count, AS_PARTICLE, "d_start(%u) + count(%u) > dest.ele_count(%u)", d_start, count, dest->_.ele_count);
+	cf_assert(s_start + count <= src->_.ele_count, AS_PARTICLE, "s_start(%u) + count(%u) > src.ele_count(%u)", s_start, count, src->_.ele_count);
+
+	if (dest->_.ele_sz == src->_.ele_sz && delta == 0) {
+		memcpy(offset_index_get_mem(dest, d_start),
+				offset_index_get_mem(src, s_start),
+				dest->_.ele_sz * count);
+	}
+	else {
+		for (size_t i = 0; i < count; i++) {
+			uint32_t value = offset_index_get_const(src, s_start + i);
+
+			value += delta;
+			offset_index_set(dest, d_start + i, value);
+		}
+	}
+}
+
+void
+offset_index_append_size(offset_index *offidx, uint32_t delta)
+{
+	uint32_t filled = offset_index_get_filled(offidx);
+
+	if (filled == offidx->_.ele_count) {
+		return;
+	}
+
+	uint32_t last = offset_index_get_const(offidx, filled - 1);
+
+	offset_index_set_filled(offidx, filled + 1);
+	offset_index_set(offidx, filled, last + delta);
+}
+
+bool
+offset_index_find_items(offset_index *full_offidx,
+		cdt_find_items_idxs_type find_type, as_unpacker *items_pk,
+		order_index *items_ordidx_r, bool inverted, uint64_t *rm_mask,
+		uint32_t *rm_count_r, order_index *rm_ranks_r)
+{
+	bool (*unpack_fn)(as_unpacker *pk, cdt_payload *payload_r);
+	uint32_t items_count = items_ordidx_r->_.ele_count;
+	define_offset_index(items_offidx, items_pk->buffer + items_pk->offset,
+			items_pk->length - items_pk->offset, items_count);
+
+	switch (find_type) {
+	case CDT_FIND_ITEMS_IDXS_FOR_LIST_VALUE:
+		unpack_fn = unpack_list_value;
+		break;
+	case CDT_FIND_ITEMS_IDXS_FOR_MAP_KEY:
+		unpack_fn = unpack_map_key;
+		break;
+	case CDT_FIND_ITEMS_IDXS_FOR_MAP_VALUE:
+		unpack_fn = unpack_map_value;
+		break;
+	default:
+		cf_crash(AS_PARTICLE, "bad input");
+		return false; // dummy return to quash warning
+	}
+
+	if (! list_full_offset_index_fill_all(&items_offidx)) {
+		cf_warning(AS_PARTICLE, "offset_index_find_items() invalid parameter key list");
+		return false;
+	}
+
+	bool success = list_order_index_sort(items_ordidx_r, &items_offidx,
+			AS_CDT_SORT_ASCENDING);
+
+	cf_assert(success, AS_PARTICLE, "offset_index_find_items() sort failed after index filled");
+
+	uint32_t rm_count = 0;
+
+	as_unpacker pk = {
+			.buffer = full_offidx->contents,
+			.length = full_offidx->content_sz
+	};
+
+	if (rm_ranks_r) {
+		order_index_clear(rm_ranks_r);
+	}
+
+	for (uint32_t i = 0; i < full_offidx->_.ele_count; i++) {
+		cdt_payload value;
+
+		if (! unpack_fn(&pk, &value)) {
+			cf_warning(AS_PARTICLE, "offset_index_find_items() invalid msgpack in unpack_fn()");
+			return false;
+		}
+
+		if (! offset_index_set_next(full_offidx, i + 1, (uint32_t)pk.offset)) {
+			cf_warning(AS_PARTICLE, "offset_index_find_items() invalid msgpack in offset_index_set_next()");
+			return false;
+		}
+
+		order_index_find find = {
+				.count = items_count,
+				.target = items_count + (rm_ranks_r != NULL ? 0 : 1)
+		};
+
+		if (! order_index_find_rank_by_value(items_ordidx_r, &value,
+				&items_offidx, &find)) {
+			cf_warning(AS_PARTICLE, "offset_index_find_items() invalid items list");
+			return false;
+		}
+
+		if (rm_ranks_r) {
+			uint32_t vl_rank = find.result;
+
+			if (find.found) {
+				uint32_t idx = order_index_get(items_ordidx_r, find.result);
+
+				order_index_incr(rm_ranks_r, (idx * 2) + 1);
+				vl_rank++;
+			}
+
+			if (vl_rank != items_count) {
+				uint32_t idx = order_index_get(items_ordidx_r, vl_rank);
+
+				order_index_incr(rm_ranks_r, idx * 2);
+			}
+		}
+
+		if (! inverted) {
+			if (find.found) {
+				cdt_idx_mask_set(rm_mask, i);
+				rm_count++;
+			}
+		}
+		else if (! find.found) {
+			cdt_idx_mask_set(rm_mask, i);
+			rm_count++;
+		}
+	}
+
+	if (rm_ranks_r) {
+		for (uint32_t i = 1; i < items_count; i++) {
+			uint32_t idx0 = order_index_get(items_ordidx_r, i - 1);
+			uint32_t idx1 = order_index_get(items_ordidx_r, i);
+			uint32_t rank0 = order_index_get(rm_ranks_r, idx0 * 2);
+			uint32_t rank1 = order_index_get(rm_ranks_r, idx1 * 2);
+
+			order_index_set(rm_ranks_r, idx1 * 2, rank0 + rank1);
+		}
+	}
+
+	*rm_count_r = rm_count;
+
+	return true;
+}
+
+void *
+offset_index_get_mem(const offset_index *offidx, uint32_t index)
+{
+	return msgpacked_index_get_mem((msgpacked_index *)offidx, index);
+}
+
+uint32_t
+offset_index_size(const offset_index *offidx)
+{
+	return msgpacked_index_size((const msgpacked_index *)offidx);
+}
+
+bool
+offset_index_is_null(const offset_index *offidx)
+{
+	return offidx->_.ptr == NULL;
+}
+
+bool
+offset_index_is_valid(const offset_index *offidx)
+{
+	return offidx->_.ptr != NULL;
+}
+
+bool
+offset_index_is_full(const offset_index *offidx)
+{
+	if (offset_index_is_null(offidx)) {
+		return false;
+	}
+
+	if (offidx->_.ele_count == 0) {
+		return true;
+	}
+
+	uint32_t filled = offset_index_get_filled(offidx);
+
+	cf_assert(filled <= offidx->_.ele_count, AS_PARTICLE, "filled(%u) > ele_count(%u)", filled, offidx->_.ele_count);
+
+	if (filled == offidx->_.ele_count) {
+		return true;
+	}
+
+	return false;
+}
+
+uint32_t
+offset_index_get_const(const offset_index *offidx, uint32_t idx)
+{
+	if (idx == 0) {
+		return 0;
+	}
+
+	if (idx == offidx->_.ele_count) {
+		return offidx->content_sz;
+	}
+
+	if (idx >= offset_index_get_filled(offidx)) {
+		offset_index_print(offidx, "offset_index_get_const() offidx");
+		print_packed(offidx->contents, offidx->content_sz, "offset_index_get_const() offidx->ele_start");
+		cf_crash(AS_PARTICLE, "offset_index_get_const() idx=%u >= filled=%u ele_count=%u", idx, offset_index_get_filled(offidx), offidx->_.ele_count);
+	}
+
+	return msgpacked_index_get((const msgpacked_index *)offidx, idx);
+}
+
+uint32_t
+offset_index_get_delta_const(const offset_index *offidx, uint32_t index)
+{
+	uint32_t offset = offset_index_get_const(offidx, index);
+
+	if (index == offidx->_.ele_count - 1) {
+		return offidx->content_sz - offset;
+	}
+
+	return offset_index_get_const(offidx, index + 1) - offset;
+}
+
+uint32_t
+offset_index_get_filled(const offset_index *offidx)
+{
+	if (offidx->_.ele_count == 0) {
+		return 1;
+	}
+
+	return msgpacked_index_get((const msgpacked_index *)offidx, 0);
+}
+
+void
+offset_index_print(const offset_index *offidx, const char *name)
+{
+	if (! name) {
+		name = "offset";
+	}
+
+	msgpacked_index_print((msgpacked_index *)offidx, name);
+}
+
+void
+offset_index_delta_print(const offset_index *offidx, const char *name)
+{
+	size_t ele_count = offidx->_.ele_count;
+	char buf[1024];
+	char *ptr = buf;
+
+	if (offidx->_.ptr) {
+		for (size_t i = 0; i < ele_count; i++) {
+			if (buf + 1024 - ptr < 12) {
+				break;
+			}
+
+			ptr += sprintf(ptr, "%u, ", offset_index_get_delta_const(offidx, i));
+		}
+
+		if (ele_count > 0) {
+			ptr -= 2;
+		}
+
+		*ptr = '\0';
+	}
+	else {
+		strcpy(buf, "(null)");
+	}
+
+	cf_warning(AS_PARTICLE, "%s: delta_off[%zu]={%s} %u", name, ele_count, buf, offidx->content_sz);
+}
+
+
+//==========================================================
+// order_index
+//
+
+static inline uint32_t
+order_index_ele_sz(uint32_t max_idx)
+{
+	// Allow for values [0, ele_count] for ele_count to indicate invalid values.
+	if (max_idx < (1 << 8)) {
+		return 1;
+	}
+	else if (max_idx < (1 << 16)) {
+		return 2;
+	}
+	else if (max_idx < (1 << 24)) {
+		return 3;
+	}
+
+	return 4;
+}
+
+void
+order_index_init(order_index *ordidx, uint8_t *ptr, uint32_t ele_count)
+{
+	ordidx->_.ele_count = ele_count;
+	ordidx->_.ele_sz = order_index_ele_sz(ele_count);
+	ordidx->_.ptr = ptr;
+	ordidx->max_idx = ele_count;
+}
+
+void
+order_index_init2(order_index *ordidx, uint8_t *ptr, uint32_t max_idx,
+		uint32_t ele_count)
+{
+	ordidx->_.ele_count = ele_count;
+	ordidx->_.ele_sz = order_index_ele_sz(max_idx);
+	ordidx->_.ptr = ptr;
+	ordidx->max_idx = max_idx;
+}
+
+void
+order_index_init_ref(order_index *dst, const order_index *src, uint32_t start,
+		uint32_t count)
+{
+	order_index_init2(dst, order_index_get_mem(src, start), src->max_idx,
+			count);
+}
+
+void
+order_index_set(order_index *ordidx, uint32_t idx, uint32_t value)
+{
+	msgpacked_index_set((msgpacked_index *)ordidx, idx, value);
+}
+
+void
+order_index_set_ptr(order_index *ordidx, uint8_t *ptr)
+{
+	msgpacked_index_set_ptr((msgpacked_index *)ordidx, ptr);
+}
+
+void
+order_index_incr(order_index *ordidx, uint32_t idx)
+{
+	msgpacked_index_incr((msgpacked_index *)ordidx, idx);
+}
+
+void
+order_index_clear(order_index *ordidx)
+{
+	memset(ordidx->_.ptr, 0, order_index_size(ordidx));
+}
+
+bool
+order_index_sorted_mark_dup_eles(order_index *ordidx,
+		const offset_index *full_offidx, uint32_t *count_r, uint32_t *sz_r)
+{
+	cf_assert(count_r, AS_PARTICLE, "count_r NULL");
+	cf_assert(sz_r, AS_PARTICLE, "sz_r NULL");
+
+	as_unpacker pk = {
+			.buffer = full_offidx->contents,
+			.length = full_offidx->content_sz
+	};
+
+	as_unpacker prev = pk;
+	uint32_t prev_idx = order_index_get(ordidx, 0);
+	uint32_t ele_count = full_offidx->_.ele_count;
+
+	prev.offset = offset_index_get_const(full_offidx, prev_idx);
+	*count_r = 0;
+	*sz_r = 0;
+
+	for (uint32_t i = 1; i < ele_count; i++) {
+		uint32_t idx = order_index_get(ordidx, i);
+		uint32_t off = offset_index_get_const(full_offidx, idx);
+
+		pk.offset = off;
+
+		msgpack_compare_t cmp = as_unpack_compare(&prev, &pk);
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			(*sz_r) += pk.offset - off;
+			(*count_r)++;
+			order_index_set(ordidx, i, ele_count);
+		}
+		else if (cmp == MSGPACK_COMPARE_LESS) {
+			// no-op
+		}
+		else {
+			return false;
+		}
+
+		prev.offset = off;
+	}
+
+	return true;
+}
+
+uint32_t
+order_index_size(const order_index *ordidx)
+{
+	return msgpacked_index_size((const msgpacked_index *)ordidx);
+}
+
+bool
+order_index_is_null(const order_index *ordidx)
+{
+	return ordidx->_.ptr == NULL;
+}
+
+bool
+order_index_is_valid(const order_index *ordidx)
+{
+	return ordidx->_.ptr != NULL;
+}
+
+bool
+order_index_is_filled(const order_index *ordidx)
+{
+	if (! order_index_is_valid(ordidx)) {
+		return false;
+	}
+
+	if (ordidx->_.ele_count > 0 &&
+			order_index_get(ordidx, 0) >= ordidx->_.ele_count) {
+		return false;
+	}
+
+	return true;
+}
+
+// Get pointer at index.
+void *
+order_index_get_mem(const order_index *ordidx, uint32_t index)
+{
+	return msgpacked_index_get_mem((const msgpacked_index *)ordidx, index);
+}
+
+uint32_t
+order_index_ptr2value(const order_index *ordidx, const void *ptr)
+{
+	return msgpacked_index_ptr2value((const msgpacked_index *)ordidx, ptr);
+}
+
+uint32_t
+order_index_get(const order_index *ordidx, uint32_t index)
+{
+	return msgpacked_index_get((const msgpacked_index *)ordidx, index);
+}
+
+// Find (closest) rank given value.
+// Find closest rank for find->idx.
+//  target == 0 means find first instance of value.
+//  target == ele_count means find last instance of value.
+//  target > ele_count means don't check idx.
+// Return true success.
+bool
+order_index_find_rank_by_value(const order_index *ordidx,
+		const cdt_payload *value, const offset_index *full_offidx,
+		order_index_find *find)
+{
+	uint32_t ele_count = full_offidx->_.ele_count;
+
+	find->found = false;
+
+	if (ele_count == 0 || find->count == 0) {
+		find->result = ele_count;
+		return true;
+	}
+
+	uint32_t lower = find->start;
+	uint32_t upper = find->start + find->count;
+	uint32_t rank = find->start + find->count / 2;
+
+	as_unpacker pk_value = {
+			.buffer = value->ptr,
+			.length = value->sz
+	};
+
+	as_unpacker pk_buf = {
+			.buffer = full_offidx->contents,
+			.length = full_offidx->content_sz
+	};
+
+	while (true) {
+		uint32_t idx = ordidx ? order_index_get(ordidx, rank) : rank;
+
+		pk_value.offset = 0; // reset
+		pk_buf.offset = offset_index_get_const(full_offidx, idx);
+
+		msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk_buf);
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			find->found = true;
+
+			if (find->target > ele_count) { // means don't check
+				break;
+			}
+
+			if (find->target < idx) {
+				cmp = MSGPACK_COMPARE_LESS;
+			}
+			else if (find->target > idx) {
+				if (rank == upper - 1) {
+					break;
+				}
+
+				cmp = MSGPACK_COMPARE_GREATER;
+			}
+			else {
+				break;
+			}
+		}
+
+		if (cmp == MSGPACK_COMPARE_GREATER) {
+			if (rank >= upper - 1) {
+				rank++;
+				break;
+			}
+
+			lower = rank + (find->found ? 0 : 1);
+			rank += upper;
+			rank /= 2;
+		}
+		else if (cmp == MSGPACK_COMPARE_LESS) {
+			if (rank == lower) {
+				break;
+			}
+
+			upper = rank;
+			rank += lower;
+			rank /= 2;
+		}
+		else {
+			return false;
+		}
+	}
+
+	find->result = rank;
+
+	return true;
+}
+
+uint32_t
+order_index_get_ele_size(const order_index *ordidx, uint32_t count,
+		const offset_index *full_offidx)
+{
+	uint32_t sz = 0;
+
+	for (uint32_t i = 0; i < count; i++) {
+		uint32_t idx = order_index_get(ordidx, i);
+
+		if (idx == ordidx->max_idx) {
+			continue;
+		}
+
+		sz += offset_index_get_delta_const(full_offidx, idx);
+	}
+
+	return sz;
+}
+
+uint8_t *
+order_index_write_eles(const order_index *ordidx, uint32_t count,
+		const offset_index *full_offidx, uint8_t *ptr, bool invert)
+{
+	uint32_t start = 0;
+	uint32_t offset = 0;
+	uint32_t sz = 0;
+
+	for (uint32_t i = 0; i < count; i++) {
+		uint32_t idx = order_index_get(ordidx, i);
+
+		if (idx == ordidx->max_idx) {
+			continue;
+		}
+
+		offset = offset_index_get_const(full_offidx, idx);
+		sz = offset_index_get_delta_const(full_offidx, idx);
+
+		if (! invert) {
+			memcpy(ptr, full_offidx->contents + offset, sz);
+			ptr += sz;
+		}
+		else {
+			uint32_t invert_sz = offset - start;
+
+			if (invert_sz != 0) {
+				memcpy(ptr, full_offidx->contents + start, invert_sz);
+				ptr += invert_sz;
+			}
+		}
+
+		start = offset + sz;
+	}
+
+	if (! invert) {
+		return ptr;
+	}
+
+	uint32_t invert_sz = full_offidx->content_sz - start;
+
+	memcpy(ptr, full_offidx->contents + start, invert_sz);
+
+	return ptr + invert_sz;
+}
+
+uint32_t
+order_index_adjust_value(const order_index_adjust *via, uint32_t src)
+{
+	if (via) {
+		return via->f(via, src);
+	}
+
+	return src;
+}
+
+void
+order_index_copy(order_index *dest, const order_index *src, uint32_t d_start,
+		uint32_t s_start, uint32_t count, const order_index_adjust *adjust)
+{
+	if (dest->_.ele_sz == src->_.ele_sz && ! adjust) {
+		memcpy(order_index_get_mem(dest, d_start),
+				order_index_get_mem(src, s_start),
+				src->_.ele_sz * count);
+	}
+	else {
+		for (uint32_t i = 0; i < count; i++) {
+			uint32_t value = order_index_get(src, s_start + i);
+
+			value = order_index_adjust_value(adjust, value);
+			order_index_set(dest, d_start + i, value);
+		}
+	}
+}
+
+size_t
+order_index_calc_size(uint32_t max_idx, uint32_t ele_count)
+{
+	return order_index_ele_sz(max_idx) * ele_count;
+}
+
+void
+order_index_print(const order_index *ordidx, const char *name)
+{
+	if (! name) {
+		name = "value";
+	}
+
+	msgpacked_index_print(&ordidx->_, name);
+}
+
+
+//==========================================================
+// order_heap
+//
+
+bool
+order_heap_init_build_by_range(order_heap *heap, uint8_t *heap_mem,
+		uint32_t idx, uint32_t count, uint32_t ele_count,
+		order_heap_compare_fn cmp_fn, const void *udata)
+{
+	uint32_t tail_distance = ele_count - idx - count;
+	uint32_t discard;
+	msgpack_compare_t cmp;
+
+	if (idx <= tail_distance) {
+		cmp = MSGPACK_COMPARE_LESS; // min k
+		discard = idx;
+	}
+	else {
+		cmp = MSGPACK_COMPARE_GREATER; // max k
+		discard = tail_distance;
+	}
+
+	order_index_init(&heap->_, heap_mem, ele_count);
+	heap->filled = 0;
+	heap->userdata = udata;
+	heap->cmp = cmp;
+	heap->cmp_fn = cmp_fn;
+	order_heap_build(heap, true);
+
+	if (! order_heap_order_at_end(heap, count + discard)) {
+		return false;
+	}
+
+	return true;
+}
+
+void
+order_heap_swap(order_heap *heap, uint32_t index1, uint32_t index2)
+{
+	uint32_t temp = order_heap_get(heap, index1);
+	order_heap_set(heap, index1, order_heap_get(heap, index2));
+	order_heap_set(heap, index2, temp);
+}
+
+bool
+order_heap_remove_top(order_heap *heap)
+{
+	if (heap->filled == 0) {
+		return true;
+	}
+
+	uint32_t index = order_heap_get(heap, (heap->filled--) - 1);
+
+	return order_heap_replace_top(heap, index);
+}
+
+bool
+order_heap_replace_top(order_heap *heap, uint32_t value)
+{
+	order_heap_set(heap, 0, value);
+
+	return order_heap_heapify(heap, 0);
+}
+
+bool
+order_heap_heapify(order_heap *heap, uint32_t index)
+{
+	while (true) {
+		uint32_t child1 = 2 * index + 1;
+		uint32_t child2 = 2 * index + 2;
+		uint32_t child;
+
+		if (child1 >= heap->filled) {
+			break;
+		}
+
+		if (child2 >= heap->filled) {
+			child = child1;
+		}
+		else {
+			msgpack_compare_t cmp = heap->cmp_fn(heap->userdata,
+					order_heap_get(heap, child1),
+					order_heap_get(heap, child2));
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				return false;
+			}
+
+			if (cmp == heap->cmp || cmp == MSGPACK_COMPARE_EQUAL) {
+				child = child1;
+			}
+			else {
+				child = child2;
+			}
+		}
+
+		msgpack_compare_t cmp = heap->cmp_fn(heap->userdata,
+				order_heap_get(heap, child),
+				order_heap_get(heap, index));
+
+		if (cmp == MSGPACK_COMPARE_ERROR) {
+			return false;
+		}
+
+		if (cmp == heap->cmp) {
+			order_heap_swap(heap, index, child);
+			index = child;
+		}
+		else {
+			break;
+		}
+	}
+
+	return true;
+}
+
+// O(n)
+bool
+order_heap_build(order_heap *heap, bool init)
+{
+	if (init) {
+		heap->filled = heap->_._.ele_count;
+
+		for (size_t i = 0; i < heap->filled; i++) {
+			order_heap_set(heap, i, i);
+		}
+	}
+
+	int64_t start = (int64_t)heap->filled / 2 - 1;
+
+	for (int64_t i = start; i >= 0; i--) {
+		if (! order_heap_heapify(heap, (uint32_t)i)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool
+order_heap_order_at_end(order_heap *heap, uint32_t count)
+{
+	uint32_t end_index = heap->filled - 1;
+
+	for (uint32_t i = 0; i < count; i++) {
+		uint32_t value = order_heap_get(heap, 0);
+
+		if (! order_heap_remove_top(heap)) {
+			return false;
+		}
+
+		order_heap_set(heap, end_index--, value);
+	}
+
+	cf_assert(heap->filled == end_index + 1, AS_PARTICLE, "FIXME"); // FIXME
+	heap->filled = end_index + 1;
+
+	return true;
+}
+
+// Reverse order of end indexes.
+void
+order_heap_reverse_end(order_heap *heap, uint32_t count)
+{
+	uint32_t start = heap->filled;
+	uint32_t end = start + count;
+	uint32_t stop = (start + end) / 2;
+
+	end--;
+
+	for (uint32_t i = start; i < stop; i++) {
+		uint32_t left = order_heap_get(heap, i);
+		uint32_t right = order_heap_get(heap, end);
+
+		order_heap_set(heap, end--, left);
+		order_heap_set(heap, i, right);
+	}
+}
+
+void
+order_heap_print(const order_heap *heap)
+{
+	order_index_print(&heap->_, "heap");
+}
+
+
+//==========================================================
+// cdt_idx_mask
+//
+
+size_t
+cdt_idx_mask_count(uint32_t ele_count)
+{
+	return (ele_count + 63) / 64;
+}
+
+void
+cdt_idx_mask_init(uint64_t *mask, uint32_t ele_count)
+{
+	memset(mask, 0, cdt_idx_mask_count(ele_count) * sizeof(uint64_t));
+}
+
+void
+cdt_idx_mask_set(uint64_t *mask, uint32_t idx)
+{
+	uint32_t shift = idx % 64;
+
+	mask[idx / 64] |= 1ULL << shift;
+}
+
+void
+cdt_idx_mask_set_by_ordidx(uint64_t *mask, const order_index *ordidx,
+		uint32_t start, uint32_t count, bool inverted)
+{
+	for (uint32_t i = 0; i < count; i++) {
+		cdt_idx_mask_set(mask, order_index_get(ordidx, start + i));
+	}
+
+	if (inverted) {
+		cdt_idx_mask_invert(mask, ordidx->max_idx);
+	}
+}
+
+void
+cdt_idx_mask_set_by_irc(uint64_t *mask, const order_index *irc,
+		const order_index *idx_map, bool inverted)
+{
+	uint32_t items_count = irc->_.ele_count / 2;
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		uint32_t rank = order_index_get(irc, 2 * i);
+		uint32_t count = order_index_get(irc, (2 * i) + 1);
+
+		if (count == 0) {
+			continue;
+		}
+
+		uint32_t end = rank + count;
+
+		for (uint32_t j = rank; j < end; j++) {
+			cdt_idx_mask_set(mask, idx_map ? order_index_get(idx_map, j) : j);
+		}
+	}
+
+	if (inverted) {
+		cdt_idx_mask_invert(mask, irc->max_idx);
+	}
+}
+
+void
+cdt_idx_mask_invert(uint64_t *mask, uint32_t ele_count)
+{
+	uint32_t mask_count = cdt_idx_mask_count(ele_count);
+
+	for (uint32_t i = 0; i < mask_count; i++) {
+		mask[i] = ~mask[i];
+	}
+}
+
+uint64_t
+cdt_idx_mask_get(const uint64_t *mask, uint32_t idx)
+{
+	return mask[idx / 64];
+}
+
+bool
+cdt_idx_mask_is_set(const uint64_t *mask, uint32_t idx)
+{
+	uint32_t shift = idx % 64;
+
+	return (mask[idx / 64] & (1ULL << shift)) != 0;
+}
+
+// Find first 1 or 0.
+uint32_t
+cdt_idx_mask_find(const uint64_t *mask, uint32_t start, uint32_t end,
+		bool is_find0)
+{
+	cf_assert(start <= end, AS_PARTICLE, "start %u > end %u", start, end);
+
+	if (start == end) {
+		return end;
+	}
+
+	uint32_t offset = start % 64;
+	uint32_t i = start / 64;
+	uint64_t bit_mask = ~((1ULL << offset) - 1);
+	uint64_t bits = (is_find0 ? ~mask[i] : mask[i]) & bit_mask;
+	uint32_t count = cf_lsb64(bits);
+
+	if (count != 64) {
+		offset = start - offset + count;
+
+		if (offset > end) {
+			return end;
+		}
+
+		return offset;
+	}
+
+	uint32_t i_end = (end + 63) / 64;
+
+	for (i++; i < i_end; i++) {
+		count = cf_lsb64(is_find0 ? ~mask[i] : mask[i]);
+
+		if (count != 64) {
+			break;
+		}
+	}
+
+	offset = (i * 64) + count;
+
+	if (offset > end) {
+		return end;
+	}
+
+	return offset;
+}
+
+uint8_t *
+cdt_idx_mask_write_eles(const uint64_t *mask, uint32_t count,
+		const offset_index *full_offidx, uint8_t *ptr, bool invert)
+{
+	if (count == 0) {
+		if (! invert) {
+			return ptr;
+		}
+
+		memcpy(ptr, full_offidx->contents, full_offidx->content_sz);
+		return ptr + full_offidx->content_sz;
+	}
+
+	uint32_t ele_count = full_offidx->_.ele_count;
+	uint32_t start_offset = 0;
+	uint32_t idx = 0;
+	uint32_t count_left = count;
+
+	while (idx < ele_count) {
+		uint32_t idx0 = cdt_idx_mask_find(mask, idx, ele_count, false);
+
+		cf_assert(idx0 < ele_count, AS_PARTICLE, "idx0 %u out of bounds from idx %u ele_count %u", idx0, idx, ele_count);
+		idx = cdt_idx_mask_find(mask, idx0 + 1, ele_count, true);
+
+		if (idx - idx0 > count_left) {
+			idx = idx0 + count_left;
+		}
+
+		uint32_t offset0 = offset_index_get_const(full_offidx, idx0);
+		uint32_t offset1 = offset_index_get_const(full_offidx, idx);
+
+		if (invert) {
+			uint32_t sz = offset0 - start_offset;
+
+			memcpy(ptr, full_offidx->contents + start_offset, sz);
+			ptr += sz;
+			start_offset = offset1;
+		}
+		else {
+			uint32_t sz = offset1 - offset0;
+
+			memcpy(ptr, full_offidx->contents + offset0, sz);
+			ptr += sz;
+		}
+
+		count_left -= idx - idx0;
+
+		if (count_left == 0) {
+			break;
+		}
+
+		idx++;
+	}
+
+	if (invert) {
+		uint32_t sz = full_offidx->content_sz - start_offset;
+
+		memcpy(ptr, full_offidx->contents + start_offset, sz);
+		ptr += sz;
+	}
+
+	return ptr;
+}
+
+uint32_t
+cdt_idx_mask_get_content_sz(const uint64_t *mask, uint32_t count,
+		const offset_index *full_offidx)
+{
+	uint32_t sz = 0;
+	uint32_t idx = 0;
+	uint32_t ele_count = full_offidx->_.ele_count;
+
+	for (uint32_t i = 0; i < count; i++) {
+		idx = cdt_idx_mask_find(mask, idx, ele_count, false);
+		sz += offset_index_get_delta_const(full_offidx, idx);
+		idx++;
+	}
+
+	return sz;
+}
+
+void
+cdt_idx_mask_print(const uint64_t *mask, uint32_t ele_count, const char *name)
+{
+	if (! name) {
+		name = "mask";
+	}
+
+	size_t max = (ele_count + 63) / 64;
+	char buf[1024];
+	char *ptr = buf;
+
+	for (size_t i = 0; i < max; i++) {
+		if (buf + 1024 - ptr < 18) {
+			break;
+		}
+
+		ptr += sprintf(ptr, "%016lX, ", mask[i]);
+	}
+
+	if (ele_count != 0) {
+		ptr -= 2;
+	}
+
+	*ptr = '\0';
+
+	cf_warning(AS_PARTICLE, "%s: index[%u]={%s}", name, ele_count, buf);
+}
+
+
+//==========================================================
+// list
+//
+
+bool
+list_param_parse(const cdt_payload *items, as_unpacker *pk, uint32_t *count_r)
+{
+	pk->buffer = items->ptr;
+	pk->offset = 0;
+	pk->length = items->sz;
+
+	int64_t items_hdr = as_unpack_list_header_element_count(pk);
+
+	if (items_hdr > 0 && as_unpack_peek_is_ext(pk)) {
+		if (as_unpack_size(pk) <= 0) {
+			cf_warning(AS_PARTICLE, "list_param_parse() invalid parameter");
+			return false;
+		}
+
+		items_hdr--;
+	}
+
+	if (items_hdr < 0 || items_hdr > CDT_MAX_PARAM_LIST_COUNT) {
+		cf_warning(AS_PARTICLE, "list_param_parse() invalid param items_hdr %ld", items_hdr);
+		return false;
+	}
+
+	*count_r = (uint32_t)items_hdr;
+
+	return true;
+}
+
+
+//==========================================================
+// Debugging support.
+//
+
+void
+print_hex(const uint8_t *packed, uint32_t packed_sz, char *buf, uint32_t buf_sz)
+{
+	uint32_t n = (buf_sz - 3) / 2;
+
+	if (n > packed_sz) {
+		n = packed_sz;
+		buf[buf_sz - 3] = '.';
+		buf[buf_sz - 2] = '.';
+		buf[buf_sz - 1] = '\0';
+	}
+
+	char *ptr = (char *)buf;
+
+	for (int i = 0; i < n; i++) {
+		sprintf(ptr, "%02X", packed[i]);
+		ptr += 2;
+	}
+}
+
+void
+print_packed(const uint8_t *packed, uint32_t sz, const char *name)
+{
+	cf_warning(AS_PARTICLE, "%s: data=%p sz=%u", name, packed, sz);
+
+	const uint32_t limit = 256;
+	uint32_t n = (sz + limit - 1) / limit;
+	uint32_t line_sz = limit;
+	char mem[1024];
+
+	for (uint32_t i = 0; i < n; i++) {
+		if (i == n - 1) {
+			line_sz = sz % limit;
+		}
+
+		print_hex(packed + limit * i, line_sz, mem, sizeof(mem));
+		cf_warning(AS_PARTICLE, "%s:%0X: [%s]", name, i, mem);
+	}
+}
+
+void
+cdt_bin_print(const as_bin *b, const char *name)
+{
+	typedef struct {
+		uint8_t type;
+		uint32_t sz;
+		uint8_t data[];
+	} __attribute__ ((__packed__)) cdt_mem;
+
+	const cdt_mem *p = (const cdt_mem *)b->particle;
+	uint8_t bintype = as_bin_get_particle_type(b);
+
+	if (! p || (bintype != AS_PARTICLE_TYPE_MAP &&
+			bintype != AS_PARTICLE_TYPE_LIST)) {
+		cf_warning(AS_PARTICLE, "%s: particle NULL type %u", name, bintype);
+		return;
+	}
+
+	cf_warning(AS_PARTICLE, "%s: btype %u data=%p sz=%u type=%d", name, bintype, p->data, p->sz, p->type);
+	char buf[4096];
+	print_hex(p->data, p->sz, buf, 4096);
+	cf_warning(AS_PARTICLE, "%s: buf=%s", name, buf);
+}
diff --git a/as/src/base/cfg.c b/as/src/base/cfg.c
new file mode 100644
index 00000000..b1f54c2d
--- /dev/null
+++ b/as/src/base/cfg.c
@@ -0,0 +1,4671 @@
+/*
+ * cfg.c
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/cfg.h"
+
+#include <errno.h>
+#include <grp.h>
+#include <limits.h>
+#include <pthread.h>
+#include <pwd.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#include "aerospike/mod_lua_config.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "bits.h"
+#include "cf_str.h"
+#include "dynbuf.h"
+#include "fault.h"
+#include "hardware.h"
+#include "hist.h"
+#include "hist_track.h"
+#include "msg.h"
+#include "node.h"
+#include "olock.h"
+#include "socket.h"
+#include "tls.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/security_config.h"
+#include "base/thr_demarshal.h"
+#include "base/thr_info.h"
+#include "base/thr_info_port.h"
+#include "base/thr_query.h"
+#include "base/thr_sindex.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction_policy.h"
+#include "base/xdr_config.h"
+#include "base/xdr_serverside.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "fabric/migrate.h"
+#include "fabric/partition_balance.h"
+#include "storage/drv_ssd.h"
+
+
+//==========================================================
+// Globals.
+//
+
+// The runtime configuration instance.
+as_config g_config;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void init_addr_list(cf_addr_list* addrs);
+void add_addr(const char* name, cf_addr_list* addrs);
+void add_tls_peer_name(const char* name, cf_serv_spec* spec);
+void copy_addrs(const cf_addr_list* from, cf_addr_list* to);
+void default_addrs(cf_addr_list* one, cf_addr_list* two);
+void bind_to_access(const cf_serv_spec* from, cf_addr_list* to);
+void cfg_add_addr_bind(const char* name, cf_serv_spec* spec);
+void cfg_add_addr_std(const char* name, cf_serv_spec* spec);
+void cfg_add_addr_alt(const char* name, cf_serv_spec* spec);
+void cfg_mserv_config_from_addrs(cf_addr_list* addrs, cf_addr_list* bind_addrs, cf_mserv_cfg* serv_cfg, cf_ip_port port, cf_sock_owner owner, uint8_t ttl);
+void cfg_serv_spec_to_bind(const cf_serv_spec* spec, const cf_serv_spec* def_spec, cf_serv_cfg* bind, cf_sock_owner owner);
+void cfg_serv_spec_std_to_access(const cf_serv_spec* spec, cf_addr_list* access);
+void cfg_serv_spec_alt_to_access(const cf_serv_spec* spec, cf_addr_list* access);
+void cfg_add_mesh_seed_addr_port(char* addr, cf_ip_port port, bool tls);
+as_set* cfg_add_set(as_namespace* ns);
+void cfg_add_storage_file(as_namespace* ns, char* file_name);
+void cfg_add_storage_device(as_namespace* ns, char* device_name, char* shadow_name);
+uint32_t cfg_obj_size_hist_max(uint32_t hist_max);
+void cfg_set_cluster_name(char* cluster_name);
+void create_and_check_hist_track(cf_hist_track** h, const char* name, histogram_scale scale);
+void cfg_create_all_histograms();
+void cfg_init_serv_spec(cf_serv_spec* spec_p);
+cf_tls_spec* cfg_create_tls_spec(as_config* cfg, char* name);
+char* cfg_resolve_tls_name(char* tls_name, const char* cluster_name, const char* which);
+
+void xdr_cfg_add_datacenter(char* dc, uint32_t nsid);
+void xdr_cfg_add_node_addr_port(dc_config_opt *dc_cfg, char* addr, int port);
+void xdr_cfg_add_tls_node(dc_config_opt *dc_cfg, char* addr, char *tls_name, int port);
+
+
+//==========================================================
+// Helper - set as_config defaults.
+//
+
+void
+cfg_set_defaults()
+{
+	as_config* c = &g_config;
+
+	memset(c, 0, sizeof(as_config));
+
+	cfg_init_serv_spec(&c->service);
+	cfg_init_serv_spec(&c->tls_service);
+	cfg_init_serv_spec(&c->hb_serv_spec);
+	cfg_init_serv_spec(&c->hb_tls_serv_spec);
+	cfg_init_serv_spec(&c->fabric);
+	cfg_init_serv_spec(&c->tls_fabric);
+	cfg_init_serv_spec(&c->info);
+
+	c->paxos_single_replica_limit = 1; // by default all clusters obey replication counts
+	c->n_proto_fd_max = 15000;
+	c->n_batch_threads = 4;
+	c->batch_max_buffers_per_queue = 255; // maximum number of buffers allowed in a single queue
+	c->batch_max_requests = 5000; // maximum requests/digests in a single batch
+	c->batch_max_unused_buffers = 256; // maximum number of buffers allowed in batch buffer pool
+	c->batch_priority = 200; // # of rows between a quick context switch?
+	c->feature_key_file = "/etc/aerospike/features.conf";
+	c->hist_track_back = 300;
+	c->hist_track_slice = 10;
+	c->n_info_threads = 16;
+	c->migrate_max_num_incoming = AS_MIGRATE_DEFAULT_MAX_NUM_INCOMING; // for receiver-side migration flow-control
+	c->n_migrate_threads = 1;
+	c->nsup_delete_sleep = 100; // 100 microseconds means a delete rate of 10k TPS
+	c->nsup_period = 120; // run nsup once every 2 minutes
+	c->nsup_startup_evict = true;
+	c->proto_fd_idle_ms = 60000; // 1 minute reaping of proto file descriptors
+	c->proto_slow_netio_sleep_ms = 1; // 1 ms sleep between retry for slow queries
+	c->run_as_daemon = true; // set false only to run in debugger & see console output
+	c->scan_max_active = 100;
+	c->scan_max_done = 100;
+	c->scan_max_udf_transactions = 32;
+	c->scan_threads = 4;
+	c->ticker_interval = 10;
+	c->transaction_max_ns = 1000 * 1000 * 1000; // 1 second
+	c->transaction_pending_limit = 20;
+	c->transaction_retry_ms = 1000 + 2; // 1 second + epsilon, so default timeout happens first
+	c->n_transaction_threads_per_queue = 4;
+	as_sindex_gconfig_default(c);
+	as_query_gconfig_default(c);
+	c->work_directory = "/opt/aerospike";
+	c->debug_allocations = CF_ALLOC_DEBUG_NONE;
+	c->fabric_dump_msgs = false;
+
+	// Network heartbeat defaults.
+	c->hb_config.mode = AS_HB_MODE_UNDEF;
+	c->hb_config.tx_interval = 150;
+	c->hb_config.max_intervals_missed = 10;
+	c->hb_config.protocol = AS_HB_PROTOCOL_V3;
+	c->hb_config.override_mtu = 0;
+
+	// Fabric defaults.
+	c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_BULK] = 2;
+	c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK] = 4;
+	c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_CTRL] = 1;
+	c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL] = 4;
+	c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_META] = 1;
+	c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META] = 4;
+	c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_RW] = 8;
+	c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW] = 16;
+	c->fabric_keepalive_enabled = true;
+	c->fabric_keepalive_intvl = 1; // seconds
+	c->fabric_keepalive_probes = 10; // tries
+	c->fabric_keepalive_time = 1; // seconds
+	c->fabric_latency_max_ms = 5; // assume a one way latency of 5 milliseconds by default
+	c->fabric_recv_rearm_threshold = 1024;
+	c->n_fabric_send_threads = 8;
+
+	// Clustering defaults.
+	c->clustering_config.cluster_size_min = 1;
+	c->clustering_config.clique_based_eviction_enabled = true;
+
+	// XDR defaults.
+	for (int i = 0; i < AS_CLUSTER_SZ ; i++) {
+		c->xdr_peers_lst[i].node = 0;
+
+		for (int j = 0; j < DC_MAX_NUM; j++) {
+			c->xdr_peers_lst[i].time[j] = 0;
+		}
+
+		c->xdr_clmap[i] = 0;
+	}
+
+	for (int j = 0; j < DC_MAX_NUM; j++) {
+		c->xdr_self_lastshiptime[j] = 0;
+	}
+
+	// Mod-lua defaults.
+	c->mod_lua.server_mode      = true;
+	c->mod_lua.cache_enabled    = true;
+	strcpy(c->mod_lua.system_path, "/opt/aerospike/sys/udf/lua");
+	strcpy(c->mod_lua.user_path, "/opt/aerospike/usr/udf/lua");
+
+	// TODO - security set default config API?
+	c->sec_cfg.privilege_refresh_period = 60 * 5; // refresh socket privileges every 5 minutes
+	c->sec_cfg.syslog_local = AS_SYSLOG_NONE;
+}
+
+//==========================================================
+// All configuration items must have a switch case
+// identifier somewhere in this enum. The order is not
+// important, other than for organizational sanity.
+//
+
+typedef enum {
+	// Generic:
+	// Token not found:
+	CASE_NOT_FOUND,
+	// Start of parsing context:
+	CASE_CONTEXT_BEGIN,
+	// End of parsing context:
+	CASE_CONTEXT_END,
+
+	// Top-level options:
+	// In canonical configuration file order:
+	CASE_SERVICE_BEGIN,
+	CASE_LOGGING_BEGIN,
+	CASE_NETWORK_BEGIN,
+	CASE_NAMESPACE_BEGIN,
+	CASE_MOD_LUA_BEGIN,
+	CASE_CLUSTER_BEGIN,
+	// Enterprise-only:
+	CASE_SECURITY_BEGIN,
+	CASE_XDR_BEGIN,
+
+	// Service options:
+	// Normally visible, in canonical configuration file order:
+	CASE_SERVICE_USER,
+	CASE_SERVICE_GROUP,
+	CASE_SERVICE_PAXOS_SINGLE_REPLICA_LIMIT,
+	CASE_SERVICE_PIDFILE,
+	CASE_SERVICE_CLIENT_FD_MAX, // renamed
+	CASE_SERVICE_PROTO_FD_MAX,
+	// Normally hidden:
+	CASE_SERVICE_ADVERTISE_IPV6,
+	CASE_SERVICE_AUTO_PIN,
+	CASE_SERVICE_BATCH_THREADS,
+	CASE_SERVICE_BATCH_MAX_BUFFERS_PER_QUEUE,
+	CASE_SERVICE_BATCH_MAX_REQUESTS,
+	CASE_SERVICE_BATCH_MAX_UNUSED_BUFFERS,
+	CASE_SERVICE_BATCH_PRIORITY,
+	CASE_SERVICE_BATCH_INDEX_THREADS,
+	CASE_SERVICE_CLUSTER_NAME,
+	CASE_SERVICE_ENABLE_BENCHMARKS_FABRIC,
+	CASE_SERVICE_ENABLE_BENCHMARKS_SVC,
+	CASE_SERVICE_ENABLE_HIST_INFO,
+	CASE_SERVICE_FEATURE_KEY_FILE,
+	CASE_SERVICE_HIST_TRACK_BACK,
+	CASE_SERVICE_HIST_TRACK_SLICE,
+	CASE_SERVICE_HIST_TRACK_THRESHOLDS,
+	CASE_SERVICE_INFO_THREADS,
+	CASE_SERVICE_LOG_LOCAL_TIME,
+	CASE_SERVICE_LOG_MILLIS,
+	CASE_SERVICE_MIGRATE_MAX_NUM_INCOMING,
+	CASE_SERVICE_MIGRATE_THREADS,
+	CASE_SERVICE_MIN_CLUSTER_SIZE,
+	CASE_SERVICE_NODE_ID,
+	CASE_SERVICE_NODE_ID_INTERFACE,
+	CASE_SERVICE_NSUP_DELETE_SLEEP,
+	CASE_SERVICE_NSUP_PERIOD,
+	CASE_SERVICE_NSUP_STARTUP_EVICT,
+	CASE_SERVICE_PROTO_FD_IDLE_MS,
+	CASE_SERVICE_QUERY_BATCH_SIZE,
+	CASE_SERVICE_QUERY_BUFPOOL_SIZE,
+	CASE_SERVICE_QUERY_IN_TRANSACTION_THREAD,
+	CASE_SERVICE_QUERY_LONG_Q_MAX_SIZE,
+	CASE_SERVICE_QUERY_PRE_RESERVE_PARTITIONS,
+	CASE_SERVICE_QUERY_PRIORITY,
+	CASE_SERVICE_QUERY_PRIORITY_SLEEP_US,
+	CASE_SERVICE_QUERY_REC_COUNT_BOUND,
+	CASE_SERVICE_QUERY_REQ_IN_QUERY_THREAD,
+	CASE_SERVICE_QUERY_REQ_MAX_INFLIGHT,
+	CASE_SERVICE_QUERY_SHORT_Q_MAX_SIZE,
+	CASE_SERVICE_QUERY_THREADS,
+	CASE_SERVICE_QUERY_THRESHOLD,
+	CASE_SERVICE_QUERY_UNTRACKED_TIME_MS,
+	CASE_SERVICE_QUERY_WORKER_THREADS,
+	CASE_SERVICE_RUN_AS_DAEMON,
+	CASE_SERVICE_SCAN_MAX_ACTIVE,
+	CASE_SERVICE_SCAN_MAX_DONE,
+	CASE_SERVICE_SCAN_MAX_UDF_TRANSACTIONS,
+	CASE_SERVICE_SCAN_THREADS,
+	CASE_SERVICE_SERVICE_THREADS,
+	CASE_SERVICE_SINDEX_BUILDER_THREADS,
+	CASE_SERVICE_SINDEX_GC_MAX_RATE,
+	CASE_SERVICE_SINDEX_GC_PERIOD,
+	CASE_SERVICE_TICKER_INTERVAL,
+	CASE_SERVICE_TRANSACTION_MAX_MS,
+	CASE_SERVICE_TRANSACTION_PENDING_LIMIT,
+	CASE_SERVICE_TRANSACTION_QUEUES,
+	CASE_SERVICE_TRANSACTION_RETRY_MS,
+	CASE_SERVICE_TRANSACTION_THREADS_PER_QUEUE,
+	CASE_SERVICE_WORK_DIRECTORY,
+	// For special debugging or bug-related repair:
+	CASE_SERVICE_DEBUG_ALLOCATIONS,
+	CASE_SERVICE_FABRIC_DUMP_MSGS,
+	CASE_SERVICE_PROLE_EXTRA_TTL,
+	// Obsoleted:
+	CASE_SERVICE_ALLOW_INLINE_TRANSACTIONS,
+	CASE_SERVICE_RESPOND_CLIENT_ON_MASTER_COMPLETION,
+	CASE_SERVICE_TRANSACTION_REPEATABLE_READ,
+	// Deprecated:
+	CASE_SERVICE_AUTO_DUN,
+	CASE_SERVICE_AUTO_UNDUN,
+	CASE_SERVICE_BATCH_RETRANSMIT,
+	CASE_SERVICE_CLIB_LIBRARY,
+	CASE_SERVICE_DEFRAG_QUEUE_ESCAPE,
+	CASE_SERVICE_DEFRAG_QUEUE_HWM,
+	CASE_SERVICE_DEFRAG_QUEUE_LWM,
+	CASE_SERVICE_DEFRAG_QUEUE_PRIORITY,
+	CASE_SERVICE_DUMP_MESSAGE_ABOVE_SIZE,
+	CASE_SERVICE_FABRIC_WORKERS,
+	CASE_SERVICE_FB_HEALTH_BAD_PCT,
+	CASE_SERVICE_FB_HEALTH_GOOD_PCT,
+	CASE_SERVICE_FB_HEALTH_MSG_PER_BURST,
+	CASE_SERVICE_FB_HEALTH_MSG_TIMEOUT,
+	CASE_SERVICE_GENERATION_DISABLE,
+	CASE_SERVICE_MAX_MSGS_PER_TYPE,
+	CASE_SERVICE_MIGRATE_READ_PRIORITY,
+	CASE_SERVICE_MIGRATE_READ_SLEEP,
+	CASE_SERVICE_MIGRATE_RX_LIFETIME_MS,
+	CASE_SERVICE_MIGRATE_XMIT_HWM,
+	CASE_SERVICE_MIGRATE_XMIT_LWM,
+	CASE_SERVICE_MIGRATE_PRIORITY, // renamed
+	CASE_SERVICE_MIGRATE_XMIT_PRIORITY,
+	CASE_SERVICE_MIGRATE_XMIT_SLEEP,
+	CASE_SERVICE_NSUP_AUTO_HWM,
+	CASE_SERVICE_NSUP_AUTO_HWM_PCT,
+	CASE_SERVICE_NSUP_MAX_DELETES,
+	CASE_SERVICE_NSUP_QUEUE_HWM,
+	CASE_SERVICE_NSUP_QUEUE_LWM,
+	CASE_SERVICE_NSUP_QUEUE_ESCAPE,
+	CASE_SERVICE_NSUP_REDUCE_PRIORITY,
+	CASE_SERVICE_NSUP_REDUCE_SLEEP,
+	CASE_SERVICE_NSUP_THREADS,
+	CASE_SERVICE_PAXOS_MAX_CLUSTER_SIZE,
+	CASE_SERVICE_PAXOS_PROTOCOL,
+	CASE_SERVICE_PAXOS_RECOVERY_POLICY,
+	CASE_SERVICE_PAXOS_RETRANSMIT_PERIOD,
+	CASE_SERVICE_REPLICATION_FIRE_AND_FORGET,
+	CASE_SERVICE_SCAN_MEMORY,
+	CASE_SERVICE_SCAN_PRIORITY,
+	CASE_SERVICE_SCAN_RETRANSMIT,
+	CASE_SERVICE_SCHEDULER_PRIORITY,
+	CASE_SERVICE_SCHEDULER_TYPE,
+	CASE_SERVICE_TRANSACTION_DUPLICATE_THREADS,
+	CASE_SERVICE_TRIAL_ACCOUNT_KEY,
+	CASE_SERVICE_UDF_RUNTIME_MAX_GMEMORY,
+	CASE_SERVICE_UDF_RUNTIME_MAX_MEMORY,
+	CASE_SERVICE_USE_QUEUE_PER_DEVICE,
+	CASE_SERVICE_WRITE_DUPLICATE_RESOLUTION_DISABLE,
+
+	// Service auto-pin options (value tokens):
+	CASE_SERVICE_AUTO_PIN_NONE,
+	CASE_SERVICE_AUTO_PIN_CPU,
+	CASE_SERVICE_AUTO_PIN_NUMA,
+
+	// Service debug-allocations options (value tokens):
+	CASE_SERVICE_DEBUG_ALLOCATIONS_NONE,
+	CASE_SERVICE_DEBUG_ALLOCATIONS_TRANSIENT,
+	CASE_SERVICE_DEBUG_ALLOCATIONS_PERSISTENT,
+	CASE_SERVICE_DEBUG_ALLOCATIONS_ALL,
+
+	// Logging options:
+	// Normally visible:
+	CASE_LOG_FILE_BEGIN,
+	// Normally hidden:
+	CASE_LOG_CONSOLE_BEGIN,
+
+	// Logging file options:
+	// Normally visible:
+	CASE_LOG_FILE_CONTEXT,
+
+	// Logging console options:
+	// Normally visible:
+	CASE_LOG_CONSOLE_CONTEXT,
+
+	// Network options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NETWORK_SERVICE_BEGIN,
+	CASE_NETWORK_HEARTBEAT_BEGIN,
+	CASE_NETWORK_FABRIC_BEGIN,
+	CASE_NETWORK_INFO_BEGIN,
+	// Normally hidden:
+	CASE_NETWORK_TLS_BEGIN,
+
+	// Network service options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NETWORK_SERVICE_ADDRESS,
+	CASE_NETWORK_SERVICE_PORT,
+	// Normally hidden:
+	CASE_NETWORK_SERVICE_EXTERNAL_ADDRESS, // renamed
+	CASE_NETWORK_SERVICE_ACCESS_ADDRESS,
+	CASE_NETWORK_SERVICE_ACCESS_PORT,
+	CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_ADDRESS,
+	CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_PORT,
+	CASE_NETWORK_SERVICE_TLS_ACCESS_ADDRESS,
+	CASE_NETWORK_SERVICE_TLS_ACCESS_PORT,
+	CASE_NETWORK_SERVICE_TLS_ADDRESS,
+	CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_ADDRESS,
+	CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_PORT,
+	CASE_NETWORK_SERVICE_TLS_AUTHENTICATE_CLIENT,
+	CASE_NETWORK_SERVICE_TLS_NAME,
+	CASE_NETWORK_SERVICE_TLS_PORT,
+	// Obsoleted:
+	CASE_NETWORK_SERVICE_ALTERNATE_ADDRESS,
+	CASE_NETWORK_SERVICE_NETWORK_INTERFACE_NAME,
+	// Deprecated:
+	CASE_NETWORK_SERVICE_REUSE_ADDRESS,
+
+	// Network heartbeat options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NETWORK_HEARTBEAT_MODE,
+	CASE_NETWORK_HEARTBEAT_ADDRESS,
+	CASE_NETWORK_HEARTBEAT_MULTICAST_GROUP,
+	CASE_NETWORK_HEARTBEAT_PORT,
+	CASE_NETWORK_HEARTBEAT_MESH_SEED_ADDRESS_PORT,
+	CASE_NETWORK_HEARTBEAT_INTERVAL,
+	CASE_NETWORK_HEARTBEAT_TIMEOUT,
+	// Normally hidden:
+	CASE_NETWORK_HEARTBEAT_MTU,
+	CASE_NETWORK_HEARTBEAT_MCAST_TTL, // renamed
+	CASE_NETWORK_HEARTBEAT_MULTICAST_TTL,
+	CASE_NETWORK_HEARTBEAT_PROTOCOL,
+	CASE_NETWORK_HEARTBEAT_TLS_ADDRESS,
+	CASE_NETWORK_HEARTBEAT_TLS_MESH_SEED_ADDRESS_PORT,
+	CASE_NETWORK_HEARTBEAT_TLS_NAME,
+	CASE_NETWORK_HEARTBEAT_TLS_PORT,
+	// Obsoleted:
+	CASE_NETWORK_HEARTBEAT_INTERFACE_ADDRESS,
+
+	// Network heartbeat mode options (value tokens):
+	CASE_NETWORK_HEARTBEAT_MODE_MESH,
+	CASE_NETWORK_HEARTBEAT_MODE_MULTICAST,
+
+	// Network heartbeat protocol options (value tokens):
+	CASE_NETWORK_HEARTBEAT_PROTOCOL_NONE,
+	CASE_NETWORK_HEARTBEAT_PROTOCOL_V3,
+
+	// Network fabric options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NETWORK_FABRIC_ADDRESS,
+	CASE_NETWORK_FABRIC_PORT,
+	// Normally hidden:
+	CASE_NETWORK_FABRIC_CHANNEL_BULK_FDS,
+	CASE_NETWORK_FABRIC_CHANNEL_BULK_RECV_THREADS,
+	CASE_NETWORK_FABRIC_CHANNEL_CTRL_FDS,
+	CASE_NETWORK_FABRIC_CHANNEL_CTRL_RECV_THREADS,
+	CASE_NETWORK_FABRIC_CHANNEL_META_FDS,
+	CASE_NETWORK_FABRIC_CHANNEL_META_RECV_THREADS,
+	CASE_NETWORK_FABRIC_CHANNEL_RW_FDS,
+	CASE_NETWORK_FABRIC_CHANNEL_RW_RECV_THREADS,
+	CASE_NETWORK_FABRIC_KEEPALIVE_ENABLED,
+	CASE_NETWORK_FABRIC_KEEPALIVE_INTVL,
+	CASE_NETWORK_FABRIC_KEEPALIVE_PROBES,
+	CASE_NETWORK_FABRIC_KEEPALIVE_TIME,
+	CASE_NETWORK_FABRIC_LATENCY_MAX_MS,
+	CASE_NETWORK_FABRIC_RECV_REARM_THRESHOLD,
+	CASE_NETWORK_FABRIC_SEND_THREADS,
+	CASE_NETWORK_FABRIC_TLS_ADDRESS,
+	CASE_NETWORK_FABRIC_TLS_NAME,
+	CASE_NETWORK_FABRIC_TLS_PORT,
+
+	// Network info options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NETWORK_INFO_ADDRESS,
+	CASE_NETWORK_INFO_PORT,
+	// Deprecated:
+	CASE_NETWORK_INFO_ENABLE_FASTPATH,
+
+	// Network TLS options:
+	CASE_NETWORK_TLS_CA_FILE,
+	CASE_NETWORK_TLS_CA_PATH,
+	CASE_NETWORK_TLS_CERT_BLACKLIST,
+	CASE_NETWORK_TLS_CERT_FILE,
+	CASE_NETWORK_TLS_CIPHER_SUITE,
+	CASE_NETWORK_TLS_KEY_FILE,
+	CASE_NETWORK_TLS_PROTOCOLS,
+
+	// Namespace options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NAMESPACE_REPLICATION_FACTOR,
+	CASE_NAMESPACE_LIMIT_SIZE, // renamed
+	CASE_NAMESPACE_MEMORY_SIZE,
+	CASE_NAMESPACE_DEFAULT_TTL,
+	CASE_NAMESPACE_STORAGE_ENGINE_BEGIN,
+	// For XDR only:
+	CASE_NAMESPACE_ENABLE_XDR,
+	CASE_NAMESPACE_SETS_ENABLE_XDR,
+	CASE_NAMESPACE_XDR_REMOTE_DATACENTER,
+	CASE_NAMESPACE_FORWARD_XDR_WRITES,
+	CASE_NAMESPACE_ALLOW_NONXDR_WRITES,
+	CASE_NAMESPACE_ALLOW_XDR_WRITES,
+	// Normally hidden:
+	CASE_NAMESPACE_COLD_START_EVICT_TTL,
+	CASE_NAMESPACE_CONFLICT_RESOLUTION_POLICY,
+	CASE_NAMESPACE_DATA_IN_INDEX,
+	CASE_NAMESPACE_DISABLE_WRITE_DUP_RES,
+	CASE_NAMESPACE_DISALLOW_NULL_SETNAME,
+	CASE_NAMESPACE_ENABLE_BENCHMARKS_BATCH_SUB,
+	CASE_NAMESPACE_ENABLE_BENCHMARKS_READ,
+	CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF,
+	CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF_SUB,
+	CASE_NAMESPACE_ENABLE_BENCHMARKS_WRITE,
+	CASE_NAMESPACE_ENABLE_HIST_PROXY,
+	CASE_NAMESPACE_EVICT_HIST_BUCKETS,
+	CASE_NAMESPACE_EVICT_TENTHS_PCT,
+	CASE_NAMESPACE_HIGH_WATER_DISK_PCT,
+	CASE_NAMESPACE_HIGH_WATER_MEMORY_PCT,
+	CASE_NAMESPACE_MAX_TTL,
+	CASE_NAMESPACE_MIGRATE_ORDER,
+	CASE_NAMESPACE_MIGRATE_RETRANSMIT_MS,
+	CASE_NAMESPACE_MIGRATE_SLEEP,
+	CASE_NAMESPACE_OBJ_SIZE_HIST_MAX,
+	CASE_NAMESPACE_PARTITION_TREE_LOCKS,
+	CASE_NAMESPACE_PARTITION_TREE_SPRIGS,
+	CASE_NAMESPACE_RACK_ID,
+	CASE_NAMESPACE_READ_CONSISTENCY_LEVEL_OVERRIDE,
+	CASE_NAMESPACE_SET_BEGIN,
+	CASE_NAMESPACE_SINDEX_BEGIN,
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_BEGIN,
+	CASE_NAMESPACE_SINGLE_BIN,
+	CASE_NAMESPACE_STOP_WRITES_PCT,
+	CASE_NAMESPACE_STRONG_CONSISTENCY,
+	CASE_NAMESPACE_STRONG_CONSISTENCY_ALLOW_EXPUNGE,
+	CASE_NAMESPACE_TOMB_RAIDER_ELIGIBLE_AGE,
+	CASE_NAMESPACE_TOMB_RAIDER_PERIOD,
+	CASE_NAMESPACE_WRITE_COMMIT_LEVEL_OVERRIDE,
+	// Deprecated:
+	CASE_NAMESPACE_ALLOW_VERSIONS,
+	CASE_NAMESPACE_DEMO_READ_MULTIPLIER,
+	CASE_NAMESPACE_DEMO_WRITE_MULTIPLIER,
+	CASE_NAMESPACE_HIGH_WATER_PCT,
+	CASE_NAMESPACE_LOW_WATER_PCT,
+	CASE_NAMESPACE_SI_BEGIN,
+
+	// Namespace conflict-resolution-policy options (value tokens):
+	CASE_NAMESPACE_CONFLICT_RESOLUTION_GENERATION,
+	CASE_NAMESPACE_CONFLICT_RESOLUTION_LAST_UPDATE_TIME,
+
+	// Namespace read consistency level options:
+	CASE_NAMESPACE_READ_CONSISTENCY_ALL,
+	CASE_NAMESPACE_READ_CONSISTENCY_OFF,
+	CASE_NAMESPACE_READ_CONSISTENCY_ONE,
+
+	// Namespace write commit level options:
+	CASE_NAMESPACE_WRITE_COMMIT_ALL,
+	CASE_NAMESPACE_WRITE_COMMIT_MASTER,
+	CASE_NAMESPACE_WRITE_COMMIT_OFF,
+
+	// Namespace storage-engine options (value tokens):
+	CASE_NAMESPACE_STORAGE_MEMORY,
+	CASE_NAMESPACE_STORAGE_SSD,
+	CASE_NAMESPACE_STORAGE_DEVICE,
+
+	// Namespace storage-engine device options:
+	// Normally visible, in canonical configuration file order:
+	CASE_NAMESPACE_STORAGE_DEVICE_DEVICE,
+	CASE_NAMESPACE_STORAGE_DEVICE_FILE,
+	CASE_NAMESPACE_STORAGE_DEVICE_FILESIZE,
+	CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE,
+	CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE,
+	CASE_NAMESPACE_STORAGE_DEVICE_MEMORY_ALL, // renamed
+	CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY,
+	// Normally hidden:
+	CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY,
+	CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE,
+	CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_MIN_SIZE,
+	CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_LWM_PCT,
+	CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_QUEUE_MIN,
+	CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_SLEEP,
+	CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_STARTUP_MINIMUM,
+	CASE_NAMESPACE_STORAGE_DEVICE_DISABLE_ODIRECT,
+	CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_BENCHMARKS_STORAGE,
+	CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_OSYNC,
+	CASE_NAMESPACE_STORAGE_DEVICE_ENCRYPTION_KEY_FILE,
+	CASE_NAMESPACE_STORAGE_DEVICE_FLUSH_MAX_MS,
+	CASE_NAMESPACE_STORAGE_DEVICE_FSYNC_MAX_SEC,
+	CASE_NAMESPACE_STORAGE_DEVICE_MAX_WRITE_CACHE,
+	CASE_NAMESPACE_STORAGE_DEVICE_MIN_AVAIL_PCT,
+	CASE_NAMESPACE_STORAGE_DEVICE_POST_WRITE_QUEUE,
+	CASE_NAMESPACE_STORAGE_DEVICE_TOMB_RAIDER_SLEEP,
+	CASE_NAMESPACE_STORAGE_DEVICE_WRITE_THREADS,
+	// Deprecated:
+	CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_MAX_BLOCKS,
+	CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_PERIOD,
+	CASE_NAMESPACE_STORAGE_DEVICE_LOAD_AT_STARTUP,
+	CASE_NAMESPACE_STORAGE_DEVICE_PERSIST,
+	CASE_NAMESPACE_STORAGE_DEVICE_READONLY,
+	CASE_NAMESPACE_STORAGE_DEVICE_SIGNATURE,
+	CASE_NAMESPACE_STORAGE_DEVICE_WRITE_SMOOTHING_PERIOD,
+
+	// Namespace set options:
+	CASE_NAMESPACE_SET_DISABLE_EVICTION,
+	CASE_NAMESPACE_SET_ENABLE_XDR,
+	CASE_NAMESPACE_SET_STOP_WRITES_COUNT,
+	// Deprecated:
+	CASE_NAMESPACE_SET_EVICT_HWM_COUNT,
+	CASE_NAMESPACE_SET_EVICT_HWM_PCT,
+	CASE_NAMESPACE_SET_STOP_WRITE_COUNT,
+	CASE_NAMESPACE_SET_STOP_WRITE_PCT,
+
+	// Namespace set set-enable-xdr options (value tokens):
+	CASE_NAMESPACE_SET_ENABLE_XDR_USE_DEFAULT,
+	CASE_NAMESPACE_SET_ENABLE_XDR_FALSE,
+	CASE_NAMESPACE_SET_ENABLE_XDR_TRUE,
+
+	// Namespace secondary-index options:
+	// Deprecated:
+	CASE_NAMESPACE_SI_GC_PERIOD,
+	CASE_NAMESPACE_SI_GC_MAX_UNITS,
+	CASE_NAMESPACE_SI_HISTOGRAM,
+	CASE_NAMESPACE_SI_IGNORE_NOT_SYNC,
+
+	// Namespace sindex options:
+	CASE_NAMESPACE_SINDEX_NUM_PARTITIONS,
+
+	// Namespace geo2dsphere within options:
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_STRICT,
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MIN_LEVEL,
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_LEVEL,
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_CELLS,
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_LEVEL_MOD,
+	CASE_NAMESPACE_GEO2DSPHERE_WITHIN_EARTH_RADIUS_METERS,
+
+	// Mod-lua options:
+	CASE_MOD_LUA_CACHE_ENABLED,
+	CASE_MOD_LUA_SYSTEM_PATH,
+	CASE_MOD_LUA_USER_PATH,
+
+	// Security options:
+	CASE_SECURITY_ENABLE_SECURITY,
+	CASE_SECURITY_PRIVILEGE_REFRESH_PERIOD,
+	CASE_SECURITY_LOG_BEGIN,
+	CASE_SECURITY_SYSLOG_BEGIN,
+
+	// Security (Aerospike) log options:
+	CASE_SECURITY_LOG_REPORT_AUTHENTICATION,
+	CASE_SECURITY_LOG_REPORT_DATA_OP,
+	CASE_SECURITY_LOG_REPORT_SYS_ADMIN,
+	CASE_SECURITY_LOG_REPORT_USER_ADMIN,
+	CASE_SECURITY_LOG_REPORT_VIOLATION,
+
+	// Security syslog options:
+	CASE_SECURITY_SYSLOG_LOCAL,
+	CASE_SECURITY_SYSLOG_REPORT_AUTHENTICATION,
+	CASE_SECURITY_SYSLOG_REPORT_DATA_OP,
+	CASE_SECURITY_SYSLOG_REPORT_SYS_ADMIN,
+	CASE_SECURITY_SYSLOG_REPORT_USER_ADMIN,
+	CASE_SECURITY_SYSLOG_REPORT_VIOLATION,
+
+	// XDR options:
+	// Normally visible, in canonical configuration file order:
+	CASE_XDR_ENABLE_XDR,
+	CASE_XDR_DIGESTLOG_PATH,
+	CASE_XDR_DATACENTER_BEGIN,
+	// Normally hidden:
+	CASE_XDR_CLIENT_THREADS,
+	CASE_XDR_COMPRESSION_THRESHOLD,
+	CASE_XDR_DELETE_SHIPPING_ENABLED,
+	CASE_XDR_DIGESTLOG_IOWAIT_MS,
+	CASE_XDR_FORWARD_XDR_WRITES,
+	CASE_XDR_HOTKEY_TIME_MS,
+	CASE_XDR_INFO_PORT,
+	CASE_XDR_INFO_TIMEOUT,
+	CASE_XDR_MAX_SHIP_BANDWIDTH,
+	CASE_XDR_MAX_SHIP_THROUGHPUT,
+	CASE_XDR_MIN_DIGESTLOG_FREE_PCT,
+	CASE_XDR_NSUP_DELETES_ENABLED,
+	CASE_XDR_READ_THREADS,
+	CASE_XDR_SHIP_BINS,
+	CASE_XDR_SHIP_DELAY,
+	CASE_XDR_SHIPPING_ENABLED,
+	CASE_XDR_WRITE_TIMEOUT,
+
+	// XDR (remote) datacenter options:
+	// Normally visible, in canonical configuration file order:
+	CASE_XDR_DATACENTER_DC_NODE_ADDRESS_PORT,
+	// Normally hidden:
+	CASE_XDR_DATACENTER_DC_CONNECTIONS,
+	CASE_XDR_DATACENTER_DC_CONNECTIONS_IDLE_MS,
+	CASE_XDR_DATACENTER_DC_INT_EXT_IPMAP,
+	CASE_XDR_DATACENTER_DC_SECURITY_CONFIG_FILE,
+	CASE_XDR_DATACENTER_DC_USE_ALTERNATE_SERVICES,
+	CASE_XDR_DATACENTER_TLS_NAME,
+	CASE_XDR_DATACENTER_TLS_NODE,
+
+	// Used parsing separate file, but share this enum:
+
+	// XDR security top-level options:
+	XDR_SEC_CASE_CREDENTIALS_BEGIN,
+
+	// XDR security credentials options:
+	// Normally visible, in canonical configuration file order:
+	XDR_SEC_CASE_CREDENTIALS_USERNAME,
+	XDR_SEC_CASE_CREDENTIALS_PASSWORD
+
+} cfg_case_id;
+
+
+//==========================================================
+// All configuration items must appear below as a cfg_opt
+// struct in the appropriate array. Order within an array is
+// not important, other than for organizational sanity.
+//
+
+typedef struct cfg_opt_s {
+	const char*	tok;
+	cfg_case_id	case_id;
+} cfg_opt;
+
+const cfg_opt GLOBAL_OPTS[] = {
+		{ "service",						CASE_SERVICE_BEGIN },
+		{ "logging",						CASE_LOGGING_BEGIN },
+		{ "network",						CASE_NETWORK_BEGIN },
+		{ "namespace",						CASE_NAMESPACE_BEGIN },
+		{ "mod-lua",						CASE_MOD_LUA_BEGIN },
+		{ "cluster",						CASE_CLUSTER_BEGIN },
+		{ "security",						CASE_SECURITY_BEGIN },
+		{ "xdr",							CASE_XDR_BEGIN }
+};
+
+const cfg_opt SERVICE_OPTS[] = {
+		{ "user",							CASE_SERVICE_USER },
+		{ "group",							CASE_SERVICE_GROUP },
+		{ "paxos-single-replica-limit",		CASE_SERVICE_PAXOS_SINGLE_REPLICA_LIMIT },
+		{ "pidfile",						CASE_SERVICE_PIDFILE },
+		{ "client-fd-max",					CASE_SERVICE_CLIENT_FD_MAX },
+		{ "proto-fd-max",					CASE_SERVICE_PROTO_FD_MAX },
+		{ "advertise-ipv6",					CASE_SERVICE_ADVERTISE_IPV6 },
+		{ "auto-pin",						CASE_SERVICE_AUTO_PIN },
+		{ "batch-threads",					CASE_SERVICE_BATCH_THREADS },
+		{ "batch-max-buffers-per-queue",	CASE_SERVICE_BATCH_MAX_BUFFERS_PER_QUEUE },
+		{ "batch-max-requests",				CASE_SERVICE_BATCH_MAX_REQUESTS },
+		{ "batch-max-unused-buffers",		CASE_SERVICE_BATCH_MAX_UNUSED_BUFFERS },
+		{ "batch-priority",					CASE_SERVICE_BATCH_PRIORITY },
+		{ "batch-index-threads",			CASE_SERVICE_BATCH_INDEX_THREADS },
+		{ "cluster-name",					CASE_SERVICE_CLUSTER_NAME },
+		{ "enable-benchmarks-fabric",		CASE_SERVICE_ENABLE_BENCHMARKS_FABRIC },
+		{ "enable-benchmarks-svc",			CASE_SERVICE_ENABLE_BENCHMARKS_SVC },
+		{ "enable-hist-info",				CASE_SERVICE_ENABLE_HIST_INFO },
+		{ "feature-key-file",				CASE_SERVICE_FEATURE_KEY_FILE },
+		{ "hist-track-back",				CASE_SERVICE_HIST_TRACK_BACK },
+		{ "hist-track-slice",				CASE_SERVICE_HIST_TRACK_SLICE },
+		{ "hist-track-thresholds",			CASE_SERVICE_HIST_TRACK_THRESHOLDS },
+		{ "info-threads",					CASE_SERVICE_INFO_THREADS },
+		{ "log-local-time",					CASE_SERVICE_LOG_LOCAL_TIME },
+		{ "log-millis",						CASE_SERVICE_LOG_MILLIS},
+		{ "migrate-max-num-incoming",		CASE_SERVICE_MIGRATE_MAX_NUM_INCOMING },
+		{ "migrate-threads",				CASE_SERVICE_MIGRATE_THREADS },
+		{ "min-cluster-size",				CASE_SERVICE_MIN_CLUSTER_SIZE },
+		{ "node-id",						CASE_SERVICE_NODE_ID },
+		{ "node-id-interface",				CASE_SERVICE_NODE_ID_INTERFACE },
+		{ "nsup-delete-sleep",				CASE_SERVICE_NSUP_DELETE_SLEEP },
+		{ "nsup-period",					CASE_SERVICE_NSUP_PERIOD },
+		{ "nsup-startup-evict",				CASE_SERVICE_NSUP_STARTUP_EVICT },
+		{ "proto-fd-idle-ms",				CASE_SERVICE_PROTO_FD_IDLE_MS },
+		{ "query-batch-size",				CASE_SERVICE_QUERY_BATCH_SIZE },
+		{ "query-bufpool-size",				CASE_SERVICE_QUERY_BUFPOOL_SIZE },
+		{ "query-in-transaction-thread",	CASE_SERVICE_QUERY_IN_TRANSACTION_THREAD },
+		{ "query-long-q-max-size",			CASE_SERVICE_QUERY_LONG_Q_MAX_SIZE },
+		{ "query-pre-reserve-partitions",   CASE_SERVICE_QUERY_PRE_RESERVE_PARTITIONS },
+		{ "query-priority", 				CASE_SERVICE_QUERY_PRIORITY },
+		{ "query-priority-sleep-us", 		CASE_SERVICE_QUERY_PRIORITY_SLEEP_US },
+		{ "query-rec-count-bound",			CASE_SERVICE_QUERY_REC_COUNT_BOUND },
+		{ "query-req-in-query-thread",		CASE_SERVICE_QUERY_REQ_IN_QUERY_THREAD },
+		{ "query-req-max-inflight",			CASE_SERVICE_QUERY_REQ_MAX_INFLIGHT },
+		{ "query-short-q-max-size",			CASE_SERVICE_QUERY_SHORT_Q_MAX_SIZE },
+		{ "query-threads",					CASE_SERVICE_QUERY_THREADS },
+		{ "query-threshold", 				CASE_SERVICE_QUERY_THRESHOLD },
+		{ "query-untracked-time-ms",		CASE_SERVICE_QUERY_UNTRACKED_TIME_MS },
+		{ "query-worker-threads",			CASE_SERVICE_QUERY_WORKER_THREADS },
+		{ "run-as-daemon",					CASE_SERVICE_RUN_AS_DAEMON },
+		{ "scan-max-active",				CASE_SERVICE_SCAN_MAX_ACTIVE },
+		{ "scan-max-done",					CASE_SERVICE_SCAN_MAX_DONE },
+		{ "scan-max-udf-transactions",		CASE_SERVICE_SCAN_MAX_UDF_TRANSACTIONS },
+		{ "scan-threads",					CASE_SERVICE_SCAN_THREADS },
+		{ "service-threads",				CASE_SERVICE_SERVICE_THREADS },
+		{ "sindex-builder-threads",			CASE_SERVICE_SINDEX_BUILDER_THREADS },
+		{ "sindex-gc-max-rate",				CASE_SERVICE_SINDEX_GC_MAX_RATE },
+		{ "sindex-gc-period",				CASE_SERVICE_SINDEX_GC_PERIOD },
+		{ "ticker-interval",				CASE_SERVICE_TICKER_INTERVAL },
+		{ "transaction-max-ms",				CASE_SERVICE_TRANSACTION_MAX_MS },
+		{ "transaction-pending-limit",		CASE_SERVICE_TRANSACTION_PENDING_LIMIT },
+		{ "transaction-queues",				CASE_SERVICE_TRANSACTION_QUEUES },
+		{ "transaction-retry-ms",			CASE_SERVICE_TRANSACTION_RETRY_MS },
+		{ "transaction-threads-per-queue",	CASE_SERVICE_TRANSACTION_THREADS_PER_QUEUE },
+		{ "work-directory",					CASE_SERVICE_WORK_DIRECTORY },
+		{ "debug-allocations",				CASE_SERVICE_DEBUG_ALLOCATIONS },
+		{ "fabric-dump-msgs",				CASE_SERVICE_FABRIC_DUMP_MSGS },
+		{ "prole-extra-ttl",				CASE_SERVICE_PROLE_EXTRA_TTL },
+		{ "allow-inline-transactions",		CASE_SERVICE_ALLOW_INLINE_TRANSACTIONS },
+		{ "respond-client-on-master-completion", CASE_SERVICE_RESPOND_CLIENT_ON_MASTER_COMPLETION },
+		{ "transaction-repeatable-read",	CASE_SERVICE_TRANSACTION_REPEATABLE_READ },
+		{ "auto-dun",						CASE_SERVICE_AUTO_DUN },
+		{ "auto-undun",						CASE_SERVICE_AUTO_UNDUN },
+		{ "batch-retransmit",				CASE_SERVICE_BATCH_RETRANSMIT },
+		{ "clib-library",					CASE_SERVICE_CLIB_LIBRARY },
+		{ "defrag-queue-escape",			CASE_SERVICE_DEFRAG_QUEUE_ESCAPE },
+		{ "defrag-queue-hwm",				CASE_SERVICE_DEFRAG_QUEUE_HWM },
+		{ "defrag-queue-lwm",				CASE_SERVICE_DEFRAG_QUEUE_LWM },
+		{ "defrag-queue-priority",			CASE_SERVICE_DEFRAG_QUEUE_PRIORITY },
+		{ "dump-message-above-size",		CASE_SERVICE_DUMP_MESSAGE_ABOVE_SIZE },
+		{ "fabric-workers",					CASE_SERVICE_FABRIC_WORKERS },
+		{ "fb-health-bad-pct",				CASE_SERVICE_FB_HEALTH_BAD_PCT },
+		{ "fb-health-good-pct",				CASE_SERVICE_FB_HEALTH_GOOD_PCT },
+		{ "fb-health-msg-per-burst",		CASE_SERVICE_FB_HEALTH_MSG_PER_BURST },
+		{ "fb-health-msg-timeout",			CASE_SERVICE_FB_HEALTH_MSG_TIMEOUT },
+		{ "generation-disable",				CASE_SERVICE_GENERATION_DISABLE },
+		{ "max-msgs-per-type",				CASE_SERVICE_MAX_MSGS_PER_TYPE },
+		{ "migrate-read-priority",			CASE_SERVICE_MIGRATE_READ_PRIORITY },
+		{ "migrate-read-sleep",				CASE_SERVICE_MIGRATE_READ_SLEEP },
+		{ "migrate-rx-lifetime-ms",			CASE_SERVICE_MIGRATE_RX_LIFETIME_MS },
+		{ "migrate-xmit-hwm",				CASE_SERVICE_MIGRATE_XMIT_HWM },
+		{ "migrate-xmit-lwm",				CASE_SERVICE_MIGRATE_XMIT_LWM },
+		{ "migrate-priority",				CASE_SERVICE_MIGRATE_PRIORITY },
+		{ "migrate-xmit-priority",			CASE_SERVICE_MIGRATE_XMIT_PRIORITY },
+		{ "migrate-xmit-sleep",				CASE_SERVICE_MIGRATE_XMIT_SLEEP },
+		{ "nsup-auto-hwm",					CASE_SERVICE_NSUP_AUTO_HWM },
+		{ "nsup-auto-hwm-pct",				CASE_SERVICE_NSUP_AUTO_HWM_PCT },
+		{ "nsup-max-deletes",				CASE_SERVICE_NSUP_MAX_DELETES },
+		{ "nsup-queue-escape",				CASE_SERVICE_NSUP_QUEUE_ESCAPE },
+		{ "nsup-queue-hwm",					CASE_SERVICE_NSUP_QUEUE_HWM },
+		{ "nsup-queue-lwm",					CASE_SERVICE_NSUP_QUEUE_LWM },
+		{ "nsup-reduce-priority",			CASE_SERVICE_NSUP_REDUCE_PRIORITY },
+		{ "nsup-reduce-sleep",				CASE_SERVICE_NSUP_REDUCE_SLEEP },
+		{ "nsup-threads",					CASE_SERVICE_NSUP_THREADS },
+		{ "paxos-max-cluster-size",			CASE_SERVICE_PAXOS_MAX_CLUSTER_SIZE },
+		{ "paxos-protocol",					CASE_SERVICE_PAXOS_PROTOCOL },
+		{ "paxos-recovery-policy",			CASE_SERVICE_PAXOS_RECOVERY_POLICY },
+		{ "paxos-retransmit-period",		CASE_SERVICE_PAXOS_RETRANSMIT_PERIOD },
+		{ "replication-fire-and-forget",	CASE_SERVICE_REPLICATION_FIRE_AND_FORGET },
+		{ "scan-memory",					CASE_SERVICE_SCAN_MEMORY },
+		{ "scan-priority",					CASE_SERVICE_SCAN_PRIORITY },
+		{ "scan-retransmit",				CASE_SERVICE_SCAN_RETRANSMIT },
+		{ "scheduler-priority",				CASE_SERVICE_SCHEDULER_PRIORITY },
+		{ "scheduler-type",					CASE_SERVICE_SCHEDULER_TYPE },
+		{ "transaction-duplicate-threads",	CASE_SERVICE_TRANSACTION_DUPLICATE_THREADS },
+		{ "trial-account-key",				CASE_SERVICE_TRIAL_ACCOUNT_KEY },
+		{ "udf-runtime-max-gmemory",		CASE_SERVICE_UDF_RUNTIME_MAX_GMEMORY },
+		{ "udf-runtime-max-memory",			CASE_SERVICE_UDF_RUNTIME_MAX_MEMORY },
+		{ "use-queue-per-device",			CASE_SERVICE_USE_QUEUE_PER_DEVICE },
+		{ "write-duplicate-resolution-disable", CASE_SERVICE_WRITE_DUPLICATE_RESOLUTION_DISABLE },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt SERVICE_AUTO_PIN_OPTS[] = {
+		{ "none",							CASE_SERVICE_AUTO_PIN_NONE },
+		{ "cpu",							CASE_SERVICE_AUTO_PIN_CPU },
+		{ "numa",							CASE_SERVICE_AUTO_PIN_NUMA }
+};
+
+const cfg_opt SERVICE_DEBUG_ALLOCATIONS_OPTS[] = {
+		{ "none",							CASE_SERVICE_DEBUG_ALLOCATIONS_NONE },
+		{ "transient",						CASE_SERVICE_DEBUG_ALLOCATIONS_TRANSIENT },
+		{ "persistent",						CASE_SERVICE_DEBUG_ALLOCATIONS_PERSISTENT },
+		{ "all",							CASE_SERVICE_DEBUG_ALLOCATIONS_ALL }
+};
+
+const cfg_opt LOGGING_OPTS[] = {
+		{ "file",							CASE_LOG_FILE_BEGIN },
+		{ "console",						CASE_LOG_CONSOLE_BEGIN },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt LOGGING_FILE_OPTS[] = {
+		{ "context",						CASE_LOG_FILE_CONTEXT },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt LOGGING_CONSOLE_OPTS[] = {
+		{ "context",						CASE_LOG_CONSOLE_CONTEXT },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NETWORK_OPTS[] = {
+		{ "service",						CASE_NETWORK_SERVICE_BEGIN },
+		{ "heartbeat",						CASE_NETWORK_HEARTBEAT_BEGIN },
+		{ "fabric",							CASE_NETWORK_FABRIC_BEGIN },
+		{ "info",							CASE_NETWORK_INFO_BEGIN },
+		{ "tls",							CASE_NETWORK_TLS_BEGIN },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NETWORK_SERVICE_OPTS[] = {
+		{ "address",						CASE_NETWORK_SERVICE_ADDRESS },
+		{ "port",							CASE_NETWORK_SERVICE_PORT },
+		{ "external-address",				CASE_NETWORK_SERVICE_EXTERNAL_ADDRESS },
+		{ "access-address",					CASE_NETWORK_SERVICE_ACCESS_ADDRESS },
+		{ "access-port",					CASE_NETWORK_SERVICE_ACCESS_PORT },
+		{ "alternate-access-address",		CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_ADDRESS },
+		{ "alternate-access-port",			CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_PORT },
+		{ "tls-access-address",				CASE_NETWORK_SERVICE_TLS_ACCESS_ADDRESS },
+		{ "tls-access-port",				CASE_NETWORK_SERVICE_TLS_ACCESS_PORT },
+		{ "tls-address",					CASE_NETWORK_SERVICE_TLS_ADDRESS },
+		{ "tls-alternate-access-address",	CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_ADDRESS },
+		{ "tls-alternate-access-port",		CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_PORT },
+		{ "tls-authenticate-client",		CASE_NETWORK_SERVICE_TLS_AUTHENTICATE_CLIENT },
+		{ "tls-name",						CASE_NETWORK_SERVICE_TLS_NAME },
+		{ "tls-port",						CASE_NETWORK_SERVICE_TLS_PORT },
+		{ "alternate-address",				CASE_NETWORK_SERVICE_ALTERNATE_ADDRESS },
+		{ "network-interface-name",			CASE_NETWORK_SERVICE_NETWORK_INTERFACE_NAME },
+		{ "reuse-address",					CASE_NETWORK_SERVICE_REUSE_ADDRESS },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NETWORK_HEARTBEAT_OPTS[] = {
+		{ "mode",							CASE_NETWORK_HEARTBEAT_MODE },
+		{ "address",						CASE_NETWORK_HEARTBEAT_ADDRESS },
+		{ "multicast-group",				CASE_NETWORK_HEARTBEAT_MULTICAST_GROUP },
+		{ "port",							CASE_NETWORK_HEARTBEAT_PORT },
+		{ "mesh-seed-address-port",			CASE_NETWORK_HEARTBEAT_MESH_SEED_ADDRESS_PORT },
+		{ "interval",						CASE_NETWORK_HEARTBEAT_INTERVAL },
+		{ "timeout",						CASE_NETWORK_HEARTBEAT_TIMEOUT },
+		{ "mtu",							CASE_NETWORK_HEARTBEAT_MTU },
+		{ "mcast-ttl",						CASE_NETWORK_HEARTBEAT_MCAST_TTL },
+		{ "multicast-ttl",					CASE_NETWORK_HEARTBEAT_MULTICAST_TTL },
+		{ "protocol",						CASE_NETWORK_HEARTBEAT_PROTOCOL },
+		{ "tls-address",					CASE_NETWORK_HEARTBEAT_TLS_ADDRESS },
+		{ "tls-mesh-seed-address-port",		CASE_NETWORK_HEARTBEAT_TLS_MESH_SEED_ADDRESS_PORT },
+		{ "tls-name",						CASE_NETWORK_HEARTBEAT_TLS_NAME },
+		{ "tls-port",						CASE_NETWORK_HEARTBEAT_TLS_PORT },
+		{ "interface-address",				CASE_NETWORK_HEARTBEAT_INTERFACE_ADDRESS },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NETWORK_HEARTBEAT_MODE_OPTS[] = {
+		{ "mesh",							CASE_NETWORK_HEARTBEAT_MODE_MESH },
+		{ "multicast",						CASE_NETWORK_HEARTBEAT_MODE_MULTICAST }
+};
+
+const cfg_opt NETWORK_HEARTBEAT_PROTOCOL_OPTS[] = {
+		{ "none",							CASE_NETWORK_HEARTBEAT_PROTOCOL_NONE },
+		{ "v3",								CASE_NETWORK_HEARTBEAT_PROTOCOL_V3}
+};
+
+const cfg_opt NETWORK_FABRIC_OPTS[] = {
+		{ "address",						CASE_NETWORK_FABRIC_ADDRESS },
+		{ "port",							CASE_NETWORK_FABRIC_PORT },
+		{ "channel-bulk-fds",				CASE_NETWORK_FABRIC_CHANNEL_BULK_FDS },
+		{ "channel-bulk-recv-threads",		CASE_NETWORK_FABRIC_CHANNEL_BULK_RECV_THREADS },
+		{ "channel-ctrl-fds",				CASE_NETWORK_FABRIC_CHANNEL_CTRL_FDS },
+		{ "channel-ctrl-recv-threads",		CASE_NETWORK_FABRIC_CHANNEL_CTRL_RECV_THREADS },
+		{ "channel-meta-fds",				CASE_NETWORK_FABRIC_CHANNEL_META_FDS },
+		{ "channel-meta-recv-threads",		CASE_NETWORK_FABRIC_CHANNEL_META_RECV_THREADS },
+		{ "channel-rw-fds",					CASE_NETWORK_FABRIC_CHANNEL_RW_FDS },
+		{ "channel-rw-recv-threads",		CASE_NETWORK_FABRIC_CHANNEL_RW_RECV_THREADS },
+		{ "keepalive-enabled",				CASE_NETWORK_FABRIC_KEEPALIVE_ENABLED },
+		{ "keepalive-intvl",				CASE_NETWORK_FABRIC_KEEPALIVE_INTVL },
+		{ "keepalive-probes",				CASE_NETWORK_FABRIC_KEEPALIVE_PROBES },
+		{ "keepalive-time",					CASE_NETWORK_FABRIC_KEEPALIVE_TIME },
+		{ "latency-max-ms",					CASE_NETWORK_FABRIC_LATENCY_MAX_MS },
+		{ "recv-rearm-threshold",			CASE_NETWORK_FABRIC_RECV_REARM_THRESHOLD },
+		{ "send-threads",					CASE_NETWORK_FABRIC_SEND_THREADS },
+		{ "tls-address",					CASE_NETWORK_FABRIC_TLS_ADDRESS },
+		{ "tls-name",						CASE_NETWORK_FABRIC_TLS_NAME },
+		{ "tls-port",						CASE_NETWORK_FABRIC_TLS_PORT },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NETWORK_INFO_OPTS[] = {
+		{ "address",						CASE_NETWORK_INFO_ADDRESS },
+		{ "port",							CASE_NETWORK_INFO_PORT },
+		{ "enable-fastpath",				CASE_NETWORK_INFO_ENABLE_FASTPATH },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NETWORK_TLS_OPTS[] = {
+		{ "ca-file",						CASE_NETWORK_TLS_CA_FILE },
+		{ "ca-path",						CASE_NETWORK_TLS_CA_PATH },
+		{ "cert-blacklist",					CASE_NETWORK_TLS_CERT_BLACKLIST },
+		{ "cert-file",						CASE_NETWORK_TLS_CERT_FILE },
+		{ "cipher-suite",					CASE_NETWORK_TLS_CIPHER_SUITE },
+		{ "key-file",						CASE_NETWORK_TLS_KEY_FILE },
+		{ "protocols",						CASE_NETWORK_TLS_PROTOCOLS },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NAMESPACE_OPTS[] = {
+		{ "replication-factor",				CASE_NAMESPACE_REPLICATION_FACTOR },
+		{ "limit-size",						CASE_NAMESPACE_LIMIT_SIZE },
+		{ "memory-size",					CASE_NAMESPACE_MEMORY_SIZE },
+		{ "default-ttl",					CASE_NAMESPACE_DEFAULT_TTL },
+		{ "storage-engine",					CASE_NAMESPACE_STORAGE_ENGINE_BEGIN },
+		{ "enable-xdr",						CASE_NAMESPACE_ENABLE_XDR },
+		{ "sets-enable-xdr",				CASE_NAMESPACE_SETS_ENABLE_XDR },
+		{ "xdr-remote-datacenter",			CASE_NAMESPACE_XDR_REMOTE_DATACENTER },
+		{ "ns-forward-xdr-writes",			CASE_NAMESPACE_FORWARD_XDR_WRITES },
+		{ "allow-nonxdr-writes",			CASE_NAMESPACE_ALLOW_NONXDR_WRITES },
+		{ "allow-xdr-writes",				CASE_NAMESPACE_ALLOW_XDR_WRITES },
+		{ "cold-start-evict-ttl",			CASE_NAMESPACE_COLD_START_EVICT_TTL },
+		{ "conflict-resolution-policy",		CASE_NAMESPACE_CONFLICT_RESOLUTION_POLICY },
+		{ "data-in-index",					CASE_NAMESPACE_DATA_IN_INDEX },
+		{ "disable-write-dup-res",			CASE_NAMESPACE_DISABLE_WRITE_DUP_RES },
+		{ "disallow-null-setname",			CASE_NAMESPACE_DISALLOW_NULL_SETNAME },
+		{ "enable-benchmarks-batch-sub",	CASE_NAMESPACE_ENABLE_BENCHMARKS_BATCH_SUB },
+		{ "enable-benchmarks-read",			CASE_NAMESPACE_ENABLE_BENCHMARKS_READ },
+		{ "enable-benchmarks-udf",			CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF },
+		{ "enable-benchmarks-udf-sub",		CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF_SUB },
+		{ "enable-benchmarks-write",		CASE_NAMESPACE_ENABLE_BENCHMARKS_WRITE },
+		{ "enable-hist-proxy",				CASE_NAMESPACE_ENABLE_HIST_PROXY },
+		{ "evict-hist-buckets",				CASE_NAMESPACE_EVICT_HIST_BUCKETS },
+		{ "evict-tenths-pct",				CASE_NAMESPACE_EVICT_TENTHS_PCT },
+		{ "high-water-disk-pct",			CASE_NAMESPACE_HIGH_WATER_DISK_PCT },
+		{ "high-water-memory-pct",			CASE_NAMESPACE_HIGH_WATER_MEMORY_PCT },
+		{ "max-ttl",						CASE_NAMESPACE_MAX_TTL },
+		{ "migrate-order",					CASE_NAMESPACE_MIGRATE_ORDER },
+		{ "migrate-retransmit-ms",			CASE_NAMESPACE_MIGRATE_RETRANSMIT_MS },
+		{ "migrate-sleep",					CASE_NAMESPACE_MIGRATE_SLEEP },
+		{ "obj-size-hist-max",				CASE_NAMESPACE_OBJ_SIZE_HIST_MAX },
+		{ "partition-tree-locks",			CASE_NAMESPACE_PARTITION_TREE_LOCKS },
+		{ "partition-tree-sprigs",			CASE_NAMESPACE_PARTITION_TREE_SPRIGS },
+		{ "rack-id",						CASE_NAMESPACE_RACK_ID },
+		{ "read-consistency-level-override", CASE_NAMESPACE_READ_CONSISTENCY_LEVEL_OVERRIDE },
+		{ "set",							CASE_NAMESPACE_SET_BEGIN },
+		{ "sindex",							CASE_NAMESPACE_SINDEX_BEGIN },
+		{ "geo2dsphere-within",				CASE_NAMESPACE_GEO2DSPHERE_WITHIN_BEGIN },
+		{ "single-bin",						CASE_NAMESPACE_SINGLE_BIN },
+		{ "stop-writes-pct",				CASE_NAMESPACE_STOP_WRITES_PCT },
+		{ "strong-consistency",				CASE_NAMESPACE_STRONG_CONSISTENCY },
+		{ "strong-consistency-allow-expunge", CASE_NAMESPACE_STRONG_CONSISTENCY_ALLOW_EXPUNGE },
+		{ "tomb-raider-eligible-age",		CASE_NAMESPACE_TOMB_RAIDER_ELIGIBLE_AGE },
+		{ "tomb-raider-period",				CASE_NAMESPACE_TOMB_RAIDER_PERIOD },
+		{ "write-commit-level-override",	CASE_NAMESPACE_WRITE_COMMIT_LEVEL_OVERRIDE },
+		{ "allow-versions",					CASE_NAMESPACE_ALLOW_VERSIONS },
+		{ "demo-read-multiplier",			CASE_NAMESPACE_DEMO_READ_MULTIPLIER },
+		{ "demo-write-multiplier",			CASE_NAMESPACE_DEMO_WRITE_MULTIPLIER },
+		{ "high-water-pct",					CASE_NAMESPACE_HIGH_WATER_PCT },
+		{ "low-water-pct",					CASE_NAMESPACE_LOW_WATER_PCT },
+		{ "si",								CASE_NAMESPACE_SI_BEGIN },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NAMESPACE_CONFLICT_RESOLUTION_OPTS[] = {
+		{ "generation",						CASE_NAMESPACE_CONFLICT_RESOLUTION_GENERATION },
+		{ "last-update-time",				CASE_NAMESPACE_CONFLICT_RESOLUTION_LAST_UPDATE_TIME }
+};
+
+const cfg_opt NAMESPACE_READ_CONSISTENCY_OPTS[] = {
+		{ "all",							CASE_NAMESPACE_READ_CONSISTENCY_ALL },
+		{ "off",							CASE_NAMESPACE_READ_CONSISTENCY_OFF },
+		{ "one",							CASE_NAMESPACE_READ_CONSISTENCY_ONE }
+};
+
+const cfg_opt NAMESPACE_WRITE_COMMIT_OPTS[] = {
+		{ "all",							CASE_NAMESPACE_WRITE_COMMIT_ALL },
+		{ "master",							CASE_NAMESPACE_WRITE_COMMIT_MASTER },
+		{ "off",							CASE_NAMESPACE_WRITE_COMMIT_OFF }
+};
+
+const cfg_opt NAMESPACE_STORAGE_OPTS[] = {
+		{ "memory",							CASE_NAMESPACE_STORAGE_MEMORY },
+		{ "ssd",							CASE_NAMESPACE_STORAGE_SSD },
+		{ "device",							CASE_NAMESPACE_STORAGE_DEVICE }
+};
+
+const cfg_opt NAMESPACE_STORAGE_DEVICE_OPTS[] = {
+		{ "device",							CASE_NAMESPACE_STORAGE_DEVICE_DEVICE },
+		{ "file",							CASE_NAMESPACE_STORAGE_DEVICE_FILE },
+		{ "filesize",						CASE_NAMESPACE_STORAGE_DEVICE_FILESIZE },
+		{ "scheduler-mode",					CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE },
+		{ "write-block-size",				CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE },
+		{ "memory-all",						CASE_NAMESPACE_STORAGE_DEVICE_MEMORY_ALL },
+		{ "data-in-memory",					CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY },
+		{ "cold-start-empty",				CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY },
+		{ "commit-to-device",				CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE },
+		{ "commit-min-size",				CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_MIN_SIZE },
+		{ "defrag-lwm-pct",					CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_LWM_PCT },
+		{ "defrag-queue-min",				CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_QUEUE_MIN },
+		{ "defrag-sleep",					CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_SLEEP },
+		{ "defrag-startup-minimum",			CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_STARTUP_MINIMUM },
+		{ "disable-odirect",				CASE_NAMESPACE_STORAGE_DEVICE_DISABLE_ODIRECT },
+		{ "enable-benchmarks-storage",		CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_BENCHMARKS_STORAGE },
+		{ "enable-osync",					CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_OSYNC },
+		{ "encryption-key-file",			CASE_NAMESPACE_STORAGE_DEVICE_ENCRYPTION_KEY_FILE },
+		{ "flush-max-ms",					CASE_NAMESPACE_STORAGE_DEVICE_FLUSH_MAX_MS },
+		{ "fsync-max-sec",					CASE_NAMESPACE_STORAGE_DEVICE_FSYNC_MAX_SEC },
+		{ "max-write-cache",				CASE_NAMESPACE_STORAGE_DEVICE_MAX_WRITE_CACHE },
+		{ "min-avail-pct",					CASE_NAMESPACE_STORAGE_DEVICE_MIN_AVAIL_PCT },
+		{ "post-write-queue",				CASE_NAMESPACE_STORAGE_DEVICE_POST_WRITE_QUEUE },
+		{ "tomb-raider-sleep",				CASE_NAMESPACE_STORAGE_DEVICE_TOMB_RAIDER_SLEEP },
+		{ "write-threads",					CASE_NAMESPACE_STORAGE_DEVICE_WRITE_THREADS },
+		{ "defrag-max-blocks",				CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_MAX_BLOCKS },
+		{ "defrag-period",					CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_PERIOD },
+		{ "load-at-startup",				CASE_NAMESPACE_STORAGE_DEVICE_LOAD_AT_STARTUP },
+		{ "persist",						CASE_NAMESPACE_STORAGE_DEVICE_PERSIST },
+		{ "readonly",						CASE_NAMESPACE_STORAGE_DEVICE_READONLY },
+		{ "signature",						CASE_NAMESPACE_STORAGE_DEVICE_SIGNATURE },
+		{ "write-smoothing-period",			CASE_NAMESPACE_STORAGE_DEVICE_WRITE_SMOOTHING_PERIOD },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NAMESPACE_SET_OPTS[] = {
+		{ "set-disable-eviction",			CASE_NAMESPACE_SET_DISABLE_EVICTION },
+		{ "set-enable-xdr",					CASE_NAMESPACE_SET_ENABLE_XDR },
+		{ "set-stop-writes-count",			CASE_NAMESPACE_SET_STOP_WRITES_COUNT },
+		{ "set-evict-hwm-count",			CASE_NAMESPACE_SET_EVICT_HWM_COUNT },
+		{ "set-evict-hwm-pct",				CASE_NAMESPACE_SET_EVICT_HWM_PCT },
+		{ "set-stop-write-count",			CASE_NAMESPACE_SET_STOP_WRITE_COUNT },
+		{ "set-stop-write-pct",				CASE_NAMESPACE_SET_STOP_WRITE_PCT },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NAMESPACE_SET_ENABLE_XDR_OPTS[] = {
+		{ "use-default",					CASE_NAMESPACE_SET_ENABLE_XDR_USE_DEFAULT },
+		{ "false",							CASE_NAMESPACE_SET_ENABLE_XDR_FALSE },
+		{ "true",							CASE_NAMESPACE_SET_ENABLE_XDR_TRUE }
+};
+
+const cfg_opt NAMESPACE_SI_OPTS[] = {
+		{ "si-gc-period",					CASE_NAMESPACE_SI_GC_PERIOD },
+		{ "si-gc-max-units",				CASE_NAMESPACE_SI_GC_MAX_UNITS },
+		{ "si-histogram",					CASE_NAMESPACE_SI_HISTOGRAM },
+		{ "si-ignore-not-sync",				CASE_NAMESPACE_SI_IGNORE_NOT_SYNC },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NAMESPACE_SINDEX_OPTS[] = {
+		{ "num-partitions",					CASE_NAMESPACE_SINDEX_NUM_PARTITIONS },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt NAMESPACE_GEO2DSPHERE_WITHIN_OPTS[] = {
+		{ "strict",							CASE_NAMESPACE_GEO2DSPHERE_WITHIN_STRICT },
+		{ "min-level",						CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MIN_LEVEL },
+		{ "max-level",						CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_LEVEL },
+		{ "max-cells",						CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_CELLS },
+		{ "level-mod",						CASE_NAMESPACE_GEO2DSPHERE_WITHIN_LEVEL_MOD },
+		{ "earth-radius-meters",			CASE_NAMESPACE_GEO2DSPHERE_WITHIN_EARTH_RADIUS_METERS },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt MOD_LUA_OPTS[] = {
+		{ "cache-enabled",					CASE_MOD_LUA_CACHE_ENABLED },
+		{ "system-path",					CASE_MOD_LUA_SYSTEM_PATH },
+		{ "user-path",						CASE_MOD_LUA_USER_PATH },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt SECURITY_OPTS[] = {
+		{ "enable-security",				CASE_SECURITY_ENABLE_SECURITY },
+		{ "privilege-refresh-period",		CASE_SECURITY_PRIVILEGE_REFRESH_PERIOD },
+		{ "log",							CASE_SECURITY_LOG_BEGIN },
+		{ "syslog",							CASE_SECURITY_SYSLOG_BEGIN },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt SECURITY_LOG_OPTS[] = {
+		{ "report-authentication",			CASE_SECURITY_LOG_REPORT_AUTHENTICATION },
+		{ "report-data-op",					CASE_SECURITY_LOG_REPORT_DATA_OP },
+		{ "report-sys-admin",				CASE_SECURITY_LOG_REPORT_SYS_ADMIN },
+		{ "report-user-admin",				CASE_SECURITY_LOG_REPORT_USER_ADMIN },
+		{ "report-violation",				CASE_SECURITY_LOG_REPORT_VIOLATION },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt SECURITY_SYSLOG_OPTS[] = {
+		{ "local",							CASE_SECURITY_SYSLOG_LOCAL },
+		{ "report-authentication",			CASE_SECURITY_SYSLOG_REPORT_AUTHENTICATION },
+		{ "report-data-op",					CASE_SECURITY_SYSLOG_REPORT_DATA_OP },
+		{ "report-sys-admin",				CASE_SECURITY_SYSLOG_REPORT_SYS_ADMIN },
+		{ "report-user-admin",				CASE_SECURITY_SYSLOG_REPORT_USER_ADMIN },
+		{ "report-violation",				CASE_SECURITY_SYSLOG_REPORT_VIOLATION },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt XDR_OPTS[] = {
+		{ "{",								CASE_CONTEXT_BEGIN },
+		{ "enable-xdr",						CASE_XDR_ENABLE_XDR },
+		{ "xdr-digestlog-path",				CASE_XDR_DIGESTLOG_PATH },
+		{ "datacenter",						CASE_XDR_DATACENTER_BEGIN },
+		{ "xdr-client-threads",				CASE_XDR_CLIENT_THREADS },
+		{ "xdr-compression-threshold",		CASE_XDR_COMPRESSION_THRESHOLD },
+		{ "xdr-delete-shipping-enabled",	CASE_XDR_DELETE_SHIPPING_ENABLED },
+		{ "xdr-digestlog-iowait-ms",		CASE_XDR_DIGESTLOG_IOWAIT_MS },
+		{ "forward-xdr-writes",				CASE_XDR_FORWARD_XDR_WRITES },
+		{ "xdr-hotkey-time-ms",				CASE_XDR_HOTKEY_TIME_MS },
+		{ "xdr-info-port",					CASE_XDR_INFO_PORT },
+		{ "xdr-info-timeout",				CASE_XDR_INFO_TIMEOUT },
+		{ "xdr-max-ship-bandwidth",			CASE_XDR_MAX_SHIP_BANDWIDTH },
+		{ "xdr-max-ship-throughput",		CASE_XDR_MAX_SHIP_THROUGHPUT },
+		{ "xdr-min-digestlog-free-pct",		CASE_XDR_MIN_DIGESTLOG_FREE_PCT },
+		{ "xdr-nsup-deletes-enabled",		CASE_XDR_NSUP_DELETES_ENABLED },
+		{ "xdr-read-threads",				CASE_XDR_READ_THREADS},
+		{ "xdr-ship-bins",					CASE_XDR_SHIP_BINS },
+		{ "xdr-ship-delay",					CASE_XDR_SHIP_DELAY }, // hidden
+		{ "xdr-shipping-enabled",			CASE_XDR_SHIPPING_ENABLED },
+		{ "xdr-write-timeout",				CASE_XDR_WRITE_TIMEOUT },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const cfg_opt XDR_DATACENTER_OPTS[] = {
+		{ "{",								CASE_CONTEXT_BEGIN },
+		{ "dc-node-address-port",			CASE_XDR_DATACENTER_DC_NODE_ADDRESS_PORT },
+		{ "dc-connections",					CASE_XDR_DATACENTER_DC_CONNECTIONS },
+		{ "dc-connections-idle-ms",			CASE_XDR_DATACENTER_DC_CONNECTIONS_IDLE_MS },
+		{ "dc-int-ext-ipmap",				CASE_XDR_DATACENTER_DC_INT_EXT_IPMAP },
+		{ "dc-security-config-file",		CASE_XDR_DATACENTER_DC_SECURITY_CONFIG_FILE },
+		{ "dc-use-alternate-services",		CASE_XDR_DATACENTER_DC_USE_ALTERNATE_SERVICES },
+		{ "tls-name",						CASE_XDR_DATACENTER_TLS_NAME },
+		{ "tls-node",						CASE_XDR_DATACENTER_TLS_NODE },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+// Used parsing separate file, but share cfg_case_id enum.
+
+const cfg_opt XDR_SEC_GLOBAL_OPTS[] = {
+		{ "credentials",					XDR_SEC_CASE_CREDENTIALS_BEGIN }
+};
+
+const cfg_opt XDR_SEC_CREDENTIALS_OPTS[] = {
+		{ "{",								CASE_CONTEXT_BEGIN },
+		{ "username",						XDR_SEC_CASE_CREDENTIALS_USERNAME },
+		{ "password",						XDR_SEC_CASE_CREDENTIALS_PASSWORD },
+		{ "}",								CASE_CONTEXT_END }
+};
+
+const int NUM_GLOBAL_OPTS							= sizeof(GLOBAL_OPTS) / sizeof(cfg_opt);
+const int NUM_SERVICE_OPTS							= sizeof(SERVICE_OPTS) / sizeof(cfg_opt);
+const int NUM_SERVICE_AUTO_PIN_OPTS					= sizeof(SERVICE_AUTO_PIN_OPTS) / sizeof(cfg_opt);
+const int NUM_SERVICE_DEBUG_ALLOCATIONS_OPTS		= sizeof(SERVICE_DEBUG_ALLOCATIONS_OPTS) / sizeof(cfg_opt);
+const int NUM_LOGGING_OPTS							= sizeof(LOGGING_OPTS) / sizeof(cfg_opt);
+const int NUM_LOGGING_FILE_OPTS						= sizeof(LOGGING_FILE_OPTS) / sizeof(cfg_opt);
+const int NUM_LOGGING_CONSOLE_OPTS					= sizeof(LOGGING_CONSOLE_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_OPTS							= sizeof(NETWORK_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_SERVICE_OPTS					= sizeof(NETWORK_SERVICE_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_HEARTBEAT_OPTS				= sizeof(NETWORK_HEARTBEAT_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_HEARTBEAT_MODE_OPTS			= sizeof(NETWORK_HEARTBEAT_MODE_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_HEARTBEAT_PROTOCOL_OPTS		= sizeof(NETWORK_HEARTBEAT_PROTOCOL_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_FABRIC_OPTS					= sizeof(NETWORK_FABRIC_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_INFO_OPTS						= sizeof(NETWORK_INFO_OPTS) / sizeof(cfg_opt);
+const int NUM_NETWORK_TLS_OPTS						= sizeof(NETWORK_TLS_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_OPTS						= sizeof(NAMESPACE_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_CONFLICT_RESOLUTION_OPTS	= sizeof(NAMESPACE_CONFLICT_RESOLUTION_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_READ_CONSISTENCY_OPTS		= sizeof(NAMESPACE_READ_CONSISTENCY_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_WRITE_COMMIT_OPTS			= sizeof(NAMESPACE_WRITE_COMMIT_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_STORAGE_OPTS				= sizeof(NAMESPACE_STORAGE_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_STORAGE_DEVICE_OPTS			= sizeof(NAMESPACE_STORAGE_DEVICE_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_SET_OPTS					= sizeof(NAMESPACE_SET_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_SET_ENABLE_XDR_OPTS			= sizeof(NAMESPACE_SET_ENABLE_XDR_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_SI_OPTS						= sizeof(NAMESPACE_SI_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_SINDEX_OPTS					= sizeof(NAMESPACE_SINDEX_OPTS) / sizeof(cfg_opt);
+const int NUM_NAMESPACE_GEO2DSPHERE_WITHIN_OPTS		= sizeof(NAMESPACE_GEO2DSPHERE_WITHIN_OPTS) / sizeof(cfg_opt);
+const int NUM_MOD_LUA_OPTS							= sizeof(MOD_LUA_OPTS) / sizeof(cfg_opt);
+const int NUM_SECURITY_OPTS							= sizeof(SECURITY_OPTS) / sizeof(cfg_opt);
+const int NUM_SECURITY_LOG_OPTS						= sizeof(SECURITY_LOG_OPTS) / sizeof(cfg_opt);
+const int NUM_SECURITY_SYSLOG_OPTS					= sizeof(SECURITY_SYSLOG_OPTS) / sizeof(cfg_opt);
+const int NUM_XDR_OPTS								= sizeof(XDR_OPTS) / sizeof(cfg_opt);
+const int NUM_XDR_DATACENTER_OPTS					= sizeof(XDR_DATACENTER_OPTS) / sizeof(cfg_opt);
+
+// Used parsing separate file, but share cfg_case_id enum.
+
+const int NUM_XDR_SEC_GLOBAL_OPTS					= sizeof(XDR_SEC_GLOBAL_OPTS) / sizeof(cfg_opt);
+const int NUM_XDR_SEC_CREDENTIALS_OPTS				= sizeof(XDR_SEC_CREDENTIALS_OPTS) / sizeof(cfg_opt);
+
+
+//==========================================================
+// Configuration value constants not for switch cases.
+//
+
+const char* DEVICE_SCHEDULER_MODES[] = {
+		"anticipatory",
+		"cfq",				// best for rotational drives
+		"deadline",
+		"noop"				// best for SSDs
+};
+
+const int NUM_DEVICE_SCHEDULER_MODES = sizeof(DEVICE_SCHEDULER_MODES) / sizeof(const char*);
+
+
+//==========================================================
+// Generic parsing utilities.
+//
+
+// Don't use these functions. Use the cf_str functions, which have better error
+// handling, and support K, M, B/G, etc.
+#undef atoi
+#define atoi() DO_NOT_USE
+#undef atol
+#define atol() DO_NOT_USE
+#undef atoll
+#define atol() DO_NOT_USE
+
+//------------------------------------------------
+// Parsing state (context) tracking & switching.
+//
+
+typedef enum {
+	GLOBAL,
+	SERVICE,
+	LOGGING, LOGGING_FILE, LOGGING_CONSOLE,
+	NETWORK, NETWORK_SERVICE, NETWORK_HEARTBEAT, NETWORK_FABRIC, NETWORK_INFO, NETWORK_TLS,
+	NAMESPACE, NAMESPACE_STORAGE_DEVICE, NAMESPACE_SET, NAMESPACE_SI, NAMESPACE_SINDEX, NAMESPACE_GEO2DSPHERE_WITHIN,
+	MOD_LUA,
+	SECURITY, SECURITY_LOG, SECURITY_SYSLOG,
+	XDR, XDR_DATACENTER,
+	// Used parsing separate file, but shares this enum:
+	XDR_SEC_CREDENTIALS,
+	// Must be last, use for sanity-checking:
+	PARSER_STATE_MAX_PLUS_1
+} as_config_parser_state;
+
+// For detail logging only - keep in sync with as_config_parser_state.
+const char* CFG_PARSER_STATES[] = {
+		"GLOBAL",
+		"SERVICE",
+		"LOGGING", "LOGGING_FILE", "LOGGING_CONSOLE",
+		"NETWORK", "NETWORK_SERVICE", "NETWORK_HEARTBEAT", "NETWORK_FABRIC", "NETWORK_INFO", "NETWORK_TLS",
+		"NAMESPACE", "NAMESPACE_STORAGE_DEVICE", "NAMESPACE_SET", "NAMESPACE_SI", "NAMESPACE_SINDEX", "NAMESPACE_GEO2DSPHERE_WITHIN",
+		"MOD_LUA",
+		"SECURITY", "SECURITY_LOG", "SECURITY_SYSLOG",
+		"XDR", "XDR_DATACENTER",
+		// Used parsing separate file, but shares corresponding enum:
+		"XDR_SEC_CREDENTIALS"
+};
+
+#define MAX_STACK_DEPTH 8
+
+typedef struct cfg_parser_state_s {
+	as_config_parser_state	current;
+	as_config_parser_state	stack[MAX_STACK_DEPTH];
+	int						depth;
+} cfg_parser_state;
+
+void
+cfg_parser_state_init(cfg_parser_state* p_state)
+{
+	p_state->current = p_state->stack[0] = GLOBAL;
+	p_state->depth = 0;
+}
+
+void
+cfg_begin_context(cfg_parser_state* p_state, as_config_parser_state context)
+{
+	if (context < 0 || context >= PARSER_STATE_MAX_PLUS_1) {
+		cf_crash(AS_CFG, "parsing - unknown context");
+	}
+
+	as_config_parser_state prev_context = p_state->stack[p_state->depth];
+
+	if (++p_state->depth >= MAX_STACK_DEPTH) {
+		cf_crash(AS_CFG, "parsing - context too deep");
+	}
+
+	p_state->current = p_state->stack[p_state->depth] = context;
+
+	// To see this log, change NO_SINKS_LIMIT in fault.c:
+	cf_detail(AS_CFG, "begin context: %s -> %s", CFG_PARSER_STATES[prev_context], CFG_PARSER_STATES[context]);
+}
+
+void
+cfg_end_context(cfg_parser_state* p_state)
+{
+	as_config_parser_state prev_context = p_state->stack[p_state->depth];
+
+	if (--p_state->depth < 0) {
+		cf_crash(AS_CFG, "parsing - can't end context depth 0");
+	}
+
+	p_state->current = p_state->stack[p_state->depth];
+
+	// To see this log, change NO_SINKS_LIMIT in fault.c:
+	cf_detail(AS_CFG, "end context: %s -> %s", CFG_PARSER_STATES[prev_context], CFG_PARSER_STATES[p_state->current]);
+}
+
+//------------------------------------------------
+// Given a token, return switch case identifier.
+//
+
+cfg_case_id
+cfg_find_tok(const char* tok, const cfg_opt opts[], int num_opts)
+{
+	for (int i = 0; i < num_opts; i++) {
+		if (strcmp(tok, opts[i].tok) == 0) {
+			return opts[i].case_id;
+		}
+	}
+
+	return CASE_NOT_FOUND;
+}
+
+//------------------------------------------------
+// Value parsing and sanity-checking utilities.
+//
+
+void
+cfg_renamed_name_tok(const cfg_line* p_line, const char* new_tok)
+{
+	cf_warning(AS_CFG, "line %d :: %s was renamed - please use '%s'",
+			p_line->num, p_line->name_tok, new_tok);
+}
+
+void
+cfg_renamed_val_tok_1(const cfg_line* p_line, const char* new_tok)
+{
+	cf_warning(AS_CFG, "line %d :: %s value '%s' was renamed - please use '%s'",
+			p_line->num, p_line->name_tok, p_line->val_tok_1, new_tok);
+}
+
+void
+cfg_deprecated_name_tok(const cfg_line* p_line)
+{
+	cf_warning(AS_CFG, "line %d :: %s is deprecated - please remove",
+			p_line->num, p_line->name_tok);
+}
+
+void
+cfg_deprecated_val_tok_1(const cfg_line* p_line)
+{
+	cf_warning(AS_CFG, "line %d :: %s value '%s' is deprecated - please remove",
+			p_line->num, p_line->name_tok, p_line->val_tok_1);
+}
+
+void
+cfg_unknown_name_tok(const cfg_line* p_line)
+{
+	cf_crash_nostack(AS_CFG, "line %d :: unknown config parameter name '%s'",
+			p_line->num, p_line->name_tok);
+}
+
+void
+cfg_unknown_val_tok_1(const cfg_line* p_line)
+{
+	cf_crash_nostack(AS_CFG, "line %d :: %s has unknown value '%s'",
+			p_line->num, p_line->name_tok, p_line->val_tok_1);
+}
+
+void
+cfg_obsolete(const cfg_line* p_line, const char* message)
+{
+	cf_crash_nostack(AS_CFG, "line %d :: '%s' is obsolete%s%s",
+			p_line->num, p_line->name_tok, message ? " - " : "", message);
+}
+
+void
+cfg_not_supported(const cfg_line* p_line, const char* feature)
+{
+	cf_crash_nostack(AS_CFG, "line %d :: illegal value '%s' for config parameter '%s' - feature %s is not supported",
+			p_line->num, p_line->val_tok_1, p_line->name_tok, feature);
+}
+
+char*
+cfg_strdup_no_checks(const cfg_line* p_line)
+{
+	return cf_strdup(p_line->val_tok_1);
+}
+
+char*
+cfg_strdup_val2_no_checks(const cfg_line* p_line)
+{
+	return cf_strdup(p_line->val_tok_2);
+}
+
+char*
+cfg_strdup_anyval(const cfg_line* p_line, const char* val_tok, bool is_required)
+{
+	if (val_tok[0] == 0) {
+		if (is_required) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must have a value specified",
+					p_line->num, p_line->name_tok);
+		}
+
+		// Do not duplicate empty strings.
+		return NULL;
+	}
+
+	return cf_strdup(val_tok);
+}
+
+char*
+cfg_strdup(const cfg_line* p_line, bool is_required)
+{
+	return cfg_strdup_anyval(p_line, p_line->val_tok_1, is_required);
+}
+
+char*
+cfg_strdup_val2(const cfg_line* p_line, bool is_required)
+{
+	return cfg_strdup_anyval(p_line, p_line->val_tok_2, is_required);
+}
+
+char*
+cfg_strdup_one_of(const cfg_line* p_line, const char* toks[], int num_toks)
+{
+	for (int i = 0; i < num_toks; i++) {
+		if (strcmp(p_line->val_tok_1, toks[i]) == 0) {
+			return cfg_strdup_no_checks(p_line);
+		}
+	}
+
+	uint32_t valid_toks_size = (num_toks * 2) + 1;
+
+	for (int i = 0; i < num_toks; i++) {
+		valid_toks_size += strlen(toks[i]);
+	}
+
+	char valid_toks[valid_toks_size];
+
+	valid_toks[0] = 0;
+
+	for (int i = 0; i < num_toks; i++) {
+		strcat(valid_toks, toks[i]);
+		strcat(valid_toks, ", ");
+	}
+
+	cf_crash_nostack(AS_CFG, "line %d :: %s must be one of: %snot %s",
+			p_line->num, p_line->name_tok, valid_toks, p_line->val_tok_1);
+
+	// Won't get here, but quiet warnings...
+	return NULL;
+}
+
+void
+cfg_strcpy(const cfg_line* p_line, char* p_str, size_t max_size)
+{
+	size_t tok1_len = strlen(p_line->val_tok_1);
+
+	if (tok1_len == 0) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must have a value specified",
+				p_line->num, p_line->name_tok);
+	}
+
+	if (tok1_len >= max_size) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be < %lu characters long, not %s",
+				p_line->num, p_line->name_tok, max_size, p_line->val_tok_1);
+	}
+
+	strcpy(p_str, p_line->val_tok_1);
+}
+
+bool
+cfg_bool(const cfg_line* p_line)
+{
+	if (strcasecmp(p_line->val_tok_1, "true") == 0 || strcasecmp(p_line->val_tok_1, "yes") == 0) {
+		return true;
+	}
+
+	if (strcasecmp(p_line->val_tok_1, "false") == 0 || strcasecmp(p_line->val_tok_1, "no") == 0) {
+		return false;
+	}
+
+	if (*p_line->val_tok_1 == '\0') {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be true or false or yes or no",
+				p_line->num, p_line->name_tok);
+	}
+
+	cf_crash_nostack(AS_CFG, "line %d :: %s must be true or false or yes or no, not %s",
+			p_line->num, p_line->name_tok, p_line->val_tok_1);
+
+	// Won't get here, but quiet warnings...
+	return false;
+}
+
+bool
+cfg_bool_no_value_is_true(const cfg_line* p_line)
+{
+	return (*p_line->val_tok_1 == '\0') ? true : cfg_bool(p_line);
+}
+
+int64_t
+cfg_i64_anyval_no_checks(const cfg_line* p_line, char* val_tok)
+{
+	if (*val_tok == '\0') {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must specify an integer value",
+				p_line->num, p_line->name_tok);
+	}
+
+	int64_t value;
+
+	if (0 != cf_str_atoi_64(val_tok, &value)) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be a number, not %s",
+				p_line->num, p_line->name_tok, val_tok);
+	}
+
+	return value;
+}
+
+int64_t
+cfg_i64_no_checks(const cfg_line* p_line)
+{
+	return cfg_i64_anyval_no_checks(p_line, p_line->val_tok_1);
+}
+
+int64_t
+cfg_i64_val2_no_checks(const cfg_line* p_line)
+{
+	return cfg_i64_anyval_no_checks(p_line, p_line->val_tok_2);
+}
+
+int64_t
+cfg_i64_val3_no_checks(const cfg_line* p_line)
+{
+	return cfg_i64_anyval_no_checks(p_line, p_line->val_tok_3);
+}
+
+int64_t
+cfg_i64(const cfg_line* p_line, int64_t min, int64_t max)
+{
+	int64_t value = cfg_i64_no_checks(p_line);
+
+	if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %ld and <= %ld, not %ld",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+int
+cfg_int_no_checks(const cfg_line* p_line)
+{
+	int64_t value = cfg_i64_no_checks(p_line);
+
+	if (value < INT_MIN || value > INT_MAX) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s %ld overflows int",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return (int)value;
+}
+
+int
+cfg_int(const cfg_line* p_line, int min, int max)
+{
+	int value = cfg_int_no_checks(p_line);
+
+	if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %d and <= %d, not %d",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+int
+cfg_int_val2_no_checks(const cfg_line* p_line)
+{
+	int64_t value = cfg_i64_val2_no_checks(p_line);
+
+	if (value < INT_MIN || value > INT_MAX) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s %ld overflows int",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return (int)value;
+}
+
+int
+cfg_int_val3_no_checks(const cfg_line* p_line)
+{
+	int64_t value = cfg_i64_val3_no_checks(p_line);
+
+	if (value < INT_MIN || value > INT_MAX) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s %ld overflows int",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return (int)value;
+}
+int
+cfg_int_val2(const cfg_line* p_line, int min, int max)
+{
+	int value = cfg_int_val2_no_checks(p_line);
+
+	if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %d and <= %d, not %d",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+int
+cfg_int_val3(const cfg_line* p_line, int min, int max)
+{
+	int value = cfg_int_val3_no_checks(p_line);
+
+	if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %d and <= %d, not %d",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+uint64_t
+cfg_x64_anyval_no_checks(const cfg_line* p_line, char* val_tok)
+{
+	if (*val_tok == '\0') {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must specify a hex value",
+				p_line->num, p_line->name_tok);
+	}
+
+	uint64_t value;
+
+	if (0 != cf_str_atoi_x64(val_tok, &value)) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be a 64-bit hex number, not %s",
+				p_line->num, p_line->name_tok, val_tok);
+	}
+
+	return value;
+}
+
+uint64_t
+cfg_x64_no_checks(const cfg_line* p_line)
+{
+	return cfg_x64_anyval_no_checks(p_line, p_line->val_tok_1);
+}
+
+uint64_t
+cfg_x64(const cfg_line* p_line, uint64_t min, uint64_t max)
+{
+	uint64_t value = cfg_x64_no_checks(p_line);
+
+	if (min == 0) {
+		if (value > max) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %lx, not %lx",
+					p_line->num, p_line->name_tok, max, value);
+		}
+	}
+	else if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %lx and <= %lx, not %lx",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+uint64_t
+cfg_u64_anyval_no_checks(const cfg_line* p_line, char* val_tok)
+{
+	if (*val_tok == '\0') {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must specify an unsigned integer value",
+				p_line->num, p_line->name_tok);
+	}
+
+	uint64_t value;
+
+	if (0 != cf_str_atoi_u64(val_tok, &value)) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be an unsigned number, not %s",
+				p_line->num, p_line->name_tok, val_tok);
+	}
+
+	return value;
+}
+
+uint64_t
+cfg_u64_no_checks(const cfg_line* p_line)
+{
+	return cfg_u64_anyval_no_checks(p_line, p_line->val_tok_1);
+}
+
+uint64_t
+cfg_u64_val2_no_checks(const cfg_line* p_line)
+{
+	return cfg_u64_anyval_no_checks(p_line, p_line->val_tok_2);
+}
+
+uint64_t
+cfg_u64(const cfg_line* p_line, uint64_t min, uint64_t max)
+{
+	uint64_t value = cfg_u64_no_checks(p_line);
+
+	if (min == 0) {
+		if (value > max) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %lu, not %lu",
+					p_line->num, p_line->name_tok, max, value);
+		}
+	}
+	else if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %lu and <= %lu, not %lu",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+uint32_t
+cfg_u32_no_checks(const cfg_line* p_line)
+{
+	uint64_t value = cfg_u64_no_checks(p_line);
+
+	if (value > UINT_MAX) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s %lu overflows unsigned int",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return (uint32_t)value;
+}
+
+uint32_t
+cfg_u32(const cfg_line* p_line, uint32_t min, uint32_t max)
+{
+	uint32_t value = cfg_u32_no_checks(p_line);
+
+	if (min == 0) {
+		if (value > max) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u, not %u",
+					p_line->num, p_line->name_tok, max, value);
+		}
+	}
+	else if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u and <= %u, not %u",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+// Note - accepts 0 if min is 0.
+uint32_t
+cfg_u32_power_of_2(const cfg_line* p_line, uint32_t min, uint32_t max)
+{
+	uint32_t value = cfg_u32(p_line, min, max);
+
+	if ((value & (value - 1)) != 0) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be an exact power of 2, not %u",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return value;
+}
+
+uint16_t
+cfg_u16_no_checks(const cfg_line* p_line)
+{
+	uint64_t value = cfg_u64_no_checks(p_line);
+
+	if (value > USHRT_MAX) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s %lu overflows unsigned short",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return (uint16_t)value;
+}
+
+uint16_t
+cfg_u16(const cfg_line* p_line, uint16_t min, uint16_t max)
+{
+	uint16_t value = cfg_u16_no_checks(p_line);
+
+	if (min == 0) {
+		if (value > max) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u, not %u",
+					p_line->num, p_line->name_tok, max, value);
+		}
+	}
+	else if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u and <= %u, not %u",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+uint8_t
+cfg_u8_no_checks(const cfg_line* p_line)
+{
+	uint64_t value = cfg_u64_no_checks(p_line);
+
+	if (value > UCHAR_MAX) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s %lu overflows unsigned char",
+				p_line->num, p_line->name_tok, value);
+	}
+
+	return (uint8_t)value;
+}
+
+uint8_t
+cfg_u8(const cfg_line* p_line, uint8_t min, uint8_t max)
+{
+	uint8_t value = cfg_u8_no_checks(p_line);
+
+	if (min == 0) {
+		if (value > max) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u, not %u",
+					p_line->num, p_line->name_tok, max, value);
+		}
+	}
+	else if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u and <= %u, not %u",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+uint32_t
+cfg_seconds_no_checks(const cfg_line* p_line)
+{
+	if (*p_line->val_tok_1 == '\0') {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must specify an unsigned integer value with time unit (s, m, h, or d)",
+				p_line->num, p_line->name_tok);
+	}
+
+	uint64_t value;
+
+	// TODO - should fix this to guard against overflow, give uint32_t.
+	if (0 != cf_str_atoi_seconds(p_line->val_tok_1, &value)) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be an unsigned number with time unit (s, m, h, or d), not %s",
+				p_line->num, p_line->name_tok, p_line->val_tok_1);
+	}
+
+	return (uint32_t)value;
+}
+
+uint32_t
+cfg_seconds(const cfg_line* p_line, uint32_t min, uint32_t max)
+{
+	uint32_t value = cfg_seconds_no_checks(p_line);
+
+	if (min == 0) {
+		if (value > max) {
+			cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u seconds, not %u seconds",
+					p_line->num, p_line->name_tok, max, value);
+		}
+	}
+	else if (value < min || value > max) {
+		cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u seconds and <= %u seconds, not %u seconds",
+				p_line->num, p_line->name_tok, min, max, value);
+	}
+
+	return value;
+}
+
+// Minimum & maximum port numbers:
+const int CFG_MIN_PORT = 1024;
+const int CFG_MAX_PORT = USHRT_MAX;
+
+cf_ip_port
+cfg_port(const cfg_line* p_line)
+{
+	return (cf_ip_port)cfg_int(p_line, CFG_MIN_PORT, CFG_MAX_PORT);
+}
+
+cf_ip_port
+cfg_port_val2(const cfg_line* p_line)
+{
+	return (cf_ip_port)cfg_int_val2(p_line, CFG_MIN_PORT, CFG_MAX_PORT);
+}
+
+cf_ip_port
+cfg_port_val3(const cfg_line* p_line)
+{
+	return (cf_ip_port)cfg_int_val3(p_line, CFG_MIN_PORT, CFG_MAX_PORT);
+}
+
+//------------------------------------------------
+// Constants used in parsing.
+//
+
+// Token delimiter characters:
+const char CFG_WHITESPACE[] = " \t\n\r\f\v";
+
+
+//==========================================================
+// Public API - parse the configuration file.
+//
+
+as_config*
+as_config_init(const char* config_file)
+{
+	as_config* c = &g_config; // shortcut pointer
+
+	// Set the service context defaults. Values parsed from the config file will
+	// override the defaults.
+	cfg_set_defaults();
+	xdr_config_defaults();
+
+	FILE* FD;
+	char iobuf[256];
+	int line_num = 0;
+	cfg_parser_state state;
+
+	cfg_parser_state_init(&state);
+
+	as_namespace* ns = NULL;
+	dc_config_opt *cur_dc_cfg = NULL;
+	cf_tls_spec* tls_spec = NULL;
+	cf_fault_sink* sink = NULL;
+	as_set* p_set = NULL; // local variable used for set initialization
+
+	// Open the configuration file for reading.
+	if (NULL == (FD = fopen(config_file, "r"))) {
+		cf_crash_nostack(AS_CFG, "couldn't open configuration file %s: %s", config_file, cf_strerror(errno));
+	}
+
+	// Parse the configuration file, line by line.
+	while (fgets(iobuf, sizeof(iobuf), FD)) {
+		line_num++;
+
+		// First chop the comment off, if there is one.
+
+		char* p_comment = strchr(iobuf, '#');
+
+		if (p_comment) {
+			*p_comment = '\0';
+		}
+
+		// Find (and null-terminate) up to three whitespace-delimited tokens in
+		// the line, a 'name' token and up to two 'value' tokens.
+
+		cfg_line line = { line_num, NULL, NULL, NULL, NULL };
+
+		line.name_tok = strtok(iobuf, CFG_WHITESPACE);
+
+		// If there are no tokens, ignore this line, get the next line.
+		if (! line.name_tok) {
+			continue;
+		}
+
+		line.val_tok_1 = strtok(NULL, CFG_WHITESPACE);
+
+		if (! line.val_tok_1) {
+			line.val_tok_1 = ""; // in case it's used where NULL can't be used
+		}
+		else {
+			line.val_tok_2 = strtok(NULL, CFG_WHITESPACE);
+		}
+
+		if (! line.val_tok_2) {
+			line.val_tok_2 = ""; // in case it's used where NULL can't be used
+		}
+		else {
+			line.val_tok_3 = strtok(NULL, CFG_WHITESPACE);
+		}
+
+		if (! line.val_tok_3) {
+			line.val_tok_3 = ""; // in case it's used where NULL can't be used
+		}
+
+		// Note that we can't see this output until a logging sink is specified.
+		cf_detail(AS_CFG, "line %d :: %s %s %s %s", line_num, line.name_tok, line.val_tok_1, line.val_tok_2, line.val_tok_3);
+
+		// Parse the directive.
+		switch (state.current) {
+
+		//==================================================
+		// Parse top-level items.
+		//
+		case GLOBAL:
+			switch (cfg_find_tok(line.name_tok, GLOBAL_OPTS, NUM_GLOBAL_OPTS)) {
+			case CASE_SERVICE_BEGIN:
+				cfg_begin_context(&state, SERVICE);
+				break;
+			case CASE_LOGGING_BEGIN:
+				cfg_begin_context(&state, LOGGING);
+				break;
+			case CASE_NETWORK_BEGIN:
+				cfg_begin_context(&state, NETWORK);
+				break;
+			case CASE_NAMESPACE_BEGIN:
+				// Create the namespace objects.
+				ns = as_namespace_create(line.val_tok_1);
+				cfg_begin_context(&state, NAMESPACE);
+				break;
+			case CASE_MOD_LUA_BEGIN:
+				cfg_begin_context(&state, MOD_LUA);
+				break;
+			case CASE_SECURITY_BEGIN:
+				cfg_enterprise_only(&line);
+				cfg_begin_context(&state, SECURITY);
+				break;
+			case CASE_XDR_BEGIN:
+				g_xcfg.xdr_section_configured = true;
+				cfg_enterprise_only(&line);
+				cfg_begin_context(&state, XDR);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse service context items.
+		//
+		case SERVICE:
+			switch (cfg_find_tok(line.name_tok, SERVICE_OPTS, NUM_SERVICE_OPTS)) {
+			case CASE_SERVICE_USER:
+				{
+					struct passwd* pwd;
+					if (NULL == (pwd = getpwnam(line.val_tok_1))) {
+						cf_crash_nostack(AS_CFG, "line %d :: user not found: %s", line_num, line.val_tok_1);
+					}
+					c->uid = pwd->pw_uid;
+					endpwent();
+				}
+				break;
+			case CASE_SERVICE_GROUP:
+				{
+					struct group* grp;
+					if (NULL == (grp = getgrnam(line.val_tok_1))) {
+						cf_crash_nostack(AS_CFG, "line %d :: group not found: %s", line_num, line.val_tok_1);
+					}
+					c->gid = grp->gr_gid;
+					endgrent();
+				}
+				break;
+			case CASE_SERVICE_PAXOS_SINGLE_REPLICA_LIMIT:
+				c->paxos_single_replica_limit = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_PIDFILE:
+				c->pidfile = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_SERVICE_CLIENT_FD_MAX:
+				cfg_renamed_name_tok(&line, "proto-fd-max");
+				// No break.
+			case CASE_SERVICE_PROTO_FD_MAX:
+				c->n_proto_fd_max = cfg_int_no_checks(&line);
+				break;
+			case CASE_SERVICE_ADVERTISE_IPV6:
+				cf_socket_set_advertise_ipv6(cfg_bool(&line));
+				break;
+			case CASE_SERVICE_AUTO_PIN:
+				switch (cfg_find_tok(line.val_tok_1, SERVICE_AUTO_PIN_OPTS, NUM_SERVICE_AUTO_PIN_OPTS)) {
+				case CASE_SERVICE_AUTO_PIN_NONE:
+					c->auto_pin = CF_TOPO_AUTO_PIN_NONE;
+					break;
+				case CASE_SERVICE_AUTO_PIN_CPU:
+					c->auto_pin = CF_TOPO_AUTO_PIN_CPU;
+					break;
+				case CASE_SERVICE_AUTO_PIN_NUMA:
+					c->auto_pin = CF_TOPO_AUTO_PIN_NUMA;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_SERVICE_BATCH_THREADS:
+				c->n_batch_threads = cfg_int(&line, 0, MAX_BATCH_THREADS);
+				break;
+			case CASE_SERVICE_BATCH_MAX_BUFFERS_PER_QUEUE:
+				c->batch_max_buffers_per_queue = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_BATCH_MAX_REQUESTS:
+				c->batch_max_requests = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_BATCH_MAX_UNUSED_BUFFERS:
+				c->batch_max_unused_buffers = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_BATCH_PRIORITY:
+				c->batch_priority = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_BATCH_INDEX_THREADS:
+				c->n_batch_index_threads = cfg_u32(&line, 1, MAX_BATCH_THREADS);
+				break;
+			case CASE_SERVICE_CLUSTER_NAME:
+				cfg_set_cluster_name(line.val_tok_1);
+				break;
+			case CASE_SERVICE_ENABLE_BENCHMARKS_FABRIC:
+				c->fabric_benchmarks_enabled = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_ENABLE_BENCHMARKS_SVC:
+				c->svc_benchmarks_enabled = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_ENABLE_HIST_INFO:
+				c->info_hist_enabled = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_FEATURE_KEY_FILE:
+				c->feature_key_file = cfg_strdup(&line, true);
+				break;
+			case CASE_SERVICE_HIST_TRACK_BACK:
+				c->hist_track_back = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_HIST_TRACK_SLICE:
+				c->hist_track_slice = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_HIST_TRACK_THRESHOLDS:
+				c->hist_track_thresholds = cfg_strdup_no_checks(&line);
+				// TODO - if config key present but no value (not even space) failure mode is bad...
+				break;
+			case CASE_SERVICE_INFO_THREADS:
+				c->n_info_threads = cfg_int_no_checks(&line);
+				break;
+			case CASE_SERVICE_LOG_LOCAL_TIME:
+				cf_fault_use_local_time(cfg_bool(&line));
+				break;
+			case CASE_SERVICE_LOG_MILLIS:
+				cf_fault_log_millis(cfg_bool(&line));
+				break;
+			case CASE_SERVICE_MIGRATE_MAX_NUM_INCOMING:
+				c->migrate_max_num_incoming = cfg_u32(&line, 0, AS_MIGRATE_LIMIT_MAX_NUM_INCOMING);
+				break;
+			case CASE_SERVICE_MIGRATE_THREADS:
+				c->n_migrate_threads = cfg_u32(&line, 0, MAX_NUM_MIGRATE_XMIT_THREADS);
+				break;
+			case CASE_SERVICE_MIN_CLUSTER_SIZE:
+				c->clustering_config.cluster_size_min = cfg_u32(&line, 0, AS_CLUSTER_SZ);
+				break;
+			case CASE_SERVICE_NODE_ID:
+				c->self_node = cfg_x64(&line, 1, UINT64_MAX);
+				break;
+			case CASE_SERVICE_NODE_ID_INTERFACE:
+				c->node_id_interface = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_SERVICE_NSUP_DELETE_SLEEP:
+				c->nsup_delete_sleep = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_NSUP_PERIOD:
+				c->nsup_period = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_NSUP_STARTUP_EVICT:
+				c->nsup_startup_evict = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_PROTO_FD_IDLE_MS:
+				c->proto_fd_idle_ms = cfg_int_no_checks(&line);
+				break;
+			case CASE_SERVICE_QUERY_BATCH_SIZE:
+				c->query_bsize = cfg_int_no_checks(&line);
+				break;
+			case CASE_SERVICE_QUERY_BUFPOOL_SIZE:
+				c->query_bufpool_size = cfg_u32(&line, 1, UINT32_MAX);
+				break;
+			case CASE_SERVICE_QUERY_IN_TRANSACTION_THREAD:
+				c->query_in_transaction_thr = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_QUERY_LONG_Q_MAX_SIZE:
+				c->query_long_q_max_size = cfg_u32(&line, 1, UINT32_MAX);
+				break;
+			case CASE_SERVICE_QUERY_PRE_RESERVE_PARTITIONS:
+				c->partitions_pre_reserved = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_QUERY_PRIORITY:
+				c->query_priority = cfg_int_no_checks(&line);
+				break;
+			case CASE_SERVICE_QUERY_PRIORITY_SLEEP_US:
+				c->query_sleep_us = cfg_u64_no_checks(&line);
+				break;
+			case CASE_SERVICE_QUERY_REC_COUNT_BOUND:
+				c->query_rec_count_bound = cfg_u64(&line, 1, UINT64_MAX);
+				break;
+			case CASE_SERVICE_QUERY_REQ_IN_QUERY_THREAD:
+				c->query_req_in_query_thread = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_QUERY_REQ_MAX_INFLIGHT:
+				c->query_req_max_inflight = cfg_u32(&line, 1, UINT32_MAX);
+				break;
+			case CASE_SERVICE_QUERY_SHORT_Q_MAX_SIZE:
+				c->query_short_q_max_size = cfg_u32(&line, 1, UINT32_MAX);
+				break;
+			case CASE_SERVICE_QUERY_THREADS:
+				c->query_threads = cfg_u32(&line, 1, AS_QUERY_MAX_THREADS);
+				break;
+			case CASE_SERVICE_QUERY_THRESHOLD:
+				c->query_threshold = cfg_int_no_checks(&line);
+				break;
+			case CASE_SERVICE_QUERY_UNTRACKED_TIME_MS:
+				c->query_untracked_time_ms = cfg_u64_no_checks(&line);
+				break;
+			case CASE_SERVICE_QUERY_WORKER_THREADS:
+				c->query_worker_threads = cfg_u32(&line, 1, AS_QUERY_MAX_WORKER_THREADS);
+				break;
+			case CASE_SERVICE_RUN_AS_DAEMON:
+				c->run_as_daemon = cfg_bool_no_value_is_true(&line);
+				break;
+			case CASE_SERVICE_SCAN_MAX_ACTIVE:
+				c->scan_max_active = cfg_u32(&line, 0, 200);
+				break;
+			case CASE_SERVICE_SCAN_MAX_DONE:
+				c->scan_max_done = cfg_u32(&line, 0, 1000);
+				break;
+			case CASE_SERVICE_SCAN_MAX_UDF_TRANSACTIONS:
+				c->scan_max_udf_transactions = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_SCAN_THREADS:
+				c->scan_threads = cfg_u32(&line, 0, 128);
+				break;
+			case CASE_SERVICE_SERVICE_THREADS:
+				c->n_service_threads = cfg_u32(&line, 1, MAX_DEMARSHAL_THREADS);
+				break;
+			case CASE_SERVICE_SINDEX_BUILDER_THREADS:
+				c->sindex_builder_threads = cfg_u32(&line, 1, MAX_SINDEX_BUILDER_THREADS);
+				break;
+			case CASE_SERVICE_SINDEX_GC_MAX_RATE:
+				c->sindex_gc_max_rate = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_SINDEX_GC_PERIOD:
+				c->sindex_gc_period = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_TICKER_INTERVAL:
+				c->ticker_interval = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_TRANSACTION_MAX_MS:
+				c->transaction_max_ns = cfg_u64_no_checks(&line) * 1000000;
+				break;
+			case CASE_SERVICE_TRANSACTION_PENDING_LIMIT:
+				c->transaction_pending_limit = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_TRANSACTION_QUEUES:
+				c->n_transaction_queues = cfg_u32(&line, 1, MAX_TRANSACTION_QUEUES);
+				break;
+			case CASE_SERVICE_TRANSACTION_RETRY_MS:
+				c->transaction_retry_ms = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_TRANSACTION_THREADS_PER_QUEUE:
+				c->n_transaction_threads_per_queue = cfg_u32(&line, 1, MAX_TRANSACTION_THREADS_PER_QUEUE);
+				break;
+			case CASE_SERVICE_WORK_DIRECTORY:
+				c->work_directory = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_SERVICE_DEBUG_ALLOCATIONS:
+				switch (cfg_find_tok(line.val_tok_1, SERVICE_DEBUG_ALLOCATIONS_OPTS, NUM_SERVICE_DEBUG_ALLOCATIONS_OPTS)) {
+				case CASE_SERVICE_DEBUG_ALLOCATIONS_NONE:
+					c->debug_allocations = CF_ALLOC_DEBUG_NONE;
+					break;
+				case CASE_SERVICE_DEBUG_ALLOCATIONS_TRANSIENT:
+					c->debug_allocations = CF_ALLOC_DEBUG_TRANSIENT;
+					break;
+				case CASE_SERVICE_DEBUG_ALLOCATIONS_PERSISTENT:
+					c->debug_allocations = CF_ALLOC_DEBUG_PERSISTENT;
+					break;
+				case CASE_SERVICE_DEBUG_ALLOCATIONS_ALL:
+					c->debug_allocations = CF_ALLOC_DEBUG_ALL;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_SERVICE_FABRIC_DUMP_MSGS:
+				c->fabric_dump_msgs = cfg_bool(&line);
+				break;
+			case CASE_SERVICE_PROLE_EXTRA_TTL:
+				c->prole_extra_ttl = cfg_u32_no_checks(&line);
+				break;
+			case CASE_SERVICE_ALLOW_INLINE_TRANSACTIONS:
+				cfg_obsolete(&line, "please configure 'service-threads' carefully");
+				break;
+			case CASE_SERVICE_RESPOND_CLIENT_ON_MASTER_COMPLETION:
+				cfg_obsolete(&line, "please use namespace-context 'write-commit-level-override' and/or write transaction policy");
+				break;
+			case CASE_SERVICE_TRANSACTION_REPEATABLE_READ:
+				cfg_obsolete(&line, "please use namespace-context 'read-consistency-level-override' and/or read transaction policy");
+				break;
+			case CASE_SERVICE_AUTO_DUN:
+			case CASE_SERVICE_AUTO_UNDUN:
+			case CASE_SERVICE_BATCH_RETRANSMIT:
+			case CASE_SERVICE_CLIB_LIBRARY:
+			case CASE_SERVICE_DEFRAG_QUEUE_ESCAPE:
+			case CASE_SERVICE_DEFRAG_QUEUE_HWM:
+			case CASE_SERVICE_DEFRAG_QUEUE_LWM:
+			case CASE_SERVICE_DEFRAG_QUEUE_PRIORITY:
+			case CASE_SERVICE_DUMP_MESSAGE_ABOVE_SIZE:
+			case CASE_SERVICE_FABRIC_WORKERS:
+			case CASE_SERVICE_FB_HEALTH_BAD_PCT:
+			case CASE_SERVICE_FB_HEALTH_GOOD_PCT:
+			case CASE_SERVICE_FB_HEALTH_MSG_PER_BURST:
+			case CASE_SERVICE_FB_HEALTH_MSG_TIMEOUT:
+			case CASE_SERVICE_GENERATION_DISABLE:
+			case CASE_SERVICE_MAX_MSGS_PER_TYPE:
+			case CASE_SERVICE_MIGRATE_READ_PRIORITY:
+			case CASE_SERVICE_MIGRATE_READ_SLEEP:
+			case CASE_SERVICE_MIGRATE_RX_LIFETIME_MS:
+			case CASE_SERVICE_MIGRATE_XMIT_HWM:
+			case CASE_SERVICE_MIGRATE_XMIT_LWM:
+			case CASE_SERVICE_MIGRATE_PRIORITY:
+			case CASE_SERVICE_MIGRATE_XMIT_PRIORITY:
+			case CASE_SERVICE_MIGRATE_XMIT_SLEEP:
+			case CASE_SERVICE_NSUP_AUTO_HWM:
+			case CASE_SERVICE_NSUP_AUTO_HWM_PCT:
+			case CASE_SERVICE_NSUP_MAX_DELETES:
+			case CASE_SERVICE_NSUP_QUEUE_ESCAPE:
+			case CASE_SERVICE_NSUP_QUEUE_HWM:
+			case CASE_SERVICE_NSUP_QUEUE_LWM:
+			case CASE_SERVICE_NSUP_REDUCE_PRIORITY:
+			case CASE_SERVICE_NSUP_REDUCE_SLEEP:
+			case CASE_SERVICE_NSUP_THREADS:
+			case CASE_SERVICE_PAXOS_MAX_CLUSTER_SIZE:
+			case CASE_SERVICE_PAXOS_PROTOCOL:
+			case CASE_SERVICE_PAXOS_RECOVERY_POLICY:
+			case CASE_SERVICE_PAXOS_RETRANSMIT_PERIOD:
+			case CASE_SERVICE_REPLICATION_FIRE_AND_FORGET:
+			case CASE_SERVICE_SCAN_MEMORY:
+			case CASE_SERVICE_SCAN_PRIORITY:
+			case CASE_SERVICE_SCAN_RETRANSMIT:
+			case CASE_SERVICE_SCHEDULER_PRIORITY:
+			case CASE_SERVICE_SCHEDULER_TYPE:
+			case CASE_SERVICE_TRANSACTION_DUPLICATE_THREADS:
+			case CASE_SERVICE_TRIAL_ACCOUNT_KEY:
+			case CASE_SERVICE_UDF_RUNTIME_MAX_GMEMORY:
+			case CASE_SERVICE_UDF_RUNTIME_MAX_MEMORY:
+			case CASE_SERVICE_USE_QUEUE_PER_DEVICE:
+			case CASE_SERVICE_WRITE_DUPLICATE_RESOLUTION_DISABLE:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse logging context items.
+		//
+		case LOGGING:
+			switch (cfg_find_tok(line.name_tok, LOGGING_OPTS, NUM_LOGGING_OPTS)) {
+			case CASE_LOG_FILE_BEGIN:
+				if ((sink = cf_fault_sink_hold(line.val_tok_1)) == NULL) {
+					cf_crash_nostack(AS_CFG, "line %d :: can't add file %s as log sink", line_num, line.val_tok_1);
+				}
+				cfg_begin_context(&state, LOGGING_FILE);
+				break;
+			case CASE_LOG_CONSOLE_BEGIN:
+				if ((sink = cf_fault_sink_hold("stderr")) == NULL) {
+					cf_crash_nostack(AS_CFG, "line %d :: can't add stderr as log sink", line_num);
+				}
+				cfg_begin_context(&state, LOGGING_CONSOLE);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse logging::file context items.
+		//
+		case LOGGING_FILE:
+			switch (cfg_find_tok(line.name_tok, LOGGING_FILE_OPTS, NUM_LOGGING_FILE_OPTS)) {
+			case CASE_LOG_FILE_CONTEXT:
+				if (0 != cf_fault_sink_addcontext(sink, line.val_tok_1, line.val_tok_2)) {
+					cf_crash_nostack(AS_CFG, "line %d :: can't add logging file context %s %s", line_num, line.val_tok_1, line.val_tok_2);
+				}
+				break;
+			case CASE_CONTEXT_END:
+				sink = NULL;
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse logging::console context items.
+		//
+		case LOGGING_CONSOLE:
+			switch (cfg_find_tok(line.name_tok, LOGGING_CONSOLE_OPTS, NUM_LOGGING_CONSOLE_OPTS)) {
+			case CASE_LOG_CONSOLE_CONTEXT:
+				if (0 != cf_fault_sink_addcontext(sink, line.val_tok_1, line.val_tok_2)) {
+					cf_crash_nostack(AS_CFG, "line %d :: can't add logging console context %s %s", line_num, line.val_tok_1, line.val_tok_2);
+				}
+				break;
+			case CASE_CONTEXT_END:
+				sink = NULL;
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse network context items.
+		//
+		case NETWORK:
+			switch (cfg_find_tok(line.name_tok, NETWORK_OPTS, NUM_NETWORK_OPTS)) {
+			case CASE_NETWORK_SERVICE_BEGIN:
+				cfg_begin_context(&state, NETWORK_SERVICE);
+				break;
+			case CASE_NETWORK_HEARTBEAT_BEGIN:
+				cfg_begin_context(&state, NETWORK_HEARTBEAT);
+				break;
+			case CASE_NETWORK_FABRIC_BEGIN:
+				cfg_begin_context(&state, NETWORK_FABRIC);
+				break;
+			case CASE_NETWORK_INFO_BEGIN:
+				cfg_begin_context(&state, NETWORK_INFO);
+				break;
+			case CASE_NETWORK_TLS_BEGIN:
+				tls_spec = cfg_create_tls_spec(c, line.val_tok_1);
+				cfg_begin_context(&state, NETWORK_TLS);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse network::service context items.
+		//
+		case NETWORK_SERVICE:
+			switch (cfg_find_tok(line.name_tok, NETWORK_SERVICE_OPTS, NUM_NETWORK_SERVICE_OPTS)) {
+			case CASE_NETWORK_SERVICE_ADDRESS:
+				cfg_add_addr_bind(line.val_tok_1, &c->service);
+				break;
+			case CASE_NETWORK_SERVICE_PORT:
+				c->service.bind_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_SERVICE_EXTERNAL_ADDRESS:
+				cfg_renamed_name_tok(&line, "access-address");
+				// No break.
+			case CASE_NETWORK_SERVICE_ACCESS_ADDRESS:
+				cfg_add_addr_std(line.val_tok_1, &c->service);
+				break;
+			case CASE_NETWORK_SERVICE_ACCESS_PORT:
+				c->service.std_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_ADDRESS:
+				cfg_add_addr_alt(line.val_tok_1, &c->service);
+				break;
+			case CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_PORT:
+				c->service.alt_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_ACCESS_ADDRESS:
+				cfg_enterprise_only(&line);
+				cfg_add_addr_std(line.val_tok_1, &c->tls_service);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_ACCESS_PORT:
+				cfg_enterprise_only(&line);
+				c->tls_service.std_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_ADDRESS:
+				cfg_enterprise_only(&line);
+				cfg_add_addr_bind(line.val_tok_1, &c->tls_service);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_ADDRESS:
+				cfg_enterprise_only(&line);
+				cfg_add_addr_alt(line.val_tok_1, &c->tls_service);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_PORT:
+				cfg_enterprise_only(&line);
+				c->tls_service.alt_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_AUTHENTICATE_CLIENT:
+				cfg_enterprise_only(&line);
+				add_tls_peer_name(line.val_tok_1, &c->tls_service);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_NAME:
+				cfg_enterprise_only(&line);
+				c->tls_service.tls_our_name = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_SERVICE_TLS_PORT:
+				cfg_enterprise_only(&line);
+				c->tls_service.bind_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_SERVICE_ALTERNATE_ADDRESS:
+				cfg_obsolete(&line, "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/network_to_3_10");
+				break;
+			case CASE_NETWORK_SERVICE_NETWORK_INTERFACE_NAME:
+				cfg_obsolete(&line, "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/network_to_3_10");
+				break;
+			case CASE_NETWORK_SERVICE_REUSE_ADDRESS:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse network::heartbeat context items.
+		//
+		case NETWORK_HEARTBEAT:
+			switch (cfg_find_tok(line.name_tok, NETWORK_HEARTBEAT_OPTS, NUM_NETWORK_HEARTBEAT_OPTS)) {
+			case CASE_NETWORK_HEARTBEAT_MODE:
+				switch (cfg_find_tok(line.val_tok_1, NETWORK_HEARTBEAT_MODE_OPTS, NUM_NETWORK_HEARTBEAT_MODE_OPTS)) {
+				case CASE_NETWORK_HEARTBEAT_MODE_MULTICAST:
+					c->hb_config.mode = AS_HB_MODE_MULTICAST;
+					break;
+				case CASE_NETWORK_HEARTBEAT_MODE_MESH:
+					c->hb_config.mode = AS_HB_MODE_MESH;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NETWORK_HEARTBEAT_ADDRESS:
+				cfg_add_addr_bind(line.val_tok_1, &c->hb_serv_spec);
+				break;
+			case CASE_NETWORK_HEARTBEAT_MULTICAST_GROUP:
+				add_addr(line.val_tok_1, &c->hb_multicast_groups);
+				break;
+			case CASE_NETWORK_HEARTBEAT_PORT:
+				c->hb_serv_spec.bind_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_HEARTBEAT_MESH_SEED_ADDRESS_PORT:
+				cfg_add_mesh_seed_addr_port(cfg_strdup_no_checks(&line), cfg_port_val2(&line), false);
+				break;
+			case CASE_NETWORK_HEARTBEAT_INTERVAL:
+				c->hb_config.tx_interval = cfg_u32(&line, AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX);
+				break;
+			case CASE_NETWORK_HEARTBEAT_TIMEOUT:
+				c->hb_config.max_intervals_missed = cfg_u32(&line, AS_HB_MAX_INTERVALS_MISSED_MIN, UINT_MAX);
+				break;
+			case CASE_NETWORK_HEARTBEAT_MTU:
+				c->hb_config.override_mtu = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NETWORK_HEARTBEAT_MCAST_TTL:
+				cfg_renamed_name_tok(&line, "multicast-ttl");
+				// No break.
+			case CASE_NETWORK_HEARTBEAT_MULTICAST_TTL:
+				c->hb_config.multicast_ttl = cfg_u8_no_checks(&line);
+				break;
+			case CASE_NETWORK_HEARTBEAT_PROTOCOL:
+				switch (cfg_find_tok(line.val_tok_1, NETWORK_HEARTBEAT_PROTOCOL_OPTS, NUM_NETWORK_HEARTBEAT_PROTOCOL_OPTS)) {
+				case CASE_NETWORK_HEARTBEAT_PROTOCOL_NONE:
+					c->hb_config.protocol = AS_HB_PROTOCOL_NONE;
+					break;
+				case CASE_NETWORK_HEARTBEAT_PROTOCOL_V3:
+					c->hb_config.protocol = AS_HB_PROTOCOL_V3;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NETWORK_HEARTBEAT_TLS_ADDRESS:
+				cfg_enterprise_only(&line);
+				cfg_add_addr_bind(line.val_tok_1, &c->hb_tls_serv_spec);
+				break;
+			case CASE_NETWORK_HEARTBEAT_TLS_MESH_SEED_ADDRESS_PORT:
+				cfg_add_mesh_seed_addr_port(cfg_strdup_no_checks(&line), cfg_port_val2(&line), true);
+				break;
+			case CASE_NETWORK_HEARTBEAT_TLS_NAME:
+				cfg_enterprise_only(&line);
+				c->hb_tls_serv_spec.tls_our_name = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_HEARTBEAT_TLS_PORT:
+				cfg_enterprise_only(&line);
+				c->hb_tls_serv_spec.bind_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_HEARTBEAT_INTERFACE_ADDRESS:
+				cfg_obsolete(&line, "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/network_to_3_10");
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse network::fabric context items.
+		//
+		case NETWORK_FABRIC:
+			switch (cfg_find_tok(line.name_tok, NETWORK_FABRIC_OPTS, NUM_NETWORK_FABRIC_OPTS)) {
+			case CASE_NETWORK_FABRIC_ADDRESS:
+				cfg_add_addr_bind(line.val_tok_1, &c->fabric);
+				break;
+			case CASE_NETWORK_FABRIC_PORT:
+				c->fabric.bind_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_BULK_FDS:
+				c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_BULK] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_BULK_RECV_THREADS:
+				c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_CTRL_FDS:
+				c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_CTRL] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_CTRL_RECV_THREADS:
+				c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_META_FDS:
+				c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_META] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_META_RECV_THREADS:
+				c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_RW_FDS:
+				c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_RW] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS);
+				break;
+			case CASE_NETWORK_FABRIC_CHANNEL_RW_RECV_THREADS:
+				c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS);
+				break;
+			case CASE_NETWORK_FABRIC_KEEPALIVE_ENABLED:
+				c->fabric_keepalive_enabled = cfg_bool(&line);
+				break;
+			case CASE_NETWORK_FABRIC_KEEPALIVE_INTVL:
+				c->fabric_keepalive_intvl = cfg_int_no_checks(&line);
+				break;
+			case CASE_NETWORK_FABRIC_KEEPALIVE_PROBES:
+				c->fabric_keepalive_probes = cfg_int_no_checks(&line);
+				break;
+			case CASE_NETWORK_FABRIC_KEEPALIVE_TIME:
+				c->fabric_keepalive_time = cfg_int_no_checks(&line);
+				break;
+			case CASE_NETWORK_FABRIC_LATENCY_MAX_MS:
+				c->fabric_latency_max_ms = cfg_int(&line, 0, 1000);
+				break;
+			case CASE_NETWORK_FABRIC_RECV_REARM_THRESHOLD:
+				c->fabric_recv_rearm_threshold = cfg_u32(&line, 0, 1024 * 1024);
+				break;
+			case CASE_NETWORK_FABRIC_SEND_THREADS:
+				c->n_fabric_send_threads = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS);
+				break;
+			case CASE_NETWORK_FABRIC_TLS_ADDRESS:
+				cfg_enterprise_only(&line);
+				cfg_add_addr_bind(line.val_tok_1, &c->tls_fabric);
+				break;
+			case CASE_NETWORK_FABRIC_TLS_NAME:
+				cfg_enterprise_only(&line);
+				c->tls_fabric.tls_our_name = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_FABRIC_TLS_PORT:
+				cfg_enterprise_only(&line);
+				c->tls_fabric.bind_port = cfg_port(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse network::info context items.
+		//
+		case NETWORK_INFO:
+			switch (cfg_find_tok(line.name_tok, NETWORK_INFO_OPTS, NUM_NETWORK_INFO_OPTS)) {
+			case CASE_NETWORK_INFO_ADDRESS:
+				cfg_add_addr_bind(line.val_tok_1, &c->info);
+				break;
+			case CASE_NETWORK_INFO_PORT:
+				c->info.bind_port = cfg_port(&line);
+				break;
+			case CASE_NETWORK_INFO_ENABLE_FASTPATH:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse network::tls context items.
+		//
+		case NETWORK_TLS:
+			switch (cfg_find_tok(line.name_tok, NETWORK_TLS_OPTS, NUM_NETWORK_TLS_OPTS)) {
+			case CASE_NETWORK_TLS_CA_FILE:
+				cfg_enterprise_only(&line);
+				tls_spec->ca_file = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_TLS_CA_PATH:
+				cfg_enterprise_only(&line);
+				tls_spec->ca_path = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_TLS_CERT_BLACKLIST:
+				cfg_enterprise_only(&line);
+				tls_spec->cert_blacklist = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_TLS_CERT_FILE:
+				cfg_enterprise_only(&line);
+				tls_spec->cert_file = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_TLS_CIPHER_SUITE:
+				cfg_enterprise_only(&line);
+				tls_spec->cipher_suite = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_TLS_KEY_FILE:
+				cfg_enterprise_only(&line);
+				tls_spec->key_file = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_NETWORK_TLS_PROTOCOLS:
+				cfg_enterprise_only(&line);
+				tls_spec->protocols = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_CONTEXT_END:
+				tls_spec = NULL;
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse namespace items.
+		//
+		case NAMESPACE:
+			switch (cfg_find_tok(line.name_tok, NAMESPACE_OPTS, NUM_NAMESPACE_OPTS)) {
+			case CASE_NAMESPACE_REPLICATION_FACTOR:
+				ns->cfg_replication_factor = cfg_u32(&line, 1, AS_CLUSTER_SZ);
+				break;
+			case CASE_NAMESPACE_LIMIT_SIZE:
+				cfg_renamed_name_tok(&line, "memory-size");
+				// No break.
+			case CASE_NAMESPACE_MEMORY_SIZE:
+				ns->memory_size = cfg_u64_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_DEFAULT_TTL:
+				ns->default_ttl = cfg_seconds_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_ENGINE_BEGIN:
+				switch (cfg_find_tok(line.val_tok_1, NAMESPACE_STORAGE_OPTS, NUM_NAMESPACE_STORAGE_OPTS)) {
+				case CASE_NAMESPACE_STORAGE_MEMORY:
+					ns->storage_type = AS_STORAGE_ENGINE_MEMORY;
+					ns->storage_data_in_memory = true;
+					break;
+				case CASE_NAMESPACE_STORAGE_SSD:
+					cfg_renamed_val_tok_1(&line, "device");
+					// No break.
+				case CASE_NAMESPACE_STORAGE_DEVICE:
+					ns->storage_type = AS_STORAGE_ENGINE_SSD;
+					ns->storage_data_in_memory = false;
+					cfg_begin_context(&state, NAMESPACE_STORAGE_DEVICE);
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NAMESPACE_ENABLE_XDR:
+				cfg_enterprise_only(&line);
+				ns->enable_xdr = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_SETS_ENABLE_XDR:
+				ns->sets_enable_xdr = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_FORWARD_XDR_WRITES:
+				ns->ns_forward_xdr_writes = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_XDR_REMOTE_DATACENTER:
+				xdr_cfg_add_datacenter(cfg_strdup(&line, true), ns->id);
+				break;
+			case CASE_NAMESPACE_ALLOW_NONXDR_WRITES:
+				ns->ns_allow_nonxdr_writes = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_ALLOW_XDR_WRITES:
+				ns->ns_allow_xdr_writes = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_COLD_START_EVICT_TTL:
+				ns->cold_start_evict_ttl = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_CONFLICT_RESOLUTION_POLICY:
+				switch (cfg_find_tok(line.val_tok_1, NAMESPACE_CONFLICT_RESOLUTION_OPTS, NUM_NAMESPACE_CONFLICT_RESOLUTION_OPTS)) {
+				case CASE_NAMESPACE_CONFLICT_RESOLUTION_GENERATION:
+					ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION;
+					break;
+				case CASE_NAMESPACE_CONFLICT_RESOLUTION_LAST_UPDATE_TIME:
+					ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NAMESPACE_DATA_IN_INDEX:
+				ns->data_in_index = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_DISABLE_WRITE_DUP_RES:
+				ns->write_dup_res_disabled = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_DISALLOW_NULL_SETNAME:
+				ns->disallow_null_setname = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_ENABLE_BENCHMARKS_BATCH_SUB:
+				ns->batch_sub_benchmarks_enabled = true;
+				break;
+			case CASE_NAMESPACE_ENABLE_BENCHMARKS_READ:
+				ns->read_benchmarks_enabled = true;
+				break;
+			case CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF:
+				ns->udf_benchmarks_enabled = true;
+				break;
+			case CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF_SUB:
+				ns->udf_sub_benchmarks_enabled = true;
+				break;
+			case CASE_NAMESPACE_ENABLE_BENCHMARKS_WRITE:
+				ns->write_benchmarks_enabled = true;
+				break;
+			case CASE_NAMESPACE_ENABLE_HIST_PROXY:
+				ns->proxy_hist_enabled = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_EVICT_HIST_BUCKETS:
+				ns->evict_hist_buckets = cfg_u32(&line, 100, 10000000);
+				break;
+			case CASE_NAMESPACE_EVICT_TENTHS_PCT:
+				ns->evict_tenths_pct = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_HIGH_WATER_DISK_PCT:
+				ns->hwm_disk_pct = cfg_u32(&line, 0, 100);
+				break;
+			case CASE_NAMESPACE_HIGH_WATER_MEMORY_PCT:
+				ns->hwm_memory_pct = cfg_u32(&line, 0, 100);
+				break;
+			case CASE_NAMESPACE_MAX_TTL:
+				ns->max_ttl = cfg_seconds(&line, 1, MAX_ALLOWED_TTL);
+				break;
+			case CASE_NAMESPACE_MIGRATE_ORDER:
+				ns->migrate_order = cfg_u32(&line, 1, 10);
+				break;
+			case CASE_NAMESPACE_MIGRATE_RETRANSMIT_MS:
+				ns->migrate_retransmit_ms = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_MIGRATE_SLEEP:
+				ns->migrate_sleep = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_OBJ_SIZE_HIST_MAX:
+				ns->obj_size_hist_max = cfg_obj_size_hist_max(cfg_u32_no_checks(&line));
+				break;
+			case CASE_NAMESPACE_PARTITION_TREE_LOCKS:
+				ns->tree_shared.n_lock_pairs = cfg_u32_power_of_2(&line, 1, 256);
+				break;
+			case CASE_NAMESPACE_PARTITION_TREE_SPRIGS:
+				ns->tree_shared.n_sprigs = cfg_u32_power_of_2(&line, 16, 4096);
+				break;
+			case CASE_NAMESPACE_RACK_ID:
+				cfg_enterprise_only(&line);
+				ns->rack_id = cfg_u32(&line, 0, MAX_RACK_ID);
+				break;
+			case CASE_NAMESPACE_READ_CONSISTENCY_LEVEL_OVERRIDE:
+				switch (cfg_find_tok(line.val_tok_1, NAMESPACE_READ_CONSISTENCY_OPTS, NUM_NAMESPACE_READ_CONSISTENCY_OPTS)) {
+				case CASE_NAMESPACE_READ_CONSISTENCY_ALL:
+					ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ALL;
+					break;
+				case CASE_NAMESPACE_READ_CONSISTENCY_OFF:
+					ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_PROTO;
+					break;
+				case CASE_NAMESPACE_READ_CONSISTENCY_ONE:
+					ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ONE;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NAMESPACE_SET_BEGIN:
+				p_set = cfg_add_set(ns);
+				cfg_strcpy(&line, p_set->name, AS_SET_NAME_MAX_SIZE);
+				cfg_begin_context(&state, NAMESPACE_SET);
+				break;
+			case CASE_NAMESPACE_SINDEX_BEGIN:
+				cfg_begin_context(&state, NAMESPACE_SINDEX);
+				break;
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_BEGIN:
+				cfg_begin_context(&state, NAMESPACE_GEO2DSPHERE_WITHIN);
+				break;
+			case CASE_NAMESPACE_SINGLE_BIN:
+				ns->single_bin = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STOP_WRITES_PCT:
+				ns->stop_writes_pct = cfg_u32(&line, 0, 100);
+				break;
+			case CASE_NAMESPACE_STRONG_CONSISTENCY:
+				cfg_enterprise_only(&line);
+				ns->cp = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STRONG_CONSISTENCY_ALLOW_EXPUNGE:
+				cfg_enterprise_only(&line);
+				ns->cp_allow_drops = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_TOMB_RAIDER_ELIGIBLE_AGE:
+				cfg_enterprise_only(&line);
+				ns->tomb_raider_eligible_age = cfg_seconds_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_TOMB_RAIDER_PERIOD:
+				cfg_enterprise_only(&line);
+				ns->tomb_raider_period = cfg_seconds_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_WRITE_COMMIT_LEVEL_OVERRIDE:
+				switch (cfg_find_tok(line.val_tok_1, NAMESPACE_WRITE_COMMIT_OPTS, NUM_NAMESPACE_WRITE_COMMIT_OPTS)) {
+				case CASE_NAMESPACE_WRITE_COMMIT_ALL:
+					ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_ALL;
+					break;
+				case CASE_NAMESPACE_WRITE_COMMIT_MASTER:
+					ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_MASTER;
+					break;
+				case CASE_NAMESPACE_WRITE_COMMIT_OFF:
+					ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_PROTO;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NAMESPACE_ALLOW_VERSIONS:
+			case CASE_NAMESPACE_DEMO_READ_MULTIPLIER:
+			case CASE_NAMESPACE_DEMO_WRITE_MULTIPLIER:
+			case CASE_NAMESPACE_HIGH_WATER_PCT:
+			case CASE_NAMESPACE_LOW_WATER_PCT:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_NAMESPACE_SI_BEGIN:
+				cfg_deprecated_name_tok(&line);
+				// Entire section is deprecated but needs to begin and end the
+				// context to avoid crash.
+				cfg_begin_context(&state, NAMESPACE_SI);
+				break;
+			case CASE_CONTEXT_END:
+				if (ns->data_in_index && ! (ns->single_bin && ns->storage_data_in_memory && ns->storage_type == AS_STORAGE_ENGINE_SSD)) {
+					cf_crash_nostack(AS_CFG, "ns %s data-in-index can't be true unless storage-engine is device and both single-bin and data-in-memory are true", ns->name);
+				}
+				if (ns->default_ttl > ns->max_ttl) {
+					cf_crash_nostack(AS_CFG, "ns %s default-ttl can't be > max-ttl", ns->name);
+				}
+				if (ns->tree_shared.n_lock_pairs > ns->tree_shared.n_sprigs) {
+					cf_crash_nostack(AS_CFG, "ns %s partition-tree-locks can't be > partition-tree-sprigs", ns->name);
+				}
+				if (ns->storage_data_in_memory) {
+					ns->storage_post_write_queue = 0; // override default (or configuration mistake)
+				}
+				if (ns->storage_data_in_memory &&
+						! ns->storage_commit_to_device) {
+					c->n_namespaces_inlined++;
+				}
+				else {
+					c->n_namespaces_not_inlined++;
+				}
+				ns = NULL;
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse namespace::storage-engine device context items.
+		//
+		case NAMESPACE_STORAGE_DEVICE:
+			switch (cfg_find_tok(line.name_tok, NAMESPACE_STORAGE_DEVICE_OPTS, NUM_NAMESPACE_STORAGE_DEVICE_OPTS)) {
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEVICE:
+				cfg_add_storage_device(ns, cfg_strdup(&line, true), cfg_strdup_val2(&line, false));
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_FILE:
+				cfg_add_storage_file(ns, cfg_strdup(&line, true));
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_FILESIZE:
+				ns->storage_filesize = cfg_u64(&line, 1024 * 1024, AS_STORAGE_MAX_DEVICE_SIZE);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE:
+				ns->storage_scheduler_mode = cfg_strdup_one_of(&line, DEVICE_SCHEDULER_MODES, NUM_DEVICE_SCHEDULER_MODES);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE:
+				ns->storage_write_block_size = cfg_u32_power_of_2(&line, MIN_WRITE_BLOCK_SIZE, MAX_WRITE_BLOCK_SIZE);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_MEMORY_ALL:
+				cfg_renamed_name_tok(&line, "data-in-memory");
+				// No break.
+			case CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY:
+				ns->storage_data_in_memory = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY:
+				ns->storage_cold_start_empty = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE:
+				cfg_enterprise_only(&line);
+				ns->storage_commit_to_device = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_MIN_SIZE:
+				cfg_enterprise_only(&line);
+				ns->storage_commit_min_size = cfg_u32_power_of_2(&line, 0, MAX_WRITE_BLOCK_SIZE);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_LWM_PCT:
+				ns->storage_defrag_lwm_pct = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_QUEUE_MIN:
+				ns->storage_defrag_queue_min = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_SLEEP:
+				ns->storage_defrag_sleep = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_STARTUP_MINIMUM:
+				ns->storage_defrag_startup_minimum = cfg_int(&line, 1, 99);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_DISABLE_ODIRECT:
+				ns->storage_disable_odirect = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_BENCHMARKS_STORAGE:
+				ns->storage_benchmarks_enabled = true;
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_OSYNC:
+				ns->storage_enable_osync = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_ENCRYPTION_KEY_FILE:
+				cfg_enterprise_only(&line);
+				ns->storage_encryption_key_file = cfg_strdup(&line, true);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_FLUSH_MAX_MS:
+				ns->storage_flush_max_us = cfg_u64_no_checks(&line) * 1000;
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_FSYNC_MAX_SEC:
+				ns->storage_fsync_max_us = cfg_u64_no_checks(&line) * 1000000;
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_MAX_WRITE_CACHE:
+				ns->storage_max_write_cache = cfg_u64_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_MIN_AVAIL_PCT:
+				ns->storage_min_avail_pct = cfg_u32(&line, 0, 100);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_POST_WRITE_QUEUE:
+				ns->storage_post_write_queue = cfg_u32(&line, 0, 4 * 1024);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_TOMB_RAIDER_SLEEP:
+				cfg_enterprise_only(&line);
+				ns->storage_tomb_raider_sleep = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_WRITE_THREADS:
+				ns->storage_write_threads = cfg_u32_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_MAX_BLOCKS:
+			case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_PERIOD:
+			case CASE_NAMESPACE_STORAGE_DEVICE_LOAD_AT_STARTUP:
+			case CASE_NAMESPACE_STORAGE_DEVICE_PERSIST:
+			case CASE_NAMESPACE_STORAGE_DEVICE_READONLY:
+			case CASE_NAMESPACE_STORAGE_DEVICE_SIGNATURE:
+			case CASE_NAMESPACE_STORAGE_DEVICE_WRITE_SMOOTHING_PERIOD:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse namespace::set context items.
+		//
+		case NAMESPACE_SET:
+			switch (cfg_find_tok(line.name_tok, NAMESPACE_SET_OPTS, NUM_NAMESPACE_SET_OPTS)) {
+			case CASE_NAMESPACE_SET_DISABLE_EVICTION:
+				DISABLE_SET_EVICTION(p_set, cfg_bool(&line));
+				break;
+			case CASE_NAMESPACE_SET_ENABLE_XDR:
+				switch (cfg_find_tok(line.val_tok_1, NAMESPACE_SET_ENABLE_XDR_OPTS, NUM_NAMESPACE_SET_ENABLE_XDR_OPTS)) {
+				case CASE_NAMESPACE_SET_ENABLE_XDR_USE_DEFAULT:
+					p_set->enable_xdr = AS_SET_ENABLE_XDR_DEFAULT;
+					break;
+				case CASE_NAMESPACE_SET_ENABLE_XDR_FALSE:
+					p_set->enable_xdr = AS_SET_ENABLE_XDR_FALSE;
+					break;
+				case CASE_NAMESPACE_SET_ENABLE_XDR_TRUE:
+					p_set->enable_xdr = AS_SET_ENABLE_XDR_TRUE;
+					break;
+				case CASE_NOT_FOUND:
+				default:
+					cfg_unknown_val_tok_1(&line);
+					break;
+				}
+				break;
+			case CASE_NAMESPACE_SET_STOP_WRITES_COUNT:
+				p_set->stop_writes_count = cfg_u64_no_checks(&line);
+				break;
+			case CASE_NAMESPACE_SET_EVICT_HWM_COUNT:
+			case CASE_NAMESPACE_SET_EVICT_HWM_PCT:
+			case CASE_NAMESPACE_SET_STOP_WRITE_COUNT:
+			case CASE_NAMESPACE_SET_STOP_WRITE_PCT:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse namespace::si context items.
+		//
+		case NAMESPACE_SI:
+			switch (cfg_find_tok(line.name_tok, NAMESPACE_SI_OPTS, NUM_NAMESPACE_SI_OPTS)) {
+			case CASE_NAMESPACE_SI_GC_PERIOD:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_NAMESPACE_SI_GC_MAX_UNITS:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_NAMESPACE_SI_HISTOGRAM:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_NAMESPACE_SI_IGNORE_NOT_SYNC:
+				cfg_deprecated_name_tok(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_val_tok_1(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse namespace::sindex context items.
+		//
+		case NAMESPACE_SINDEX:
+			switch (cfg_find_tok(line.name_tok, NAMESPACE_SINDEX_OPTS, NUM_NAMESPACE_SINDEX_OPTS)) {
+			case CASE_NAMESPACE_SINDEX_NUM_PARTITIONS:
+				// FIXME - minimum should be 1, but currently crashes.
+				ns->sindex_num_partitions = cfg_u32(&line, MIN_PARTITIONS_PER_INDEX, MAX_PARTITIONS_PER_INDEX);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse namespace::2dsphere-within context items.
+		//
+		case NAMESPACE_GEO2DSPHERE_WITHIN:
+			switch (cfg_find_tok(line.name_tok, NAMESPACE_GEO2DSPHERE_WITHIN_OPTS, NUM_NAMESPACE_GEO2DSPHERE_WITHIN_OPTS)) {
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_STRICT:
+				ns->geo2dsphere_within_strict = cfg_bool(&line);
+				break;
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MIN_LEVEL:
+				ns->geo2dsphere_within_min_level = cfg_u16(&line, 0, 30);
+				break;
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_LEVEL:
+				ns->geo2dsphere_within_max_level = cfg_u16(&line, 0, 30);
+				break;
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_CELLS:
+				ns->geo2dsphere_within_max_cells = cfg_u16(&line, 1, MAX_REGION_CELLS);
+				break;
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_LEVEL_MOD:
+				ns->geo2dsphere_within_level_mod = cfg_u16(&line, 1, 3);
+				break;
+			case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_EARTH_RADIUS_METERS:
+				ns->geo2dsphere_within_earth_radius_meters = cfg_u32_no_checks(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse mod-lua context items.
+		//
+		case MOD_LUA:
+			switch (cfg_find_tok(line.name_tok, MOD_LUA_OPTS, NUM_MOD_LUA_OPTS)) {
+			case CASE_MOD_LUA_CACHE_ENABLED:
+				c->mod_lua.cache_enabled = cfg_bool(&line);
+				break;
+			case CASE_MOD_LUA_SYSTEM_PATH:
+				cfg_strcpy(&line, c->mod_lua.system_path, sizeof(c->mod_lua.system_path));
+				break;
+			case CASE_MOD_LUA_USER_PATH:
+				cfg_strcpy(&line, c->mod_lua.user_path, sizeof(c->mod_lua.user_path));
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse security context items.
+		//
+		case SECURITY:
+			switch (cfg_find_tok(line.name_tok, SECURITY_OPTS, NUM_SECURITY_OPTS)) {
+			case CASE_SECURITY_ENABLE_SECURITY:
+				c->sec_cfg.security_enabled = cfg_bool(&line);
+				break;
+			case CASE_SECURITY_PRIVILEGE_REFRESH_PERIOD:
+				c->sec_cfg.privilege_refresh_period = cfg_u32(&line, 10, 60 * 60 * 24);
+				break;
+			case CASE_SECURITY_LOG_BEGIN:
+				cfg_begin_context(&state, SECURITY_LOG);
+				break;
+			case CASE_SECURITY_SYSLOG_BEGIN:
+				cfg_begin_context(&state, SECURITY_SYSLOG);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse security::log context items.
+		//
+		case SECURITY_LOG:
+			switch (cfg_find_tok(line.name_tok, SECURITY_LOG_OPTS, NUM_SECURITY_LOG_OPTS)) {
+			case CASE_SECURITY_LOG_REPORT_AUTHENTICATION:
+				c->sec_cfg.report.authentication |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0;
+				break;
+			case CASE_SECURITY_LOG_REPORT_DATA_OP:
+				as_security_config_log_scope(AS_SEC_SINK_LOG, line.val_tok_1, line.val_tok_2);
+				break;
+			case CASE_SECURITY_LOG_REPORT_SYS_ADMIN:
+				c->sec_cfg.report.sys_admin |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0;
+				break;
+			case CASE_SECURITY_LOG_REPORT_USER_ADMIN:
+				c->sec_cfg.report.user_admin |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0;
+				break;
+			case CASE_SECURITY_LOG_REPORT_VIOLATION:
+				c->sec_cfg.report.violation |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0;
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse security::syslog context items.
+		//
+		case SECURITY_SYSLOG:
+			switch (cfg_find_tok(line.name_tok, SECURITY_SYSLOG_OPTS, NUM_SECURITY_SYSLOG_OPTS)) {
+			case CASE_SECURITY_SYSLOG_LOCAL:
+				c->sec_cfg.syslog_local = (as_sec_syslog_local)cfg_int(&line, AS_SYSLOG_MIN, AS_SYSLOG_MAX);
+				break;
+			case CASE_SECURITY_SYSLOG_REPORT_AUTHENTICATION:
+				c->sec_cfg.report.authentication |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0;
+				break;
+			case CASE_SECURITY_SYSLOG_REPORT_DATA_OP:
+				as_security_config_log_scope(AS_SEC_SINK_SYSLOG, line.val_tok_1, line.val_tok_2);
+				break;
+			case CASE_SECURITY_SYSLOG_REPORT_SYS_ADMIN:
+				c->sec_cfg.report.sys_admin |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0;
+				break;
+			case CASE_SECURITY_SYSLOG_REPORT_USER_ADMIN:
+				c->sec_cfg.report.user_admin |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0;
+				break;
+			case CASE_SECURITY_SYSLOG_REPORT_VIOLATION:
+				c->sec_cfg.report.violation |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0;
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parse xdr context items.
+		//
+		case XDR:
+			switch (cfg_find_tok(line.name_tok, XDR_OPTS, NUM_XDR_OPTS)) {
+			case CASE_CONTEXT_BEGIN:
+				// Allow open brace on its own line to begin this context.
+				break;
+			case CASE_XDR_ENABLE_XDR:
+				g_xcfg.xdr_global_enabled = cfg_bool(&line);
+				break;
+			case CASE_XDR_DIGESTLOG_PATH:
+				g_xcfg.xdr_digestlog_path = cfg_strdup(&line, true);
+				g_xcfg.xdr_digestlog_file_size = cfg_u64_val2_no_checks(&line);
+				break;
+			case CASE_XDR_DATACENTER_BEGIN:
+				if (g_dc_count == DC_MAX_NUM) {
+					cf_crash_nostack(AS_CFG, "Cannot have more than %d datacenters", DC_MAX_NUM);
+				}
+
+				cur_dc_cfg = &g_dc_xcfg_opt[g_dc_count];
+				cur_dc_cfg->dc_name = cfg_strdup(&line, true);
+				cur_dc_cfg->dc_id = g_dc_count;
+				cf_vector_pointer_init(&cur_dc_cfg->dc_node_v, 10, 0);
+				cf_vector_pointer_init(&cur_dc_cfg->dc_addr_map_v, 10, 0);
+				cfg_begin_context(&state, XDR_DATACENTER);
+				break;
+			case CASE_XDR_CLIENT_THREADS:
+				g_xcfg.xdr_client_threads = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_COMPRESSION_THRESHOLD:
+				g_xcfg.xdr_compression_threshold = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_DELETE_SHIPPING_ENABLED:
+				g_xcfg.xdr_delete_shipping_enabled = cfg_bool(&line);
+				break;
+			case CASE_XDR_DIGESTLOG_IOWAIT_MS:
+				g_xcfg.xdr_digestlog_iowait_ms = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_FORWARD_XDR_WRITES:
+				g_xcfg.xdr_forward_xdrwrites = cfg_bool(&line);
+				break;
+			case CASE_XDR_HOTKEY_TIME_MS:
+				g_xcfg.xdr_hotkey_time_ms = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_INFO_PORT:
+				g_xcfg.xdr_info_port = cfg_port(&line);
+				break;
+			case CASE_XDR_INFO_TIMEOUT:
+				g_xcfg.xdr_info_request_timeout_ms = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_MAX_SHIP_BANDWIDTH:
+				g_xcfg.xdr_max_ship_bandwidth = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_MAX_SHIP_THROUGHPUT:
+				g_xcfg.xdr_max_ship_throughput = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_MIN_DIGESTLOG_FREE_PCT:
+				g_xcfg.xdr_min_dlog_free_pct = cfg_u32(&line, 0, 100);
+				break;
+			case CASE_XDR_NSUP_DELETES_ENABLED:
+				g_xcfg.xdr_nsup_deletes_enabled = cfg_bool(&line);
+				break;
+			case CASE_XDR_READ_THREADS:
+				g_xcfg.xdr_read_threads = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_SHIP_BINS:
+				g_xcfg.xdr_ship_bins = cfg_bool(&line);
+				break;
+			case CASE_XDR_SHIP_DELAY:
+				g_xcfg.xdr_internal_shipping_delay = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_SHIPPING_ENABLED:
+				g_xcfg.xdr_shipping_enabled = cfg_bool(&line);
+				break;
+			case CASE_XDR_WRITE_TIMEOUT:
+				g_xcfg.xdr_write_timeout = cfg_u32_no_checks(&line);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//----------------------------------------
+		// Parse xdr::datacenter context items.
+		//
+		case XDR_DATACENTER:
+			switch (cfg_find_tok(line.name_tok, XDR_DATACENTER_OPTS, NUM_XDR_DATACENTER_OPTS)) {
+			case CASE_CONTEXT_BEGIN:
+				// Allow open brace on its own line to begin this context.
+				break;
+			case CASE_XDR_DATACENTER_DC_NODE_ADDRESS_PORT:
+				xdr_cfg_add_node_addr_port(cur_dc_cfg, cfg_strdup(&line, true), cfg_port_val2(&line));
+				break;
+			case CASE_XDR_DATACENTER_DC_CONNECTIONS:
+				cur_dc_cfg->dc_connections = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_DATACENTER_DC_CONNECTIONS_IDLE_MS:
+				cur_dc_cfg->dc_connections_idle_ms = cfg_u32_no_checks(&line);
+				break;
+			case CASE_XDR_DATACENTER_DC_INT_EXT_IPMAP:
+				xdr_cfg_add_int_ext_mapping(cur_dc_cfg, cfg_strdup(&line, true), cfg_strdup_val2(&line, true));
+				break;
+			case CASE_XDR_DATACENTER_DC_SECURITY_CONFIG_FILE:
+				cur_dc_cfg->dc_security_cfg.sec_config_file = cfg_strdup(&line, true);
+				break;
+			case CASE_XDR_DATACENTER_DC_USE_ALTERNATE_SERVICES:
+				cur_dc_cfg->dc_use_alternate_services = cfg_bool(&line);
+				break;
+			case CASE_XDR_DATACENTER_TLS_NAME:
+				cur_dc_cfg->tls_our_name = cfg_strdup_no_checks(&line);
+				break;
+			case CASE_XDR_DATACENTER_TLS_NODE:
+				xdr_cfg_add_tls_node(cur_dc_cfg, cfg_strdup(&line, true), cfg_strdup_val2(&line, true), cfg_port_val3(&line));
+				break;
+			case CASE_CONTEXT_END:
+				g_dc_count++;
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		//==================================================
+		// Parser state is corrupt.
+		//
+		default:
+			cf_crash_nostack(AS_CFG, "line %d :: invalid parser top-level state %d", line_num, state.current);
+			break;
+		}
+	}
+
+	fclose(FD);
+
+	//--------------------------------------------
+	// Checks that must wait until everything is parsed. Alternatively, such
+	// checks can be done in as_config_post_process() - doing them here means
+	// failure logs show in the console, doing them in as_config_post_process()
+	// means failure logs show in the log file.
+	//
+
+	as_security_config_check();
+
+	return &g_config;
+}
+
+
+//==========================================================
+// Public API - configuration-related tasks after parsing.
+//
+
+void
+as_config_post_process(as_config* c, const char* config_file)
+{
+	//--------------------------------------------
+	// Re-read the configuration file and print it to the logs, line by line.
+	// This will be the first thing to appear in the log file(s).
+	//
+
+	FILE* FD;
+
+	if (NULL == (FD = fopen(config_file, "r"))) {
+		cf_crash_nostack(AS_CFG, "couldn't re-open configuration file %s: %s", config_file, cf_strerror(errno));
+	}
+
+	char iobuf[256];
+
+	while (fgets(iobuf, sizeof(iobuf), FD)) {
+		char* p = iobuf;
+		char* p_last = p + (strlen(p) - 1);
+
+		if ('\n' == *p_last) {
+			*p_last-- = '\0';
+		}
+
+		if (p_last >= p && '\r' == *p_last) {
+			*p_last = '\0';
+		}
+
+		cf_info(AS_CFG, "%s", p);
+	}
+
+	fclose(FD);
+
+	//
+	// Done echoing configuration file to log.
+	//--------------------------------------------
+
+	// Configuration checks and special defaults that differ between CE and EE.
+	cfg_post_process();
+
+	cf_alloc_set_debug(c->debug_allocations);
+
+	// Check the configured file descriptor limit against the system limit.
+	struct rlimit fd_limit;
+
+	getrlimit(RLIMIT_NOFILE, &fd_limit);
+
+	if (c->n_proto_fd_max < 0 || (rlim_t)c->n_proto_fd_max > fd_limit.rlim_cur) {
+		cf_crash_nostack(AS_CFG, "%lu system file descriptors not enough, config specified %d", fd_limit.rlim_cur, c->n_proto_fd_max);
+	}
+
+	cf_info(AS_CFG, "system file descriptor limit: %lu, proto-fd-max: %d", fd_limit.rlim_cur, c->n_proto_fd_max);
+
+	// Output NUMA topology information.
+	cf_topo_info();
+
+	if (c->auto_pin != CF_TOPO_AUTO_PIN_NONE) {
+		if (c->n_service_threads != 0) {
+			cf_crash_nostack(AS_CFG, "can't configure 'service-threads' and 'auto-pin' at the same time");
+		}
+
+		if (c->n_transaction_queues != 0) {
+			cf_crash_nostack(AS_CFG, "can't configure 'transaction-queues' and 'auto-pin' at the same time");
+		}
+	}
+
+	uint16_t n_cpus = cf_topo_count_cpus();
+
+	if (c->n_service_threads == 0) {
+		c->n_service_threads = n_cpus;
+	}
+
+	if (c->n_transaction_queues == 0) {
+		// If there's at least one SSD namespace, use CPU count. Otherwise, be
+		// modest - only proxies, internal retries, and background scans & queries
+		// will use these queues & threads.
+		c->n_transaction_queues = g_config.n_namespaces_not_inlined != 0 ? n_cpus : 4;
+	}
+
+	// Allocate and initialize the record locks (olocks). Maybe not the best
+	// place for this, unless we make number of locks configurable.
+	g_record_locks = olock_create(16 * 1024, true);
+
+	// Setup performance metrics histograms.
+	cfg_create_all_histograms();
+
+	// If node-id was not configured, generate one.
+	if (c->self_node == 0) {
+		cf_ip_port id_port = c->fabric.bind_port != 0 ? c->fabric.bind_port : c->tls_fabric.bind_port;
+
+		if (cf_node_id_get(id_port, c->node_id_interface, &c->self_node) < 0) {
+			cf_crash_nostack(AS_CFG, "could not get node id");
+		}
+	}
+	else if (c->node_id_interface) {
+		cf_crash_nostack(AS_CFG, "may not configure both 'node-id' and ''node-id-interface");
+	}
+
+	cf_info(AS_CFG, "node-id %lx", c->self_node);
+
+	// Resolve TLS names in all TLS configurations.
+
+	for (uint32_t i = 0; i < g_config.n_tls_specs; ++i) {
+		if (g_config.tls_specs[i].name == NULL) {
+			cf_crash_nostack(AS_CFG, "nameless TLS configuration section");
+		}
+
+		g_config.tls_specs[i].name =
+				cfg_resolve_tls_name(g_config.tls_specs[i].name, g_config.cluster_name, NULL);
+	}
+
+	// Populate access ports from configuration.
+
+	g_access.service.port = g_config.service.std_port != 0 ?
+			g_config.service.std_port : g_config.service.bind_port;
+
+	g_access.alt_service.port = g_config.service.alt_port != 0 ?
+			g_config.service.alt_port : g_access.service.port;
+
+	g_access.tls_service.port = g_config.tls_service.std_port != 0 ?
+			g_config.tls_service.std_port : g_config.tls_service.bind_port;
+
+	g_access.alt_tls_service.port = g_config.tls_service.alt_port != 0 ?
+			g_config.tls_service.alt_port : g_access.tls_service.port;
+
+	// Populate access addresses from configuration.
+
+	cfg_serv_spec_std_to_access(&g_config.service, &g_access.service.addrs);
+	cfg_serv_spec_alt_to_access(&g_config.service, &g_access.alt_service.addrs);
+	cfg_serv_spec_std_to_access(&g_config.tls_service, &g_access.tls_service.addrs);
+	cfg_serv_spec_alt_to_access(&g_config.tls_service, &g_access.alt_tls_service.addrs);
+
+	// By default, use bind addresses also as access addresses.
+
+	if (g_access.service.addrs.n_addrs == 0) {
+		bind_to_access(&g_config.service, &g_access.service.addrs);
+	}
+
+	if (g_access.tls_service.addrs.n_addrs == 0) {
+		bind_to_access(&g_config.tls_service, &g_access.tls_service.addrs);
+	}
+
+	// By default, use non-TLS access addresses also for TLS - and vice versa.
+
+	default_addrs(&g_access.service.addrs, &g_access.tls_service.addrs);
+	default_addrs(&g_access.alt_service.addrs, &g_access.alt_tls_service.addrs);
+
+	cf_serv_cfg_init(&g_service_bind);
+
+	// Client service bind addresses.
+
+	if (g_config.service.bind_port != 0) {
+		cfg_serv_spec_to_bind(&g_config.service, &g_config.tls_service, &g_service_bind,
+				CF_SOCK_OWNER_SERVICE);
+	}
+
+	// Client TLS service bind addresses.
+
+	if (g_config.tls_service.bind_port != 0) {
+		cfg_serv_spec_to_bind(&g_config.tls_service, &g_config.service, &g_service_bind,
+				CF_SOCK_OWNER_SERVICE_TLS);
+
+		cf_tls_spec* tls_spec = cfg_link_tls("service", &g_config.tls_service.tls_our_name);
+
+		uint32_t n_peer_names = g_config.tls_service.n_tls_peer_names;
+		char **peer_names = g_config.tls_service.tls_peer_names;
+
+		bool has_any = false;
+		bool has_false = false;
+
+		for (uint32_t i = 0; i < n_peer_names; ++i) {
+			has_any = has_any || strcmp(peer_names[i], "any") == 0;
+			has_false = has_false || strcmp(peer_names[i], "false") == 0;
+		}
+
+		if ((has_any || has_false) && n_peer_names > 1) {
+			cf_crash_nostack(AS_CFG, "\"any\" and \"false\" are incompatible with other tls-authenticate-client arguments");
+		}
+
+		bool auth_client;
+
+		if (has_any || n_peer_names == 0) {
+			auth_client = true;
+			n_peer_names = 0;
+			peer_names = NULL;
+		}
+		else if (has_false) {
+			auth_client = false;
+			n_peer_names = 0;
+			peer_names = NULL;
+		}
+		else {
+			auth_client = true;
+		}
+
+		g_service_tls = tls_config_server_context(tls_spec, auth_client, n_peer_names, peer_names);
+	}
+
+	if (g_service_bind.n_cfgs == 0) {
+		cf_crash_nostack(AS_CFG, "no service ports configured");
+	}
+
+	// Heartbeat service bind addresses.
+
+	cf_serv_cfg_init(&g_config.hb_config.bind_cfg);
+
+	if (c->hb_serv_spec.bind_port != 0) {
+		cfg_serv_spec_to_bind(&c->hb_serv_spec, &c->hb_tls_serv_spec, &c->hb_config.bind_cfg,
+				CF_SOCK_OWNER_HEARTBEAT);
+	}
+
+	// Heartbeat TLS service bind addresses.
+
+	if (c->hb_tls_serv_spec.bind_port != 0) {
+		if (c->hb_config.mode != AS_HB_MODE_MESH) {
+			cf_crash_nostack(AS_CFG, "multicast heartbeats do not support TLS");
+		}
+
+		cfg_serv_spec_to_bind(&c->hb_tls_serv_spec, &c->hb_serv_spec, &c->hb_config.bind_cfg,
+				CF_SOCK_OWNER_HEARTBEAT_TLS);
+
+		cf_tls_spec* tls_spec = cfg_link_tls("heartbeat", &c->hb_tls_serv_spec.tls_our_name);
+		c->hb_config.tls = tls_config_intra_context(tls_spec, "heartbeat");
+	}
+
+	if (g_config.hb_config.bind_cfg.n_cfgs == 0) {
+		cf_crash_nostack(AS_CFG, "no heartbeat ports configured");
+	}
+
+	// Heartbeat multicast groups.
+
+	if (c->hb_multicast_groups.n_addrs > 0) {
+		cfg_mserv_config_from_addrs(&c->hb_multicast_groups, &c->hb_serv_spec.bind,
+				&g_config.hb_config.multicast_group_cfg, c->hb_serv_spec.bind_port,
+				CF_SOCK_OWNER_HEARTBEAT, g_config.hb_config.multicast_ttl);
+	}
+
+	// Fabric service bind addresses.
+
+	cf_serv_cfg_init(&g_fabric_bind);
+
+	if (g_config.fabric.bind_port != 0) {
+		cfg_serv_spec_to_bind(&g_config.fabric, &g_config.tls_fabric, &g_fabric_bind,
+				CF_SOCK_OWNER_FABRIC);
+	}
+
+	// Fabric TLS service bind addresses.
+
+	if (g_config.tls_fabric.bind_port != 0) {
+		cfg_serv_spec_to_bind(&g_config.tls_fabric, &g_config.fabric, &g_fabric_bind,
+				CF_SOCK_OWNER_FABRIC_TLS);
+
+		cf_tls_spec* tls_spec = cfg_link_tls("fabric", &g_config.tls_fabric.tls_our_name);
+		g_fabric_tls = tls_config_intra_context(tls_spec, "fabric");
+	}
+
+	if (g_fabric_bind.n_cfgs == 0) {
+		cf_crash_nostack(AS_CFG, "no fabric ports configured");
+	}
+
+	// Info service port.
+
+	g_info_port = g_config.info.bind_port;
+
+	// Info service bind addresses.
+
+	cf_serv_cfg_init(&g_info_bind);
+	cfg_serv_spec_to_bind(&g_config.info, NULL, &g_info_bind, CF_SOCK_OWNER_INFO);
+
+	// Validate heartbeat configuration.
+	as_hb_config_validate();
+
+	//--------------------------------------------
+	// Per-namespace config post-processing.
+	//
+
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace* ns = g_config.namespaces[i];
+
+		client_replica_maps_create(ns);
+
+		ns->tree_shared.destructor			= (as_index_value_destructor)&as_record_destroy;
+		ns->tree_shared.destructor_udata	= (void*)ns;
+		ns->tree_shared.locks_shift			= 12 - cf_msb(ns->tree_shared.n_lock_pairs);
+		ns->tree_shared.sprigs_shift		= 12 - cf_msb(ns->tree_shared.n_sprigs);
+		ns->tree_shared.sprigs_offset		= sizeof(as_lock_pair) * ns->tree_shared.n_lock_pairs;
+
+		ssd_init_encryption_key(ns);
+
+		char hist_name[HISTOGRAM_NAME_SIZE];
+
+		// One-way activated histograms (may be tracked histograms).
+
+		sprintf(hist_name, "{%s}-read", ns->name);
+		create_and_check_hist_track(&ns->read_hist, hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-write", ns->name);
+		create_and_check_hist_track(&ns->write_hist, hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-udf", ns->name);
+		create_and_check_hist_track(&ns->udf_hist, hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-query", ns->name);
+		create_and_check_hist_track(&ns->query_hist, hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-query-rec-count", ns->name);
+		ns->query_rec_count_hist = histogram_create(hist_name, HIST_COUNT);
+
+		sprintf(hist_name, "{%s}-re-repl", ns->name);
+		ns->re_repl_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+		// Activate-by-config histograms (can't be tracked histograms).
+
+		sprintf(hist_name, "{%s}-proxy", ns->name);
+		ns->proxy_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-read-start", ns->name);
+		ns->read_start_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-read-restart", ns->name);
+		ns->read_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-read-dup-res", ns->name);
+		ns->read_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-read-repl-ping", ns->name);
+		ns->read_repl_ping_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-read-local", ns->name);
+		ns->read_local_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-read-response", ns->name);
+		ns->read_response_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-write-start", ns->name);
+		ns->write_start_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-write-restart", ns->name);
+		ns->write_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-write-dup-res", ns->name);
+		ns->write_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-write-master", ns->name);
+		ns->write_master_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-write-repl-write", ns->name);
+		ns->write_repl_write_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-write-response", ns->name);
+		ns->write_response_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-udf-start", ns->name);
+		ns->udf_start_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-restart", ns->name);
+		ns->udf_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-dup-res", ns->name);
+		ns->udf_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-master", ns->name);
+		ns->udf_master_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-repl-write", ns->name);
+		ns->udf_repl_write_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-response", ns->name);
+		ns->udf_response_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-batch-sub-start", ns->name);
+		ns->batch_sub_start_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-batch-sub-restart", ns->name);
+		ns->batch_sub_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-batch-sub-dup-res", ns->name);
+		ns->batch_sub_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-batch-sub-repl-ping", ns->name);
+		ns->batch_sub_repl_ping_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-batch-sub-read-local", ns->name);
+		ns->batch_sub_read_local_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-batch-sub-response", ns->name);
+		ns->batch_sub_response_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+		sprintf(hist_name, "{%s}-udf-sub-start", ns->name);
+		ns->udf_sub_start_hist =  histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-sub-restart", ns->name);
+		ns->udf_sub_restart_hist =  histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-sub-dup-res", ns->name);
+		ns->udf_sub_dup_res_hist =  histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-sub-master", ns->name);
+		ns->udf_sub_master_hist =  histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-sub-repl-write", ns->name);
+		ns->udf_sub_repl_write_hist =  histogram_create(hist_name, HIST_MILLISECONDS);
+		sprintf(hist_name, "{%s}-udf-sub-response", ns->name);
+		ns->udf_sub_response_hist =  histogram_create(hist_name, HIST_MILLISECONDS);
+
+		// Linear 'nsup' histograms.
+		// Note - histograms' ranges MUST be set before use.
+
+		sprintf(hist_name, "%s object size histogram", ns->name);
+		ns->obj_size_hist = linear_hist_create(hist_name, 0, 0, OBJ_SIZE_HIST_NUM_BUCKETS);
+
+		sprintf(hist_name, "%s evict histogram", ns->name);
+		ns->evict_hist = linear_hist_create(hist_name, 0, 0, ns->evict_hist_buckets);
+
+		sprintf(hist_name, "%s ttl histogram", ns->name);
+		ns->ttl_hist = linear_hist_create(hist_name, 0, 0, TTL_HIST_NUM_BUCKETS);
+	}
+}
+
+
+//==========================================================
+// Public API - Cluster name.
+//
+
+pthread_mutex_t g_config_lock = PTHREAD_MUTEX_INITIALIZER;
+
+void
+as_config_cluster_name_get(char* cluster_name)
+{
+	pthread_mutex_lock(&g_config_lock);
+	strcpy(cluster_name, g_config.cluster_name);
+	pthread_mutex_unlock(&g_config_lock);
+}
+
+bool
+as_config_cluster_name_set(const char* cluster_name)
+{
+	if (cluster_name[0] == '\0') {
+		cf_warning(AS_CFG, "cluster name '%s' is not allowed. Ignoring.", cluster_name);
+		return false;
+	}
+
+	if (strlen(cluster_name) >= AS_CLUSTER_NAME_SZ) {
+		cf_warning(AS_CFG, "size of cluster name should not be greater than %d characters. Ignoring cluster name '%s'.",
+			AS_CLUSTER_NAME_SZ - 1, cluster_name);
+		return false;
+	}
+
+	pthread_mutex_lock(&g_config_lock);
+	if (strcmp(cluster_name,"null") == 0){
+		// 'null' is a special value representing an unset cluster-name.
+		strcpy(g_config.cluster_name, "");
+	} else {
+		strcpy(g_config.cluster_name, cluster_name);
+	}
+	pthread_mutex_unlock(&g_config_lock);
+
+	return true;
+}
+
+bool
+as_config_cluster_name_matches(const char* cluster_name)
+{
+	pthread_mutex_lock(&g_config_lock);
+	bool matches = strcmp(cluster_name, g_config.cluster_name) == 0;
+	pthread_mutex_unlock(&g_config_lock);
+	return matches;
+}
+
+
+//==========================================================
+// Public API - XDR.
+//
+
+bool
+xdr_read_security_configfile(xdr_security_config* sc)
+{
+	FILE* FD;
+	char iobuf[256];
+	int line_num = 0;
+	cfg_parser_state state;
+
+	cfg_parser_state_init(&state);
+
+	// Initialize the XDR config values to the defaults.
+	sc->username = NULL;
+	sc->password = NULL;
+	iobuf[0] = 0;
+
+	// Open the configuration file for reading. Dont crash if it fails as this
+	// function can be called during runtime (when credentials file change)
+	if (NULL == (FD = fopen(sc->sec_config_file, "r"))) {
+		cf_warning(AS_XDR, "Couldn't open configuration file %s: %s",
+				sc->sec_config_file, cf_strerror(errno));
+		return false;
+	}
+
+	// Parse the configuration file, line by line.
+	while (fgets(iobuf, sizeof(iobuf), FD)) {
+		line_num++;
+
+		// First chop the comment off, if there is one.
+
+		char* p_comment = strchr(iobuf, '#');
+
+		if (p_comment) {
+			*p_comment = '\0';
+		}
+
+		// Find (and null-terminate) up to three whitespace-delimited tokens in
+		// the line, a 'name' token and up to two 'value' tokens.
+
+		cfg_line line = { line_num, NULL, NULL, NULL, NULL };
+
+		line.name_tok = strtok(iobuf, CFG_WHITESPACE);
+
+		// If there are no tokens, ignore this line, get the next line.
+		if (! line.name_tok) {
+			continue;
+		}
+
+		line.val_tok_1 = strtok(NULL, CFG_WHITESPACE);
+
+		if (! line.val_tok_1) {
+			line.val_tok_1 = ""; // in case it's used where NULL can't be used
+		}
+		else {
+			line.val_tok_2 = strtok(NULL, CFG_WHITESPACE);
+		}
+
+		if (! line.val_tok_2) {
+			line.val_tok_2 = ""; // in case it's used where NULL can't be used
+		}
+		else {
+			line.val_tok_3 = strtok(NULL, CFG_WHITESPACE);
+		}
+
+		if (! line.val_tok_3) {
+			line.val_tok_3 = ""; // in case it's used where NULL can't be used
+		}
+
+		// Note that we can't see this output until a logging sink is specified.
+		cf_detail(AS_CFG, "line %d :: %s %s %s %s", line_num, line.name_tok,
+				line.val_tok_1, line.val_tok_2, line.val_tok_3);
+
+		// Parse the directive.
+		switch (state.current) {
+
+		// Parse top-level items.
+		case GLOBAL:
+			switch (cfg_find_tok(line.name_tok, XDR_SEC_GLOBAL_OPTS, NUM_XDR_SEC_GLOBAL_OPTS)) {
+			case XDR_SEC_CASE_CREDENTIALS_BEGIN:
+				cfg_begin_context(&state, XDR_SEC_CREDENTIALS);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		// Parse xdr context items.
+		case XDR_SEC_CREDENTIALS:
+			switch (cfg_find_tok(line.name_tok, XDR_SEC_CREDENTIALS_OPTS, NUM_XDR_SEC_CREDENTIALS_OPTS)) {
+			case CASE_CONTEXT_BEGIN:
+				// Allow open brace on its own line to begin this context.
+				break;
+			case XDR_SEC_CASE_CREDENTIALS_USERNAME:
+				sc->username = cfg_strdup(&line, true);
+				break;
+			case XDR_SEC_CASE_CREDENTIALS_PASSWORD:
+				sc->password = cfg_strdup(&line, true);
+				break;
+			case CASE_CONTEXT_END:
+				cfg_end_context(&state);
+				break;
+			case CASE_NOT_FOUND:
+			default:
+				cfg_unknown_name_tok(&line);
+				break;
+			}
+			break;
+
+		// Parser state is corrupt.
+		default:
+			cf_warning(AS_XDR, "line %d :: invalid parser top-level state %d",
+					line_num, state.current);
+			break;
+		}
+	}
+
+	// Close the file.
+	fclose(FD);
+	return true;
+}
+
+
+//==========================================================
+// Item-specific parsing utilities.
+//
+
+void
+init_addr_list(cf_addr_list* addrs)
+{
+	addrs->n_addrs = 0;
+	memset(&addrs->addrs, '\0', sizeof(addrs->addrs));
+}
+
+void
+add_addr(const char* name, cf_addr_list* addrs)
+{
+	uint32_t n = addrs->n_addrs;
+
+	if (n >= CF_SOCK_CFG_MAX) {
+		cf_crash_nostack(CF_SOCKET, "Too many addresses: %s", name);
+	}
+
+	addrs->addrs[n] = cf_strdup(name);
+	++addrs->n_addrs;
+}
+
+void
+add_tls_peer_name(const char* name, cf_serv_spec* spec)
+{
+	uint32_t n = spec->n_tls_peer_names;
+
+	if (n >= CF_SOCK_CFG_MAX) {
+		cf_crash_nostack(CF_SOCKET, "Too many TLS peer names: %s", name);
+	}
+
+	spec->tls_peer_names[n] = cf_strdup(name);
+	++spec->n_tls_peer_names;
+}
+
+void
+copy_addrs(const cf_addr_list* from, cf_addr_list* to)
+{
+	for (uint32_t i = 0; i < from->n_addrs; ++i) {
+		to->addrs[i] = from->addrs[i];
+	}
+
+	to->n_addrs = from->n_addrs;
+}
+
+void
+default_addrs(cf_addr_list* one, cf_addr_list* two)
+{
+	if (one->n_addrs == 0) {
+		copy_addrs(two, one);
+	}
+
+	if (two->n_addrs == 0) {
+		copy_addrs(one, two);
+	}
+}
+
+void
+bind_to_access(const cf_serv_spec* from, cf_addr_list* to)
+{
+	cf_serv_spec spec;
+	spec.bind_port = 0;
+	init_addr_list(&spec.bind);
+	spec.std_port = 0;
+	init_addr_list(&spec.std);
+	spec.alt_port = 0;
+	init_addr_list(&spec.alt);
+
+	for (uint32_t i = 0; i < from->bind.n_addrs; ++i) {
+		cf_ip_addr resol[CF_SOCK_CFG_MAX];
+		uint32_t n_resol = CF_SOCK_CFG_MAX;
+
+		if (cf_ip_addr_from_string_multi(from->bind.addrs[i], resol, &n_resol) < 0) {
+			cf_crash_nostack(AS_CFG, "Invalid default access address: %s", from->bind.addrs[i]);
+		}
+
+		bool valid = true;
+
+		for (uint32_t k = 0; k < n_resol; ++k) {
+			if (cf_ip_addr_is_any(&resol[k]) || cf_ip_addr_is_local(&resol[k])) {
+				cf_debug(AS_CFG, "Skipping invalid default access address: %s",
+						from->bind.addrs[i]);
+				valid = false;
+				break;
+			}
+		}
+
+		if (valid) {
+			uint32_t n = spec.std.n_addrs;
+			spec.std.addrs[n] = from->bind.addrs[i];
+			++spec.std.n_addrs;
+		}
+	}
+
+	cfg_serv_spec_std_to_access(&spec, to);
+}
+
+void
+cfg_add_addr_bind(const char* name, cf_serv_spec* spec)
+{
+	add_addr(name, &spec->bind);
+}
+
+void
+cfg_add_addr_std(const char* name, cf_serv_spec* spec)
+{
+	add_addr(name, &spec->std);
+}
+
+void
+cfg_add_addr_alt(const char* name, cf_serv_spec* spec)
+{
+	add_addr(name, &spec->alt);
+}
+
+void
+cfg_mserv_config_from_addrs(cf_addr_list* addrs, cf_addr_list* bind_addrs,
+		cf_mserv_cfg* serv_cfg, cf_ip_port port, cf_sock_owner owner,
+		uint8_t ttl)
+{
+	static cf_addr_list def_addrs = {
+		.n_addrs = 1, .addrs = { "any" }
+	};
+
+	if (bind_addrs->n_addrs == 0) {
+		bind_addrs = &def_addrs;
+	}
+
+	for (uint32_t i = 0; i < addrs->n_addrs; ++i) {
+
+		cf_ip_addr resol[CF_SOCK_CFG_MAX];
+		uint32_t n_resol = CF_SOCK_CFG_MAX;
+
+		if (cf_ip_addr_from_string_multi(addrs->addrs[i], resol,
+						 &n_resol) < 0) {
+			cf_crash_nostack(AS_CFG, "Invalid multicast group: %s",
+					 addrs->addrs[i]);
+		}
+
+		for (uint32_t j = 0; j < bind_addrs->n_addrs; j++) {
+
+			cf_ip_addr bind_resol[CF_SOCK_CFG_MAX];
+			uint32_t n_bind_resol = CF_SOCK_CFG_MAX;
+
+			if (cf_ip_addr_from_string_multi(bind_addrs->addrs[j],
+							 bind_resol,
+							 &n_bind_resol) < 0) {
+				cf_crash_nostack(AS_CFG, "Invalid address: %s",
+						 bind_addrs->addrs[j]);
+			}
+
+			for (int32_t k = 0; k < n_resol; ++k) {
+				for (int32_t l = 0; l < n_bind_resol; ++l) {
+					if (cf_mserv_cfg_add_combo(serv_cfg, owner, port,
+							&resol[k], &bind_resol[l], ttl) < 0) {
+						cf_crash_nostack(AS_CFG, "Too many IP addresses");
+					}
+				}
+			}
+		}
+	}
+}
+
+void
+cfg_serv_spec_to_bind(const cf_serv_spec* spec, const cf_serv_spec* def_spec, cf_serv_cfg* bind,
+		cf_sock_owner owner)
+{
+	static cf_addr_list def_addrs = {
+		.n_addrs = 1, .addrs = { "any" }
+	};
+
+	cf_sock_cfg cfg;
+	cf_sock_cfg_init(&cfg, owner);
+	cfg.port = spec->bind_port;
+
+	const cf_addr_list* addrs;
+
+	if (spec->bind.n_addrs != 0) {
+		addrs = &spec->bind;
+	}
+	else if (def_spec != NULL && def_spec->bind.n_addrs != 0) {
+		addrs = &def_spec->bind;
+	}
+	else {
+		addrs = &def_addrs;
+	}
+
+	for (uint32_t i = 0; i < addrs->n_addrs; ++i) {
+		cf_ip_addr resol[CF_SOCK_CFG_MAX];
+		uint32_t n_resol = CF_SOCK_CFG_MAX;
+
+		if (cf_ip_addr_from_string_multi(addrs->addrs[i], resol, &n_resol) < 0) {
+			cf_crash_nostack(AS_CFG, "Invalid address: %s", addrs->addrs[i]);
+		}
+
+		for (uint32_t k = 0; k < n_resol; ++k) {
+			cf_ip_addr_copy(&resol[k], &cfg.addr);
+
+			if (cf_serv_cfg_add_sock_cfg(bind, &cfg) < 0) {
+				cf_crash_nostack(AS_CFG, "Too many IP addresses: %s", addrs->addrs[i]);
+			}
+		}
+	}
+}
+
+static void
+addrs_to_access(const cf_addr_list* addrs, cf_addr_list* access)
+{
+	for (uint32_t i = 0; i < addrs->n_addrs; ++i) {
+		cf_ip_addr resol[CF_SOCK_CFG_MAX];
+		uint32_t n_resol = CF_SOCK_CFG_MAX;
+
+		if (cf_ip_addr_from_string_multi(addrs->addrs[i], resol, &n_resol) < 0) {
+			cf_crash_nostack(AS_CFG, "Invalid access address: %s", addrs->addrs[i]);
+		}
+
+		for (uint32_t k = 0; k < n_resol; ++k) {
+			if (cf_ip_addr_is_any(&resol[k])) {
+				cf_crash_nostack(AS_CFG, "Invalid access address: %s", addrs->addrs[i]);
+			}
+		}
+
+		if (cf_ip_addr_is_dns_name(addrs->addrs[i])) {
+			add_addr(addrs->addrs[i], access);
+		}
+		else {
+			for (uint32_t k = 0; k < n_resol; ++k) {
+				char tmp[250];
+				cf_ip_addr_to_string_safe(&resol[k], tmp, sizeof(tmp));
+				add_addr(tmp, access);
+			}
+		}
+	}
+}
+
+void
+cfg_serv_spec_std_to_access(const cf_serv_spec* spec, cf_addr_list* access)
+{
+	addrs_to_access(&spec->std, access);
+}
+
+void
+cfg_serv_spec_alt_to_access(const cf_serv_spec* spec, cf_addr_list* access)
+{
+	addrs_to_access(&spec->alt, access);
+}
+
+void
+cfg_add_mesh_seed_addr_port(char* addr, cf_ip_port port, bool tls)
+{
+	int32_t i;
+
+	for (i = 0; i < AS_CLUSTER_SZ; i++) {
+		if (g_config.hb_config.mesh_seed_addrs[i] == NULL) {
+			g_config.hb_config.mesh_seed_addrs[i] = addr;
+			g_config.hb_config.mesh_seed_ports[i] = port;
+			g_config.hb_config.mesh_seed_tls[i] = tls;
+			break;
+		}
+	}
+
+	if (i == AS_CLUSTER_SZ) {
+		cf_crash_nostack(AS_CFG, "can't configure more than %d mesh-seed-address-port entries", AS_CLUSTER_SZ);
+	}
+}
+
+as_set*
+cfg_add_set(as_namespace* ns)
+{
+	if (ns->sets_cfg_count >= AS_SET_MAX_COUNT) {
+		cf_crash_nostack(AS_CFG, "namespace %s - too many sets", ns->name);
+	}
+
+	// Lazily allocate temporary sets config array.
+	if (! ns->sets_cfg_array) {
+		size_t array_size = AS_SET_MAX_COUNT * sizeof(as_set);
+
+		ns->sets_cfg_array = (as_set*)cf_malloc(array_size);
+		memset(ns->sets_cfg_array, 0, array_size);
+	}
+
+	return &ns->sets_cfg_array[ns->sets_cfg_count++];
+}
+
+void
+cfg_add_storage_file(as_namespace* ns, char* file_name)
+{
+	int i;
+
+	for (i = 0; i < AS_STORAGE_MAX_FILES; i++) {
+		if (! ns->storage_files[i]) {
+			ns->storage_files[i] = file_name;
+			break;
+		}
+	}
+
+	if (i == AS_STORAGE_MAX_FILES) {
+		cf_crash_nostack(AS_CFG, "namespace %s - too many storage files", ns->name);
+	}
+}
+
+void
+cfg_add_storage_device(as_namespace* ns, char* device_name, char* shadow_name)
+{
+	int i;
+
+	for (i = 0; i < AS_STORAGE_MAX_DEVICES; i++) {
+		if (! ns->storage_devices[i]) {
+			ns->storage_devices[i] = device_name;
+			ns->storage_shadows[i] = shadow_name;
+			break;
+		}
+	}
+
+	if (i == AS_STORAGE_MAX_DEVICES) {
+		cf_crash_nostack(AS_CFG, "namespace %s - too many storage devices", ns->name);
+	}
+}
+
+uint32_t
+cfg_obj_size_hist_max(uint32_t hist_max)
+{
+	uint32_t round_to = OBJ_SIZE_HIST_NUM_BUCKETS;
+	uint32_t round_max = hist_max != 0 ?
+			((hist_max + round_to - 1) / round_to) * round_to : round_to;
+
+	if (round_max != hist_max) {
+		cf_info(AS_CFG, "rounding obj-size-hist-max %u up to %u", hist_max, round_max);
+	}
+
+	return round_max; // in 128-byte blocks
+}
+
+void
+cfg_set_cluster_name(char* cluster_name){
+	if(!as_config_cluster_name_set(cluster_name)){
+		cf_crash_nostack(AS_CFG, "cluster name '%s' is not allowed", cluster_name);
+	}
+}
+
+
+//==========================================================
+// Other (non-item-specific) utilities.
+//
+
+void
+create_and_check_hist_track(cf_hist_track** h, const char* name,
+		histogram_scale scale)
+{
+	*h = cf_hist_track_create(name, scale);
+
+	as_config* c = &g_config;
+
+	if (c->hist_track_back != 0 &&
+			! cf_hist_track_start(*h, c->hist_track_back, c->hist_track_slice, c->hist_track_thresholds)) {
+		cf_crash_nostack(AS_AS, "couldn't enable histogram tracking: %s", name);
+	}
+}
+
+// TODO - not really a config method any more, reorg needed.
+void
+cfg_create_all_histograms()
+{
+	g_stats.batch_index_hist = histogram_create("batch-index", HIST_MILLISECONDS);
+	g_stats.info_hist = histogram_create("info", HIST_MILLISECONDS);
+	g_stats.svc_demarshal_hist = histogram_create("svc-demarshal", HIST_MILLISECONDS);
+	g_stats.svc_queue_hist = histogram_create("svc-queue", HIST_MILLISECONDS);
+
+	g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-send-init", HIST_MILLISECONDS);
+	g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-send-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-recv-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-recv-cb", HIST_MILLISECONDS);
+	g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-send-init", HIST_MILLISECONDS);
+	g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-send-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-recv-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-recv-cb", HIST_MILLISECONDS);
+	g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-send-init", HIST_MILLISECONDS);
+	g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-send-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-recv-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-recv-cb", HIST_MILLISECONDS);
+	g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-send-init", HIST_MILLISECONDS);
+	g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-send-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-recv-fragment", HIST_MILLISECONDS);
+	g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-recv-cb", HIST_MILLISECONDS);
+}
+
+void
+cfg_init_serv_spec(cf_serv_spec* spec_p)
+{
+	spec_p->bind_port = 0;
+	init_addr_list(&spec_p->bind);
+	spec_p->std_port = 0;
+	init_addr_list(&spec_p->std);
+	spec_p->alt_port = 0;
+	init_addr_list(&spec_p->alt);
+	spec_p->tls_our_name = NULL;
+	spec_p->n_tls_peer_names = 0;
+	memset(spec_p->tls_peer_names, 0, sizeof(spec_p->tls_peer_names));
+}
+
+cf_tls_spec*
+cfg_create_tls_spec(as_config* cfg, char* name)
+{
+	uint32_t ind = cfg->n_tls_specs++;
+
+	if (ind >= MAX_TLS_SPECS) {
+		cf_crash_nostack(AS_CFG, "too many TLS configuration sections");
+	}
+
+	cf_tls_spec* tls_spec = cfg->tls_specs + ind;
+	tls_spec->name = cf_strdup(name);
+	return tls_spec;
+}
+
+char*
+cfg_resolve_tls_name(char* tls_name, const char* cluster_name, const char* which)
+{
+	bool expanded = false;
+
+	if (strcmp(tls_name, "<hostname>") == 0) {
+		char hostname[1024];
+		int rv = gethostname(hostname, sizeof(hostname));
+		if (rv != 0) {
+			cf_crash_nostack(AS_CFG,
+				"trouble resolving hostname for tls-name: %s", cf_strerror(errno));
+		}
+		hostname[sizeof(hostname)-1] = '\0'; // POSIX.1-2001
+		cf_free(tls_name);
+		tls_name = cf_strdup(hostname);
+		expanded = true;
+	}
+	else if (strcmp(tls_name, "<cluster-name>") == 0) {
+		if (strlen(cluster_name) == 0) {
+			cf_crash_nostack
+				(AS_CFG, "can't resolve tls-name to non-existent cluster-name");
+		}
+		cf_free(tls_name);
+		tls_name = cf_strdup(cluster_name);
+		expanded = true;
+	}
+
+	if (expanded && which != NULL) {
+		cf_info(AS_CFG, "%s tls-name %s", which, tls_name);
+	}
+
+	return tls_name;
+}
+
+cf_tls_spec*
+cfg_link_tls(const char* which, char** our_name)
+{
+	if (*our_name == NULL) {
+		cf_crash_nostack(AS_CFG, "%s TLS configuration requires tls-name", which);
+	}
+
+	*our_name = cfg_resolve_tls_name(*our_name, g_config.cluster_name, which);
+	cf_tls_spec* tls_spec = NULL;
+
+	for (uint32_t i = 0; i < g_config.n_tls_specs; ++i) {
+		if (strcmp(*our_name, g_config.tls_specs[i].name) == 0) {
+			tls_spec = g_config.tls_specs + i;
+			break;
+		}
+	}
+
+	if (tls_spec == NULL) {
+		cf_crash_nostack(AS_CFG, "invalid tls-name in TLS configuration: %s",
+				*our_name);
+	}
+
+	return tls_spec;
+}
+
+//==========================================================
+// XDR utilities.
+//
+
+void
+xdr_cfg_add_datacenter(char* dc, uint32_t nsid)
+{
+	cf_vector *v = &g_config.namespaces[nsid-1]->xdr_dclist_v;
+
+	// Crash if datacenter with same name already exists.
+	for (uint32_t index = 0; index < cf_vector_size(v); index++) {
+		if (strcmp((char *)cf_vector_pointer_get(v, index), dc) == 0) {
+			cf_crash_nostack(AS_XDR, "datacenter %s already exists for namespace %s - please remove duplicate entries from config file",
+					dc, g_config.namespaces[nsid-1]->name);
+		}
+	}
+
+	// Add the string pointer (of the datacenter name) to the vector.
+	cf_vector_pointer_append(v, dc);
+}
+
+void
+xdr_cfg_add_node_addr_port(dc_config_opt *dc_cfg, char* addr, int port)
+{
+	xdr_cfg_add_tls_node(dc_cfg, addr, NULL, port);
+}
+
+void
+xdr_cfg_add_tls_node(dc_config_opt *dc_cfg, char* addr, char *tls_name, int port)
+{
+	// Add the element to the vector.
+	node_addr_port* nap = (node_addr_port*)cf_malloc(sizeof(node_addr_port));
+
+	nap->addr = addr;
+	nap->tls_name = tls_name;
+	nap->port = port;
+
+	cf_vector_pointer_append(&dc_cfg->dc_node_v, nap);
+}
diff --git a/as/src/base/cfg_ce.c b/as/src/base/cfg_ce.c
new file mode 100644
index 00000000..e90f5e8a
--- /dev/null
+++ b/as/src/base/cfg_ce.c
@@ -0,0 +1,90 @@
+/*
+ * cfg_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/cfg.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void post_process_namespace(as_namespace* ns);
+
+
+//==========================================================
+// Public API.
+//
+
+bool
+as_config_error_enterprise_only()
+{
+	return true;
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+void
+cfg_enterprise_only(const cfg_line* p_line)
+{
+	cf_crash_nostack(AS_CFG, "line %d :: '%s' is enterprise-only",
+			p_line->num, p_line->name_tok);
+}
+
+
+void
+cfg_post_process()
+{
+	// So far, no other context handled.
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		post_process_namespace(g_config.namespaces[ns_ix]);
+	}
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+void
+post_process_namespace(as_namespace* ns)
+{
+	if (ns->conflict_resolution_policy ==
+			AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF) {
+		ns->conflict_resolution_policy =
+				AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION;
+	}
+}
diff --git a/as/src/base/features_ce.c b/as/src/base/features_ce.c
new file mode 100644
index 00000000..b07c8ffb
--- /dev/null
+++ b/as/src/base/features_ce.c
@@ -0,0 +1,38 @@
+/*
+ * features_ce.c
+ *
+ * Copyright (C) 2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/features.h"
+
+
+//==========================================================
+// Public API.
+//
+
+const char *
+as_features_info()
+{
+	return "null";
+}
diff --git a/as/src/base/index.c b/as/src/base/index.c
new file mode 100644
index 00000000..6d3de8df
--- /dev/null
+++ b/as/src/base/index.c
@@ -0,0 +1,1254 @@
+/*
+ * index.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/index.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <xmmintrin.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "arenax.h"
+#include "cf_mutex.h"
+#include "fault.h"
+#include "olock.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/stats.h"
+
+
+//==========================================================
+// Constants and typedefs.
+//
+
+typedef enum {
+	AS_BLACK	= 0,
+	AS_RED		= 1
+} as_index_color;
+
+typedef struct as_index_ph_s {
+	as_index			*r;
+	cf_arenax_handle	r_h;
+} as_index_ph;
+
+typedef struct as_index_ph_array_s {
+	uint64_t	alloc_sz;
+	uint64_t	pos;
+	as_index_ph	indexes[];
+} as_index_ph_array;
+
+typedef struct as_index_ele_s {
+	struct as_index_ele_s	*parent;
+	cf_arenax_handle		me_h;
+	as_index				*me;
+} as_index_ele;
+
+const size_t MAX_STACK_ARRAY_BYTES = 128 * 1024;
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_queue g_gc_queue;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void *run_index_tree_gc(void *unused);
+void as_index_tree_destroy(as_index_tree *tree);
+void as_index_sprig_done(as_index_sprig *isprig, as_index *r, cf_arenax_handle r_h);
+bool as_index_sprig_invalid_record_done(as_index_sprig *isprig, as_index_ref *index_ref);
+
+uint64_t as_index_sprig_reduce_partial(as_index_sprig *isprig, uint64_t sample_count, as_index_reduce_fn cb, void *udata);
+void as_index_sprig_traverse(as_index_sprig *isprig, cf_arenax_handle r_h, as_index_ph_array *v_a);
+void as_index_sprig_traverse_purge(as_index_sprig *isprig, cf_arenax_handle r_h);
+
+int as_index_sprig_exists(as_index_sprig *isprig, cf_digest *keyd);
+int as_index_sprig_get_vlock(as_index_sprig *isprig, cf_digest *keyd, as_index_ref *index_ref);
+int as_index_sprig_get_insert_vlock(as_index_sprig *isprig, cf_digest *keyd, as_index_ref *index_ref);
+int as_index_sprig_delete(as_index_sprig *isprig, cf_digest *keyd);
+
+int as_index_sprig_search_lockless(as_index_sprig *isprig, cf_digest *keyd, as_index **ret, cf_arenax_handle *ret_h);
+void as_index_sprig_insert_rebalance(as_index_sprig *isprig, as_index *root_parent, as_index_ele *ele);
+void as_index_sprig_delete_rebalance(as_index_sprig *isprig, as_index *root_parent, as_index_ele *ele);
+void as_index_rotate_left(as_index_ele *a, as_index_ele *b);
+void as_index_rotate_right(as_index_ele *a, as_index_ele *b);
+
+static inline void
+as_index_sprig_from_i(as_index_tree *tree, as_index_sprig *isprig,
+		uint32_t sprig_i)
+{
+	uint32_t lock_i = sprig_i >>
+			(tree->shared->locks_shift - tree->shared->sprigs_shift);
+
+	isprig->destructor = tree->shared->destructor;
+	isprig->destructor_udata = tree->shared->destructor_udata;
+	isprig->arena = tree->arena;
+	isprig->pair = tree_locks(tree) + lock_i;
+	isprig->sprig = tree_sprigs(tree) + sprig_i;
+}
+
+static inline void
+as_index_sprig_from_keyd(as_index_tree *tree, as_index_sprig *isprig,
+		const cf_digest *keyd)
+{
+	// Get the 12 most significant non-pid bits in the digest. Note - this is
+	// hardwired around the way we currently extract the (12 bit) partition-ID
+	// from the digest.
+	uint32_t bits = (((uint32_t)keyd->digest[1] & 0xF0) << 4) |
+			(uint32_t)keyd->digest[2];
+
+	uint32_t lock_i = bits >> tree->shared->locks_shift;
+	uint32_t sprig_i = bits >> tree->shared->sprigs_shift;
+
+	isprig->destructor = tree->shared->destructor;
+	isprig->destructor_udata = tree->shared->destructor_udata;
+	isprig->arena = tree->arena;
+	isprig->pair = tree_locks(tree) + lock_i;
+	isprig->sprig = tree_sprigs(tree) + sprig_i;
+}
+
+
+//==========================================================
+// Public API - initialize garbage collection system.
+//
+
+void
+as_index_tree_gc_init()
+{
+	cf_queue_init(&g_gc_queue, sizeof(as_index_tree*), 4096, true);
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, run_index_tree_gc, NULL) != 0) {
+		cf_crash(AS_INDEX, "failed to create garbage collection thread");
+	}
+}
+
+
+int
+as_index_tree_gc_queue_size()
+{
+	return cf_queue_sz(&g_gc_queue);
+}
+
+
+//==========================================================
+// Public API - create/destroy/size a tree.
+//
+
+// Create a new red-black tree.
+as_index_tree *
+as_index_tree_create(as_index_tree_shared *shared, cf_arenax *arena)
+{
+	size_t locks_size = sizeof(cf_mutex) * shared->n_lock_pairs * 2;
+	size_t sprigs_size = sizeof(as_sprig) * shared->n_sprigs;
+	size_t tree_size = sizeof(as_index_tree) + locks_size + sprigs_size;
+
+	as_index_tree *tree = cf_rc_alloc(tree_size);
+
+	tree->shared = shared;
+	tree->arena = arena;
+
+	as_lock_pair *pair = tree_locks(tree);
+	as_lock_pair *pair_end = pair + shared->n_lock_pairs;
+
+	while (pair < pair_end) {
+		cf_mutex_init(&pair->lock);
+		cf_mutex_init(&pair->reduce_lock);
+		pair++;
+	}
+
+	// The tree starts empty.
+	memset(tree_sprigs(tree), 0, sprigs_size);
+
+	return tree;
+}
+
+
+// Destroy a red-black tree; return 0 if the tree was destroyed or 1 otherwise.
+// TODO - nobody cares about the return value, make it void?
+int
+as_index_tree_release(as_index_tree *tree)
+{
+	int rc = cf_rc_release(tree);
+
+	if (rc > 0) {
+		return 1;
+	}
+
+	cf_assert(rc == 0, AS_INDEX, "tree ref-count %d", rc);
+
+	// TODO - call as_index_tree_destroy() directly if tree is empty?
+
+	cf_queue_push(&g_gc_queue, &tree);
+
+	return 0;
+}
+
+
+// Get the number of elements in the tree.
+uint64_t
+as_index_tree_size(as_index_tree *tree)
+{
+	uint64_t n_elements = 0;
+	as_sprig* sprig = tree_sprigs(tree);
+	as_sprig* sprig_end = sprig + tree->shared->n_sprigs;
+
+	while (sprig < sprig_end) {
+		n_elements += sprig->n_elements;
+		sprig++;
+	}
+
+	return n_elements;
+}
+
+
+//==========================================================
+// Public API - reduce a tree.
+//
+
+// Make a callback for every element in the tree, from outside the tree lock.
+void
+as_index_reduce(as_index_tree *tree, as_index_reduce_fn cb, void *udata)
+{
+	as_index_reduce_partial(tree, AS_REDUCE_ALL, cb, udata);
+}
+
+
+// Make a callback for a specified number of elements in the tree, from outside
+// the tree lock.
+void
+as_index_reduce_partial(as_index_tree *tree, uint64_t sample_count,
+		as_index_reduce_fn cb, void *udata)
+{
+	// Reduce sprigs from largest to smallest digests to preserve this order for
+	// the whole tree. (Rapid rebalance requires exact order.)
+
+	for (int i = (int)tree->shared->n_sprigs - 1; i >= 0; i--) {
+		as_index_sprig isprig;
+		as_index_sprig_from_i(tree, &isprig, (uint32_t)i);
+
+		sample_count -= as_index_sprig_reduce_partial(&isprig, sample_count, cb,
+				udata);
+
+		if (sample_count == 0) {
+			break;
+		}
+	}
+}
+
+
+//==========================================================
+// Public API - get/insert/delete an element in a tree.
+//
+
+// Is there an element with specified digest in the tree?
+//
+// Returns:
+//		 0 - found (yes)
+//		-1 - not found (no)
+int
+as_index_exists(as_index_tree *tree, cf_digest *keyd)
+{
+	as_index_sprig isprig;
+	as_index_sprig_from_keyd(tree, &isprig, keyd);
+
+	return as_index_sprig_exists(&isprig, keyd);
+}
+
+
+// If there's an element with specified digest in the tree, return a locked
+// and reserved reference to it in index_ref.
+//
+// Returns:
+//		 0 - found (reference returned in index_ref)
+//		-1 - not found (index_ref untouched)
+int
+as_index_get_vlock(as_index_tree *tree, cf_digest *keyd,
+		as_index_ref *index_ref)
+{
+	as_index_sprig isprig;
+	as_index_sprig_from_keyd(tree, &isprig, keyd);
+
+	return as_index_sprig_get_vlock(&isprig, keyd, index_ref);
+}
+
+
+// If there's an element with specified digest in the tree, return a locked
+// and reserved reference to it in index_ref. If not, create an element with
+// this digest, insert it into the tree, and return a locked and reserved
+// reference to it in index_ref.
+//
+// Returns:
+//		 1 - created and inserted (reference returned in index_ref)
+//		 0 - found already existing (reference returned in index_ref)
+//		-1 - error - could not allocate arena stage
+//		-2 - error - found "half created" or deleted record
+int
+as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd,
+		as_index_ref *index_ref)
+{
+	as_index_sprig isprig;
+	as_index_sprig_from_keyd(tree, &isprig, keyd);
+
+	return as_index_sprig_get_insert_vlock(&isprig, keyd, index_ref);
+}
+
+
+// If there's an element with specified digest in the tree, delete it.
+//
+// Returns:
+//		 0 - found and deleted
+//		-1 - not found
+// TODO - nobody cares about the return value, make it void?
+int
+as_index_delete(as_index_tree *tree, cf_digest *keyd)
+{
+	as_index_sprig isprig;
+	as_index_sprig_from_keyd(tree, &isprig, keyd);
+
+	return as_index_sprig_delete(&isprig, keyd);
+}
+
+
+//==========================================================
+// Local helpers - garbage collection, generic.
+//
+
+void *
+run_index_tree_gc(void *unused)
+{
+	as_index_tree *tree;
+
+	while (cf_queue_pop(&g_gc_queue, &tree, CF_QUEUE_FOREVER) == CF_QUEUE_OK) {
+		as_index_tree_destroy(tree);
+	}
+
+	return NULL;
+}
+
+
+void
+as_index_tree_destroy(as_index_tree *tree)
+{
+	as_sprig* sprig = tree_sprigs(tree);
+	as_sprig* sprig_end = sprig + tree->shared->n_sprigs;
+
+	while (sprig < sprig_end) {
+		as_index_sprig isprig;
+
+		isprig.destructor = tree->shared->destructor;
+		isprig.destructor_udata = tree->shared->destructor_udata;
+		isprig.arena = tree->arena;
+		isprig.sprig = sprig;
+
+		as_index_sprig_traverse_purge(&isprig, isprig.sprig->root_h);
+		sprig++;
+	}
+
+	as_lock_pair *pair = tree_locks(tree);
+	as_lock_pair *pair_end = pair + tree->shared->n_lock_pairs;
+
+	while (pair < pair_end) {
+		cf_mutex_destroy(&pair->lock);
+		cf_mutex_destroy(&pair->reduce_lock);
+		pair++;
+	}
+
+	cf_rc_free(tree);
+}
+
+
+void
+as_index_sprig_done(as_index_sprig *isprig, as_index *r, cf_arenax_handle r_h)
+{
+	int rc = as_index_release(r);
+
+	if (rc > 0) {
+		return;
+	}
+
+	cf_assert(rc == 0, AS_INDEX, "index ref-count %d", rc);
+
+	if (isprig->destructor) {
+		isprig->destructor(r, isprig->destructor_udata);
+	}
+
+	cf_arenax_free(isprig->arena, r_h);
+}
+
+
+bool
+as_index_sprig_invalid_record_done(as_index_sprig *isprig,
+		as_index_ref *index_ref)
+{
+	if (as_index_is_valid_record(index_ref->r)) {
+		return false;
+	}
+
+	if (! index_ref->skip_lock) {
+		cf_mutex_unlock(index_ref->olock);
+	}
+
+	as_index_sprig_done(isprig, index_ref->r, index_ref->r_h);
+
+	return true;
+}
+
+
+//==========================================================
+// Local helpers - reduce a sprig.
+//
+
+// Make a callback for a specified number of elements in the tree, from outside
+// the tree lock.
+uint64_t
+as_index_sprig_reduce_partial(as_index_sprig *isprig, uint64_t sample_count,
+		as_index_reduce_fn cb, void *udata)
+{
+	bool reduce_all = sample_count == AS_REDUCE_ALL;
+
+	cf_mutex_lock(&isprig->pair->reduce_lock);
+
+	if (reduce_all || sample_count > isprig->sprig->n_elements) {
+		sample_count = isprig->sprig->n_elements;
+	}
+
+	// Common to encounter empty sprigs.
+	if (sample_count == 0) {
+		cf_mutex_unlock(&isprig->pair->reduce_lock);
+		return 0;
+	}
+
+	size_t sz = sizeof(as_index_ph_array) +
+			(sizeof(as_index_ph) * sample_count);
+	as_index_ph_array *v_a;
+	uint8_t buf[MAX_STACK_ARRAY_BYTES];
+
+	v_a = sz > MAX_STACK_ARRAY_BYTES ? cf_malloc(sz) : (as_index_ph_array*)buf;
+
+	v_a->alloc_sz = sample_count;
+	v_a->pos = 0;
+
+	uint64_t start_ms = cf_getms();
+
+	// Recursively, fetch all the value pointers into this array, so we can make
+	// all the callbacks outside the big lock.
+	as_index_sprig_traverse(isprig, isprig->sprig->root_h, v_a);
+
+	cf_detail(AS_INDEX, "sprig reduce took %lu ms", cf_getms() - start_ms);
+
+	cf_mutex_unlock(&isprig->pair->reduce_lock);
+
+	uint64_t i;
+
+	for (i = 0; i < v_a->pos; i++) {
+		as_index_ref r_ref;
+
+		r_ref.skip_lock = false;
+		r_ref.r = v_a->indexes[i].r;
+		r_ref.r_h = v_a->indexes[i].r_h;
+
+		olock_vlock(g_record_locks, &r_ref.r->keyd, &r_ref.olock);
+
+		// Ignore this record if it's "half created" or deleted.
+		if (as_index_sprig_invalid_record_done(isprig, &r_ref)) {
+			continue;
+		}
+
+		// Callback MUST call as_record_done() to unlock and release record.
+		cb(&r_ref, udata);
+	}
+
+	if (v_a != (as_index_ph_array*)buf) {
+		cf_free(v_a);
+	}
+
+	// In reduce-all mode, return 0 so outside loop continues to pass
+	// sample_count = AS_REDUCE_ALL.
+	return reduce_all ? 0 : i;
+}
+
+
+void
+as_index_sprig_traverse(as_index_sprig *isprig, cf_arenax_handle r_h,
+		as_index_ph_array *v_a)
+{
+	if (r_h == SENTINEL_H) {
+		return;
+	}
+
+	as_index *r = RESOLVE_H(r_h);
+
+	as_index_sprig_traverse(isprig, r->left_h, v_a);
+
+	if (v_a->pos >= v_a->alloc_sz) {
+		return;
+	}
+
+	as_index_reserve(r);
+
+	v_a->indexes[v_a->pos].r = r;
+	v_a->indexes[v_a->pos].r_h = r_h;
+	v_a->pos++;
+
+	as_index_sprig_traverse(isprig, r->right_h, v_a);
+}
+
+
+void
+as_index_sprig_traverse_purge(as_index_sprig *isprig, cf_arenax_handle r_h)
+{
+	if (r_h == SENTINEL_H) {
+		return;
+	}
+
+	as_index *r = RESOLVE_H(r_h);
+
+	as_index_sprig_traverse_purge(isprig, r->left_h);
+	as_index_sprig_traverse_purge(isprig, r->right_h);
+
+	as_index_sprig_done(isprig, r, r_h);
+}
+
+
+//==========================================================
+// Local helpers - get/insert/delete an element in a sprig.
+//
+
+int
+as_index_sprig_exists(as_index_sprig *isprig, cf_digest *keyd)
+{
+	cf_mutex_lock(&isprig->pair->lock);
+
+	int rv = as_index_sprig_search_lockless(isprig, keyd, NULL, NULL);
+
+	cf_mutex_unlock(&isprig->pair->lock);
+
+	return rv;
+}
+
+
+int
+as_index_sprig_get_vlock(as_index_sprig *isprig, cf_digest *keyd,
+		as_index_ref *index_ref)
+{
+	cf_mutex_lock(&isprig->pair->lock);
+
+	int rv = as_index_sprig_search_lockless(isprig, keyd, &index_ref->r,
+			&index_ref->r_h);
+
+	if (rv != 0) {
+		cf_mutex_unlock(&isprig->pair->lock);
+		return rv;
+	}
+
+	as_index_reserve(index_ref->r);
+
+	cf_mutex_unlock(&isprig->pair->lock);
+
+	if (! index_ref->skip_lock) {
+		olock_vlock(g_record_locks, keyd, &index_ref->olock);
+	}
+
+	// Treat record as not found if it's "half created" or deleted.
+	if (as_index_sprig_invalid_record_done(isprig, index_ref)) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+int
+as_index_sprig_get_insert_vlock(as_index_sprig *isprig, cf_digest *keyd,
+		as_index_ref *index_ref)
+{
+	int cmp = 0;
+	bool retry;
+
+	// Use a stack as_index object for the root's parent, for convenience.
+	as_index root_parent;
+
+	// Save parents as we search for the specified element's insertion point.
+	as_index_ele eles[64]; // FIXME - increase this appropriately
+	as_index_ele *ele;
+
+	do {
+		ele = eles;
+
+		cf_mutex_lock(&isprig->pair->lock);
+
+		// Search for the specified element, or a parent to insert it under.
+
+		root_parent.left_h = isprig->sprig->root_h;
+		root_parent.color = AS_BLACK;
+
+		ele->parent = NULL; // we'll never look this far up
+		ele->me_h = 0; // root parent has no handle, never used
+		ele->me = &root_parent;
+
+		cf_arenax_handle t_h = isprig->sprig->root_h;
+		as_index *t = RESOLVE_H(t_h);
+
+		while (t_h != SENTINEL_H) {
+			ele++;
+			ele->parent = ele - 1;
+			ele->me_h = t_h;
+			ele->me = t;
+
+			_mm_prefetch(t, _MM_HINT_NTA);
+
+			if ((cmp = cf_digest_compare(keyd, &t->keyd)) == 0) {
+				// The element already exists, simply return it.
+
+				as_index_reserve(t);
+
+				cf_mutex_unlock(&isprig->pair->lock);
+
+				if (! index_ref->skip_lock) {
+					olock_vlock(g_record_locks, keyd, &index_ref->olock);
+				}
+
+				index_ref->r = t;
+				index_ref->r_h = t_h;
+
+				// Fail if the record is "half created" or deleted.
+				if (as_index_sprig_invalid_record_done(isprig, index_ref)) {
+					return -2;
+				}
+
+				return 0;
+			}
+
+			t_h = cmp > 0 ? t->left_h : t->right_h;
+			t = RESOLVE_H(t_h);
+		}
+
+		// We didn't find the tree element, so we'll be inserting it.
+
+		retry = false;
+
+		if (! cf_mutex_trylock(&isprig->pair->reduce_lock)) {
+			// The tree is being reduced - could take long, unlock so reads and
+			// overwrites aren't blocked.
+			cf_mutex_unlock(&isprig->pair->lock);
+
+			// Wait until the tree reduce is done...
+			cf_mutex_lock(&isprig->pair->reduce_lock);
+			cf_mutex_unlock(&isprig->pair->reduce_lock);
+
+			// ... and start over - we unlocked, so the tree may have changed.
+			retry = true;
+		}
+	} while (retry);
+
+	// Create a new element and insert it.
+
+	// Save the root so we can detect whether it changes.
+	cf_arenax_handle old_root = isprig->sprig->root_h;
+
+	// Make the new element.
+	cf_arenax_handle n_h = cf_arenax_alloc(isprig->arena);
+
+	if (n_h == 0) {
+		cf_warning(AS_INDEX, "arenax alloc failed");
+		cf_mutex_unlock(&isprig->pair->reduce_lock);
+		cf_mutex_unlock(&isprig->pair->lock);
+		return -1;
+	}
+
+	as_index *n = RESOLVE_H(n_h);
+
+	n->rc = 2; // one for create (eventually balanced by delete), one for caller
+
+	n->keyd = *keyd;
+
+	n->left_h = n->right_h = SENTINEL_H; // n starts as a leaf element
+	n->color = AS_RED; // n's color starts as red
+
+	// Make sure we can detect that the record isn't initialized.
+	as_index_clear_record_info(n);
+
+	// Insert the new element n under parent ele.
+	if (ele->me == &root_parent || 0 < cmp) {
+		ele->me->left_h = n_h;
+	}
+	else {
+		ele->me->right_h = n_h;
+	}
+
+	ele++;
+	ele->parent = ele - 1;
+	ele->me_h = n_h;
+	ele->me = n;
+
+	// Rebalance the sprig as needed.
+	as_index_sprig_insert_rebalance(isprig, &root_parent, ele);
+
+	// If insertion caused the root to change, save the new root.
+	if (root_parent.left_h != old_root) {
+		isprig->sprig->root_h = root_parent.left_h;
+	}
+
+	isprig->sprig->n_elements++;
+
+	cf_mutex_unlock(&isprig->pair->reduce_lock);
+	cf_mutex_unlock(&isprig->pair->lock);
+
+	if (! index_ref->skip_lock) {
+		olock_vlock(g_record_locks, keyd, &index_ref->olock);
+	}
+
+	index_ref->r = n;
+	index_ref->r_h = n_h;
+
+	return 1;
+}
+
+
+int
+as_index_sprig_delete(as_index_sprig *isprig, cf_digest *keyd)
+{
+	as_index *r;
+	cf_arenax_handle r_h;
+	bool retry;
+
+	// Use a stack as_index object for the root's parent, for convenience.
+	as_index root_parent;
+
+	// Save parents as we search for the specified element (or its successor).
+	as_index_ele eles[(64 * 2) + 3]; // FIXME - increase this appropriately
+	as_index_ele *ele;
+
+	do {
+		ele = eles;
+
+		cf_mutex_lock(&isprig->pair->lock);
+
+		root_parent.left_h = isprig->sprig->root_h;
+		root_parent.color = AS_BLACK;
+
+		ele->parent = NULL; // we'll never look this far up
+		ele->me_h = 0; // root parent has no handle, never used
+		ele->me = &root_parent;
+
+		r_h = isprig->sprig->root_h;
+		r = RESOLVE_H(r_h);
+
+		while (r_h != SENTINEL_H) {
+			ele++;
+			ele->parent = ele - 1;
+			ele->me_h = r_h;
+			ele->me = r;
+
+			_mm_prefetch(r, _MM_HINT_NTA);
+
+			int cmp = cf_digest_compare(keyd, &r->keyd);
+
+			if (cmp == 0) {
+				break; // found, we'll be deleting it
+			}
+
+			r_h = cmp > 0 ? r->left_h : r->right_h;
+			r = RESOLVE_H(r_h);
+		}
+
+		if (r_h == SENTINEL_H) {
+			cf_mutex_unlock(&isprig->pair->lock);
+			return -1; // not found, nothing to delete
+		}
+
+		// We found the tree element, so we'll be deleting it.
+
+		retry = false;
+
+		if (! cf_mutex_trylock(&isprig->pair->reduce_lock)) {
+			// The tree is being reduced - could take long, unlock so reads and
+			// overwrites aren't blocked.
+			cf_mutex_unlock(&isprig->pair->lock);
+
+			// Wait until the tree reduce is done...
+			cf_mutex_lock(&isprig->pair->reduce_lock);
+			cf_mutex_unlock(&isprig->pair->reduce_lock);
+
+			// ... and start over - we unlocked, so the tree may have changed.
+			retry = true;
+		}
+	} while (retry);
+
+	// Delete the element.
+
+	// Save the root so we can detect whether it changes.
+	cf_arenax_handle old_root = isprig->sprig->root_h;
+
+	// Snapshot the element to delete, r. (Already have r_h and r shortcuts.)
+	as_index_ele *r_e = ele;
+
+	if (r->left_h != SENTINEL_H && r->right_h != SENTINEL_H) {
+		// Search down for a "successor"...
+
+		ele++;
+		ele->parent = ele - 1;
+		ele->me_h = r->right_h;
+		ele->me = RESOLVE_H(ele->me_h);
+
+		while (ele->me->left_h != SENTINEL_H) {
+			ele++;
+			ele->parent = ele - 1;
+			ele->me_h = ele->parent->me->left_h;
+			ele->me = RESOLVE_H(ele->me_h);
+		}
+	}
+	// else ele is left at r, i.e. s == r
+
+	// Snapshot the successor, s. (Note - s could be r.)
+	as_index_ele *s_e = ele;
+	cf_arenax_handle s_h = s_e->me_h;
+	as_index *s = s_e->me;
+
+	// Get the appropriate child of s. (Note - child could be sentinel.)
+	ele++;
+
+	if (s->left_h == SENTINEL_H) {
+		ele->me_h = s->right_h;
+	}
+	else {
+		ele->me_h = s->left_h;
+	}
+
+	ele->me = RESOLVE_H(ele->me_h);
+
+	// Cut s (remember, it could be r) out of the tree.
+	ele->parent = s_e->parent;
+
+	if (s_h == s_e->parent->me->left_h) {
+		s_e->parent->me->left_h = ele->me_h;
+	}
+	else {
+		s_e->parent->me->right_h = ele->me_h;
+	}
+
+	// Rebalance at ele if necessary. (Note - if r != s, r is in the tree, and
+	// its parent may change during rebalancing.)
+	if (s->color == AS_BLACK) {
+		as_index_sprig_delete_rebalance(isprig, &root_parent, ele);
+	}
+
+	if (s != r) {
+		// s was a successor distinct from r, put it in r's place in the tree.
+		s->left_h = r->left_h;
+		s->right_h = r->right_h;
+		s->color = r->color;
+
+		if (r_h == r_e->parent->me->left_h) {
+			r_e->parent->me->left_h = s_h;
+		}
+		else {
+			r_e->parent->me->right_h = s_h;
+		}
+	}
+
+	// If delete caused the root to change, save the new root.
+	if (root_parent.left_h != old_root) {
+		isprig->sprig->root_h = root_parent.left_h;
+	}
+
+	// Flag record as deleted.
+	as_index_invalidate_record(r);
+
+	// We may now destroy r, which is no longer in the sprig.
+	as_index_sprig_done(isprig, r, r_h);
+
+	isprig->sprig->n_elements--;
+
+	cf_mutex_unlock(&isprig->pair->reduce_lock);
+	cf_mutex_unlock(&isprig->pair->lock);
+
+	return 0;
+}
+
+
+//==========================================================
+// Local helpers - search/rebalance a sprig.
+//
+
+int
+as_index_sprig_search_lockless(as_index_sprig *isprig, cf_digest *keyd,
+		as_index **ret, cf_arenax_handle *ret_h)
+{
+	cf_arenax_handle r_h = isprig->sprig->root_h;
+	as_index *r = RESOLVE_H(r_h);
+
+	while (r_h != SENTINEL_H) {
+		_mm_prefetch(r, _MM_HINT_NTA);
+
+		int cmp = cf_digest_compare(keyd, &r->keyd);
+
+		if (cmp == 0) {
+			if (ret_h) {
+				*ret_h = r_h;
+			}
+
+			if (ret) {
+				*ret = r;
+			}
+
+			return 0; // found
+		}
+
+		r_h = cmp > 0 ? r->left_h : r->right_h;
+		r = RESOLVE_H(r_h);
+	}
+
+	return -1; // not found
+}
+
+
+void
+as_index_sprig_insert_rebalance(as_index_sprig *isprig, as_index *root_parent,
+		as_index_ele *ele)
+{
+	// Entering here, ele is the last element on the stack. It turns out during
+	// insert rebalancing we won't ever need new elements on the stack, but make
+	// this resemble delete rebalance - define r_e to go back up the tree.
+	as_index_ele *r_e = ele;
+	as_index_ele *parent_e = r_e->parent;
+
+	while (parent_e->me->color == AS_RED) {
+		as_index_ele *grandparent_e = parent_e->parent;
+
+		if (r_e->parent->me_h == grandparent_e->me->left_h) {
+			// Element u is r's 'uncle'.
+			cf_arenax_handle u_h = grandparent_e->me->right_h;
+			as_index *u = RESOLVE_H(u_h);
+
+			if (u->color == AS_RED) {
+				u->color = AS_BLACK;
+				parent_e->me->color = AS_BLACK;
+				grandparent_e->me->color = AS_RED;
+
+				// Move up two layers - r becomes old r's grandparent.
+				r_e = parent_e->parent;
+				parent_e = r_e->parent;
+			}
+			else {
+				if (r_e->me_h == parent_e->me->right_h) {
+					// Save original r, which will become new r's parent.
+					as_index_ele *r0_e = r_e;
+
+					// Move up one layer - r becomes old r's parent.
+					r_e = parent_e;
+
+					// Then rotate r back down a layer.
+					as_index_rotate_left(r_e, r0_e);
+
+					parent_e = r_e->parent;
+					// Note - grandparent_e is unchanged.
+				}
+
+				parent_e->me->color = AS_BLACK;
+				grandparent_e->me->color = AS_RED;
+
+				// r and parent move up a layer as grandparent rotates down.
+				as_index_rotate_right(grandparent_e, parent_e);
+			}
+		}
+		else {
+			// Element u is r's 'uncle'.
+			cf_arenax_handle u_h = grandparent_e->me->left_h;
+			as_index *u = RESOLVE_H(u_h);
+
+			if (u->color == AS_RED) {
+				u->color = AS_BLACK;
+				parent_e->me->color = AS_BLACK;
+				grandparent_e->me->color = AS_RED;
+
+				// Move up two layers - r becomes old r's grandparent.
+				r_e = parent_e->parent;
+				parent_e = r_e->parent;
+			}
+			else {
+				if (r_e->me_h == parent_e->me->left_h) {
+					// Save original r, which will become new r's parent.
+					as_index_ele *r0_e = r_e;
+
+					// Move up one layer - r becomes old r's parent.
+					r_e = parent_e;
+
+					// Then rotate r back down a layer.
+					as_index_rotate_right(r_e, r0_e);
+
+					parent_e = r_e->parent;
+					// Note - grandparent_e is unchanged.
+				}
+
+				parent_e->me->color = AS_BLACK;
+				grandparent_e->me->color = AS_RED;
+
+				// r and parent move up a layer as grandparent rotates down.
+				as_index_rotate_left(grandparent_e, parent_e);
+			}
+		}
+	}
+
+	RESOLVE_H(root_parent->left_h)->color = AS_BLACK;
+}
+
+
+void
+as_index_sprig_delete_rebalance(as_index_sprig *isprig, as_index *root_parent,
+		as_index_ele *ele)
+{
+	// Entering here, ele is the last element on the stack. It's possible as r_e
+	// crawls up the tree, we'll need new elements on the stack, in which case
+	// ele keeps building the stack down while r_e goes up.
+	as_index_ele *r_e = ele;
+
+	while (r_e->me->color == AS_BLACK && r_e->me_h != root_parent->left_h) {
+		as_index *r_parent = r_e->parent->me;
+
+		if (r_e->me_h == r_parent->left_h) {
+			cf_arenax_handle s_h = r_parent->right_h;
+			as_index *s = RESOLVE_H(s_h);
+
+			if (s->color == AS_RED) {
+				s->color = AS_BLACK;
+				r_parent->color = AS_RED;
+
+				ele++;
+				// ele->parent will be set by rotation.
+				ele->me_h = s_h;
+				ele->me = s;
+
+				as_index_rotate_left(r_e->parent, ele);
+
+				s_h = r_parent->right_h;
+				s = RESOLVE_H(s_h);
+			}
+
+			as_index *s_left = RESOLVE_H(s->left_h);
+			as_index *s_right = RESOLVE_H(s->right_h);
+
+			if (s_left->color == AS_BLACK && s_right->color == AS_BLACK) {
+				s->color = AS_RED;
+
+				r_e = r_e->parent;
+			}
+			else {
+				if (s_right->color == AS_BLACK) {
+					s_left->color = AS_BLACK;
+					s->color = AS_RED;
+
+					ele++;
+					ele->parent = r_e->parent;
+					ele->me_h = s_h;
+					ele->me = s;
+
+					as_index_ele *s_e = ele;
+
+					ele++;
+					// ele->parent will be set by rotation.
+					ele->me_h = s->left_h;
+					ele->me = s_left;
+
+					as_index_rotate_right(s_e, ele);
+
+					s_h = r_parent->right_h;
+					s = s_left; // same as RESOLVE_H(s_h)
+				}
+
+				s->color = r_parent->color;
+				r_parent->color = AS_BLACK;
+				RESOLVE_H(s->right_h)->color = AS_BLACK;
+
+				ele++;
+				// ele->parent will be set by rotation.
+				ele->me_h = s_h;
+				ele->me = s;
+
+				as_index_rotate_left(r_e->parent, ele);
+
+				RESOLVE_H(root_parent->left_h)->color = AS_BLACK;
+
+				return;
+			}
+		}
+		else {
+			cf_arenax_handle s_h = r_parent->left_h;
+			as_index *s = RESOLVE_H(s_h);
+
+			if (s->color == AS_RED) {
+				s->color = AS_BLACK;
+				r_parent->color = AS_RED;
+
+				ele++;
+				// ele->parent will be set by rotation.
+				ele->me_h = s_h;
+				ele->me = s;
+
+				as_index_rotate_right(r_e->parent, ele);
+
+				s_h = r_parent->left_h;
+				s = RESOLVE_H(s_h);
+			}
+
+			as_index *s_left = RESOLVE_H(s->left_h);
+			as_index *s_right = RESOLVE_H(s->right_h);
+
+			if (s_left->color == AS_BLACK && s_right->color == AS_BLACK) {
+				s->color = AS_RED;
+
+				r_e = r_e->parent;
+			}
+			else {
+				if (s_left->color == AS_BLACK) {
+					s_right->color = AS_BLACK;
+					s->color = AS_RED;
+
+					ele++;
+					ele->parent = r_e->parent;
+					ele->me_h = s_h;
+					ele->me = s;
+
+					as_index_ele *s_e = ele;
+
+					ele++;
+					// ele->parent will be set by rotation.
+					ele->me_h = s->right_h;
+					ele->me = s_right;
+
+					as_index_rotate_left(s_e, ele);
+
+					s_h = r_parent->left_h;
+					s = s_right; // same as RESOLVE_H(s_h)
+				}
+
+				s->color = r_parent->color;
+				r_parent->color = AS_BLACK;
+				RESOLVE_H(s->left_h)->color = AS_BLACK;
+
+				ele++;
+				// ele->parent will be set by rotation.
+				ele->me_h = s_h;
+				ele->me = s;
+
+				as_index_rotate_right(r_e->parent, ele);
+
+				RESOLVE_H(root_parent->left_h)->color = AS_BLACK;
+
+				return;
+			}
+		}
+	}
+
+	r_e->me->color = AS_BLACK;
+}
+
+
+void
+as_index_rotate_left(as_index_ele *a, as_index_ele *b)
+{
+	// Element b is element a's right child - a will become b's left child.
+
+	/*        p      -->      p
+	 *        |               |
+	 *        a               b
+	 *       / \             / \
+	 *     [x]  b           a  [y]
+	 *         / \         / \
+	 *        c  [y]     [x]  c
+	 */
+
+	// Set a's right child to c, b's former left child.
+	a->me->right_h = b->me->left_h;
+
+	// Set p's left or right child (whichever a was) to b.
+	if (a->me_h == a->parent->me->left_h) {
+		a->parent->me->left_h = b->me_h;
+	}
+	else {
+		a->parent->me->right_h = b->me_h;
+	}
+
+	// Set b's parent to p, a's old parent.
+	b->parent = a->parent;
+
+	// Set b's left child to a, and a's parent to b.
+	b->me->left_h = a->me_h;
+	a->parent = b;
+}
+
+
+void
+as_index_rotate_right(as_index_ele *a, as_index_ele *b)
+{
+	// Element b is element a's left child - a will become b's right child.
+
+	/*        p      -->      p
+	 *        |               |
+	 *        a               b
+	 *       / \             / \
+	 *      b  [x]         [y]  a
+	 *     / \                 / \
+	 *   [y]  c               c  [x]
+	 */
+
+	// Set a's left child to c, b's former right child.
+	a->me->left_h = b->me->right_h;
+
+	// Set p's left or right child (whichever a was) to b.
+	if (a->me_h == a->parent->me->left_h) {
+		a->parent->me->left_h = b->me_h;
+	}
+	else {
+		a->parent->me->right_h = b->me_h;
+	}
+
+	// Set b's parent to p, a's old parent.
+	b->parent = a->parent;
+
+	// Set b's right child to a, and a's parent to b.
+	b->me->right_h = a->me_h;
+	a->parent = b;
+}
diff --git a/as/src/base/index_ce.c b/as/src/base/index_ce.c
new file mode 100644
index 00000000..faf94500
--- /dev/null
+++ b/as/src/base/index_ce.c
@@ -0,0 +1,67 @@
+/*
+ * index_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/index.h"
+
+#include "arenax.h"
+#include "fault.h"
+
+#include "base/datamodel.h"
+
+
+//==========================================================
+// Public API.
+//
+
+as_index_tree *
+as_index_tree_resume(as_index_tree_shared *shared, cf_arenax *arena,
+		as_treex *treex)
+{
+	cf_crash(AS_INDEX, "CE code called as_index_tree_resume()");
+	return NULL;
+}
+
+
+void
+as_index_tree_shutdown(as_index_tree *tree, as_treex *treex)
+{
+	// For enterprise version only.
+}
+
+
+void
+as_index_reduce_live(as_index_tree *tree, as_index_reduce_fn cb, void *udata)
+{
+	as_index_reduce(tree, cb, udata);
+}
+
+
+void
+as_index_reduce_partial_live(as_index_tree *tree, uint64_t sample_count,
+		as_index_reduce_fn cb, void *udata)
+{
+	as_index_reduce_partial(tree, sample_count, cb, udata);
+}
diff --git a/as/src/base/job_manager.c b/as/src/base/job_manager.c
new file mode 100644
index 00000000..87fe0a94
--- /dev/null
+++ b/as/src/base/job_manager.c
@@ -0,0 +1,806 @@
+/*
+ * job_manager.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/
+ */
+
+//==============================================================================
+// Includes.
+//
+
+#include "base/job_manager.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aerospike/as_string.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_queue_priority.h"
+
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/monitor.h"
+#include "fabric/partition.h"
+
+
+//==============================================================================
+// Globals.
+//
+
+static cf_atomic32 g_job_trid = 0;
+
+
+
+//==============================================================================
+// Non-class-specific utilities.
+//
+
+static inline uint64_t
+job_trid(uint64_t trid)
+{
+	return trid != 0 ? trid : (uint64_t)cf_atomic32_incr(&g_job_trid);
+}
+
+static inline const char*
+job_result_str(int result_code)
+{
+	switch (result_code) {
+	case 0:
+		return "ok";
+	case AS_JOB_FAIL_UNKNOWN:
+		return "abandoned-unknown";
+	case AS_JOB_FAIL_CLUSTER_KEY:
+		return "abandoned-cluster-key";
+	case AS_JOB_FAIL_USER_ABORT:
+		return "user-aborted";
+	case AS_JOB_FAIL_RESPONSE_ERROR:
+		return "abandoned-response-error";
+	case AS_JOB_FAIL_RESPONSE_TIMEOUT:
+		return "abandoned-response-timeout";
+	default:
+		return "abandoned-?";
+	}
+}
+
+static inline int
+safe_priority(int priority) {
+	// Handles priority 0, the 'auto' priority.
+	return priority < AS_JOB_PRIORITY_LOW || priority > AS_JOB_PRIORITY_HIGH ?
+			AS_JOB_PRIORITY_MEDIUM : priority;
+}
+
+
+
+//==============================================================================
+// as_priority_thread_pool class implementation.
+// TODO - move to common.
+//
+
+//----------------------------------------------------------
+// as_priority_thread_pool typedefs and forward declarations.
+//
+
+typedef struct queue_task_s {
+	as_priority_thread_pool_task_fn	task_fn;
+	void*							task;
+} queue_task;
+
+uint32_t create_threads(as_priority_thread_pool* pool, uint32_t count);
+void shutdown_threads(as_priority_thread_pool* pool, uint32_t count);
+void* run_pool_thread(void* udata);
+int compare_cb(void* buf, void* task);
+
+//----------------------------------------------------------
+// as_priority_thread_pool public API.
+//
+
+bool
+as_priority_thread_pool_init(as_priority_thread_pool* pool, uint32_t n_threads)
+{
+	pthread_mutex_init(&pool->lock, NULL);
+
+	// Initialize queues.
+	pool->dispatch_queue = cf_queue_priority_create(sizeof(queue_task), true);
+	pool->complete_queue = cf_queue_create(sizeof(uint32_t), true);
+
+	// Start detached threads.
+	pool->n_threads = create_threads(pool, n_threads);
+
+	return pool->n_threads == n_threads;
+}
+
+void
+as_priority_thread_pool_shutdown(as_priority_thread_pool* pool)
+{
+	shutdown_threads(pool, pool->n_threads);
+	cf_queue_priority_destroy(pool->dispatch_queue);
+	cf_queue_destroy(pool->complete_queue);
+	pthread_mutex_destroy(&pool->lock);
+}
+
+bool
+as_priority_thread_pool_resize(as_priority_thread_pool* pool,
+		uint32_t n_threads)
+{
+	pthread_mutex_lock(&pool->lock);
+
+	bool result = true;
+
+	if (n_threads != pool->n_threads) {
+		if (n_threads < pool->n_threads) {
+			// Shutdown excess threads.
+			shutdown_threads(pool, pool->n_threads - n_threads);
+			pool->n_threads = n_threads;
+		}
+		else {
+			// Start new detached threads.
+			pool->n_threads += create_threads(pool,
+					n_threads - pool->n_threads);
+			result = pool->n_threads == n_threads;
+		}
+	}
+
+	pthread_mutex_unlock(&pool->lock);
+
+	return result;
+}
+
+bool
+as_priority_thread_pool_queue_task(as_priority_thread_pool* pool,
+		as_priority_thread_pool_task_fn task_fn, void* task, int priority)
+{
+	queue_task qtask = { task_fn, task };
+
+	return cf_queue_priority_push(pool->dispatch_queue, &qtask, priority) ==
+			CF_QUEUE_OK;
+}
+
+bool
+as_priority_thread_pool_remove_task(as_priority_thread_pool* pool, void* task)
+{
+	queue_task qtask = { NULL, NULL };
+
+	cf_queue_priority_reduce_pop(pool->dispatch_queue, &qtask, compare_cb,
+			task);
+
+	return qtask.task != NULL;
+}
+
+void
+as_priority_thread_pool_change_task_priority(as_priority_thread_pool* pool,
+		void* task, int new_priority)
+{
+	cf_queue_priority_reduce_change(pool->dispatch_queue, new_priority,
+			compare_cb, task);
+}
+
+//----------------------------------------------------------
+// as_priority_thread_pool utilities.
+//
+
+uint32_t
+create_threads(as_priority_thread_pool* pool, uint32_t count)
+{
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	uint32_t n_threads_created = 0;
+	pthread_t thread;
+
+	for (uint32_t i = 0; i < count; i++) {
+		if (pthread_create(&thread, &attrs, run_pool_thread, pool) == 0) {
+			n_threads_created++;
+		}
+	}
+
+	return n_threads_created;
+}
+
+void
+shutdown_threads(as_priority_thread_pool* pool, uint32_t count)
+{
+	// Send terminator tasks to kill 'count' threads.
+	queue_task task = { NULL, NULL };
+
+	for (uint32_t i = 0; i < count; i++) {
+		cf_queue_priority_push(pool->dispatch_queue, &task,
+				CF_QUEUE_PRIORITY_HIGH);
+	}
+
+	// Wait till threads finish.
+	uint32_t complete;
+
+	for (uint32_t i = 0; i < count; i++) {
+		cf_queue_pop(pool->complete_queue, &complete, CF_QUEUE_FOREVER);
+	}
+}
+
+void*
+run_pool_thread(void* udata)
+{
+	as_priority_thread_pool* pool = (as_priority_thread_pool*)udata;
+	queue_task qtask;
+
+	// Retrieve tasks from queue and execute.
+	while (cf_queue_priority_pop(pool->dispatch_queue, &qtask,
+			CF_QUEUE_FOREVER) == CF_QUEUE_OK) {
+		// A null task indicates thread should be shut down.
+		if (! qtask.task_fn) {
+			break;
+		}
+
+		// Run task.
+		qtask.task_fn(qtask.task);
+	}
+
+	// Send thread completion event back to caller.
+	uint32_t complete = 1;
+
+	cf_queue_push(pool->complete_queue, &complete);
+
+	return NULL;
+}
+
+int
+compare_cb(void* buf, void* task)
+{
+	return ((queue_task*)buf)->task == task ? -1 : 0;
+}
+
+
+
+//==============================================================================
+// as_job base class implementation.
+//
+
+//----------------------------------------------------------
+// as_job typedefs and forward declarations.
+//
+
+static inline const char* as_job_safe_set_name(as_job* _job);
+static inline float as_job_progress(as_job* _job);
+int as_job_partition_reserve(as_job* _job, int pid, as_partition_reservation* rsv);
+
+//----------------------------------------------------------
+// as_job public API.
+//
+
+void
+as_job_init(as_job* _job, const as_job_vtable* vtable,
+		as_job_manager* mgr, as_job_rsv_type rsv_type, uint64_t trid,
+		as_namespace* ns, uint16_t set_id, int priority)
+{
+	memset(_job, 0, sizeof(as_job));
+
+	_job->vtable	= *vtable;
+	_job->mgr		= mgr;
+	_job->rsv_type	= rsv_type;
+	_job->trid		= job_trid(trid);
+	_job->ns		= ns;
+	_job->set_id	= set_id;
+	_job->priority	= safe_priority(priority);
+
+	pthread_mutex_init(&_job->requeue_lock, NULL);
+}
+
+void
+as_job_slice(void* task)
+{
+	as_job* _job = (as_job*)task;
+
+	int pid = _job->next_pid;
+	as_partition_reservation rsv;
+
+	if ((pid = as_job_partition_reserve(_job, pid, &rsv)) == AS_PARTITIONS) {
+		_job->next_pid = AS_PARTITIONS;
+		as_job_active_release(_job);
+		return;
+	}
+
+	pthread_mutex_lock(&_job->requeue_lock);
+
+	if (_job->abandoned != 0) {
+		pthread_mutex_unlock(&_job->requeue_lock);
+		as_partition_release(&rsv);
+		as_job_active_release(_job);
+		return;
+	}
+
+	if ((_job->next_pid = pid + 1) < AS_PARTITIONS) {
+		as_job_active_reserve(_job);
+		as_job_manager_requeue_job(_job->mgr, _job);
+	}
+
+	pthread_mutex_unlock(&_job->requeue_lock);
+
+	_job->vtable.slice_fn(_job, &rsv);
+
+	as_partition_release(&rsv);
+	as_job_active_release(_job);
+}
+
+void
+as_job_finish(as_job* _job)
+{
+	_job->vtable.finish_fn(_job);
+	as_job_manager_finish_job(_job->mgr, _job);
+}
+
+void
+as_job_destroy(as_job* _job)
+{
+	_job->vtable.destroy_fn(_job);
+
+	pthread_mutex_destroy(&_job->requeue_lock);
+	cf_free(_job);
+}
+
+void
+as_job_info(as_job* _job, as_mon_jobstat* stat)
+{
+	uint64_t now = cf_getms();
+	bool done = _job->finish_ms != 0;
+	uint64_t since_start_ms = now - _job->start_ms;
+	uint64_t since_finish_ms = done ? now - _job->finish_ms : 0;
+	uint64_t active_ms = done ?
+			_job->finish_ms - _job->start_ms : since_start_ms;
+
+	stat->trid				= _job->trid;
+	stat->priority			= (uint32_t)_job->priority;
+	stat->progress_pct		= as_job_progress(_job);
+	stat->run_time			= active_ms;
+	stat->time_since_done	= since_finish_ms;
+	stat->recs_read			= cf_atomic64_get(_job->n_records_read);
+
+	strcpy(stat->ns, _job->ns->name);
+	strcpy(stat->set, as_job_safe_set_name(_job));
+
+	char status[64];
+	sprintf(status, "%s(%s)", done ? "done" : "active",
+			job_result_str(_job->abandoned));
+	as_strncpy(stat->status, status, sizeof(stat->status));
+
+	_job->vtable.info_mon_fn(_job, stat);
+}
+
+void
+as_job_active_reserve(as_job* _job)
+{
+	cf_atomic32_incr(&_job->active_rc);
+}
+
+void
+as_job_active_release(as_job* _job)
+{
+	if (cf_atomic32_decr(&_job->active_rc) == 0) {
+		as_job_finish(_job);
+	}
+}
+
+//----------------------------------------------------------
+// as_job utilities.
+//
+
+static inline const char*
+as_job_safe_set_name(as_job* _job)
+{
+	const char* set_name = as_namespace_get_set_name(_job->ns, _job->set_id);
+
+	return set_name ? set_name : ""; // empty string means no set name displayed
+}
+
+static inline float
+as_job_progress(as_job* _job)
+{
+	return ((float)(_job->next_pid * 100)) / (float)AS_PARTITIONS;
+}
+
+int
+as_job_partition_reserve(as_job* _job, int pid, as_partition_reservation* rsv)
+{
+	if (_job->rsv_type == RSV_WRITE) {
+		while (pid < AS_PARTITIONS && as_partition_reserve_write(_job->ns, pid,
+				rsv, NULL) != 0) {
+			pid++;
+		}
+	}
+	else if (_job->rsv_type == RSV_MIGRATE) {
+		as_partition_reserve(_job->ns, pid, rsv);
+	}
+	else {
+		cf_crash(AS_JOB, "bad job rsv type %d", _job->rsv_type);
+	}
+
+	return pid;
+}
+
+
+
+//==============================================================================
+// as_job_manager class implementation.
+//
+
+//----------------------------------------------------------
+// as_job_manager typedefs and forward declarations.
+//
+
+typedef struct find_item_s {
+	uint64_t	trid;
+	as_job*		_job;
+	bool		remove;
+} find_item;
+
+typedef struct info_item_s {
+	as_job**	p_job;
+} info_item;
+
+void as_job_manager_evict_finished_jobs(as_job_manager* mgr);
+int as_job_manager_find_cb(void* buf, void* udata);
+as_job* as_job_manager_find_job(cf_queue* jobs, uint64_t trid, bool remove);
+static inline as_job* as_job_manager_find_any(as_job_manager* mgr, uint64_t trid);
+static inline as_job* as_job_manager_find_active(as_job_manager* mgr, uint64_t trid);
+static inline as_job* as_job_manager_remove_active(as_job_manager* mgr, uint64_t trid);
+int as_job_manager_info_cb(void* buf, void* udata);
+
+//----------------------------------------------------------
+// as_job_manager public API.
+//
+
+void
+as_job_manager_init(as_job_manager* mgr, uint32_t max_active, uint32_t max_done,
+		uint32_t n_threads)
+{
+	mgr->max_active	= max_active;
+	mgr->max_done	= max_done;
+
+	if (pthread_mutex_init(&mgr->lock, NULL) != 0) {
+		cf_crash(AS_JOB, "job manager failed mutex init");
+	}
+
+	mgr->active_jobs = cf_queue_create(sizeof(as_job*), false);
+	mgr->finished_jobs = cf_queue_create(sizeof(as_job*), false);
+
+	if (! as_priority_thread_pool_init(&mgr->thread_pool, n_threads)) {
+		cf_crash(AS_JOB, "job manager failed thread pool init");
+	}
+}
+
+int
+as_job_manager_start_job(as_job_manager* mgr, as_job* _job)
+{
+	pthread_mutex_lock(&mgr->lock);
+
+	if (cf_queue_sz(mgr->active_jobs) >= mgr->max_active) {
+		cf_warning(AS_JOB, "max of %u jobs currently active", mgr->max_active);
+		pthread_mutex_unlock(&mgr->lock);
+		return AS_JOB_FAIL_FORBIDDEN;
+	}
+
+	// Make sure trid is unique.
+	if (as_job_manager_find_any(mgr, _job->trid)) {
+		cf_warning(AS_JOB, "job with trid %lu already active", _job->trid);
+		pthread_mutex_unlock(&mgr->lock);
+		return AS_JOB_FAIL_PARAMETER;
+	}
+
+	_job->start_ms = cf_getms();
+	as_job_active_reserve(_job);
+	cf_queue_push(mgr->active_jobs, &_job);
+	as_priority_thread_pool_queue_task(&mgr->thread_pool, as_job_slice, _job,
+			_job->priority);
+
+	pthread_mutex_unlock(&mgr->lock);
+	return 0;
+}
+
+void
+as_job_manager_requeue_job(as_job_manager* mgr, as_job* _job)
+{
+	as_priority_thread_pool_queue_task(&mgr->thread_pool, as_job_slice, _job,
+			_job->priority);
+}
+
+void
+as_job_manager_finish_job(as_job_manager* mgr, as_job* _job)
+{
+	pthread_mutex_lock(&mgr->lock);
+
+	as_job_manager_remove_active(mgr, _job->trid);
+	_job->finish_ms = cf_getms();
+	cf_queue_push(mgr->finished_jobs, &_job);
+	as_job_manager_evict_finished_jobs(mgr);
+
+	pthread_mutex_unlock(&mgr->lock);
+}
+
+void
+as_job_manager_abandon_job(as_job_manager* mgr, as_job* _job, int reason)
+{
+	pthread_mutex_lock(&_job->requeue_lock);
+	_job->abandoned = reason;
+	bool found = as_priority_thread_pool_remove_task(&mgr->thread_pool, _job);
+	pthread_mutex_unlock(&_job->requeue_lock);
+
+	if (found) {
+		as_job_active_release(_job);
+	}
+}
+
+bool
+as_job_manager_abort_job(as_job_manager* mgr, uint64_t trid)
+{
+	pthread_mutex_lock(&mgr->lock);
+
+	as_job* _job = as_job_manager_find_active(mgr, trid);
+
+	if (! _job) {
+		pthread_mutex_unlock(&mgr->lock);
+		return false;
+	}
+
+	pthread_mutex_lock(&_job->requeue_lock);
+	_job->abandoned = AS_JOB_FAIL_USER_ABORT;
+	bool found = as_priority_thread_pool_remove_task(&mgr->thread_pool, _job);
+	pthread_mutex_unlock(&_job->requeue_lock);
+
+	pthread_mutex_unlock(&mgr->lock);
+
+	if (found) {
+		as_job_active_release(_job);
+	}
+
+	return true;
+}
+
+int
+as_job_manager_abort_all_jobs(as_job_manager* mgr)
+{
+	pthread_mutex_lock(&mgr->lock);
+
+	int n_jobs = cf_queue_sz(mgr->active_jobs);
+
+	if (n_jobs == 0) {
+		pthread_mutex_unlock(&mgr->lock);
+		return 0;
+	}
+
+	as_job* _jobs[n_jobs];
+	info_item item = { _jobs };
+
+	cf_queue_reduce(mgr->active_jobs, as_job_manager_info_cb, &item);
+
+	bool found[n_jobs];
+
+	for (int i = 0; i < n_jobs; i++) {
+		as_job* _job = _jobs[i];
+
+		pthread_mutex_lock(&_job->requeue_lock);
+		_job->abandoned = AS_JOB_FAIL_USER_ABORT;
+		found[i] = as_priority_thread_pool_remove_task(&mgr->thread_pool, _job);
+		pthread_mutex_unlock(&_job->requeue_lock);
+	}
+
+	pthread_mutex_unlock(&mgr->lock);
+
+	for (int i = 0; i < n_jobs; i++) {
+		if (found[i]) {
+			as_job_active_release(_jobs[i]);
+		}
+	}
+
+	return n_jobs;
+}
+
+bool
+as_job_manager_change_job_priority(as_job_manager* mgr, uint64_t trid,
+		int priority)
+{
+	pthread_mutex_lock(&mgr->lock);
+
+	as_job* _job = as_job_manager_find_active(mgr, trid);
+
+	if (! _job) {
+		pthread_mutex_unlock(&mgr->lock);
+		return false;
+	}
+
+	pthread_mutex_lock(&_job->requeue_lock);
+	_job->priority = safe_priority(priority);
+	as_priority_thread_pool_change_task_priority(&mgr->thread_pool, _job,
+			_job->priority);
+	pthread_mutex_unlock(&_job->requeue_lock);
+
+	pthread_mutex_unlock(&mgr->lock);
+	return true;
+}
+
+void
+as_job_manager_limit_active_jobs(as_job_manager* mgr, uint32_t max_active)
+{
+	mgr->max_active = max_active;
+}
+
+void
+as_job_manager_limit_finished_jobs(as_job_manager* mgr, uint32_t max_done)
+{
+	pthread_mutex_lock(&mgr->lock);
+	mgr->max_done = max_done;
+	as_job_manager_evict_finished_jobs(mgr);
+	pthread_mutex_unlock(&mgr->lock);
+}
+
+void
+as_job_manager_resize_thread_pool(as_job_manager* mgr, uint32_t n_threads)
+{
+	as_priority_thread_pool_resize(&mgr->thread_pool, n_threads);
+}
+
+as_mon_jobstat*
+as_job_manager_get_job_info(as_job_manager* mgr, uint64_t trid)
+{
+	pthread_mutex_lock(&mgr->lock);
+
+	as_job* _job = as_job_manager_find_any(mgr, trid);
+
+	if (! _job) {
+		pthread_mutex_unlock(&mgr->lock);
+		return NULL;
+	}
+
+	as_mon_jobstat* stat = cf_malloc(sizeof(as_mon_jobstat));
+
+	memset(stat, 0, sizeof(as_mon_jobstat));
+	as_job_info(_job, stat);
+
+	pthread_mutex_unlock(&mgr->lock);
+	return stat; // caller must free this
+}
+
+as_mon_jobstat*
+as_job_manager_get_info(as_job_manager* mgr, int* size)
+{
+	*size = 0;
+
+	pthread_mutex_lock(&mgr->lock);
+
+	int n_jobs = cf_queue_sz(mgr->active_jobs) +
+				 cf_queue_sz(mgr->finished_jobs);
+
+	if (n_jobs == 0) {
+		pthread_mutex_unlock(&mgr->lock);
+		return NULL;
+	}
+
+	as_job* _jobs[n_jobs];
+	info_item item = { _jobs };
+
+	cf_queue_reduce_reverse(mgr->active_jobs, as_job_manager_info_cb, &item);
+	cf_queue_reduce_reverse(mgr->finished_jobs, as_job_manager_info_cb, &item);
+
+	size_t stats_size = sizeof(as_mon_jobstat) * n_jobs;
+	as_mon_jobstat* stats = cf_malloc(stats_size);
+
+	memset(stats, 0, stats_size);
+
+	for (int i = 0; i < n_jobs; i++) {
+		as_job_info(_jobs[i], &stats[i]);
+	}
+
+	pthread_mutex_unlock(&mgr->lock);
+
+	*size = n_jobs;
+	return stats; // caller must free this
+}
+
+int
+as_job_manager_get_active_job_count(as_job_manager* mgr)
+{
+	pthread_mutex_lock(&mgr->lock);
+	int n_jobs = cf_queue_sz(mgr->active_jobs);
+	pthread_mutex_unlock(&mgr->lock);
+
+	return n_jobs;
+}
+
+//----------------------------------------------------------
+// as_job_manager utilities.
+//
+
+void
+as_job_manager_evict_finished_jobs(as_job_manager* mgr)
+{
+	int max_allowed = (int)mgr->max_done;
+
+	while (cf_queue_sz(mgr->finished_jobs) > max_allowed) {
+		as_job* _job;
+
+		cf_queue_pop(mgr->finished_jobs, &_job, 0);
+		as_job_destroy(_job);
+	}
+}
+
+int
+as_job_manager_find_cb(void* buf, void* udata)
+{
+	as_job* _job = *(as_job**)buf;
+	find_item* match = (find_item*)udata;
+
+	if (match->trid == _job->trid) {
+		match->_job = _job;
+		return match->remove ? -2 : -1;
+	}
+
+	return 0;
+}
+
+as_job*
+as_job_manager_find_job(cf_queue* jobs, uint64_t trid, bool remove)
+{
+	find_item item = { trid, NULL, remove };
+
+	cf_queue_reduce(jobs, as_job_manager_find_cb, &item);
+
+	return item._job;
+}
+
+static inline as_job*
+as_job_manager_find_any(as_job_manager* mgr, uint64_t trid)
+{
+	as_job* _job = as_job_manager_find_job(mgr->active_jobs, trid, false);
+
+	if (! _job) {
+		_job = as_job_manager_find_job(mgr->finished_jobs, trid, false);
+	}
+
+	return _job;
+}
+
+static inline as_job*
+as_job_manager_find_active(as_job_manager* mgr, uint64_t trid)
+{
+	return as_job_manager_find_job(mgr->active_jobs, trid, false);
+}
+
+static inline as_job*
+as_job_manager_remove_active(as_job_manager* mgr, uint64_t trid)
+{
+	return as_job_manager_find_job(mgr->active_jobs, trid, true);
+}
+
+int
+as_job_manager_info_cb(void* buf, void* udata)
+{
+	as_job* _job = *(as_job**)buf;
+	info_item* item = (info_item*)udata;
+
+	*item->p_job++ = _job;
+
+	return 0;
+}
diff --git a/as/src/base/json_init.c b/as/src/base/json_init.c
new file mode 100644
index 00000000..a7c93f9e
--- /dev/null
+++ b/as/src/base/json_init.c
@@ -0,0 +1,62 @@
+/*
+ * json_init.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "jansson.h"
+#include "citrusleaf/alloc.h"
+#include "base/json_init.h"
+
+/* SYNOPSIS
+ *  This module handles initialization of the Jansson JSON API by
+ *  setting the memory allocation functions to be used internally
+ *  by Jansson to the CF allocation-related functions.
+ */
+
+/*
+ *  Note that actual wrapper functions are needed instead of simply
+ *  using the names of the CF malloc() and free() functions, since the
+ *  memory allocation instrumentation infrastructure uses macroexpansion
+ *  of the CF allocation-related function names to track all allocations.
+ */
+
+/*
+ *  Wrapper function to call the CF malloc() function.
+ */
+static void *as_json_malloc(size_t size)
+{
+	return cf_malloc(size);
+}
+
+/*
+ *  Wrapper function to call the CF free() function.
+ */
+static void as_json_free(void *ptr)
+{
+	cf_free(ptr);
+}
+
+/*
+ *  Initialize the JSON module by setting the memory allocation functions.
+ */
+void as_json_init()
+{
+	json_set_alloc_funcs(as_json_malloc, as_json_free);
+}
diff --git a/as/src/base/monitor.c b/as/src/base/monitor.c
new file mode 100644
index 00000000..e0b22540
--- /dev/null
+++ b/as/src/base/monitor.c
@@ -0,0 +1,474 @@
+/*
+ * monitor.c
+ *
+ * Copyright (C) 2013-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ *  Aerospike Long Running Job Monitoring interface
+ *
+ *  This file implements the generic interface for the long running jobs
+ *  in Aerospike like query / scan / batch etc. The idea is to able to see
+ *  what is going on in the system.
+ *
+ *  Each module which needs to show up in the monitoring needs to register
+ *  and implement the interfaces.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "base/secondary_index.h"
+#include "base/monitor.h"
+#include "base/scan.h"
+#include "base/thr_sindex.h"
+
+
+#define AS_MON_MAX_MODULE 10
+
+// Indexed by as_mon_module_slot - keep in sync.
+const char * AS_MON_MODULES[] = {
+		"query",
+		"scan",
+		"sindex-builder"
+};
+
+// functional declaration
+int    as_mon_populate_jobstat(as_mon_jobstat * stat, cf_dyn_buf *db);
+static as_mon * g_as_mon_module[AS_MON_MAX_MODULE];
+static uint32_t g_as_mon_curr_mod_count;
+int    as_mon_register(const char *module);
+
+/*
+ * This is called to init the mon subsystem.
+ */
+int
+as_mon_init()
+{
+	g_as_mon_curr_mod_count = 0;
+	as_mon_register(AS_MON_MODULES[QUERY_MOD]);
+	as_mon_register(AS_MON_MODULES[SCAN_MOD]);
+	as_mon_register(AS_MON_MODULES[SBLD_MOD]);
+
+	// TODO: Add more stuff if there is any locks needs some stats needed etc etc ...
+	return AS_MON_OK;
+}
+
+as_mon *
+as_mon_get_module(const char * module)
+{
+	as_mon_module_slot mod;
+	if (strcmp(module, AS_MON_MODULES[QUERY_MOD]) == 0) {
+		mod = QUERY_MOD;
+	}
+	else if (strcmp(module, AS_MON_MODULES[SCAN_MOD]) == 0) {
+		mod = SCAN_MOD;
+	}
+	else if (strcmp(module, AS_MON_MODULES[SBLD_MOD]) == 0) {
+		mod = SBLD_MOD;
+	}
+	else {
+		return NULL;
+	}
+
+	return g_as_mon_module[mod];
+}
+
+/*
+ * The call to register a module to be tracked under as mon interface
+ * Returns -
+ * 		AS_MON_OK    - On successful registartion.
+ * 		AS_MON_ERROR - failure
+ */
+int
+as_mon_register(const char *module)
+{
+	if (!module) return AS_MON_ERR;
+	as_mon *mon_obj = (as_mon *) cf_rc_alloc(sizeof(as_mon));
+	as_mon_cb *cb = cf_malloc(sizeof(as_mon_cb));
+	as_mon_module_slot mod;
+
+	if(!strcmp(module, AS_MON_MODULES[QUERY_MOD])) {
+		cb->get_jobstat     = as_query_get_jobstat;
+		cb->get_jobstat_all = as_query_get_jobstat_all;
+
+		cb->set_priority    = as_query_set_priority;
+		cb->kill            = as_query_kill;
+		cb->suspend         = NULL;
+		cb->set_pendingmax  = NULL;
+		cb->set_maxinflight = NULL;
+		cb->set_maxpriority = NULL;
+		mod = QUERY_MOD;
+	}
+	else if (!strcmp(module, AS_MON_MODULES[SCAN_MOD]))
+	{
+		cb->get_jobstat     = as_scan_get_jobstat;
+		cb->get_jobstat_all = as_scan_get_jobstat_all;
+
+		cb->set_priority    = as_scan_change_job_priority;
+		cb->kill            = as_scan_abort;
+		cb->suspend         = NULL;
+		cb->set_pendingmax  = NULL;
+		cb->set_maxinflight = NULL;
+		cb->set_maxpriority = NULL;
+		mod = SCAN_MOD;
+	}
+	else if (!strcmp(module, AS_MON_MODULES[SBLD_MOD]))
+	{
+		cb->get_jobstat     = as_sbld_get_jobstat;
+		cb->get_jobstat_all = as_sbld_get_jobstat_all;
+
+		cb->set_priority    = NULL;
+		cb->kill            = as_sbld_abort;
+		cb->suspend         = NULL;
+		cb->set_pendingmax  = NULL;
+		cb->set_maxinflight = NULL;
+		cb->set_maxpriority = NULL;
+		mod = SBLD_MOD;
+	}
+	else {
+		cf_warning(AS_MON, "wrong module parameter.");
+		return AS_MON_ERR;
+	}
+	// Setup mon object
+	mon_obj->type  = cf_strdup(module);
+	memcpy(&mon_obj->cb, cb, sizeof(as_mon_cb));
+
+	g_as_mon_curr_mod_count++;
+	g_as_mon_module[mod] = mon_obj;
+	return AS_MON_OK;
+}
+
+/*
+ * Calls the callback function to kill a job.
+ *
+ * Returns
+ * 		AS_MON_OK - On success.
+ * 		AS_MON_ERR - on failure.
+ *
+ */
+int
+as_mon_killjob(const char *module, uint64_t id, cf_dyn_buf *db)
+{
+	int retval = AS_MON_ERR;
+	as_mon * mon_object = as_mon_get_module(module);
+
+	if (!mon_object) {
+		cf_warning(AS_MON, "Failed to find module %s", module);
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		cf_dyn_buf_append_string(db, ":module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\" not found");
+		return retval;
+	}
+
+	if (mon_object->cb.kill) {
+		retval = mon_object->cb.kill(id);
+
+		if (retval == AS_MON_OK) {
+			cf_dyn_buf_append_string(db, "OK");
+		}
+		else {
+			cf_dyn_buf_append_string(db, "ERROR:");
+			cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+			cf_dyn_buf_append_string(db, ":job not active");
+		}
+	}
+	else {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":kill-job not supported for module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\"");
+	}
+	return retval;
+}
+
+/*
+ * Calls the callback function to set priority of a job.
+ *
+ * Returns
+ * 		AS_MON_OK - On success.
+ * 		AS_MON_ERR - on failure.
+ *
+ */
+int
+as_mon_set_priority(const char *module, uint64_t id, uint32_t priority, cf_dyn_buf *db)
+{
+	if (priority == 0) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":priority value must be greater than zero");
+		return AS_MON_ERR;
+	}
+	int retval = AS_MON_ERR;
+	as_mon * mon_object = as_mon_get_module(module);
+
+	if (!mon_object) {
+		cf_warning(AS_MON, "Failed to find module %s", module);
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		cf_dyn_buf_append_string(db, ":module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\" not found");
+		return retval;
+	}
+
+	if (mon_object->cb.set_priority) {
+		retval = mon_object->cb.set_priority(id, priority);
+
+		if (retval == AS_MON_OK) {
+			cf_dyn_buf_append_string(db, "OK");
+		}
+		else {
+			cf_dyn_buf_append_string(db, "ERROR:");
+			cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+			cf_dyn_buf_append_string(db, ":job not active");
+		}
+	}
+	else {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":set-priority not supported for module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\"");
+	}
+	return retval;
+}
+
+/*
+ * Calls the callback function to populate the stat of a particular job.
+ *
+ * Returns
+ * 		AS_MON_OK - On success.
+ * 		AS_MON_ERR - on failure.
+ *
+ */
+int
+as_mon_populate_jobstat(as_mon_jobstat * job_stat, cf_dyn_buf *db)
+{
+	cf_dyn_buf_append_string(db, "trid=");
+	cf_dyn_buf_append_uint64(db, job_stat->trid);
+
+	if (job_stat->job_type[0]) {
+		cf_dyn_buf_append_string(db, ":job-type=");
+		cf_dyn_buf_append_string(db, job_stat->job_type);
+	}
+
+	cf_dyn_buf_append_string(db, ":ns=");
+	cf_dyn_buf_append_string(db, job_stat->ns);
+
+	if (job_stat->set[0]) {
+		cf_dyn_buf_append_string(db, ":set=");
+		cf_dyn_buf_append_string(db, job_stat->set);
+	}
+
+	cf_dyn_buf_append_string(db, ":priority=");
+	cf_dyn_buf_append_uint32(db, job_stat->priority);
+
+	if (job_stat->status[0]) {
+		cf_dyn_buf_append_string(db, ":status=");
+		cf_dyn_buf_append_string(db, job_stat->status);
+	}
+
+	char progress_pct[8];
+	sprintf(progress_pct, "%.2f", job_stat->progress_pct);
+
+	cf_dyn_buf_append_string(db, ":job-progress=");
+	cf_dyn_buf_append_string(db, progress_pct);
+
+	cf_dyn_buf_append_string(db, ":run-time=");
+	cf_dyn_buf_append_uint64(db, job_stat->run_time);
+
+	cf_dyn_buf_append_string(db, ":time-since-done=");
+	cf_dyn_buf_append_uint64(db, job_stat->time_since_done);
+
+	cf_dyn_buf_append_string(db, ":recs-read=");
+	cf_dyn_buf_append_uint64(db, job_stat->recs_read);
+
+	cf_dyn_buf_append_string(db, ":net-io-bytes=");
+	cf_dyn_buf_append_uint64(db, job_stat->net_io_bytes);
+
+	//	char cpu_data[100];
+	//	sprintf(cpu_data, "%f", job_stat->cpu);
+	//	cf_dyn_buf_append_string(db, cpu_data);
+
+	if (job_stat->jdata[0]) {
+		cf_dyn_buf_append_string(db, job_stat->jdata);
+	}
+
+	return AS_MON_OK;
+}
+
+static int
+as_mon_get_jobstat_reduce_fn(as_mon *mon_object, cf_dyn_buf *db)
+{
+	int size = 0;
+	as_mon_jobstat * job_stats = NULL;
+	if (mon_object->cb.get_jobstat_all) {
+		job_stats = mon_object->cb.get_jobstat_all(&size);
+	}
+
+	// return OK to go to next module
+	if (!job_stats) return AS_MON_OK;
+
+	as_mon_jobstat * job;
+	job = job_stats;
+
+	for (int i = 0; i < size; i++) {
+		cf_dyn_buf_append_string(db, "module=");
+		cf_dyn_buf_append_string(db, mon_object->type);
+		cf_dyn_buf_append_string(db, ":");
+		as_mon_populate_jobstat(job, db);
+		cf_dyn_buf_append_string(db, ";");
+		job++;
+	}
+	cf_free(job_stats);
+	return AS_MON_OK;
+}
+
+/*
+ * This is called when the info call is triggered to get the info
+ * about all the jobs.
+ *
+ * parameter:
+ *     @db: in/out which gets populated. Each module stats is colon separated
+ *          key:value and each module info is semicolon separated.
+ *          e.g module:query:cpu:<val>:mem:<val>;module:query:cpu:<val>:mem:<val>;
+ *
+ * returns: 0 in case of success
+ *          negative value in case of failure
+ */
+int
+as_mon_get_jobstat_all(const char *module, cf_dyn_buf *db)
+{
+	bool found_module = false;
+	for (int i = 0; i < g_as_mon_curr_mod_count; i++) {
+		if ((module && !strcmp(g_as_mon_module[i]->type, module))
+				|| (!module)) {
+			as_mon_get_jobstat_reduce_fn(g_as_mon_module[i], db);
+			if (module) {
+				found_module = true;
+			}
+		}
+	}
+
+	if (module && !found_module) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		cf_dyn_buf_append_string(db, ":module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\" not found");
+	}
+	else {
+		cf_dyn_buf_chomp(db);
+	}
+	return 0;
+}
+
+/*
+ * This is called when the info call is triggered to get the info
+ * about a particular job in particular module.
+ *
+ * parameter:
+ *     @db: in/out which gets populated. Each module stats is colon separated
+ *          key:value and each module info is semicolon separated.
+ *          e.g module:query:cpu:<val>:mem:<val>;module:query:cpu:<val>:mem:<val>;
+ *
+ * returns: 0 in case of success
+ *          negative value in case of failure
+ */
+int
+as_mon_get_jobstat(const char *module, uint64_t id, cf_dyn_buf *db)
+{
+	int      retval     = AS_MON_ERR;
+	as_mon * mon_object = as_mon_get_module(module);;
+
+	if (!mon_object) {
+		cf_warning(AS_MON, "Failed to find module %s", module);
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		cf_dyn_buf_append_string(db, ":module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\" not found");
+		return retval;
+	}
+
+	as_mon_jobstat * job_stat = NULL;
+	if (mon_object->cb.get_jobstat) {
+		job_stat = mon_object->cb.get_jobstat(id);
+	}
+	else {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":get-job not supported for module \"");
+		cf_dyn_buf_append_string(db, module);
+		cf_dyn_buf_append_string(db, "\"");
+		return retval;
+	}
+
+	if (job_stat) {
+		retval = as_mon_populate_jobstat(job_stat, db);
+		cf_free(job_stat);
+	}
+	else {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		cf_dyn_buf_append_string(db, ":job not found");
+	}
+	return retval;
+}
+
+/*
+ * Manipulates the monitor system.
+ * Add, delete, reinit the modules.
+ *
+ */
+
+void
+as_mon_info_cmd(const char *module, char *cmd, uint64_t trid, uint32_t value, cf_dyn_buf *db)
+{
+	if (module == NULL) {
+		as_mon_get_jobstat_all(NULL, db);
+		return;
+	}
+
+	if (cmd == NULL) {
+		as_mon_get_jobstat_all(module, db);
+		return;
+	}
+
+	if (!strcmp(cmd, "get-job")) {
+		as_mon_get_jobstat(module, trid, db);
+	}
+	else if (!strcmp(cmd, "kill-job")) {
+		as_mon_killjob(module, trid, db);
+	}
+	else if (!strcmp(cmd, "set-priority")) {
+		as_mon_set_priority(module, trid, value, db);
+	}
+	else {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":unrecognized command \"");
+		cf_dyn_buf_append_string(db, cmd);
+		cf_dyn_buf_append_string(db, "\"");
+	}
+}
diff --git a/as/src/base/namespace.c b/as/src/base/namespace.c
new file mode 100644
index 00000000..d721f3d2
--- /dev/null
+++ b/as/src/base/namespace.c
@@ -0,0 +1,746 @@
+/*
+ * namespace.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_hash_math.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "hist.h"
+#include "linear_hist.h"
+#include "vmapx.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/truncate.h"
+#include "fabric/partition.h"
+#include "fabric/roster.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+
+//==========================================================
+// Globals.
+//
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static void append_set_props(as_set *p_set, cf_dyn_buf *db);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline uint32_t
+ns_name_hash(char *name)
+{
+	uint32_t hv = cf_hash_fnv32((const uint8_t *)name, strlen(name));
+
+	// Don't collide with a ns-id.
+	if (hv <= AS_NAMESPACE_SZ) {
+		hv += AS_NAMESPACE_SZ;
+	}
+
+	return hv;
+}
+
+
+//==========================================================
+// Public API.
+//
+
+as_namespace *
+as_namespace_create(char *name)
+{
+	cf_assert_nostack(strlen(name) < AS_ID_NAMESPACE_SZ,
+			AS_NAMESPACE, "{%s} namespace name too long (max length is %u)",
+			name, AS_ID_NAMESPACE_SZ - 1);
+
+	cf_assert_nostack(g_config.n_namespaces < AS_NAMESPACE_SZ,
+			AS_NAMESPACE, "too many namespaces (max is %u)", AS_NAMESPACE_SZ);
+
+	uint32_t namehash = ns_name_hash(name);
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		if (strcmp(ns->name, name) == 0) {
+			cf_crash_nostack(AS_NAMESPACE, "{%s} duplicate namespace", name);
+		}
+
+		// Check for CE also, in case deployment later becomes EE with XDR.
+		if (ns->namehash == namehash) {
+			cf_crash_nostack(AS_XDR, "{%s} {%s} namespace name hashes collide",
+					ns->name, name);
+		}
+	}
+
+	as_namespace *ns = cf_malloc(sizeof(as_namespace));
+
+	g_config.namespaces[g_config.n_namespaces++] = ns;
+
+	// Set all members 0/NULL/false to start with.
+	memset(ns, 0, sizeof(as_namespace));
+
+	strcpy(ns->name, name);
+	ns->id = g_config.n_namespaces; // note that id is 1-based
+	ns->namehash = namehash;
+
+	ns->jem_arena = cf_alloc_create_arena();
+	cf_info(AS_NAMESPACE, "{%s} uses JEMalloc arena %d", name, ns->jem_arena);
+
+	ns->cold_start = false; // try warm or cool restart unless told not to
+	ns->arena = NULL; // can't create the arena until the configuration has been done
+
+	//--------------------------------------------
+	// Non-0/NULL/false configuration defaults.
+	//
+
+	ns->cfg_replication_factor = 2;
+	ns->replication_factor = 0; // gets set on rebalance
+	ns->memory_size = 1024LL * 1024LL * 1024LL * 4LL; // default memory limit is 4G per namespace
+
+	ns->sets_enable_xdr = true; // ship all the sets by default
+	ns->ns_allow_nonxdr_writes = true; // allow nonxdr writes by default
+	ns->ns_allow_xdr_writes = true; // allow xdr writes by default
+	cf_vector_pointer_init(&ns->xdr_dclist_v, 3, 0);
+
+	ns->cold_start_evict_ttl = 0xFFFFffff; // unless this is specified via config file, use evict void-time saved in device header
+	ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF;
+	ns->evict_hist_buckets = 10000; // for 30 day TTL, bucket width is 4 minutes 20 seconds
+	ns->evict_tenths_pct = 5; // default eviction amount is 0.5%
+	ns->hwm_disk_pct = 50; // evict when device usage exceeds 50%
+	ns->hwm_memory_pct = 60; // evict when memory usage exceeds 50% of namespace memory-size
+	ns->max_ttl = MAX_ALLOWED_TTL; // 10 years
+	ns->migrate_order = 5;
+	ns->migrate_retransmit_ms = 1000 * 5; // 5 seconds
+	ns->migrate_sleep = 1;
+	ns->obj_size_hist_max = OBJ_SIZE_HIST_NUM_BUCKETS;
+	ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_PROTO;
+	ns->stop_writes_pct = 90; // stop writes when 90% of either memory or disk is used
+	ns->tomb_raider_eligible_age = 60 * 60 * 24; // 1 day
+	ns->tomb_raider_period = 60 * 60 * 24; // 1 day
+	ns->tree_shared.n_lock_pairs = 8;
+	ns->tree_shared.n_sprigs = 64;
+	ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_PROTO;
+
+	ns->storage_type = AS_STORAGE_ENGINE_MEMORY;
+	ns->storage_data_in_memory = true;
+	// Note - default true is consistent with AS_STORAGE_ENGINE_MEMORY, but
+	// cfg.c will set default false for AS_STORAGE_ENGINE_SSD.
+
+	ns->storage_filesize = 1024UL * 1024UL * 1024UL * 16UL; // default file size is 16G per file
+	ns->storage_scheduler_mode = NULL; // null indicates default is to not change scheduler mode
+	ns->storage_write_block_size = 1024 * 1024;
+	ns->storage_defrag_lwm_pct = 50; // defrag if occupancy of block is < 50%
+	ns->storage_defrag_sleep = 1000; // sleep this many microseconds between each wblock
+	ns->storage_defrag_startup_minimum = 10; // defrag until >= 10% disk is writable before joining cluster
+	ns->storage_flush_max_us = 1000 * 1000; // wait this many microseconds before flushing inactive current write buffer (0 = never)
+	ns->storage_max_write_cache = 1024 * 1024 * 64;
+	ns->storage_min_avail_pct = 5; // stop writes when < 5% disk is writable
+	ns->storage_post_write_queue = 256; // number of wblocks per device used as post-write cache
+	ns->storage_tomb_raider_sleep = 1000; // sleep this many microseconds between each device read
+	ns->storage_write_threads = 1;
+
+	ns->sindex_num_partitions = DEFAULT_PARTITIONS_PER_INDEX;
+
+	ns->geo2dsphere_within_strict = true;
+	ns->geo2dsphere_within_min_level = 1;
+	ns->geo2dsphere_within_max_level = 30;
+	ns->geo2dsphere_within_max_cells = 12;
+	ns->geo2dsphere_within_level_mod = 1;
+	ns->geo2dsphere_within_earth_radius_meters = 6371000;  // Wikipedia, mean
+
+	return ns;
+}
+
+
+void
+as_namespaces_init(bool cold_start_cmd, uint32_t instance)
+{
+	uint32_t stage_capacity = as_mem_check();
+
+	as_namespaces_setup(cold_start_cmd, instance, stage_capacity);
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		// Done with temporary sets configuration array.
+		if (ns->sets_cfg_array) {
+			cf_free(ns->sets_cfg_array);
+		}
+
+		for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+			as_partition_init(ns, pid);
+		}
+
+		as_truncate_init(ns);
+		as_sindex_init(ns);
+	}
+
+	as_roster_init_smd();
+	as_truncate_init_smd();
+	as_sindex_init_smd(); // before as_storage_init() populates the indexes
+}
+
+
+bool
+as_namespace_configure_sets(as_namespace *ns)
+{
+	for (uint32_t i = 0; i < ns->sets_cfg_count; i++) {
+		uint32_t idx;
+		cf_vmapx_err result = cf_vmapx_put_unique(ns->p_sets_vmap,
+				ns->sets_cfg_array[i].name, &idx);
+
+		if (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS) {
+			as_set* p_set = NULL;
+
+			if ((result = cf_vmapx_get_by_index(ns->p_sets_vmap, idx,
+					(void**)&p_set)) != CF_VMAPX_OK) {
+				// Should be impossible - just verified idx.
+				cf_crash(AS_NAMESPACE, "vmap error %d", result);
+			}
+
+			// Transfer configurable metadata.
+			p_set->stop_writes_count = ns->sets_cfg_array[i].stop_writes_count;
+			p_set->disable_eviction = ns->sets_cfg_array[i].disable_eviction;
+			p_set->enable_xdr = ns->sets_cfg_array[i].enable_xdr;
+		}
+		else {
+			// Maybe exceeded max sets allowed, but try failing gracefully.
+			cf_warning(AS_NAMESPACE, "vmap error %d", result);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
+as_namespace *
+as_namespace_get_byname(char *name)
+{
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		if (strcmp(ns->name, name) == 0) {
+			return ns;
+		}
+	}
+
+	return NULL;
+}
+
+
+as_namespace *
+as_namespace_get_byid(uint32_t id)
+{
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		if (id == ns->id) {
+			return ns;
+		}
+	}
+
+	return NULL;
+}
+
+
+as_namespace *
+as_namespace_get_bybuf(uint8_t *buf, size_t len)
+{
+	if (len >= AS_ID_NAMESPACE_SZ) {
+		return NULL;
+	}
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		if (memcmp(buf, ns->name, len) == 0 && ns->name[len] == 0) {
+			return ns;
+		}
+	}
+
+	return NULL;
+}
+
+
+as_namespace *
+as_namespace_get_bymsgfield(as_msg_field *fp)
+{
+	return as_namespace_get_bybuf(fp->data, as_msg_field_get_value_sz(fp));
+}
+
+
+const char *
+as_namespace_get_set_name(as_namespace *ns, uint16_t set_id)
+{
+	// Note that set_id is 1-based, but cf_vmap index is 0-based.
+	// (This is because 0 in the index structure means 'no set'.)
+
+	if (set_id == INVALID_SET_ID) {
+		return NULL;
+	}
+
+	as_set *p_set;
+
+	return cf_vmapx_get_by_index(ns->p_sets_vmap, (uint32_t)set_id - 1,
+			(void**)&p_set) == CF_VMAPX_OK ? p_set->name : NULL;
+}
+
+
+uint16_t
+as_namespace_get_set_id(as_namespace *ns, const char *set_name)
+{
+	uint32_t idx;
+
+	return cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx) == CF_VMAPX_OK ?
+			(uint16_t)(idx + 1) : INVALID_SET_ID;
+}
+
+
+// At the moment this is only used by the enterprise build security feature.
+uint16_t
+as_namespace_get_create_set_id(as_namespace *ns, const char *set_name)
+{
+	if (! set_name) {
+		// Should be impossible.
+		cf_warning(AS_NAMESPACE, "null set name");
+		return INVALID_SET_ID;
+	}
+
+	uint32_t idx;
+	cf_vmapx_err result = cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx);
+
+	if (result == CF_VMAPX_OK) {
+		return (uint16_t)(idx + 1);
+	}
+
+	if (result == CF_VMAPX_ERR_NAME_NOT_FOUND) {
+		result = cf_vmapx_put_unique(ns->p_sets_vmap, set_name, &idx);
+
+		if (result == CF_VMAPX_ERR_NAME_EXISTS) {
+			return (uint16_t)(idx + 1);
+		}
+
+		if (result == CF_VMAPX_ERR_BAD_PARAM) {
+			cf_warning(AS_NAMESPACE, "set name %s too long", set_name);
+			return INVALID_SET_ID;
+		}
+
+		if (result == CF_VMAPX_ERR_FULL) {
+			cf_warning(AS_NAMESPACE, "can't add %s (at sets limit)", set_name);
+			return INVALID_SET_ID;
+		}
+
+		if (result != CF_VMAPX_OK) {
+			// Currently, remaining errors are all some form of out-of-memory.
+			cf_warning(AS_NAMESPACE, "can't add %s (%d)", set_name, result);
+			return INVALID_SET_ID;
+		}
+
+		return (uint16_t)(idx + 1);
+	}
+
+	// Should be impossible.
+	cf_warning(AS_NAMESPACE, "unexpected error %d", result);
+	return INVALID_SET_ID;
+}
+
+
+int
+as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len,
+		uint16_t *p_set_id, bool apply_restrictions)
+{
+	as_set *p_set;
+
+	if (as_namespace_get_create_set_w_len(ns, set_name, len, &p_set,
+			p_set_id) != 0) {
+		return -1;
+	}
+
+	if (apply_restrictions && as_set_stop_writes(p_set)) {
+		return -2;
+	}
+
+	cf_atomic64_incr(&p_set->n_objects);
+
+	return 0;
+}
+
+
+int
+as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name,
+		size_t len, as_set **pp_set, uint16_t *p_set_id)
+{
+	cf_assert(set_name, AS_NAMESPACE, "null set name");
+	cf_assert(len != 0, AS_NAMESPACE, "empty set name");
+
+	uint32_t idx;
+	cf_vmapx_err result = cf_vmapx_get_index_w_len(ns->p_sets_vmap, set_name,
+			len, &idx);
+
+	if (result == CF_VMAPX_ERR_NAME_NOT_FOUND) {
+		// Special case handling for name too long.
+		if (len >= AS_SET_NAME_MAX_SIZE) {
+			char bad_name[AS_SET_NAME_MAX_SIZE];
+
+			memcpy(bad_name, set_name, AS_SET_NAME_MAX_SIZE - 1);
+			bad_name[AS_SET_NAME_MAX_SIZE - 1] = 0;
+
+			cf_warning(AS_NAMESPACE, "set name %s... too long", bad_name);
+			return -1;
+		}
+
+		result = cf_vmapx_put_unique_w_len(ns->p_sets_vmap, set_name, len,
+				&idx);
+
+		// Since this function can be called via many functions simultaneously.
+		// Need to handle race, So handle CF_VMAPX_ERR_NAME_EXISTS.
+		if (result == CF_VMAPX_ERR_FULL) {
+			cf_warning(AS_NAMESPACE, "at set names limit, can't add set");
+			return -1;
+		}
+
+		if (result != CF_VMAPX_OK && result != CF_VMAPX_ERR_NAME_EXISTS) {
+			cf_warning(AS_NAMESPACE, "error %d, can't add set", result);
+			return -1;
+		}
+	}
+	else if (result != CF_VMAPX_OK) {
+		// Should be impossible.
+		cf_warning(AS_NAMESPACE, "unexpected error %d", result);
+		return -1;
+	}
+
+	if (pp_set) {
+		if ((result = cf_vmapx_get_by_index(ns->p_sets_vmap, idx,
+				(void**)pp_set)) != CF_VMAPX_OK) {
+			// Should be impossible - just verified idx.
+			cf_warning(AS_NAMESPACE, "unexpected error %d", result);
+			return -1;
+		}
+	}
+
+	if (p_set_id) {
+		*p_set_id = (uint16_t)(idx + 1);
+	}
+
+	return 0;
+}
+
+
+as_set *
+as_namespace_get_set_by_name(as_namespace *ns, const char *set_name)
+{
+	uint32_t idx;
+
+	if (cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx) != CF_VMAPX_OK) {
+		return NULL;
+	}
+
+	as_set *p_set;
+
+	if (cf_vmapx_get_by_index(ns->p_sets_vmap, idx, (void**)&p_set) !=
+			CF_VMAPX_OK) {
+		// Should be impossible - just verified idx.
+		cf_crash(AS_NAMESPACE, "unexpected vmap error");
+	}
+
+	return p_set;
+}
+
+
+as_set *
+as_namespace_get_set_by_id(as_namespace *ns, uint16_t set_id)
+{
+	if (set_id == INVALID_SET_ID) {
+		return NULL;
+	}
+
+	as_set *p_set;
+
+	if (cf_vmapx_get_by_index(ns->p_sets_vmap, set_id - 1, (void**)&p_set) !=
+			CF_VMAPX_OK) {
+		// Should be impossible.
+		cf_warning(AS_NAMESPACE, "unexpected - record with set-id not in vmap");
+		return NULL;
+	}
+
+	return p_set;
+}
+
+
+as_set *
+as_namespace_get_record_set(as_namespace *ns, const as_record *r)
+{
+	return as_namespace_get_set_by_id(ns, as_index_get_set_id(r));
+}
+
+
+void
+as_namespace_get_set_info(as_namespace *ns, const char *set_name,
+		cf_dyn_buf *db)
+{
+	as_set *p_set;
+
+	if (set_name) {
+		if (cf_vmapx_get_by_name(ns->p_sets_vmap, set_name, (void**)&p_set) ==
+				CF_VMAPX_OK) {
+			append_set_props(p_set, db);
+		}
+
+		return;
+	}
+
+	for (uint32_t idx = 0; idx < cf_vmapx_count(ns->p_sets_vmap); idx++) {
+		if (cf_vmapx_get_by_index(ns->p_sets_vmap, idx, (void**)&p_set) ==
+				CF_VMAPX_OK) {
+			cf_dyn_buf_append_string(db, "ns=");
+			cf_dyn_buf_append_string(db, ns->name);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_string(db, "set=");
+			cf_dyn_buf_append_string(db, p_set->name);
+			cf_dyn_buf_append_char(db, ':');
+			append_set_props(p_set, db);
+		}
+	}
+}
+
+
+void
+as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id,
+		int64_t delta_bytes)
+{
+	if (set_id == INVALID_SET_ID) {
+		return;
+	}
+
+	as_set *p_set;
+
+	if (cf_vmapx_get_by_index(ns->p_sets_vmap, set_id - 1, (void**)&p_set) !=
+			CF_VMAPX_OK) {
+		cf_warning(AS_NAMESPACE, "set-id %u - failed vmap get", set_id);
+		return;
+	}
+
+	if (cf_atomic64_add(&p_set->n_bytes_memory, delta_bytes) < 0) {
+		cf_warning(AS_NAMESPACE, "set-id %u - negative memory!", set_id);
+	}
+}
+
+
+void
+as_namespace_release_set_id(as_namespace *ns, uint16_t set_id)
+{
+	if (set_id == INVALID_SET_ID) {
+		return;
+	}
+
+	as_set *p_set;
+
+	if (cf_vmapx_get_by_index(ns->p_sets_vmap, set_id - 1, (void**)&p_set) !=
+			CF_VMAPX_OK) {
+		return;
+	}
+
+	if (cf_atomic64_decr(&p_set->n_objects) < 0) {
+		cf_warning(AS_NAMESPACE, "set-id %u - negative objects!", set_id);
+	}
+}
+
+
+void
+as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns)
+{
+	if (show_ns) {
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+	}
+
+	if (ns->single_bin) {
+		cf_dyn_buf_append_string(db, "[single-bin]");
+	}
+	else {
+		uint32_t bin_count = cf_vmapx_count(ns->p_bin_name_vmap);
+
+		cf_dyn_buf_append_string(db, "bin_names=");
+		cf_dyn_buf_append_uint32(db, bin_count);
+		cf_dyn_buf_append_string(db, ",bin_names_quota=");
+		cf_dyn_buf_append_uint32(db, BIN_NAMES_QUOTA);
+
+		for (uint16_t i = 0; i < (uint16_t)bin_count; i++) {
+			cf_dyn_buf_append_char(db, ',');
+			cf_dyn_buf_append_string(db, as_bin_get_name_from_id(ns, i));
+		}
+	}
+
+	if (show_ns) {
+		cf_dyn_buf_append_char(db, ';');
+	}
+}
+
+
+void
+as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name,
+		cf_dyn_buf *db, bool show_ns)
+{
+	if (show_ns) {
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+	}
+
+	if (set_name == NULL || set_name[0] == 0) {
+		if (strcmp(hist_name, "ttl") == 0) {
+			cf_dyn_buf_append_string(db, "ttl=");
+			linear_hist_get_info(ns->ttl_hist, db);
+			cf_dyn_buf_append_char(db, ';');
+		}
+		else if (strcmp(hist_name, "objsz") == 0) {
+			if (ns->storage_type == AS_STORAGE_ENGINE_SSD) {
+				cf_dyn_buf_append_string(db, "objsz=");
+				linear_hist_get_info(ns->obj_size_hist, db);
+				cf_dyn_buf_append_char(db, ';');
+			}
+			else {
+				cf_dyn_buf_append_string(db, "hist-not-applicable");
+			}
+		}
+		else {
+			cf_dyn_buf_append_string(db, "error-unknown-hist-name");
+		}
+
+		return;
+	}
+
+	uint16_t set_id = as_namespace_get_set_id(ns, set_name);
+
+	if (set_id != INVALID_SET_ID) {
+		if (strcmp(hist_name, "ttl") == 0) {
+			if (ns->set_ttl_hists[set_id]) {
+				cf_dyn_buf_append_string(db, "ttl=");
+				linear_hist_get_info(ns->set_ttl_hists[set_id], db);
+				cf_dyn_buf_append_char(db, ';');
+			}
+			else {
+				cf_dyn_buf_append_string(db, "hist-unavailable");
+			}
+		}
+		else if (strcmp(hist_name, "objsz") == 0) {
+			if (ns->storage_type == AS_STORAGE_ENGINE_SSD) {
+				if (ns->set_obj_size_hists[set_id]) {
+					cf_dyn_buf_append_string(db, "objsz=");
+					linear_hist_get_info(ns->set_obj_size_hists[set_id], db);
+					cf_dyn_buf_append_char(db, ';');
+				}
+				else {
+					cf_dyn_buf_append_string(db, "hist-unavailable");
+				}
+			}
+			else {
+				cf_dyn_buf_append_string(db, "hist-not-applicable");
+			}
+		}
+		else {
+			cf_dyn_buf_append_string(db, "error-unknown-hist-name");
+		}
+	}
+	else {
+		cf_dyn_buf_append_string(db, "error-unknown-set-name");
+	}
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static void
+append_set_props(as_set *p_set, cf_dyn_buf *db)
+{
+	// Statistics:
+
+	cf_dyn_buf_append_string(db, "objects=");
+	cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_objects));
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "tombstones=");
+	cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_tombstones));
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "memory_data_bytes=");
+	cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_bytes_memory));
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "truncate_lut=");
+	cf_dyn_buf_append_uint64(db, p_set->truncate_lut);
+	cf_dyn_buf_append_char(db, ':');
+
+	// Configuration:
+
+	cf_dyn_buf_append_string(db, "stop-writes-count=");
+	cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->stop_writes_count));
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "set-enable-xdr=");
+
+	if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_TRUE) {
+		cf_dyn_buf_append_string(db, "true");
+	}
+	else if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_FALSE) {
+		cf_dyn_buf_append_string(db, "false");
+	}
+	else if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_DEFAULT) {
+		cf_dyn_buf_append_string(db, "use-default");
+	}
+	else {
+		cf_dyn_buf_append_uint32(db, cf_atomic32_get(p_set->enable_xdr));
+	}
+
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "disable-eviction=");
+	cf_dyn_buf_append_bool(db, IS_SET_EVICTION_DISABLED(p_set));
+	cf_dyn_buf_append_char(db, ';');
+}
diff --git a/as/src/base/namespace_ce.c b/as/src/base/namespace_ce.c
new file mode 100644
index 00000000..2400b30b
--- /dev/null
+++ b/as/src/base/namespace_ce.c
@@ -0,0 +1,142 @@
+/*
+ * namespace_cold.c
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "arenax.h"
+#include "fault.h"
+#include "vmapx.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+
+static bool
+check_capacity(uint32_t capacity)
+{
+	uint8_t* test_index_stages[g_config.n_namespaces];
+	uint8_t* test_data_blocks[g_config.n_namespaces];
+	uint32_t i;
+
+	for (i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+		uint64_t stage_size = (uint64_t)as_index_size_get(ns) * capacity;
+
+		if ((test_index_stages[i] = cf_try_malloc(stage_size)) == NULL) {
+			break;
+		}
+
+		// Memory for overhead and data, proportional to (= to) stage size.
+		if ((test_data_blocks[i] = cf_try_malloc(stage_size)) == NULL) {
+			cf_free(test_index_stages[i]);
+			break;
+		}
+	}
+
+	for (uint32_t j = 0; j < i; j++) {
+		cf_free(test_index_stages[j]);
+		cf_free(test_data_blocks[j]);
+	}
+
+	return i == g_config.n_namespaces;
+}
+
+#define MIN_STAGE_CAPACITY (MAX_STAGE_CAPACITY / 8)
+#define NS_MIN_MB (((sizeof(as_index) * MIN_STAGE_CAPACITY) * 2) / (1024 * 1024))
+
+uint32_t
+as_mem_check()
+{
+	uint32_t capacity;
+
+	for (capacity = MAX_STAGE_CAPACITY; capacity >= MIN_STAGE_CAPACITY; capacity /= 2) {
+		if (check_capacity(capacity)) {
+			break;
+		}
+	}
+
+	if (capacity < MIN_STAGE_CAPACITY) {
+		cf_crash_nostack(AS_NAMESPACE, "server requires at least %luMb of memory per namespace", NS_MIN_MB);
+	}
+
+	if (capacity < MAX_STAGE_CAPACITY) {
+		cf_info(AS_NAMESPACE, "detected small memory profile - will size arena stages 1/%u max", MAX_STAGE_CAPACITY / capacity);
+	}
+
+	return capacity;
+}
+
+static void
+setup_namespace(as_namespace* ns, uint32_t stage_capacity)
+{
+	ns->cold_start = true;
+
+	cf_info(AS_NAMESPACE, "{%s} beginning cold start", ns->name);
+
+	//--------------------------------------------
+	// Set up the set name vmap.
+	//
+
+	ns->p_sets_vmap = (cf_vmapx*)cf_malloc(cf_vmapx_sizeof(sizeof(as_set), AS_SET_MAX_COUNT));
+
+	cf_vmapx_init(ns->p_sets_vmap, sizeof(as_set), AS_SET_MAX_COUNT, 1024, AS_SET_NAME_MAX_SIZE);
+
+	// Transfer configuration file information about sets.
+	if (! as_namespace_configure_sets(ns)) {
+		cf_crash(AS_NAMESPACE, "{%s} can't configure sets", ns->name);
+	}
+
+	//--------------------------------------------
+	// Set up the bin name vmap.
+	//
+
+	if (! ns->single_bin) {
+		ns->p_bin_name_vmap = (cf_vmapx*)cf_malloc(cf_vmapx_sizeof(VMAP_BIN_NAME_MAX_SZ, MAX_BIN_NAMES));
+
+		cf_vmapx_init(ns->p_bin_name_vmap, VMAP_BIN_NAME_MAX_SZ, MAX_BIN_NAMES, 4096, VMAP_BIN_NAME_MAX_SZ);
+	}
+
+	//--------------------------------------------
+	// Set up the index arena.
+	//
+
+	ns->arena = (cf_arenax*)cf_malloc(cf_arenax_sizeof());
+
+	cf_arenax_init(ns->arena, 0, as_index_size_get(ns), stage_capacity, 0, CF_ARENAX_BIGLOCK);
+}
+
+void
+as_namespaces_setup(bool cold_start_cmd, uint32_t instance, uint32_t stage_capacity)
+{
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		setup_namespace(g_config.namespaces[i], stage_capacity);
+	}
+}
+
+void
+as_namespace_xmem_trusted(as_namespace *ns)
+{
+	// For enterprise version only.
+}
diff --git a/as/src/base/packet_compression.c b/as/src/base/packet_compression.c
new file mode 100644
index 00000000..2eeb178f
--- /dev/null
+++ b/as/src/base/packet_compression.c
@@ -0,0 +1,234 @@
+/*
+ * packet_compression.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <zlib.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "fault.h"
+
+#include "base/packet_compression.h"
+#include "base/proto.h"
+
+#define STACK_BUF_SZ (1024 * 16)
+
+/**
+ * Function to decompress the given data
+ * Expected arguments
+ * @param type			Type of compression
+ * @param length		Length of buffer to be decompressed
+ * @param buf			Pointer to buffer to be decompressed
+ * @param out_buf_len	Length of buffer to hold decompressed data
+ * @param out_buf		Pointer to buffer to hold decompressed data
+ * @return 0 if successful
+ */
+int
+as_decompress(compression_type type, size_t buf_len, const uint8_t *buf, size_t *out_buf_len, uint8_t *out_buf)
+{
+	int ret_value = -1;
+	cf_debug(AS_COMPRESSION, "In as_decompress");
+	switch (type) {
+		case COMPRESSION_ZLIB: {
+			// manual convert to match types just in case
+			uLongf converted_out_buf_len = *out_buf_len;
+			// zlib api to decompress the data
+			ret_value = uncompress(out_buf, &converted_out_buf_len, buf, (uLongf) buf_len);
+			*out_buf_len = converted_out_buf_len;
+			break;
+		}
+		default:
+			cf_warning(AS_COMPRESSION, "Unknown as_proto compression type: %d", type);
+			break;
+	}
+	cf_debug(AS_COMPRESSION, "Returned as_decompress : %d", ret_value);
+	return ret_value;
+}
+
+/**
+ * Function to get back decompressed packet from PROTO_TYPE_AS_MSG_COMPRESSED packet
+ * Packet :  Header - Original size of message - Compressed message
+ * @param buf			Pointer to PROTO_TYPE_AS_MSG_COMPRESSED packet. - Input
+ * @param output_packet	Pointer holding address of decompressed packet. - Output
+ */
+int
+as_packet_decompression(uint8_t *buf, uint8_t **output_packet, size_t *output_packet_size)
+{
+	int ret_value = -1;
+	as_comp_proto *as_comp_protop = (as_comp_proto *) buf;
+
+	cf_debug(AS_COMPRESSION, "In as_packet_decompression");
+
+	if (as_comp_protop->proto.type != PROTO_TYPE_AS_MSG_COMPRESSED)	{
+		cf_warning(AS_COMPRESSION, "as_packet_decompression : Invalid input data : type received %d != PROTO_TYPE_AS_MSG_COMPRESSED (%d)",
+				   as_comp_protop->proto.type, PROTO_TYPE_AS_MSG_COMPRESSED);
+		cf_warning(AS_COMPRESSION, "Returned as_packet_decompression : %d", ret_value);
+		return ret_value;
+	}
+
+#if 0 // enable this when byte swap also fixed on client side
+	as_comp_protop->org_sz = cf_swap_from_be64(as_comp_protop->org_sz);
+#endif
+	size_t decompressed_as_packet_sz = as_comp_protop->org_sz;
+	// sanity check for client supplied size
+	if (decompressed_as_packet_sz > PROTO_SIZE_MAX) {
+		// the closest error for this case is "input data was corrupted or incomplete"
+		return Z_DATA_ERROR;
+	}
+
+	size_t buf_sz = as_comp_protop->proto.sz - 8;
+	buf += sizeof(as_comp_proto);
+	uint8_t *decompressed_packet = cf_malloc(decompressed_as_packet_sz);
+	ret_value = as_decompress(COMPRESSION_ZLIB, buf_sz, buf, &decompressed_as_packet_sz, decompressed_packet);
+	if (ret_value) {
+		cf_free(decompressed_packet);
+	} else {
+		*output_packet = decompressed_packet;
+		if (output_packet_size) {
+			*output_packet_size = decompressed_as_packet_sz;
+		}
+	}
+	cf_debug(AS_COMPRESSION, "Returned as_packet_decompression : %d", ret_value);
+	return (ret_value);
+}
+
+/*
+ * Function to compress the given data
+ * Expected arguments
+ * 1. Type of compression
+ *  1 for zlib
+ * 2. Length of buffer to be compressed - mandatory
+ * 3. Pointer to buffer to be compressed - mandatory
+ * 4. Length of buffer to hold compressed data - mandatory
+ * 5. Pointer to buffer to hold compressed data - mandatory
+ * 6. Compression level - Optional, default Z_DEFAULT_COMPRESSION
+ */
+int
+as_compress(int argc, uint8_t *argv[])
+{
+#define MANDATORY_NO_ARGUMENTS 5
+	int compression_type;
+	uint8_t *buf;
+	size_t *buf_len;
+	uint8_t *out_buf;
+	size_t *out_buf_len;
+	int compression_level;
+	int ret_value = 0;
+
+	cf_debug(AS_COMPRESSION, "In as_compress");
+
+	if (argc < MANDATORY_NO_ARGUMENTS)
+	{
+		// Insufficient arguments
+		cf_debug(AS_COMPRESSION, "as_compress : In sufficient arguments\n");
+		cf_debug(AS_COMPRESSION, "Returned as_compress : -1");
+		return -1;
+	}
+
+	compression_type = *argv[0];
+	buf_len = (size_t *) argv[1];
+	buf = argv[2];
+	out_buf_len = (size_t *) argv[3];
+	out_buf = argv[4];
+
+	compression_level = (argc > MANDATORY_NO_ARGUMENTS) ? (*argv[MANDATORY_NO_ARGUMENTS + 1]) : Z_DEFAULT_COMPRESSION;
+
+	switch (compression_type)
+	{
+		case COMPRESSION_ZLIB:
+			// zlib api to compress the data
+			ret_value = compress2(out_buf, out_buf_len, buf, *buf_len, compression_level);
+			break;
+	}
+	cf_debug(AS_COMPRESSION, "Returned as_compress : %d", ret_value);
+	return ret_value;
+}
+
+/*
+ * Function to create packet to send compressed data.
+ * Packet :  Header - Original size of message - Compressed message.
+ * Input : buf - Pointer to data to be compressed. - Input
+ *     buf_sz - Size of the data to be compressed. - Input
+ *     compressed_packet : Pointer holding address of compressed packet. - Output
+ *     compressed_as_packet_sz : Size of the compressed packet. - Output
+ */
+int
+as_packet_compression(uint8_t *buf, size_t buf_sz, uint8_t **compressed_packet, size_t *compressed_as_packet_sz)
+{
+	uint8_t *tmp_buf;
+	uint8_t wr_stack_buf[STACK_BUF_SZ];
+	uint8_t *wr_buf = wr_stack_buf;
+	size_t  wr_buf_sz = sizeof(wr_stack_buf);
+	cf_debug(AS_COMPRESSION, "In as_packet_compression");
+
+	/* Compress the data using client API for compression.
+	 * Expected arguments
+	 * 1. Type of compression
+	 *  1 for zlib
+	 * 2. Length of buffer to be compressed - mandatory
+	 * 3. Pointer to buffer to be compressed - mandatory
+	 * 4. Length of buffer to hold compressed data - mandatory
+	 * 5. Pointer to buffer to hold compressed data - mandatory
+	 * 6. Compression level - Optional, default Z_DEFAULT_COMPRESSION
+	 */
+	uint8_t *argv[5];
+	int argc = 5;
+	int compression_type = COMPRESSION_ZLIB;
+	argv[0] = (uint8_t *)&compression_type;
+	argv[1] = (uint8_t *)&buf_sz;
+	argv[2] = buf;
+	argv[3] = (uint8_t *)&wr_buf_sz;
+	argv[4] = wr_buf;
+
+	if (as_compress(argc, argv))
+	{
+		compressed_packet = NULL;
+		compressed_as_packet_sz = 0;
+		cf_debug(AS_COMPRESSION, "Returned as_packet_compression : -1");
+		return -1;
+	}
+
+	// Allocate buffer to hold new packet
+	*compressed_as_packet_sz = sizeof(as_comp_proto) + wr_buf_sz;
+	*compressed_packet = (uint8_t *) cf_calloc(*compressed_as_packet_sz, 1);
+	if(!*compressed_packet)
+	{
+		cf_debug(AS_COMPRESSION, "as_packet_compression : failed to allocte memory");
+		cf_debug(AS_COMPRESSION, "Returned as_packet_compression : -1");
+		return -1;
+	}
+	// Construct the packet for compressed data.
+	as_comp_proto *as_comp_protop = (as_comp_proto *) *compressed_packet;
+	as_comp_protop->proto.version = PROTO_VERSION;
+	as_comp_protop->proto.type = PROTO_TYPE_AS_MSG_COMPRESSED;
+	as_comp_protop->proto.sz = *compressed_as_packet_sz - 8;
+	as_proto *proto = (as_proto *) *compressed_packet;
+	as_proto_swap(proto);
+	as_comp_protop->org_sz = buf_sz;
+
+	tmp_buf = *compressed_packet +  sizeof(as_comp_proto);
+	memcpy(tmp_buf, wr_buf, wr_buf_sz);
+
+	cf_debug(AS_COMPRESSION, "Returned as_packet_compression : 0");
+	return 0;
+}
diff --git a/as/src/base/particle.c b/as/src/base/particle.c
new file mode 100644
index 00000000..1411c836
--- /dev/null
+++ b/as/src/base/particle.c
@@ -0,0 +1,1016 @@
+/*
+ * particle.c
+ *
+ * Copyright (C) 2008-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include "base/particle.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+extern const as_particle_vtable integer_vtable;
+extern const as_particle_vtable float_vtable;
+extern const as_particle_vtable string_vtable;
+extern const as_particle_vtable blob_vtable;
+extern const as_particle_vtable map_vtable;
+extern const as_particle_vtable list_vtable;
+extern const as_particle_vtable geojson_vtable;
+
+// Array of particle vtable pointers.
+const as_particle_vtable *particle_vtable[] = {
+		[AS_PARTICLE_TYPE_NULL]			= NULL,
+		[AS_PARTICLE_TYPE_INTEGER]		= &integer_vtable,
+		[AS_PARTICLE_TYPE_FLOAT]		= &float_vtable,
+		[AS_PARTICLE_TYPE_STRING]		= &string_vtable,
+		[AS_PARTICLE_TYPE_BLOB]			= &blob_vtable,
+		[AS_PARTICLE_TYPE_TIMESTAMP]	= &integer_vtable,
+		[AS_PARTICLE_TYPE_JAVA_BLOB]	= &blob_vtable,
+		[AS_PARTICLE_TYPE_CSHARP_BLOB]	= &blob_vtable,
+		[AS_PARTICLE_TYPE_PYTHON_BLOB]	= &blob_vtable,
+		[AS_PARTICLE_TYPE_RUBY_BLOB]	= &blob_vtable,
+		[AS_PARTICLE_TYPE_PHP_BLOB]		= &blob_vtable,
+		[AS_PARTICLE_TYPE_ERLANG_BLOB]	= &blob_vtable,
+		[AS_PARTICLE_TYPE_MAP]			= &map_vtable,
+		[AS_PARTICLE_TYPE_LIST]			= &list_vtable,
+		[AS_PARTICLE_TYPE_GEOJSON]		= &geojson_vtable
+};
+
+
+//==========================================================
+// Local utilities.
+//
+
+// Particle type check.
+static inline as_particle_type
+safe_particle_type(uint8_t type)
+{
+	switch ((as_particle_type)type) {
+	case AS_PARTICLE_TYPE_INTEGER:
+	case AS_PARTICLE_TYPE_FLOAT:
+	case AS_PARTICLE_TYPE_STRING:
+	case AS_PARTICLE_TYPE_BLOB:
+	case AS_PARTICLE_TYPE_TIMESTAMP:
+	case AS_PARTICLE_TYPE_JAVA_BLOB:
+	case AS_PARTICLE_TYPE_CSHARP_BLOB:
+	case AS_PARTICLE_TYPE_PYTHON_BLOB:
+	case AS_PARTICLE_TYPE_RUBY_BLOB:
+	case AS_PARTICLE_TYPE_PHP_BLOB:
+	case AS_PARTICLE_TYPE_ERLANG_BLOB:
+	case AS_PARTICLE_TYPE_MAP:
+	case AS_PARTICLE_TYPE_LIST:
+	case AS_PARTICLE_TYPE_GEOJSON:
+		return (as_particle_type)type;
+	// Note - AS_PARTICLE_TYPE_NULL is considered bad here.
+	default:
+		cf_warning(AS_PARTICLE, "encountered bad particle type %u", type);
+		return AS_PARTICLE_TYPE_BAD;
+	}
+}
+
+
+//==========================================================
+// Particle "class static" functions.
+//
+
+as_particle_type
+as_particle_type_from_asval(const as_val *val)
+{
+	as_val_t vtype = as_val_type(val);
+
+	switch (vtype) {
+	case AS_UNDEF: // if val was null - handle quietly
+	case AS_NIL:
+		return AS_PARTICLE_TYPE_NULL;
+	case AS_BOOLEAN:
+	case AS_INTEGER:
+		return AS_PARTICLE_TYPE_INTEGER;
+	case AS_DOUBLE:
+		return AS_PARTICLE_TYPE_FLOAT;
+	case AS_STRING:
+		return AS_PARTICLE_TYPE_STRING;
+	case AS_BYTES:
+		return AS_PARTICLE_TYPE_BLOB;
+	case AS_GEOJSON:
+		return AS_PARTICLE_TYPE_GEOJSON;
+	case AS_LIST:
+		return AS_PARTICLE_TYPE_LIST;
+	case AS_MAP:
+		return AS_PARTICLE_TYPE_MAP;
+	case AS_REC:
+	case AS_PAIR:
+	default:
+		cf_warning(AS_PARTICLE, "no particle type for as_val_t %d", vtype);
+		return AS_PARTICLE_TYPE_NULL;
+	}
+}
+
+as_particle_type
+as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size)
+{
+	as_val_t vtype = as_unpack_buf_peek_type(packed, packed_size);
+
+	switch (vtype) {
+	case AS_NIL:
+		return AS_PARTICLE_TYPE_NULL;
+	case AS_BOOLEAN:
+	case AS_INTEGER:
+		return AS_PARTICLE_TYPE_INTEGER;
+	case AS_DOUBLE:
+		return AS_PARTICLE_TYPE_FLOAT;
+	case AS_STRING:
+		return AS_PARTICLE_TYPE_STRING;
+	case AS_BYTES:
+		return AS_PARTICLE_TYPE_BLOB;
+	case AS_GEOJSON:
+		return AS_PARTICLE_TYPE_GEOJSON;
+	case AS_LIST:
+		return AS_PARTICLE_TYPE_LIST;
+	case AS_MAP:
+		return AS_PARTICLE_TYPE_MAP;
+	case AS_UNDEF:
+	case AS_REC:
+	case AS_PAIR:
+	default:
+		cf_warning(AS_PARTICLE, "encountered bad as_val_t %d", vtype);
+		return AS_PARTICLE_TYPE_BAD;
+	}
+}
+
+uint32_t
+as_particle_size_from_asval(const as_val *val)
+{
+	as_particle_type type = as_particle_type_from_asval(val);
+
+	if (type == AS_PARTICLE_TYPE_NULL) {
+		// Currently UDF code just skips unmanageable as_val types.
+		return 0;
+	}
+
+	return particle_vtable[type]->size_from_asval_fn(val);
+}
+
+uint32_t
+as_particle_asval_client_value_size(const as_val *val)
+{
+	as_particle_type type = as_particle_type_from_asval(val);
+
+	if (type == AS_PARTICLE_TYPE_NULL) {
+		// Currently UDF code just sends bin-op with NULL particle to client.
+		return 0;
+	}
+
+	return particle_vtable[type]->asval_wire_size_fn(val);
+}
+
+uint32_t
+as_particle_asval_to_client(const as_val *val, as_msg_op *op)
+{
+	as_particle_type type = as_particle_type_from_asval(val);
+
+	op->particle_type = type;
+
+	if (type == AS_PARTICLE_TYPE_NULL) {
+		// Currently UDF code just sends bin-op with NULL particle to client.
+		return 0;
+	}
+
+	uint8_t *value = (uint8_t *)op + sizeof(as_msg_op) + op->name_sz;
+	uint32_t added_size = particle_vtable[type]->asval_to_wire_fn(val, value);
+
+	op->op_sz += added_size;
+
+	return added_size;
+}
+
+
+//==========================================================
+// as_bin particle functions.
+//
+
+//------------------------------------------------
+// Destructor, etc.
+//
+
+void
+as_bin_particle_destroy(as_bin *b, bool free_particle)
+{
+	if (free_particle && as_bin_is_external_particle(b) && b->particle) {
+		particle_vtable[as_bin_get_particle_type(b)]->destructor_fn(b->particle);
+	}
+
+	b->particle = NULL;
+}
+
+uint32_t
+as_bin_particle_size(as_bin *b)
+{
+	if (! as_bin_inuse(b)) {
+		// Single-bin will get here.
+		// TODO - clean up code paths so this doesn't happen?
+		return 0;
+	}
+
+	return particle_vtable[as_bin_get_particle_type(b)]->size_fn(b->particle);
+}
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int
+as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op)
+{
+	// This method does not destroy the existing particle, if any. We assume
+	// there is a copy of this bin (and particle reference) elsewhere, and that
+	// the copy will be responsible for the existing particle. Therefore it's
+	// important on failure to leave the existing particle intact.
+
+	uint8_t operation = op->op;
+	as_particle_type op_type = safe_particle_type(op->particle_type);
+
+	if (op_type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t op_value_size = as_msg_op_get_value_sz(op);
+	uint8_t *op_value = as_msg_op_get_value_p((as_msg_op *)op);
+
+	// Currently all operations become creates if there's no existing particle.
+	if (! as_bin_inuse(b)) {
+		// Memcache increment is weird - manipulate to create integer.
+		if (operation == AS_MSG_OP_MC_INCR) {
+			if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			op_type = AS_PARTICLE_TYPE_INTEGER;
+			op_value_size = sizeof(uint64_t);
+			op_value += sizeof(uint64_t);
+		}
+
+		int32_t mem_size = particle_vtable[op_type]->size_from_wire_fn(op_value, op_value_size);
+
+		if (mem_size < 0) {
+			return (int)mem_size;
+		}
+
+		as_particle *old_particle = b->particle;
+
+		if (mem_size != 0) {
+			b->particle = cf_malloc_ns((size_t)mem_size);
+		}
+
+		// Load the new particle into the bin.
+		int result = particle_vtable[op_type]->from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+
+		// Set the bin's iparticle metadata.
+		if (result == 0) {
+			as_bin_state_set_from_type(b, op_type);
+		}
+		else {
+			if (mem_size != 0) {
+				cf_free(b->particle);
+			}
+
+			b->particle = old_particle;
+		}
+
+		return result;
+	}
+
+	// There is an existing particle, which we will modify.
+	uint8_t existing_type = as_bin_get_particle_type(b);
+	int32_t new_mem_size = 0;
+	as_particle *new_particle = NULL;
+
+	as_particle *old_particle = b->particle;
+	int result = 0;
+
+	switch (operation) {
+	case AS_MSG_OP_MC_INCR:
+		if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+		op_type = AS_PARTICLE_TYPE_INTEGER;
+		// op_value_size of 16 will flag operation as memcache increment...
+		// no break
+	case AS_MSG_OP_INCR:
+		result = particle_vtable[existing_type]->incr_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		break;
+	case AS_MSG_OP_MC_APPEND:
+		if (existing_type != AS_PARTICLE_TYPE_STRING) {
+			return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		}
+		// no break
+	case AS_MSG_OP_APPEND:
+		new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		if (new_mem_size < 0) {
+			return new_mem_size;
+		}
+		new_particle = cf_malloc_ns((size_t)new_mem_size);
+		memcpy(new_particle, b->particle, particle_vtable[existing_type]->size_fn(b->particle));
+		b->particle = new_particle;
+		result = particle_vtable[existing_type]->append_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		break;
+	case AS_MSG_OP_MC_PREPEND:
+		if (existing_type != AS_PARTICLE_TYPE_STRING) {
+			return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		}
+		// no break
+	case AS_MSG_OP_PREPEND:
+		new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		if (new_mem_size < 0) {
+			return new_mem_size;
+		}
+		new_particle = cf_malloc_ns((size_t)new_mem_size);
+		memcpy(new_particle, b->particle, particle_vtable[existing_type]->size_fn(b->particle));
+		b->particle = new_particle;
+		result = particle_vtable[existing_type]->prepend_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		break;
+	default:
+		// TODO - just crash?
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (result < 0) {
+		if (new_mem_size != 0) {
+			cf_free(b->particle);
+		}
+
+		b->particle = old_particle;
+	}
+
+	return result;
+}
+
+int
+as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op)
+{
+	uint8_t operation = op->op;
+	as_particle_type op_type = safe_particle_type(op->particle_type);
+
+	if (op_type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t op_value_size = as_msg_op_get_value_sz(op);
+	uint8_t *op_value = as_msg_op_get_value_p((as_msg_op *)op);
+
+	// Currently all operations become creates if there's no existing particle.
+	if (! as_bin_inuse(b)) {
+		// Memcache increment is weird - manipulate to create integer.
+		if (operation == AS_MSG_OP_MC_INCR) {
+			if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			op_type = AS_PARTICLE_TYPE_INTEGER;
+			op_value_size = sizeof(uint64_t);
+			op_value += sizeof(uint64_t);
+		}
+
+		int32_t mem_size = particle_vtable[op_type]->size_from_wire_fn(op_value, op_value_size);
+
+		if (mem_size < 0) {
+			return (int)mem_size;
+		}
+
+		as_particle *old_particle = b->particle;
+
+		// Instead of allocating, we use the stack buffer provided. (Note that
+		// embedded types like integer will overwrite this with the value.)
+		cf_ll_buf_reserve(particles_llb, (size_t)mem_size, (uint8_t **)&b->particle);
+
+		// Load the new particle into the bin.
+		int result = particle_vtable[op_type]->from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+
+		// Set the bin's iparticle metadata.
+		if (result == 0) {
+			as_bin_state_set_from_type(b, op_type);
+		}
+		else {
+			b->particle = old_particle;
+		}
+
+		return result;
+	}
+
+	// There is an existing particle, which we will modify.
+	uint8_t existing_type = as_bin_get_particle_type(b);
+	int32_t new_mem_size = 0;
+
+	as_particle *old_particle = b->particle;
+	int result = 0;
+
+	switch (operation) {
+	case AS_MSG_OP_MC_INCR:
+		if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+		op_type = AS_PARTICLE_TYPE_INTEGER;
+		// op_value_size of 16 will flag operation as memcache increment...
+		// no break
+	case AS_MSG_OP_INCR:
+		result = particle_vtable[existing_type]->incr_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		break;
+	case AS_MSG_OP_MC_APPEND:
+		if (existing_type != AS_PARTICLE_TYPE_STRING) {
+			return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		}
+		// no break
+	case AS_MSG_OP_APPEND:
+		new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		if (new_mem_size < 0) {
+			return (int)new_mem_size;
+		}
+		cf_ll_buf_reserve(particles_llb, (size_t)new_mem_size, (uint8_t **)&b->particle);
+		memcpy(b->particle, old_particle, particle_vtable[existing_type]->size_fn(old_particle));
+		result = particle_vtable[existing_type]->append_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		break;
+	case AS_MSG_OP_MC_PREPEND:
+		if (existing_type != AS_PARTICLE_TYPE_STRING) {
+			return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		}
+		// no break
+	case AS_MSG_OP_PREPEND:
+		new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		if (new_mem_size < 0) {
+			return (int)new_mem_size;
+		}
+		cf_ll_buf_reserve(particles_llb, (size_t)new_mem_size, (uint8_t **)&b->particle);
+		memcpy(b->particle, old_particle, particle_vtable[existing_type]->size_fn(old_particle));
+		result = particle_vtable[existing_type]->prepend_from_wire_fn(op_type, op_value, op_value_size, &b->particle);
+		break;
+	default:
+		// TODO - just crash?
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (result < 0) {
+		b->particle = old_particle;
+	}
+
+	return result;
+}
+
+int
+as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op)
+{
+	// This method does not destroy the existing particle, if any. We assume
+	// there is a copy of this bin (and particle reference) elsewhere, and that
+	// the copy will be responsible for the existing particle. Therefore it's
+	// important on failure to leave the existing particle intact.
+
+	as_particle_type type = safe_particle_type(op->particle_type);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t value_size = as_msg_op_get_value_sz(op);
+	uint8_t *value = as_msg_op_get_value_p((as_msg_op *)op);
+	int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size);
+
+	if (mem_size < 0) {
+		return (int)mem_size;
+	}
+
+	as_particle *old_particle = b->particle;
+
+	if (mem_size != 0) {
+		b->particle = cf_malloc_ns((size_t)mem_size);
+	}
+
+	// Load the new particle into the bin.
+	int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle);
+
+	// Set the bin's iparticle metadata.
+	if (result == 0) {
+		as_bin_state_set_from_type(b, type);
+	}
+	else {
+		if (mem_size != 0) {
+			cf_free(b->particle);
+		}
+
+		b->particle = old_particle;
+	}
+
+	return result;
+}
+
+int
+as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op)
+{
+	// We assume that if we're using stack particles, the old particle is either
+	// nonexistent or also a stack particle - either way, don't destroy.
+
+	as_particle_type type = safe_particle_type(op->particle_type);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t value_size = as_msg_op_get_value_sz(op);
+	uint8_t *value = as_msg_op_get_value_p((as_msg_op *)op);
+	int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size);
+
+	if (mem_size < 0) {
+		return (int)mem_size;
+	}
+
+	as_particle *old_particle = b->particle;
+
+	// Instead of allocating, we use the stack buffer provided. (Note that
+	// embedded types like integer will overwrite this with the value.)
+	cf_ll_buf_reserve(particles_llb, (size_t)mem_size, (uint8_t **)&b->particle);
+
+	// Load the new particle into the bin.
+	int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle);
+
+	// Set the bin's iparticle metadata.
+	if (result == 0) {
+		as_bin_state_set_from_type(b, type);
+	}
+	else {
+		b->particle = old_particle;
+	}
+
+	return result;
+}
+
+int
+as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end)
+{
+	// This method does not destroy the existing particle, if any. We assume
+	// there is a copy of this bin (and particle reference) elsewhere, and that
+	// the copy will be responsible for the existing particle. Therefore it's
+	// important on failure to leave the existing particle intact.
+
+	const uint8_t *pickled = (const uint8_t *)*p_pickled;
+
+	if (pickled + 1 + 4 > end) {
+		cf_warning(AS_PARTICLE, "incomplete pickled particle");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	as_particle_type type = safe_particle_type(*pickled++);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	const uint32_t *p32 = (const uint32_t *)pickled;
+	uint32_t value_size = cf_swap_from_be32(*p32++);
+	const uint8_t *value = (const uint8_t *)p32;
+
+	*p_pickled = value + value_size;
+
+	// TODO - does this serve as a value_size sanity check?
+	if (*p_pickled > end) {
+		cf_warning(AS_PARTICLE, "incomplete pickled particle");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size);
+
+	if (mem_size < 0) {
+		return (int)mem_size;
+	}
+
+	as_particle *old_particle = b->particle;
+
+	if (mem_size != 0) {
+		b->particle = cf_malloc_ns((size_t)mem_size);
+	}
+
+	// Load the new particle into the bin.
+	int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle);
+
+	if (result < 0) {
+		if (mem_size != 0) {
+			cf_free(b->particle);
+		}
+
+		b->particle = old_particle;
+		return result;
+	}
+
+	// Set the bin's iparticle metadata.
+	as_bin_state_set_from_type(b, type);
+
+	return 0;
+}
+
+int
+as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end)
+{
+	// We assume that if we're using stack particles, the old particle is either
+	// nonexistent or also a stack particle - either way, don't destroy.
+
+	const uint8_t *pickled = (const uint8_t *)*p_pickled;
+
+	if (pickled + 1 + 4 > end) {
+		cf_warning(AS_PARTICLE, "incomplete pickled particle");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	as_particle_type type = safe_particle_type(*pickled++);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	const uint32_t *p32 = (const uint32_t *)pickled;
+	uint32_t value_size = cf_swap_from_be32(*p32++);
+	const uint8_t *value = (const uint8_t *)p32;
+
+	*p_pickled = value + value_size;
+
+	// TODO - does this serve as a value_size sanity check?
+	if (*p_pickled > end) {
+		cf_warning(AS_PARTICLE, "incomplete pickled particle");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size);
+
+	if (mem_size < 0) {
+		// Leave existing particle intact.
+		return (int)mem_size;
+	}
+
+	as_particle *old_particle = b->particle;
+
+	// Instead of allocating, we use the stack buffer provided. (Note that
+	// embedded types like integer will overwrite this with the value.)
+	cf_ll_buf_reserve(particles_llb, (size_t)mem_size, (uint8_t **)&b->particle);
+
+	// Load the new particle into the bin.
+	int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle);
+
+	if (result < 0) {
+		b->particle = old_particle;
+		return result;
+	}
+
+	// Set the bin's iparticle metadata.
+	as_bin_state_set_from_type(b, type);
+
+	return 0;
+}
+
+int
+as_bin_particle_compare_from_pickled(const as_bin *b, uint8_t **p_pickled)
+{
+	if (! as_bin_inuse(b)) {
+		// TODO - just crash?
+		cf_warning(AS_PARTICLE, "comparing to unused bin");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	const uint8_t *pickled = (const uint8_t *)*p_pickled;
+	as_particle_type type = safe_particle_type(*pickled++);
+	const uint32_t *p32 = (const uint32_t *)pickled;
+	uint32_t value_size = cf_swap_from_be32(*p32++);
+	const uint8_t *value = (const uint8_t *)p32;
+
+	*p_pickled = (uint8_t *)value + value_size;
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return particle_vtable[as_bin_get_particle_type(b)]->compare_from_wire_fn(b->particle, type, value, value_size);
+}
+
+uint32_t
+as_bin_particle_client_value_size(const as_bin *b)
+{
+	if (! as_bin_inuse(b)) {
+		// UDF result bin (bin name "SUCCESS" or "FAILURE") will get here.
+		return 0;
+	}
+
+	uint8_t type = as_bin_get_particle_type(b);
+
+	return particle_vtable[type]->wire_size_fn(b->particle);
+}
+
+uint32_t
+as_bin_particle_to_client(const as_bin *b, as_msg_op *op)
+{
+	if (! (b && as_bin_inuse(b))) {
+		// UDF result bin (bin name "SUCCESS" or "FAILURE") will get here.
+		// Ordered ops that find no bin will get here.
+		op->particle_type = AS_PARTICLE_TYPE_NULL;
+		return 0;
+	}
+
+	uint8_t type = as_bin_get_particle_type(b);
+
+	op->particle_type = type;
+
+	uint8_t *value = (uint8_t *)op + sizeof(as_msg_op) + op->name_sz;
+	uint32_t added_size = particle_vtable[type]->to_wire_fn(b->particle, value);
+
+	op->op_sz += added_size;
+
+	return added_size;
+}
+
+uint32_t
+as_bin_particle_pickled_size(const as_bin *b)
+{
+	uint8_t type = as_bin_get_particle_type(b);
+
+	// Always a type byte and a 32-bit size.
+	return 1 + 4 + particle_vtable[type]->wire_size_fn(b->particle);
+}
+
+uint32_t
+as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled)
+{
+	uint8_t type = as_bin_get_particle_type(b);
+
+	*pickled++ = type;
+
+	uint32_t *p_size = (uint32_t *)pickled;
+	uint8_t *value = (uint8_t *)(p_size + 1);
+	uint32_t size = particle_vtable[type]->to_wire_fn(b->particle, value);
+
+	*p_size = cf_swap_to_be32(size);
+
+	return 1 + 4 + size;
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+int
+as_bin_particle_replace_from_asval(as_bin *b, const as_val *val)
+{
+	uint8_t old_type = as_bin_get_particle_type(b);
+	as_particle_type new_type = as_particle_type_from_asval(val);
+
+	if (new_type == AS_PARTICLE_TYPE_NULL) {
+		// Currently UDF code just skips unmanageable as_val types.
+		return 0;
+	}
+
+	uint32_t new_mem_size = particle_vtable[new_type]->size_from_asval_fn(val);
+	// TODO - could this ever fail?
+
+	as_particle *old_particle = b->particle;
+
+	if (new_mem_size != 0) {
+		b->particle = cf_malloc_ns(new_mem_size);
+	}
+
+	// Load the new particle into the bin.
+	particle_vtable[new_type]->from_asval_fn(val, &b->particle);
+	// TODO - could this ever fail?
+
+	if (as_bin_inuse(b)) {
+		// Destroy the old particle.
+		particle_vtable[old_type]->destructor_fn(old_particle);
+	}
+
+	// Set the bin's iparticle metadata.
+	as_bin_state_set_from_type(b, new_type);
+
+	return 0;
+}
+
+void
+as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val)
+{
+	// We assume that if we're using stack particles, the old particle is either
+	// nonexistent or also a stack particle - either way, don't destroy.
+
+	as_particle_type type = as_particle_type_from_asval(val);
+
+	if (type == AS_PARTICLE_TYPE_NULL) {
+		// Currently UDF code just skips unmanageable as_val types.
+		return;
+	}
+
+	// Instead of allocating, we use the stack buffer provided. (Note that
+	// embedded types like integer will overwrite this with the value.)
+	b->particle = (as_particle *)stack;
+
+	// Load the new particle into the bin.
+	particle_vtable[type]->from_asval_fn(val, &b->particle);
+	// TODO - could this ever fail?
+
+	// Set the bin's iparticle metadata.
+	as_bin_state_set_from_type(b, type);
+
+	// TODO - we don't bother returning size written, since nothing yet needs
+	// it and it's very expensive for CDTs to do an extra size_from_asval_fn()
+	// call. Perhaps we could have from_asval_fn() return the size if needed?
+}
+
+as_val *
+as_bin_particle_to_asval(const as_bin *b)
+{
+	uint8_t type = as_bin_get_particle_type(b);
+
+	// Caller is responsible for freeing as_val returned here.
+	return particle_vtable[type]->to_asval_fn(b->particle);
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+int
+as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size)
+{
+	// We assume the bin is empty.
+
+	as_particle_type type = as_particle_type_from_msgpack(packed, packed_size);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (type == AS_PARTICLE_TYPE_NULL) {
+		return AS_PROTO_RESULT_OK;
+	}
+
+	uint32_t mem_size = particle_vtable[type]->size_from_msgpack_fn(packed, packed_size);
+
+	if (mem_size != 0) {
+		b->particle = cf_malloc(mem_size); // response, so not cf_malloc_ns()
+	}
+
+	particle_vtable[type]->from_msgpack_fn(packed, packed_size, &b->particle);
+
+	// Set the bin's iparticle metadata.
+	as_bin_state_set_from_type(b, type);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+//------------------------------------------------
+// Handle on-device "flat" format.
+//
+
+// TODO - re-do to leave original intact on failure.
+int
+as_bin_particle_cast_from_flat(as_bin *b, uint8_t *flat, uint32_t flat_size)
+{
+	if (as_bin_inuse(b)) {
+		// TODO - just crash?
+		cf_warning(AS_PARTICLE, "cast from flat into used bin");
+		return -1;
+	}
+
+	as_particle_type type = safe_particle_type(*flat);
+
+	if (type == AS_PARTICLE_TYPE_BAD) {
+		return -1;
+	}
+
+	// Cast the new particle into the bin.
+	int result = particle_vtable[type]->cast_from_flat_fn(flat, flat_size, &b->particle);
+
+	// Set the bin's iparticle metadata.
+	if (result == 0) {
+		as_bin_state_set_from_type(b, type);
+	}
+	else {
+		as_bin_set_empty(b);
+	}
+
+	return result;
+}
+
+// TODO - re-do to leave original intact on failure.
+int
+as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, uint32_t flat_size)
+{
+	uint8_t old_type = as_bin_get_particle_type(b);
+	as_particle_type new_type = safe_particle_type(*flat);
+
+	if (new_type == AS_PARTICLE_TYPE_BAD) {
+		return -1;
+	}
+
+	// Just destroy the old particle, if any - we're replacing it.
+	if (as_bin_inuse(b)) {
+		particle_vtable[old_type]->destructor_fn(b->particle);
+	}
+
+	// Load the new particle into the bin.
+	int result = particle_vtable[new_type]->from_flat_fn(flat, flat_size, &b->particle);
+
+	// Set the bin's iparticle metadata.
+	if (result == 0) {
+		as_bin_state_set_from_type(b, new_type);
+	}
+	else {
+		as_bin_set_empty(b);
+	}
+
+	return result;
+}
+
+uint32_t
+as_bin_particle_flat_size(as_bin *b)
+{
+	if (! as_bin_inuse(b)) {
+		// TODO - just crash?
+		cf_warning(AS_PARTICLE, "flat sizing unused bin");
+		return 0;
+	}
+
+	uint8_t type = as_bin_get_particle_type(b);
+
+	return particle_vtable[type]->flat_size_fn(b->particle);
+}
+
+uint32_t
+as_bin_particle_to_flat(const as_bin *b, uint8_t *flat)
+{
+	if (! as_bin_inuse(b)) {
+		// TODO - just crash?
+		cf_warning(AS_PARTICLE, "flattening unused bin");
+		return 0;
+	}
+
+	uint8_t type = as_bin_get_particle_type(b);
+
+	*flat = type;
+
+	return particle_vtable[type]->to_flat_fn(b->particle, flat);
+}
+
+
+//==========================================================
+// as_bin particle functions specific to CDTs.
+//
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int
+as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result)
+{
+	return as_bin_cdt_packed_read(b, op, result);
+}
+
+int
+as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result)
+{
+	return as_bin_cdt_packed_modify(b, op, result, NULL);
+}
+
+int
+as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result)
+{
+	return as_bin_cdt_packed_modify(b, op, result, particles_llb);
+}
diff --git a/as/src/base/particle_blob.c b/as/src/base/particle_blob.c
new file mode 100644
index 00000000..0c8ec98f
--- /dev/null
+++ b/as/src/base/particle_blob.c
@@ -0,0 +1,432 @@
+/*
+ * particle_blob.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include "base/particle_blob.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aerospike/as_bytes.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/proto.h"
+
+
+// BLOB particle interface function declarations are in particle_blob.h since
+// BLOB functions are used by other particles derived from BLOB.
+
+
+//==========================================================
+// BLOB particle interface - vtable.
+//
+
+const as_particle_vtable blob_vtable = {
+		blob_destruct,
+		blob_size,
+
+		blob_concat_size_from_wire,
+		blob_append_from_wire,
+		blob_prepend_from_wire,
+		blob_incr_from_wire,
+		blob_size_from_wire,
+		blob_from_wire,
+		blob_compare_from_wire,
+		blob_wire_size,
+		blob_to_wire,
+
+		blob_size_from_asval,
+		blob_from_asval,
+		blob_to_asval,
+		blob_asval_wire_size,
+		blob_asval_to_wire,
+
+		blob_size_from_msgpack,
+		blob_from_msgpack,
+
+		blob_size_from_flat,
+		blob_cast_from_flat,
+		blob_from_flat,
+		blob_flat_size,
+		blob_to_flat
+};
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct blob_mem_s {
+	uint8_t		type;
+	uint32_t	sz;
+	uint8_t		data[];
+} __attribute__ ((__packed__)) blob_mem;
+
+typedef struct blob_flat_s {
+	uint8_t		type;
+	uint32_t	size; // host order on device
+	uint8_t		data[];
+} __attribute__ ((__packed__)) blob_flat;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static inline as_particle_type blob_bytes_type_to_particle_type(as_bytes_type type);
+
+
+//==========================================================
+// BLOB particle interface - function definitions.
+//
+
+//------------------------------------------------
+// Destructor, etc.
+//
+
+void
+blob_destruct(as_particle *p)
+{
+	cf_free(p);
+}
+
+uint32_t
+blob_size(const as_particle *p)
+{
+	return (uint32_t)(sizeof(blob_mem) + ((blob_mem *)p)->sz);
+}
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int32_t
+blob_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	blob_mem *p_blob_mem = (blob_mem *)*pp;
+
+	if (wire_type != p_blob_mem->type) {
+		cf_warning(AS_PARTICLE, "type mismatch concat sizing blob/string, %d:%d", p_blob_mem->type, wire_type);
+		return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+	}
+
+	return (int32_t)(sizeof(blob_mem) + p_blob_mem->sz + value_size);
+}
+
+int
+blob_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	blob_mem *p_blob_mem = (blob_mem *)*pp;
+
+	if (wire_type != p_blob_mem->type) {
+		cf_warning(AS_PARTICLE, "type mismatch appending to blob/string, %d:%d", p_blob_mem->type, wire_type);
+		return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+	}
+
+	memcpy(p_blob_mem->data + p_blob_mem->sz, wire_value, value_size);
+	p_blob_mem->sz += value_size;
+
+	return 0;
+}
+
+int
+blob_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	blob_mem *p_blob_mem = (blob_mem *)*pp;
+
+	if (wire_type != p_blob_mem->type) {
+		cf_warning(AS_PARTICLE, "type mismatch prepending to blob/string, %d:%d", p_blob_mem->type, wire_type);
+		return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+	}
+
+	memmove(p_blob_mem->data + value_size, p_blob_mem->data, p_blob_mem->sz);
+	memcpy(p_blob_mem->data, wire_value, value_size);
+	p_blob_mem->sz += value_size;
+
+	return 0;
+}
+
+int
+blob_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "unexpected increment of blob/string");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int32_t
+blob_size_from_wire(const uint8_t *wire_value, uint32_t value_size)
+{
+	// Wire value is same as in-memory value.
+	return (int32_t)(sizeof(blob_mem) + value_size);
+}
+
+int
+blob_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	blob_mem *p_blob_mem = (blob_mem *)*pp;
+
+	p_blob_mem->type = wire_type;
+	p_blob_mem->sz = value_size;
+	memcpy(p_blob_mem->data, wire_value, p_blob_mem->sz);
+
+	return 0;
+}
+
+int
+blob_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size)
+{
+	blob_mem *p_blob_mem = (blob_mem *)p;
+
+	return (wire_type == p_blob_mem->type &&
+			value_size == p_blob_mem->sz &&
+			memcmp(wire_value, p_blob_mem->data, value_size) == 0) ? 0 : 1;
+}
+
+uint32_t
+blob_wire_size(const as_particle *p)
+{
+	blob_mem *p_blob_mem = (blob_mem *)p;
+
+	return p_blob_mem->sz;
+}
+
+uint32_t
+blob_to_wire(const as_particle *p, uint8_t *wire)
+{
+	blob_mem *p_blob_mem = (blob_mem *)p;
+
+	memcpy(wire, p_blob_mem->data, p_blob_mem->sz);
+
+	return p_blob_mem->sz;
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+uint32_t
+blob_size_from_asval(const as_val *val)
+{
+	return (uint32_t)sizeof(blob_mem) + as_bytes_size(as_bytes_fromval(val));
+}
+
+void
+blob_from_asval(const as_val *val, as_particle **pp)
+{
+	blob_mem *p_blob_mem = (blob_mem *)*pp;
+
+	as_bytes *bytes = as_bytes_fromval(val);
+
+	p_blob_mem->type = (uint8_t)blob_bytes_type_to_particle_type(bytes->type);
+	p_blob_mem->sz = as_bytes_size(bytes);
+	memcpy(p_blob_mem->data, as_bytes_get(bytes), p_blob_mem->sz);
+}
+
+as_val *
+blob_to_asval(const as_particle *p)
+{
+	blob_mem *p_blob_mem = (blob_mem *)p;
+
+	uint8_t *value = cf_malloc(p_blob_mem->sz);
+
+	memcpy(value, p_blob_mem->data, p_blob_mem->sz);
+
+	return (as_val *)as_bytes_new_wrap(value, p_blob_mem->sz, true);
+}
+
+uint32_t
+blob_asval_wire_size(const as_val *val)
+{
+	return as_bytes_size(as_bytes_fromval(val));
+}
+
+uint32_t
+blob_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	as_bytes *bytes = as_bytes_fromval(val);
+	uint32_t size = as_bytes_size(bytes);
+
+	memcpy(wire, as_bytes_get(bytes), size);
+
+	return size;
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+uint32_t
+blob_size_from_msgpack(const uint8_t *packed, uint32_t packed_size)
+{
+	// Ok to oversize by a few bytes - only used for allocation sizing.
+	// -1 for blob internal type and -1 for blob header.
+	return (uint32_t)sizeof(blob_mem) + packed_size - 2;
+}
+
+void
+blob_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp)
+{
+	as_unpacker pk = {
+			.buffer = packed,
+			.offset = 0,
+			.length = packed_size
+	};
+
+	int64_t blob_size = as_unpack_blob_size(&pk);
+	const uint8_t *ptr = pk.buffer + pk.offset;
+
+	uint8_t type = *ptr;
+
+	// Adjust for type (1 byte).
+	ptr++;
+	blob_size--;
+
+	blob_mem *p_blob_mem = (blob_mem *)*pp;
+
+	p_blob_mem->type = (uint8_t)blob_bytes_type_to_particle_type((as_bytes_type)type);
+	p_blob_mem->sz = blob_size;
+	memcpy(p_blob_mem->data, ptr, p_blob_mem->sz);
+}
+
+//------------------------------------------------
+// Handle on-device "flat" format.
+//
+
+int32_t
+blob_size_from_flat(const uint8_t *flat, uint32_t flat_size)
+{
+	blob_flat *p_blob_flat = (blob_flat *)flat;
+	// Assume type is correct, since we got here.
+
+	// Sanity check length.
+	if (p_blob_flat->size != flat_size - sizeof(blob_flat)) {
+		cf_warning(AS_PARTICLE, "unexpected flat blob/string: flat size %u, len %u",
+				flat_size, p_blob_flat->size);
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	// Flat value is same as in-memory value.
+	return (int32_t)(sizeof(blob_mem) + p_blob_flat->size);
+}
+
+int
+blob_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	// Sizing is only a sanity check.
+	int32_t mem_size = blob_size_from_flat(flat, flat_size);
+
+	if (mem_size < 0) {
+		return mem_size;
+	}
+
+	// We can do this only because the flat and in-memory formats are identical.
+	*pp = (as_particle *)flat;
+
+	return 0;
+}
+
+int
+blob_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	int32_t mem_size = blob_size_from_flat(flat, flat_size);
+
+	if (mem_size < 0) {
+		return mem_size;
+	}
+
+	blob_mem *p_blob_mem = (blob_mem *)cf_malloc_ns((size_t)mem_size);
+	const blob_flat *p_blob_flat = (const blob_flat *)flat;
+
+	p_blob_mem->type = p_blob_flat->type;
+	p_blob_mem->sz = p_blob_flat->size;
+	memcpy(p_blob_mem->data, p_blob_flat->data, p_blob_mem->sz);
+
+	*pp = (as_particle *)p_blob_mem;
+
+	return 0;
+}
+
+uint32_t
+blob_flat_size(const as_particle *p)
+{
+	return (uint32_t)(sizeof(blob_flat) + ((blob_mem *)p)->sz);
+}
+
+uint32_t
+blob_to_flat(const as_particle *p, uint8_t *flat)
+{
+	blob_mem *p_blob_mem = (blob_mem *)p;
+	blob_flat *p_blob_flat = (blob_flat *)flat;
+
+	// Already wrote the type.
+	p_blob_flat->size = p_blob_mem->sz;
+	memcpy(p_blob_flat->data, p_blob_mem->data, p_blob_flat->size);
+
+	return blob_flat_size(p);
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static inline as_particle_type
+blob_bytes_type_to_particle_type(as_bytes_type type)
+{
+	switch (type) {
+	case AS_BYTES_STRING:
+		return AS_PARTICLE_TYPE_STRING;
+	case AS_BYTES_BLOB:
+		return AS_PARTICLE_TYPE_BLOB;
+	case AS_BYTES_JAVA:
+		return AS_PARTICLE_TYPE_JAVA_BLOB;
+	case AS_BYTES_CSHARP:
+		return AS_PARTICLE_TYPE_CSHARP_BLOB;
+	case AS_BYTES_PYTHON:
+		return AS_PARTICLE_TYPE_PYTHON_BLOB;
+	case AS_BYTES_RUBY:
+		return AS_PARTICLE_TYPE_RUBY_BLOB;
+	case AS_BYTES_PHP:
+		return AS_PARTICLE_TYPE_PHP_BLOB;
+	case AS_BYTES_ERLANG:
+		return AS_PARTICLE_TYPE_ERLANG_BLOB;
+	case AS_BYTES_GEOJSON:
+		return AS_PARTICLE_TYPE_GEOJSON;
+	case AS_BYTES_INTEGER:
+	case AS_BYTES_DOUBLE:
+	case AS_BYTES_MAP:
+	case AS_BYTES_LIST:
+	case AS_BYTES_UNDEF:
+	default:
+		break;
+	}
+
+	// Invalid blob types remain as blobs.
+	return AS_PARTICLE_TYPE_BLOB;
+}
diff --git a/as/src/base/particle_float.c b/as/src/base/particle_float.c
new file mode 100644
index 00000000..203bb3db
--- /dev/null
+++ b/as/src/base/particle_float.c
@@ -0,0 +1,200 @@
+/*
+ * particle_float.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aerospike/as_double.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_byte_order.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/particle_integer.h"
+#include "base/proto.h"
+
+
+//==========================================================
+// FLOAT particle interface - function declarations.
+//
+
+// Most FLOAT particle table functions just use the equivalent INTEGER particle
+// functions. Here are the differences...
+
+// Handle "wire" format.
+int float_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int float_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int float_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size);
+
+// Handle as_val translation.
+void float_from_asval(const as_val *val, as_particle **pp);
+as_val *float_to_asval(const as_particle *p);
+uint32_t float_asval_to_wire(const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+void float_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+
+//==========================================================
+// FLOAT particle interface - vtable.
+//
+
+const as_particle_vtable float_vtable = {
+		integer_destruct,
+		integer_size,
+
+		integer_concat_size_from_wire,
+		integer_append_from_wire,
+		integer_prepend_from_wire,
+		float_incr_from_wire,
+		integer_size_from_wire,
+		float_from_wire,
+		float_compare_from_wire,
+		integer_wire_size,
+		integer_to_wire,
+
+		integer_size_from_asval,
+		float_from_asval,
+		float_to_asval,
+		integer_asval_wire_size,
+		float_asval_to_wire,
+
+		integer_size_from_msgpack,
+		float_from_msgpack,
+
+		integer_size_from_flat,
+		integer_cast_from_flat,
+		integer_from_flat,
+		integer_flat_size,
+		integer_to_flat
+};
+
+
+//==========================================================
+// FLOAT particle interface - function definitions.
+//
+
+// Most FLOAT particle table functions just use the equivalent INTEGER particle
+// functions. Here are the differences...
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int
+float_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	// For now we won't allow adding integers (or anything else) to floats.
+	if (wire_type != AS_PARTICLE_TYPE_FLOAT) {
+		cf_warning(AS_PARTICLE, "increment with non float type %u", wire_type);
+		return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+	}
+
+	uint64_t i;
+
+	switch (value_size) {
+	case 8:
+		i = cf_swap_from_be64(*(uint64_t *)wire_value);
+		break;
+	default:
+		cf_warning(AS_PARTICLE, "unexpected value size %u", value_size);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	(*(double *)pp) += *(double *)&i;
+
+	return 0;
+}
+
+int
+float_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	if (value_size != 8) {
+		cf_warning(AS_PARTICLE, "unexpected value size %u", value_size);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return integer_from_wire(wire_type, wire_value, value_size, pp);
+}
+
+int
+float_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size)
+{
+	if (wire_type != AS_PARTICLE_TYPE_FLOAT) {
+		return 1;
+	}
+
+	if (value_size != 8) {
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return integer_compare_from_wire(p, AS_PARTICLE_TYPE_INTEGER, wire_value, value_size);
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+void
+float_from_asval(const as_val *val, as_particle **pp)
+{
+	*(double *)pp = as_double_get(as_double_fromval(val));
+}
+
+as_val *
+float_to_asval(const as_particle *p)
+{
+	return (as_val *)as_double_new(*(double *)&p);
+}
+
+uint32_t
+float_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	double x = as_double_get(as_double_fromval(val));
+
+	*(uint64_t *)wire = cf_swap_to_be64(*(uint64_t *)&x);
+
+	return (uint32_t)sizeof(uint64_t);
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+void
+float_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp)
+{
+	double x;
+	as_unpacker pk = {
+			.buffer = packed,
+			.offset = 0,
+			.length = packed_size
+	};
+
+	as_unpack_double(&pk, &x);
+
+	*(double *)pp = x;
+}
diff --git a/as/src/base/particle_geojson.c b/as/src/base/particle_geojson.c
new file mode 100644
index 00000000..72cd7b87
--- /dev/null
+++ b/as/src/base/particle_geojson.c
@@ -0,0 +1,600 @@
+/*
+ * particle_geojson.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aerospike/as_geojson.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/particle_blob.h"
+#include "base/proto.h"
+#include "geospatial/geospatial.h"
+
+
+//==========================================================
+// GEOJSON particle interface - function declarations.
+//
+
+// Most GEOJSON particle table functions just use the equivalent BLOB particle
+// functions. Here are the differences...
+
+// Handle "wire" format.
+int32_t geojson_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int geojson_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int geojson_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int geojson_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int32_t geojson_size_from_wire(const uint8_t *wire_value, uint32_t value_size);
+int geojson_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+uint32_t geojson_to_wire(const as_particle *p, uint8_t *wire);
+
+// Handle as_val translation.
+uint32_t geojson_size_from_asval(const as_val *val);
+void geojson_from_asval(const as_val *val, as_particle **pp);
+as_val *geojson_to_asval(const as_particle *p);
+uint32_t geojson_asval_wire_size(const as_val *val);
+uint32_t geojson_asval_to_wire(const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+uint32_t geojson_size_from_msgpack(const uint8_t *packed, uint32_t packed_size);
+void geojson_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+
+//==========================================================
+// GEOJSON particle interface - vtable.
+//
+
+const as_particle_vtable geojson_vtable = {
+		blob_destruct,
+		blob_size,
+
+		geojson_concat_size_from_wire,
+		geojson_append_from_wire,
+		geojson_prepend_from_wire,
+		geojson_incr_from_wire,
+		geojson_size_from_wire,
+		geojson_from_wire,
+		blob_compare_from_wire,
+		blob_wire_size,
+		geojson_to_wire,
+
+		geojson_size_from_asval,
+		geojson_from_asval,
+		geojson_to_asval,
+		geojson_asval_wire_size,
+		geojson_asval_to_wire,
+
+		geojson_size_from_msgpack,
+		geojson_from_msgpack,
+
+		blob_size_from_flat,
+		blob_cast_from_flat,
+		blob_from_flat,
+		blob_flat_size,
+		blob_to_flat
+};
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// GEOJSON particle flag bit-fields.
+#define GEOJSON_ISREGION	0x1
+
+// The GEOJSON particle structs overlay the related BLOB structs.
+
+typedef struct geojson_mem_s {
+	uint8_t		type;	// IMPORTANT: overlay blob_mem!
+	uint32_t	sz;		// IMPORTANT: overlay blob_mem!
+	uint8_t		flags;
+	uint16_t	ncells;
+	uint8_t		data[];	// (ncells * uint64_t) + jsonstr
+} __attribute__ ((__packed__)) geojson_mem;
+
+typedef struct geojson_flat_s {
+	uint8_t		type;	// IMPORTANT: overlay blob_flat!
+	uint32_t	size;	// IMPORTANT: overlay blob_flat!
+	uint8_t		flags;
+	uint16_t	ncells;
+	uint8_t		data[];	// (ncells * uint64_t) + jsonstr
+} __attribute__ ((__packed__)) geojson_flat;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static bool geojson_match(bool particle_is_region, uint64_t particle_cellid, geo_region_t particle_region, uint64_t query_cellid, geo_region_t query_region, bool is_strict);
+static inline uint32_t geojson_size(uint32_t n_cells, size_t string_size);
+
+
+//==========================================================
+// GEOJSON particle interface - function definitions.
+//
+
+// Most GEOJSON particle table functions just use the equivalent BLOB particle
+// functions. Here are the differences...
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int32_t
+geojson_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "invalid operation on geojson particle");
+	return -1;
+}
+
+int32_t
+geojson_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "invalid operation on geojson particle");
+	return -1;
+}
+
+int32_t
+geojson_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "invalid operation on geojson particle");
+	return -1;
+}
+
+int32_t
+geojson_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "invalid operation on geojson particle");
+	return -1;
+}
+
+int32_t
+geojson_size_from_wire(const uint8_t *wire_value, uint32_t value_size)
+{
+	// NOTE - Unfortunately we would need to run the JSON parser and region
+	// coverer to find out exactly how many cells we need to allocate for this
+	// particle.
+	//
+	// For now we always allocate the maximum number of cells (MAX_REGION_CELLS)
+	// for the in-memory particle.
+	//
+	// For now also ignore any incoming cells entirely.
+
+	uint8_t const *incp = (uint8_t const *)wire_value + 1;
+	uint16_t incells = cf_swap_from_be16(*(uint16_t const *)incp);
+	size_t incellsz = incells * sizeof(uint64_t);
+	size_t injsonsz = value_size - sizeof(uint8_t) - sizeof(uint16_t) - incellsz;
+
+	return (int32_t)(sizeof(geojson_mem) + (MAX_REGION_CELLS * sizeof(uint64_t)) + injsonsz);
+}
+
+int
+geojson_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	uint8_t const *incp = (uint8_t const *)wire_value + 1;
+	uint16_t incells = cf_swap_from_be16(*(uint16_t const *)incp);
+	size_t incellsz = incells * sizeof(uint64_t);
+	char const *injsonptr = (char const *)incp + sizeof(uint16_t) + incellsz;
+	size_t injsonsz = value_size - sizeof(uint8_t) - sizeof(uint16_t) - incellsz;
+
+	// We ignore any incoming cells entirely.
+
+	uint64_t cellid = 0;
+	geo_region_t region = NULL;
+
+	if (! geo_parse(NULL, injsonptr, injsonsz, &cellid, &region)) {
+		cf_warning(AS_PARTICLE, "geo_parse failed");
+		return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON;
+	}
+
+	if (cellid && region) {
+		geo_region_destroy(region);
+		cf_warning(AS_PARTICLE, "geo_parse found both point and region");
+		return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON;
+	}
+
+	if (! cellid && ! region) {
+		cf_warning(AS_PARTICLE, "geo_parse found neither point nor region");
+		return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON;
+	}
+
+	geojson_mem *p_geojson_mem = (geojson_mem *)*pp;
+
+	p_geojson_mem->type = wire_type;
+
+	// We'll come back and set the size at the end.
+	uint64_t *p_outcells = (uint64_t *)p_geojson_mem->data;
+
+	p_geojson_mem->flags = 0;
+
+	if (cellid) {
+		// POINT
+		p_geojson_mem->flags &= ~GEOJSON_ISREGION;
+		p_geojson_mem->ncells = 1;
+		p_outcells[0] = cellid;
+	}
+	else {
+		// REGION
+		p_geojson_mem->flags |= GEOJSON_ISREGION;
+
+		int numcells;
+
+		if (! geo_region_cover(NULL, region, MAX_REGION_CELLS, p_outcells, NULL, NULL, &numcells)) {
+			geo_region_destroy(region);
+			cf_warning(AS_PARTICLE, "geo_region_cover failed");
+			return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON;
+		}
+
+		p_geojson_mem->ncells = numcells;
+	}
+
+	if (region) {
+		geo_region_destroy(region);
+	}
+
+	// Copy the JSON into place.
+	char *p_outjson = (char *)&p_outcells[p_geojson_mem->ncells];
+
+	memcpy(p_outjson, injsonptr, injsonsz);
+
+	// Set the actual size; we will waste some space at the end of the allocated
+	// particle.
+	p_geojson_mem->sz = sizeof(uint8_t) + sizeof(uint16_t) + (p_geojson_mem->ncells * sizeof(uint64_t)) + injsonsz;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+uint32_t
+geojson_to_wire(const as_particle *p, uint8_t *wire)
+{
+	// Use blob routine first.
+	uint32_t sz = blob_to_wire(p, wire);
+
+	// Swap ncells.
+	uint16_t *p_ncells = (uint16_t *)(wire + sizeof(uint8_t));
+	uint16_t ncells = *p_ncells;
+
+	*p_ncells = cf_swap_to_be16(*p_ncells);
+	++p_ncells;
+
+	// Swap the cells.
+	uint64_t *p_cell_begin = (uint64_t *)p_ncells;
+	uint64_t *p_cell_end = p_cell_begin + ncells;
+
+	for (uint64_t *p_cell = p_cell_begin; p_cell < p_cell_end; ++p_cell) {
+		*p_cell = cf_swap_to_be64(*p_cell);
+	}
+
+	return sz;
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+uint32_t
+geojson_size_from_asval(const as_val *val)
+{
+	as_geojson *pg = as_geojson_fromval(val);
+	size_t jsz = as_geojson_len(pg);
+
+	// Compute the size; we won't be writing any cellids ...
+	return geojson_size(0, jsz);
+}
+
+void
+geojson_from_asval(const as_val *val, as_particle **pp)
+{
+	geojson_mem *p_geojson_mem = (geojson_mem *)*pp;
+
+	as_geojson *pg = as_geojson_fromval(val);
+	size_t jsz = as_geojson_len(pg);
+
+	p_geojson_mem->type = AS_PARTICLE_TYPE_GEOJSON;
+	p_geojson_mem->sz = geojson_size(0, jsz);
+	p_geojson_mem->flags = 0;
+	p_geojson_mem->ncells = 0;
+
+	uint8_t *p8 = (uint8_t *)p_geojson_mem->data;
+	memcpy(p8, as_geojson_get(pg), jsz);
+}
+
+as_val *
+geojson_to_asval(const as_particle *p)
+{
+	size_t jsonsz;
+	char const *jsonptr = as_geojson_mem_jsonstr(p, &jsonsz);
+	char *buf = cf_malloc(jsonsz + 1);
+
+	memcpy(buf, jsonptr, jsonsz);
+	buf[jsonsz] = '\0';
+
+	return (as_val *)as_geojson_new_wlen(buf, jsonsz, true);
+}
+
+uint32_t
+geojson_asval_wire_size(const as_val *val)
+{
+	as_geojson *pg = as_geojson_fromval(val);
+	size_t jsz = as_geojson_len(pg);
+
+	// We won't be writing any cellids ...
+	return geojson_size(0, jsz);
+}
+
+uint32_t
+geojson_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	as_geojson *pg = as_geojson_fromval(val);
+	size_t jsz = as_geojson_len(pg);
+
+	uint8_t *p8 = wire;
+
+	*p8++ = 0;						// flags
+
+	uint16_t *p16 = (uint16_t *)p8;
+
+	*p16++ = cf_swap_to_be16(0);	// no cells on output to client
+	p8 = (uint8_t *)p16;
+	memcpy(p8, as_geojson_get(pg), jsz);
+
+	return geojson_size(0, jsz);
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+uint32_t
+geojson_size_from_msgpack(const uint8_t *packed, uint32_t packed_size)
+{
+	// Oversize by a few bytes doing the easy thing.
+	size_t jsz = (size_t)packed_size;
+
+	// Compute the size; we won't be writing any cellids ...
+	return geojson_size(0, jsz);
+}
+
+void
+geojson_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp)
+{
+	geojson_mem *p_geojson_mem = (geojson_mem *)*pp;
+
+	as_unpacker pk = {
+			.buffer = packed,
+			.offset = 0,
+			.length = packed_size
+	};
+
+	int64_t blob_size = as_unpack_blob_size(&pk);
+	const uint8_t *ptr = pk.buffer + pk.offset;
+
+	// *ptr should be AS_BYTES_GEOJSON at this point.
+
+	// Adjust for type (1 byte).
+	ptr++;
+	blob_size--;
+
+	size_t jsz = (size_t)blob_size;
+
+	p_geojson_mem->type = AS_PARTICLE_TYPE_GEOJSON;
+	p_geojson_mem->sz = geojson_size(0, jsz);
+	p_geojson_mem->flags = 0;
+	p_geojson_mem->ncells = 0;
+
+	uint8_t *p8 = (uint8_t *)p_geojson_mem->data;
+	memcpy(p8, ptr, jsz);
+}
+
+
+//==========================================================
+// Particle functions specific to GEOJSON.
+//
+
+size_t
+as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **ppcells)
+{
+	geojson_mem *gp = (geojson_mem *)b->particle;
+
+	*ppcells = (uint64_t *)gp->data;
+
+	return (size_t)gp->ncells;
+}
+
+bool
+as_particle_geojson_match(as_particle *particle, uint64_t query_cellid, geo_region_t query_region, bool is_strict)
+{
+	// Determine whether the candidate particle geometry is a match
+	// for the query geometry.
+	//
+	// If query_cellid is non-zero this is a regions-containing-point query.
+	//
+	// If query_region is non-null this is a points-in-region query.
+	//
+	// Candidate geometry can either be a point or a region.  Regions
+	// will have the GEOJSON_ISREGION flag set.
+
+	geojson_mem *gp = (geojson_mem *)particle;
+
+	uint64_t *cells = (uint64_t *)gp->data;
+
+	uint64_t candidate_cellid = cells[0];
+	geo_region_t candidate_region = NULL;
+
+	bool candidate_is_region = (gp->flags & GEOJSON_ISREGION) != 0;
+
+	// If we are a strict RCP query on a region candidate we need to
+	// run the parser to obtain a candidate_region for the matcher.
+	//
+	if (query_cellid != 0 && candidate_is_region && is_strict) {
+		size_t jsonsz;
+		char const *jsonptr = as_geojson_mem_jsonstr(particle, &jsonsz);
+
+		if (! geo_parse(NULL, jsonptr, jsonsz, &candidate_cellid,
+				&candidate_region)) {
+			cf_warning(AS_PARTICLE, "geo_parse() failed - unexpected");
+			geo_region_destroy(candidate_region);
+			return false;
+		}
+	}
+
+	bool ismatch = geojson_match(
+			candidate_is_region,
+			candidate_cellid,
+			candidate_region,
+			query_cellid,
+			query_region,
+			is_strict);
+
+	geo_region_destroy(candidate_region);
+
+	return ismatch;
+}
+
+bool
+as_particle_geojson_match_asval(const as_val *val, uint64_t query_cellid, geo_region_t query_region, bool is_strict)
+{
+	as_geojson *pg = as_geojson_fromval(val);
+	size_t jsonsz = as_geojson_len(pg);
+	char * jsonptr = as_geojson_get(pg);
+
+	uint64_t candidate_cellid = 0;
+	geo_region_t candidate_region = NULL;
+
+	if (! geo_parse(NULL, jsonptr, jsonsz, &candidate_cellid,
+			&candidate_region)) {
+		cf_warning(AS_PARTICLE, "geo_parse() failed - unexpected");
+		geo_region_destroy(candidate_region);
+		return false;
+	}
+
+	bool candidate_is_region = candidate_cellid == 0;
+
+	bool ismatch = geojson_match(
+			candidate_is_region,
+			candidate_cellid,
+			candidate_region,
+			query_cellid,
+			query_region,
+			is_strict);
+
+	geo_region_destroy(candidate_region);
+
+	return ismatch;
+}
+
+char const *
+as_geojson_mem_jsonstr(as_particle const *particle, size_t *p_jsonsz)
+{
+	geojson_mem *p_geojson_mem = (geojson_mem *)particle;
+
+	size_t cellsz = p_geojson_mem->ncells * sizeof(uint64_t);
+
+	*p_jsonsz = p_geojson_mem->sz - sizeof(uint8_t) - sizeof(uint16_t) - cellsz;
+
+	return (char const *)p_geojson_mem->data + cellsz;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static bool
+geojson_match(bool candidate_is_region, uint64_t candidate_cellid, geo_region_t candidate_region, uint64_t query_cellid, geo_region_t query_region, bool is_strict)
+{
+	// Determine whether the candidate geometry is a match for the
+	// query geometry.
+	//
+	// If query_cellid is non-zero this is a regions-containing-point query.
+	//
+	// If query_region is non-null this is a points-in-region query.
+	//
+	// Candidate geometry can either be a point or a region.  Regions
+	// will have the GEOJSON_ISREGION flag set.
+
+	// Is this a REGIONS-CONTAINING-POINT query?
+	//
+	if (query_cellid != 0) {
+
+		if (candidate_is_region) {
+			// Candidate is a REGION.
+
+			// Shortcut, if we aren't strict just return true.
+			if (! is_strict) {
+				return true;
+			}
+
+			return geo_point_within(query_cellid, candidate_region);
+		}
+		else {
+			// Candidate is a POINT, skip it.
+			return false;
+		}
+	}
+
+	// Is this a POINTS-IN-REGION query?
+	//
+	if (query_region) {
+
+		if (candidate_is_region) {
+			// Candidate is a REGION, skip it.
+			return false;
+		}
+		else {
+			// Sanity check, make sure this geometry has been processed.
+			if (candidate_cellid == 0) {
+				cf_warning(AS_PARTICLE, "candidate cellid has no value");
+				return false;
+			}
+
+			// Candidate is a POINT.
+			if (is_strict) {
+				return geo_point_within(candidate_cellid, query_region);
+			}
+			else {
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+static inline uint32_t
+geojson_size(uint32_t n_cells, size_t string_size)
+{
+	return (uint32_t)(
+			sizeof(uint8_t) +				// flags
+			sizeof(uint16_t) +				// ncells (always 0 here)
+			(n_cells * sizeof(uint64_t)) +	// cell array
+			string_size);					// json string
+}
diff --git a/as/src/base/particle_integer.c b/as/src/base/particle_integer.c
new file mode 100644
index 00000000..ad7f894a
--- /dev/null
+++ b/as/src/base/particle_integer.c
@@ -0,0 +1,446 @@
+/*
+ * particle_integer.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include "base/particle_integer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aerospike/as_boolean.h"
+#include "aerospike/as_integer.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_byte_order.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/proto.h"
+
+
+// INTEGER particle interface function declarations are in particle_int.h since
+// INTEGER functions are used by other particles derived from INTEGER.
+
+
+//==========================================================
+// INTEGER particle interface - vtable.
+//
+
+const as_particle_vtable integer_vtable = {
+		integer_destruct,
+		integer_size,
+
+		integer_concat_size_from_wire,
+		integer_append_from_wire,
+		integer_prepend_from_wire,
+		integer_incr_from_wire,
+		integer_size_from_wire,
+		integer_from_wire,
+		integer_compare_from_wire,
+		integer_wire_size,
+		integer_to_wire,
+
+		integer_size_from_asval,
+		integer_from_asval,
+		integer_to_asval,
+		integer_asval_wire_size,
+		integer_asval_to_wire,
+
+		integer_size_from_msgpack,
+		integer_from_msgpack,
+
+		integer_size_from_flat,
+		integer_cast_from_flat,
+		integer_from_flat,
+		integer_flat_size,
+		integer_to_flat
+};
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct integer_mem_s {
+	uint8_t		do_not_use;	// already know it's an int type
+	uint64_t	i;
+} __attribute__ ((__packed__)) integer_mem;
+
+typedef struct integer_flat_s {
+	uint8_t		type;
+	uint8_t		size;
+	uint64_t	i;
+} __attribute__ ((__packed__)) integer_flat;
+
+
+//==========================================================
+// INTEGER particle interface - function definitions.
+//
+
+//------------------------------------------------
+// Destructor, etc.
+//
+
+void
+integer_destruct(as_particle *p)
+{
+	// Nothing to do - integer values live in the as_bin.
+}
+
+uint32_t
+integer_size(const as_particle *p)
+{
+	// Integer values live in the as_bin instead of a pointer.
+	return 0;
+}
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int32_t
+integer_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "concat size for integer/float");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+integer_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "append to integer/float");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+integer_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "prepend to integer/float");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+integer_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	if (wire_type != AS_PARTICLE_TYPE_INTEGER) {
+		cf_warning(AS_PARTICLE, "increment with non integer type %u", wire_type);
+		return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+	}
+
+	uint64_t i;
+
+	switch (value_size) {
+	case 8:
+		i = cf_swap_from_be64(*(uint64_t *)wire_value);
+		break;
+	case 4:
+		i = (uint64_t)cf_swap_from_be32(*(uint32_t *)wire_value);
+		break;
+	case 2:
+		i = (uint64_t)cf_swap_from_be16(*(uint16_t *)wire_value);
+		break;
+	case 1:
+		i = (uint64_t)*wire_value;
+		break;
+	case 16: // memcache increment - it's special
+		i = cf_swap_from_be64(*(uint64_t *)wire_value);
+		// For memcache, decrements floor at 0.
+		if ((int64_t)i < 0 && *(uint64_t *)pp + i > *(uint64_t *)pp) {
+			*pp = 0;
+			return 0;
+		}
+		break;
+	default:
+		cf_warning(AS_PARTICLE, "unexpected value size %u", value_size);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	(*(uint64_t *)pp) += i;
+
+	return 0;
+}
+
+int32_t
+integer_size_from_wire(const uint8_t *wire_value, uint32_t value_size)
+{
+	// Integer values live in the as_bin instead of a pointer.
+	return 0;
+}
+
+int
+integer_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	uint64_t i;
+
+	switch (value_size) {
+	case 8:
+		i = cf_swap_from_be64(*(uint64_t *)wire_value);
+		break;
+	case 4:
+		i = (uint64_t)cf_swap_from_be32(*(uint32_t *)wire_value);
+		break;
+	case 2:
+		i = (uint64_t)cf_swap_from_be16(*(uint16_t *)wire_value);
+		break;
+	case 1:
+		i = (uint64_t)*wire_value;
+		break;
+	default:
+		cf_warning(AS_PARTICLE, "unexpected value size %u", value_size);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	*pp = (as_particle *)i;
+
+	return 0;
+}
+
+int
+integer_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size)
+{
+	if (wire_type != AS_PARTICLE_TYPE_INTEGER) {
+		return 1;
+	}
+
+	uint64_t i;
+
+	switch (value_size) {
+	case 8:
+		i = cf_swap_from_be64(*(uint64_t *)wire_value);
+		break;
+	case 4:
+		i = (uint64_t)cf_swap_from_be32(*(uint32_t *)wire_value);
+		break;
+	case 2:
+		i = (uint64_t)cf_swap_from_be16(*(uint16_t *)wire_value);
+		break;
+	case 1:
+		i = (uint64_t)*wire_value;
+		break;
+	default:
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return (uint64_t)p == i ? 0 : 1;
+}
+
+uint32_t
+integer_wire_size(const as_particle *p)
+{
+	return (uint32_t)sizeof(uint64_t);
+}
+
+uint32_t
+integer_to_wire(const as_particle *p, uint8_t *wire)
+{
+	*(uint64_t *)wire = cf_swap_to_be64((uint64_t)p);
+
+	return (uint32_t)sizeof(uint64_t);
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+uint32_t
+integer_size_from_asval(const as_val *val)
+{
+	// Integer values live in the as_bin instead of a pointer.
+	return 0;
+}
+
+void
+integer_from_asval(const as_val *val, as_particle **pp)
+{
+	// Unfortunately AS_BOOLEANs (as well as AS_INTEGERs) become INTEGER
+	// particles, so we have to check the as_val type here.
+
+	as_val_t vtype = as_val_type(val);
+	int64_t i;
+
+	switch (vtype) {
+	case AS_INTEGER:
+		i = as_integer_get(as_integer_fromval(val));
+		break;
+	case AS_BOOLEAN:
+		i = as_boolean_get(as_boolean_fromval(val)) ? 1 : 0;
+		break;
+	default:
+		cf_crash(AS_PARTICLE, "unexpected as_val_t %d", vtype);
+		return;
+	}
+
+	*pp = (as_particle *)i;
+}
+
+as_val *
+integer_to_asval(const as_particle *p)
+{
+	return (as_val *)as_integer_new((uint64_t)p);
+}
+
+uint32_t
+integer_asval_wire_size(const as_val *val)
+{
+	return (uint32_t)sizeof(uint64_t);
+}
+
+uint32_t
+integer_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	// Unfortunately AS_BOOLEANs (as well as AS_INTEGERs) become INTEGER
+	// particles, so we have to check the as_val type here.
+
+	as_val_t vtype = as_val_type(val);
+	int64_t i;
+
+	switch (vtype) {
+	case AS_INTEGER:
+		i = as_integer_get(as_integer_fromval(val));
+		break;
+	case AS_BOOLEAN:
+		i = as_boolean_get(as_boolean_fromval(val)) ? 1 : 0;
+		break;
+	default:
+		cf_crash(AS_PARTICLE, "unexpected as_val_t %d", vtype);
+		return 0;
+	}
+
+	*(uint64_t *)wire = cf_swap_to_be64((uint64_t)i);
+
+	return (uint32_t)sizeof(uint64_t);
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+uint32_t
+integer_size_from_msgpack(const uint8_t *packed, uint32_t packed_size)
+{
+	// Integer values live in the as_bin instead of a pointer.
+	return 0;
+}
+
+void
+integer_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp)
+{
+	int64_t i;
+	as_unpacker pk = {
+			.buffer = packed,
+			.offset = 0,
+			.length = packed_size
+	};
+
+	as_unpack_int64(&pk, &i);
+
+	*pp = (as_particle *)i;
+}
+
+//------------------------------------------------
+// Handle on-device "flat" format.
+//
+
+int32_t
+integer_size_from_flat(const uint8_t *flat, uint32_t flat_size)
+{
+	// Integer values live in the as_bin instead of a pointer.
+	return 0;
+}
+
+int
+integer_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	integer_flat *p_int_flat = (integer_flat *)flat;
+	// Assume type is correct, since we got here.
+
+	// Sanity check lengths.
+	if (p_int_flat->size != 8 || flat_size != sizeof(integer_flat)) {
+		cf_warning(AS_PARTICLE, "unexpected flat integer/float: flat_size %u, len %u",
+				flat_size, p_int_flat->size);
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	// Integer values live in an as_bin instead of a pointer. Also, flat
+	// integers are host order, so no byte swap.
+	*pp = (as_particle *)p_int_flat->i;
+
+	return 0;
+}
+
+int
+integer_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	const integer_flat *p_int_flat = (const integer_flat *)flat;
+	// Assume type is correct, since we got here.
+
+	// Sanity check lengths.
+	if (p_int_flat->size != 8 || flat_size != sizeof(integer_flat)) {
+		cf_warning(AS_PARTICLE, "unexpected flat integer/float: flat_size %u, len %u",
+				flat_size, p_int_flat->size);
+		return -1; // TODO - AS_PROTO error code seems inappropriate?
+	}
+
+	// Integer values live in an as_bin instead of a pointer. Also, flat
+	// integers are host order, so no byte swap.
+	*pp = (as_particle *)p_int_flat->i;
+
+	return 0;
+}
+
+uint32_t
+integer_flat_size(const as_particle *p)
+{
+	return sizeof(integer_flat);
+}
+
+uint32_t
+integer_to_flat(const as_particle *p, uint8_t *flat)
+{
+	integer_flat *p_int_flat = (integer_flat *)flat;
+
+	// Already wrote the type.
+	p_int_flat->size = 8;
+	p_int_flat->i = (uint64_t)p;
+
+	return integer_flat_size(p);
+}
+
+
+//==========================================================
+// as_bin particle functions specific to INTEGER.
+//
+
+int64_t
+as_bin_particle_integer_value(const as_bin *b)
+{
+	// Caller must ensure this is called only for INTEGER particles.
+	return (int64_t)b->particle;
+}
+
+void
+as_bin_particle_integer_set(as_bin *b, int64_t i)
+{
+	b->particle = (as_particle *)i;
+}
diff --git a/as/src/base/particle_list.c b/as/src/base/particle_list.c
new file mode 100644
index 00000000..54e04331
--- /dev/null
+++ b/as/src/base/particle_list.c
@@ -0,0 +1,4519 @@
+/*
+ * particle_list.c
+ *
+ * Copyright (C) 2015-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/param.h>
+
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+
+#include "fault.h"
+
+#include "base/cdt.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/proto.h"
+
+
+//==========================================================
+// LIST particle interface - function declarations.
+//
+
+// Destructor, etc.
+void list_destruct(as_particle *p);
+uint32_t list_size(const as_particle *p);
+
+// Handle "wire" format.
+int32_t list_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int list_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int list_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int list_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int32_t list_size_from_wire(const uint8_t *wire_value, uint32_t value_size);
+int list_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int list_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size);
+uint32_t list_wire_size(const as_particle *p);
+uint32_t list_to_wire(const as_particle *p, uint8_t *wire);
+
+// Handle as_val translation.
+uint32_t list_size_from_asval(const as_val *val);
+void list_from_asval(const as_val *val, as_particle **pp);
+as_val *list_to_asval(const as_particle *p);
+uint32_t list_asval_wire_size(const as_val *val);
+uint32_t list_asval_to_wire(const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+uint32_t list_size_from_msgpack(const uint8_t *packed, uint32_t packed_size);
+void list_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+// Handle on-device "flat" format.
+int32_t list_size_from_flat(const uint8_t *flat, uint32_t flat_size);
+int list_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp);
+int list_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp);
+uint32_t list_flat_size(const as_particle *p);
+uint32_t list_to_flat(const as_particle *p, uint8_t *flat);
+
+
+//==========================================================
+// LIST particle interface - vtable.
+//
+
+const as_particle_vtable list_vtable = {
+		list_destruct,
+		list_size,
+
+		list_concat_size_from_wire,
+		list_append_from_wire,
+		list_prepend_from_wire,
+		list_incr_from_wire,
+		list_size_from_wire,
+		list_from_wire,
+		list_compare_from_wire,
+		list_wire_size,
+		list_to_wire,
+
+		list_size_from_asval,
+		list_from_asval,
+		list_to_asval,
+		list_asval_wire_size,
+		list_asval_to_wire,
+
+		list_size_from_msgpack,
+		list_from_msgpack,
+
+		list_size_from_flat,
+		list_cast_from_flat,
+		list_from_flat,
+		list_flat_size,
+		list_to_flat
+};
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+//#define LIST_DEBUG_VERIFY
+
+#define PACKED_LIST_INDEX_STEP 128
+
+#define AS_PACKED_LIST_FLAG_NONE     0x00
+#define AS_PACKED_LIST_FLAG_ORDERED  0x01
+
+#define PACKED_LIST_FLAG_OFF_IDX     0x10
+#define PACKED_LIST_FLAG_FULLOFF_IDX 0x20
+
+typedef struct packed_list_s {
+	const uint8_t *packed;
+	uint32_t packed_sz;
+
+	uint32_t ele_count; // excludes ext ele
+	// Mutable state member (is considered mutable in const objects).
+	offset_index offidx; // offset start at contents (excluding ext metadata ele)
+	// Mutable state member (is considered mutable in const objects).
+	offset_index full_offidx; // index at every element
+	uint8_t ext_flags;
+
+	const uint8_t *contents; // where elements start (excludes ext)
+	uint32_t content_sz;
+} packed_list;
+
+typedef struct packed_list_op_s {
+	const packed_list *list;
+
+	uint32_t new_ele_count;
+	uint32_t new_content_sz;
+
+	uint32_t seg1_sz;
+	uint32_t seg2_offset;
+	uint32_t seg2_sz;
+	uint32_t nil_ele_sz; // number of nils we need to insert
+} packed_list_op;
+
+typedef struct list_mem_s {
+	uint8_t type;
+	uint32_t sz;
+	uint8_t data[];
+} __attribute__ ((__packed__)) list_mem;
+
+typedef struct list_flat_s {
+	uint8_t type;
+	uint32_t sz; // host order on device and in memory
+	uint8_t data[];
+} __attribute__ ((__packed__)) list_flat;
+
+typedef struct msgpack_list_empty_flagged_s {
+	uint8_t list_hdr;
+	uint8_t ext_hdr;
+	uint8_t ext_sz;
+	uint8_t ext_flags;
+} __attribute__ ((__packed__)) msgpack_list_empty_flagged;
+
+typedef struct list_mem_empty_flagged_s {
+	list_mem mem;
+	msgpack_list_empty_flagged list;
+} list_mem_empty_flagged;
+
+static const list_mem_empty_flagged list_ordered_empty = {
+		.mem = {
+				.type = AS_PARTICLE_TYPE_LIST,
+				.sz = sizeof(msgpack_list_empty_flagged)
+		},
+		.list = {
+				.list_hdr = 0x91,
+				.ext_hdr = 0xC7,
+				.ext_sz = 0,
+				.ext_flags = AS_PACKED_LIST_FLAG_ORDERED
+		}
+};
+static const list_mem list_mem_empty = {
+		.type = AS_PARTICLE_TYPE_LIST,
+		.sz = 1,
+		.data = {0x90}
+};
+
+typedef struct {
+	const offset_index *offsets;
+	const order_index *order;
+	as_cdt_sort_flags flags;
+	bool error;
+} list_order_index_sort_userdata;
+
+#define define_packed_list_op(__name, __list_p) \
+	packed_list_op __name; \
+	packed_list_op_init(&__name, __list_p)
+
+#define list_full_offidx_p(__list_p) \
+	(offset_index *)(list_is_ordered(__list_p) ? &(__list_p)->offidx : &(__list_p)->full_offidx)
+
+#define vla_list_full_offidx_if_invalid(__name, __list_p) \
+	union { \
+		offset_index *offidx; \
+		uint8_t mem_temp[sizeof(offset_index *) + (offset_index_is_valid(list_full_offidx_p(__list_p)) ? 0 : offset_index_size(list_full_offidx_p(__list_p)))]; \
+	} __name; \
+	__name.offidx = list_full_offidx_p(__list_p); \
+	if (! __name.offidx->_.ptr) { \
+		__name.offidx->_.ptr = __name.mem_temp + sizeof(offset_index *); \
+		offset_index_set_filled(__name.offidx, 1); \
+	}
+
+#define define_packed_list_particle(__name, __particle, __ret) \
+	packed_list __name; \
+	bool __ret = packed_list_init_from_particle(&__name, __particle)
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static inline bool is_list_type(uint8_t type);
+static inline bool flags_is_ordered(uint8_t flags);
+static inline bool list_is_ordered(const packed_list *list);
+static inline uint8_t get_ext_flags(bool ordered);
+static uint32_t list_calc_ext_content_sz(uint32_t ele_count, uint32_t content_sz, bool ordered);
+
+static uint32_t list_pack_header(uint8_t *buf, uint32_t ele_count);
+static void list_pack_empty_index(as_packer *pk, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz, bool is_ordered);
+
+// as_bin
+static inline void as_bin_set_empty_list(as_bin *b, rollback_alloc *alloc_buf, bool is_ordered);
+static void as_bin_set_ordered_empty_list(as_bin *b, rollback_alloc *alloc_buf);
+static inline void as_bin_set_temp_list_if_notinuse(as_bin *b, uint64_t create_flags);
+
+// packed_list
+static bool packed_list_init(packed_list *list, const uint8_t *buf, uint32_t sz);
+static inline bool packed_list_init_from_particle(packed_list *list, const as_particle *p);
+static bool packed_list_init_from_bin(packed_list *list, const as_bin *b);
+static bool packed_list_unpack_hdridx(packed_list *list);
+static void packed_list_partial_offidx_update(const packed_list *list);
+
+static bool packed_list_find_by_value_ordered(const packed_list *list, const cdt_payload *value, order_index_find *find);
+static uint32_t packed_list_find_idx_offset(const packed_list *list, uint32_t index);
+static bool packed_list_find_rank_range_by_value_interval_ordered(const packed_list *list, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank_r, uint32_t *count_r, bool is_multi);
+static bool packed_list_find_rank_range_by_value_interval_unordered(const packed_list *list, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank, uint32_t *count, uint64_t *mask_val, bool inverted, bool is_multi);
+
+static uint32_t packed_list_mem_sz(const packed_list *list, bool has_ext, uint32_t *ext_content_sz_r);
+static uint32_t packed_list_pack_buf(const packed_list *list, uint8_t *buf, uint32_t sz, uint32_t ext_content_sz, bool strip_flags);
+static list_mem *packed_list_pack_mem(const packed_list *list, list_mem *p_list_mem);
+static void packed_list_content_pack(const packed_list *list, as_packer *pk);
+static int packed_list_remove_by_idx(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const uint64_t rm_idx, uint32_t *rm_sz);
+static int packed_list_remove_by_mask(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t rm_count, uint32_t *rm_sz);
+
+static int packed_list_trim(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, uint32_t index, uint32_t count, cdt_result_data *result);
+static int packed_list_get_remove_by_index_range(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result);
+static int packed_list_get_remove_by_value_interval(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result);
+static int packed_list_get_remove_by_rank_range(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result);
+static int packed_list_get_remove_all_by_value_list(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result);
+
+static int packed_list_insert(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, int64_t index, const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, cdt_result_data *result);
+static int packed_list_add_ordered(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *payload, bool unique, cdt_result_data *result);
+static int packed_list_add_items_ordered(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *items, bool unique, cdt_result_data *result);
+static int packed_list_replace_ordered(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, uint32_t index, const cdt_payload *value, uint64_t mod_flags);
+
+// packed_list_op
+static void packed_list_op_init(packed_list_op *op, const packed_list *list);
+static bool packed_list_op_insert(packed_list_op *op, uint32_t index, uint32_t count, uint32_t insert_sz);
+static bool packed_list_op_remove(packed_list_op *op, uint32_t index, uint32_t count);
+
+static uint32_t packed_list_op_write_seg1(const packed_list_op *op, uint8_t *buf);
+static uint32_t packed_list_op_write_seg2(const packed_list_op *op, uint8_t *buf);
+
+static bool packed_list_builder_add_ranks_by_range(const packed_list *list, cdt_container_builder *builder, as_unpacker *start, uint32_t count, bool reverse);
+
+// list
+static list_mem *list_create(rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_sz);
+static as_particle *list_simple_create_from_buf(rollback_alloc *alloc_buf, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz);
+static as_particle *list_simple_create(rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_sz, uint8_t **contents_r);
+
+static int list_set_flags(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags, cdt_result_data *result);
+static int list_append(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, cdt_result_data *result);
+static int list_insert(as_bin *b, rollback_alloc *alloc_buf, int64_t index, const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, cdt_result_data *result);
+static int list_set(as_bin *b, rollback_alloc *alloc_buf, int64_t index, const cdt_payload *value, uint64_t mod_flags);
+static int list_increment(as_bin *b, rollback_alloc *alloc_buf, int64_t index, cdt_payload *delta_value, uint64_t mod_flags, cdt_result_data *result);
+static int list_sort(as_bin *b, rollback_alloc *alloc_buf, as_cdt_sort_flags sort_flags);
+
+static int list_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result);
+static int list_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result);
+static int list_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result);
+static int list_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result);
+
+static uint8_t *list_setup_bin(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags, uint32_t content_sz, uint32_t ele_count, uint32_t idx_trunc, const offset_index *old_offidx, offset_index *new_offidx);
+
+// list_offset_index
+static inline void list_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz);
+static void list_offset_index_rm_mask_cpy(offset_index *dst, const offset_index *full_src, const uint64_t *rm_mask, uint32_t rm_count);
+
+// list_full_offset_index
+static inline void list_full_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz);
+static bool list_full_offset_index_fill_to(offset_index *offidx, uint32_t index);
+
+// list_order_index
+static int list_order_index_sort_cmp_fn(const void *x, const void *y, void *p);
+static uint8_t *list_order_index_pack(const order_index *ordidx, const offset_index *full_offidx, uint8_t *buf, offset_index *new_offidx);
+
+// list_order_heap
+static msgpack_compare_t list_order_heap_cmp_fn(const void *udata, uint32_t idx1, uint32_t idx2);
+
+// list_result_data
+static bool list_result_data_set_not_found(cdt_result_data *rd, int64_t index);
+static void list_result_data_set_values_by_mask(cdt_result_data *rd, const uint64_t *mask, const offset_index *full_offidx, uint32_t count, uint32_t sz);
+static void list_result_data_set_values_by_idxcount(cdt_result_data *rd, const order_index *idxcnt, const offset_index *full_offidx);
+static bool list_result_data_set_values_by_ordidx(cdt_result_data *rd, const order_index *ordidx, const offset_index *full_offidx, uint32_t count, uint32_t sz);
+
+// Debugging support
+static void list_print(const packed_list *list, const char *name);
+static bool list_verify(const as_bin *b);
+
+
+//==========================================================
+// LIST particle interface - function definitions.
+//
+
+//------------------------------------------------
+// Destructor, etc.
+//
+
+void
+list_destruct(as_particle *p)
+{
+	cf_free(p);
+}
+
+uint32_t
+list_size(const as_particle *p)
+{
+	const list_mem *p_list_mem = (const list_mem *)p;
+	return (uint32_t)sizeof(list_mem) + p_list_mem->sz;
+}
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int32_t
+list_concat_size_from_wire(as_particle_type wire_type,
+		const uint8_t *wire_value, uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "concat size for list");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+list_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "append to list");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+list_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "prepend to list");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+list_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "increment of list");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int32_t
+list_size_from_wire(const uint8_t *wire_value, uint32_t value_size)
+{
+	// TODO - CDT can't determine in memory or not.
+	packed_list list;
+
+	if (! packed_list_init(&list, wire_value, value_size)) {
+		cf_warning(AS_PARTICLE, "list_size_from_wire() invalid packed list");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return (int32_t)(sizeof(list_mem) + packed_list_mem_sz(&list, true, NULL));
+}
+
+int
+list_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	// TODO - CDT can't determine in memory or not.
+	// It works for data-not-in-memory but we'll incur a memcpy that could be
+	// eliminated.
+	packed_list list;
+
+	if (! packed_list_init(&list, wire_value, value_size)) {
+		cf_warning(AS_PARTICLE, "list_from_wire() invalid packed list");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	list_mem *p_list_mem = packed_list_pack_mem(&list, (list_mem *)*pp);
+
+	p_list_mem->type = wire_type;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+int
+list_compare_from_wire(const as_particle *p, as_particle_type wire_type,
+		const uint8_t *wire_value, uint32_t value_size)
+{
+	// TODO
+	cf_warning(AS_PARTICLE, "list_compare_from_wire() not implemented");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+uint32_t
+list_wire_size(const as_particle *p)
+{
+	define_packed_list_particle(list, p, success);
+	cf_assert(success, AS_PARTICLE, "list_wire_size() invalid packed list");
+
+	return packed_list_mem_sz(&list, false, NULL);
+}
+
+uint32_t
+list_to_wire(const as_particle *p, uint8_t *wire)
+{
+	define_packed_list_particle(list, p, success);
+	cf_assert(success, AS_PARTICLE, "list_to_wire() invalid packed list");
+
+	return packed_list_pack_buf(&list, wire, INT_MAX, 0, true);
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+uint32_t
+list_size_from_asval(const as_val *val)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val);
+
+	as_serializer_destroy(&s);
+
+	const as_list *list = (const as_list *)val;
+
+	uint32_t ele_count = as_list_size(list);
+	uint32_t base_hdr_sz = as_pack_list_header_get_size(ele_count);
+	uint32_t content_sz = sz - base_hdr_sz;
+	bool is_ordered = flags_is_ordered((uint8_t)list->flags);
+	uint32_t ext_content_sz = list_calc_ext_content_sz(ele_count, content_sz,
+			is_ordered);
+	uint32_t hdr_sz = (is_ordered || ext_content_sz != 0) ?
+			as_pack_list_header_get_size(ele_count + 1) : base_hdr_sz;
+
+	return (uint32_t)sizeof(list_mem) + hdr_sz +
+			as_pack_ext_header_get_size(ext_content_sz) + ext_content_sz +
+			content_sz;
+}
+
+void
+list_from_asval(const as_val *val, as_particle **pp)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	list_mem *p_list_mem = (list_mem *)*pp;
+	int32_t sz = as_serializer_serialize_presized(&s, val, p_list_mem->data);
+
+	cf_assert(sz >= 0, AS_PARTICLE, "list_from_asval() failed to presize");
+	as_serializer_destroy(&s);
+
+	const as_list *list = (const as_list *)val;
+
+	uint32_t ele_count = as_list_size(list);
+	uint32_t base_hdr_sz = as_pack_list_header_get_size(ele_count);
+	uint32_t content_sz = (uint32_t)sz - base_hdr_sz;
+	bool is_ordered = flags_is_ordered((uint8_t)list->flags);
+	uint32_t ext_content_sz = list_calc_ext_content_sz(ele_count, content_sz,
+			is_ordered);
+
+	if (is_ordered || ext_content_sz != 0) {
+		uint32_t hdr_sz = as_pack_list_header_get_size(ele_count + 1);
+		uint32_t ele_start = hdr_sz +
+				as_pack_ext_header_get_size(ext_content_sz) + ext_content_sz;
+
+		// Prefer memmove over 2x serialize.
+		memmove(p_list_mem->data + ele_start, p_list_mem->data + base_hdr_sz,
+				content_sz);
+
+		as_packer pk = {
+				.buffer = p_list_mem->data,
+				.capacity = ele_start
+		};
+
+		as_pack_list_header(&pk, ele_count + 1);
+		as_pack_ext_header(&pk, ext_content_sz, get_ext_flags(is_ordered));
+		list_pack_empty_index(&pk, ele_count, NULL, content_sz, is_ordered);
+		cf_assert(pk.offset == ele_start, AS_PARTICLE, "size mismatch pk.offset(%d) != ele_start(%u)", pk.offset, ele_start);
+		p_list_mem->sz = ele_start + content_sz;
+	}
+	else {
+		p_list_mem->sz = (uint32_t)sz;
+	}
+
+	p_list_mem->type = AS_PARTICLE_TYPE_LIST;
+}
+
+as_val *
+list_to_asval(const as_particle *p)
+{
+	list_mem *p_list_mem = (list_mem *)p;
+
+	as_buffer buf = {
+			.capacity = p_list_mem->sz,
+			.size = p_list_mem->sz,
+			.data = p_list_mem->data
+	};
+
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	as_val *val = NULL;
+
+	as_serializer_deserialize(&s, &buf, &val);
+	as_serializer_destroy(&s);
+
+	if (! val) {
+		return (as_val *)as_arraylist_new(0, 1);
+	}
+
+	return val;
+}
+
+uint32_t
+list_asval_wire_size(const as_val *val)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val);
+
+	as_serializer_destroy(&s);
+
+	return sz;
+}
+
+uint32_t
+list_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	int32_t sz = as_serializer_serialize_presized(&s, val, wire);
+
+	as_serializer_destroy(&s);
+	cf_assert(sz > 0, AS_PARTICLE, "list_asval_to_wire() sz %d failed to serialize", sz);
+
+	return (uint32_t)sz;
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+uint32_t
+list_size_from_msgpack(const uint8_t *packed, uint32_t packed_size)
+{
+	return (uint32_t)sizeof(list_mem) + packed_size;
+}
+
+void
+list_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp)
+{
+	list_mem *p_list_mem = (list_mem *)*pp;
+
+	p_list_mem->type = AS_PARTICLE_TYPE_LIST;
+	p_list_mem->sz = packed_size;
+	memcpy(p_list_mem->data, packed, p_list_mem->sz);
+}
+
+//------------------------------------------------
+// Handle on-device "flat" format.
+//
+
+int32_t
+list_size_from_flat(const uint8_t *flat, uint32_t flat_size)
+{
+	// TODO - maybe never used
+	return -1;
+}
+
+int
+list_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	// Cast temp buffer from disk to data-not-in-memory.
+	list_flat *p_list_flat = (list_flat *)flat;
+
+	// This assumes list_flat is the same as list_mem.
+	*pp = (as_particle *)p_list_flat;
+
+	return 0;
+}
+
+int
+list_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	// Convert temp buffer from disk to data-in-memory.
+	const list_flat *p_list_flat = (const list_flat *)flat;
+	packed_list list;
+
+	if (! packed_list_init(&list, p_list_flat->data, p_list_flat->sz)) {
+		cf_warning(AS_PARTICLE, "list_from_flat() invalid packed list");
+		return -1;
+	}
+
+	list_mem *p_list_mem = packed_list_pack_mem(&list, NULL);
+
+	if (! p_list_mem) {
+		cf_warning(AS_PARTICLE, "list_from_flat() failed to create particle");
+		return -1;
+	}
+
+	p_list_mem->type = p_list_flat->type;
+	*pp = (as_particle *)p_list_mem;
+
+	return 0;
+}
+
+uint32_t
+list_flat_size(const as_particle *p)
+{
+	define_packed_list_particle(list, p, success);
+	cf_assert(success, AS_PARTICLE, "list_to_flat() invalid packed list");
+
+	return sizeof(list_flat) + packed_list_mem_sz(&list, false, NULL);
+}
+
+uint32_t
+list_to_flat(const as_particle *p, uint8_t *flat)
+{
+	define_packed_list_particle(list, p, success);
+	list_flat *p_list_flat = (list_flat *)flat;
+
+	cf_assert(success, AS_PARTICLE, "list_to_flat() invalid packed list");
+	p_list_flat->sz = packed_list_mem_sz(&list, false, NULL);
+
+	uint32_t check = packed_list_pack_buf(&list, p_list_flat->data,
+			p_list_flat->sz, 0, true);
+
+	cf_assert(check == p_list_flat->sz, AS_PARTICLE, "size mismatch check(%u) != sz(%u), ele_count %u content_sz %u flags 0x%x", check, p_list_flat->sz, list.ele_count, list.content_sz, list.ext_flags);
+
+	// Already wrote the type.
+
+	return sizeof(list_flat) + p_list_flat->sz;
+}
+
+
+//==========================================================
+// as_bin particle functions specific to LIST.
+//
+
+void
+as_bin_particle_list_get_packed_val(const as_bin *b, cdt_payload *packed)
+{
+	const list_mem *p_list_mem = (const list_mem *)b->particle;
+
+	packed->ptr = (uint8_t *)p_list_mem->data;
+	packed->sz = p_list_mem->sz;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static inline bool
+is_list_type(uint8_t type)
+{
+	return type == AS_PARTICLE_TYPE_LIST;
+}
+
+static inline bool
+flags_is_ordered(uint8_t flags)
+{
+	return (flags & AS_PACKED_LIST_FLAG_ORDERED) != 0;
+}
+
+static inline bool
+list_is_ordered(const packed_list *list)
+{
+	return flags_is_ordered(list->ext_flags);
+}
+
+static inline bool
+mod_flags_is_unique(uint64_t flags)
+{
+	return (flags & AS_CDT_LIST_ADD_UNIQUE) != 0;
+}
+
+static inline bool
+mod_flags_is_bounded(uint64_t flags)
+{
+	return (flags & AS_CDT_LIST_INSERT_BOUNDED) != 0;
+}
+
+static inline int
+mod_flags_return_exists(uint64_t flags)
+{
+	// TODO - modify for NOFAIL flag later.
+	return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS;
+}
+
+static inline uint8_t
+strip_ext_flags(uint8_t flags)
+{
+	return flags & AS_PACKED_LIST_FLAG_ORDERED;
+}
+
+static inline uint8_t
+get_ext_flags(bool ordered)
+{
+	return ordered ?
+			(AS_PACKED_LIST_FLAG_ORDERED | PACKED_LIST_FLAG_FULLOFF_IDX) :
+			PACKED_LIST_FLAG_OFF_IDX;
+}
+
+static uint32_t
+list_calc_ext_content_sz(uint32_t ele_count, uint32_t content_sz, bool ordered)
+{
+	offset_index offidx;
+
+	if (! ordered) {
+		list_offset_index_init(&offidx, NULL, ele_count, NULL, content_sz);
+	}
+	else {
+		list_full_offset_index_init(&offidx, NULL, ele_count, NULL, content_sz);
+	}
+
+	return offset_index_size(&offidx);
+}
+
+static uint32_t
+list_pack_header(uint8_t *buf, uint32_t ele_count)
+{
+	as_packer pk = {
+			.buffer = buf,
+			.capacity = INT_MAX,
+	};
+
+	if (as_pack_list_header(&pk, ele_count) != 0) {
+		cf_crash(AS_PARTICLE, "as_pack_list_header() unexpected failure");
+	}
+
+	return pk.offset;
+}
+
+static void
+list_pack_empty_index(as_packer *pk, uint32_t ele_count,
+		const uint8_t *contents, uint32_t content_sz, bool is_ordered)
+{
+	offset_index offidx;
+
+	if (is_ordered) {
+		list_full_offset_index_init(&offidx, pk->buffer + pk->offset, ele_count,
+				contents, content_sz);
+	}
+	else {
+		list_offset_index_init(&offidx, pk->buffer + pk->offset, ele_count,
+				contents, content_sz);
+	}
+
+	offset_index_set_filled(&offidx, 1);
+	pk->offset += offset_index_size(&offidx);
+}
+
+//------------------------------------------------
+// as_bin
+//
+
+static inline void
+as_bin_set_empty_list(as_bin *b, rollback_alloc *alloc_buf, bool is_ordered)
+{
+	if (is_ordered) {
+		as_bin_set_ordered_empty_list(b, alloc_buf);
+	}
+	else {
+		as_bin_set_unordered_empty_list(b, alloc_buf);
+	}
+}
+
+void
+as_bin_set_unordered_empty_list(as_bin *b, rollback_alloc *alloc_buf)
+{
+	b->particle = list_simple_create_from_buf(alloc_buf, 0, NULL, 0);
+	as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_LIST);
+}
+
+static void
+as_bin_set_ordered_empty_list(as_bin *b, rollback_alloc *alloc_buf)
+{
+	b->particle = list_simple_create_from_buf(alloc_buf, 1,
+			(const uint8_t *)&list_ordered_empty.list.ext_hdr,
+			sizeof(msgpack_list_empty_flagged) - 1);
+	as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_LIST);
+}
+
+static inline void
+as_bin_set_temp_list_if_notinuse(as_bin *b, uint64_t create_flags)
+{
+	if (! as_bin_inuse(b)) {
+		b->particle = (create_flags & AS_PACKED_LIST_FLAG_ORDERED) != 0 ?
+				(as_particle *)&list_ordered_empty :
+				(as_particle *)&list_mem_empty;
+		as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_LIST);
+	}
+}
+
+//----------------------------------------------------------
+// packed_list
+//
+
+static bool
+packed_list_init(packed_list *list, const uint8_t *buf, uint32_t sz)
+{
+	list->packed = buf;
+	list->packed_sz = sz;
+
+	list->ele_count = 0;
+	list->ext_flags = 0;
+	list->contents = NULL;
+
+	return packed_list_unpack_hdridx(list);
+}
+
+static inline bool
+packed_list_init_from_particle(packed_list *list, const as_particle *p)
+{
+	const list_mem *p_list_mem = (const list_mem *)p;
+	return packed_list_init(list, p_list_mem->data, p_list_mem->sz);
+}
+
+static bool
+packed_list_init_from_bin(packed_list *list, const as_bin *b)
+{
+	uint8_t type = as_bin_get_particle_type(b);
+	cf_assert(is_list_type(type), AS_PARTICLE, "packed_list_init_from_bin() invalid type %d", type);
+	return packed_list_init_from_particle(list, b->particle);
+}
+
+static bool
+packed_list_unpack_hdridx(packed_list *list)
+{
+	if (list->packed_sz == 0) {
+		list->ext_flags = 0;
+		return false;
+	}
+
+	as_unpacker pk = {
+			.buffer = list->packed,
+			.length = list->packed_sz
+	};
+
+	int64_t ele_count = as_unpack_list_header_element_count(&pk);
+
+	if (ele_count < 0) {
+		return false;
+	}
+
+	list->ele_count = (uint32_t)ele_count;
+
+	if (ele_count != 0 && as_unpack_peek_is_ext(&pk)) {
+		as_msgpack_ext ext;
+
+		if (as_unpack_ext(&pk, &ext) != 0) {
+			return false;
+		}
+
+		list->ext_flags = ext.type;
+		list->ele_count--;
+		list->contents = list->packed + pk.offset;
+		list->content_sz = list->packed_sz - pk.offset;
+
+		if (list_is_ordered(list)) {
+			list_full_offset_index_init(&list->offidx, NULL, list->ele_count,
+					list->contents, list->content_sz);
+		}
+		else {
+			list_offset_index_init(&list->offidx, NULL, list->ele_count,
+					list->contents, list->content_sz);
+		}
+
+		list_full_offset_index_init(&list->full_offidx, NULL, list->ele_count,
+				list->contents, list->content_sz);
+
+		if (ext.size >= offset_index_size(&list->offidx)) {
+			offset_index_set_ptr(&list->offidx, (uint8_t *)ext.data,
+					list->packed + pk.offset);
+		}
+	}
+	else {
+		list->contents = list->packed + pk.offset;
+		list->content_sz = list->packed_sz - pk.offset;
+		list->ext_flags = 0;
+
+		list_offset_index_init(&list->offidx, NULL, list->ele_count,
+				list->contents, list->content_sz);
+		list_full_offset_index_init(&list->full_offidx, NULL, list->ele_count,
+				list->contents, list->content_sz);
+	}
+
+	return true;
+}
+
+static void
+packed_list_partial_offidx_update(const packed_list *list)
+{
+	if (list_is_ordered(list) || ! offset_index_is_valid(&list->full_offidx) ||
+			! offset_index_is_valid(&list->offidx)) {
+		return;
+	}
+
+	offset_index *full = (offset_index *)&list->full_offidx;
+	offset_index *part = (offset_index *)&list->offidx;
+	uint32_t filled = offset_index_get_filled(part);
+	uint32_t max = (offset_index_get_filled(full) / PACKED_LIST_INDEX_STEP) + 1;
+
+	if (filled >= max) {
+		return;
+	}
+
+	for (uint32_t j = filled; j < max; j++) {
+		uint32_t off = offset_index_get_const(full, j * PACKED_LIST_INDEX_STEP);
+		offset_index_set(part, j, off);
+	}
+
+	offset_index_set_filled(part, max);
+}
+
+static bool
+packed_list_find_by_value_ordered(const packed_list *list,
+		const cdt_payload *value, order_index_find *find)
+{
+	if (list->ele_count == 0) {
+		find->found = false;
+		find->result = 0;
+		return true;
+	}
+
+	offset_index *offidx = list_full_offidx_p(list);
+	cf_assert(offset_index_is_valid(offidx), AS_PARTICLE, "invalid offidx");
+	uint32_t last = offset_index_get_filled(offidx);
+
+	find->count = last - find->start;
+
+	if (! order_index_find_rank_by_value(NULL, value, offidx, find)) {
+		return false;
+	}
+
+	if (offset_index_is_full(offidx) || find->result < last - 1 ||
+			(! find->found && find->result < last) || (find->found &&
+					(find->target > list->ele_count ||
+							find->result >= find->target))) {
+		return true;
+	}
+
+	if (find->result == list->ele_count || find->result == last ||
+			find->result < find->target) {
+		as_unpacker pk_start = {
+				.buffer = value->ptr,
+				.length = value->sz
+		};
+
+		as_unpacker pk_buf = {
+				.buffer = list->contents,
+				.offset = offset_index_get_const(offidx, last - 1),
+				.length = list->content_sz
+		};
+
+		if (as_unpack_size(&pk_buf) <= 0) {
+			return false;
+		}
+
+		offset_index_set(offidx, last, pk_buf.offset);
+		find->result = list->ele_count;
+
+		for (uint32_t i = last; i < list->ele_count; i++) {
+			pk_start.offset = 0; // reset
+
+			msgpack_compare_t cmp = as_unpack_compare(&pk_start, &pk_buf);
+
+			offset_index_set(offidx, i + 1, pk_buf.offset);
+
+			if (cmp == MSGPACK_COMPARE_EQUAL) {
+				find->found = true;
+
+				if (i != list->ele_count - 1 && i < find->target &&
+						find->target <= list->ele_count) {
+					continue;
+				}
+
+				find->result = i;
+				offset_index_set_filled(offidx, MIN(i + 2, list->ele_count));
+				break;
+			}
+
+			if (cmp == MSGPACK_COMPARE_LESS) {
+				find->result = i - (find->found ? 1 : 0);
+				offset_index_set_filled(offidx, MIN(i + 2, list->ele_count));
+				break;
+			}
+
+			if (cmp == MSGPACK_COMPARE_END || cmp == MSGPACK_COMPARE_ERROR) {
+				return false;
+			}
+		}
+
+		if (find->result == list->ele_count) {
+			offset_index_set_filled(offidx, list->ele_count);
+		}
+	}
+
+	return true;
+}
+
+static uint32_t
+packed_list_find_idx_offset(const packed_list *list, uint32_t index)
+{
+	if (index == 0) {
+		return 0;
+	}
+
+	if (list_is_ordered(list)) {
+		if (offset_index_is_valid(&list->offidx)) {
+			offset_index *offidx = (offset_index *)&list->offidx;
+
+			if (! list_full_offset_index_fill_to(offidx, index)) {
+				return 0;
+			}
+
+			return offset_index_get_const(offidx, index);
+		}
+
+		define_offset_index(offidx, list->contents, list->content_sz,
+				list->ele_count);
+
+		if (! list_full_offset_index_fill_to(&offidx, index)) {
+			return 0;
+		}
+
+		return offset_index_get_const(&offidx, index);
+	}
+	else if (offset_index_is_valid(&list->full_offidx) &&
+			index < offset_index_get_filled(&list->full_offidx)) {
+		return offset_index_get_const(&list->full_offidx, index);
+	}
+
+	as_unpacker pk = {
+			.buffer = list->contents,
+			.length = list->content_sz
+	};
+
+	uint32_t steps = index;
+
+	if (offset_index_is_valid(&list->offidx)) {
+		uint32_t idx = index / PACKED_LIST_INDEX_STEP;
+		uint32_t filled = offset_index_get_filled(&list->offidx);
+
+		if (idx >= filled) {
+			cf_assert(filled != 0, AS_PARTICLE, "packed_list_op_find_idx_offset() filled is zero");
+			idx = filled - 1;
+		}
+
+		pk.offset = offset_index_get_const(&list->offidx, idx);
+		steps -= idx * PACKED_LIST_INDEX_STEP;
+
+		offset_index *offidx = (offset_index *)&list->offidx; // mutable struct variable
+		uint32_t blocks = steps / PACKED_LIST_INDEX_STEP;
+
+		steps %= PACKED_LIST_INDEX_STEP;
+
+		for (uint32_t i = 0; i < blocks; i++) {
+			for (uint32_t j = 0; j < PACKED_LIST_INDEX_STEP; j++) {
+				if (as_unpack_size(&pk) <= 0) {
+					return 0;
+				}
+			}
+
+			idx++;
+			offset_index_set_next(offidx, idx, pk.offset);
+		}
+	}
+
+	for (uint32_t i = 0; i < steps; i++) {
+		if (as_unpack_size(&pk) <= 0) {
+			return 0;
+		}
+	}
+
+	return pk.offset;
+}
+
+static uint32_t
+packed_list_find_idx_offset_continue(const packed_list *list, uint32_t index,
+		uint32_t index0, uint32_t offset0)
+{
+	if (list_is_ordered(list)) {
+		return packed_list_find_idx_offset(list, index);
+	}
+	else if (offset_index_is_valid(&list->full_offidx) &&
+			index < offset_index_get_filled(&list->full_offidx)) {
+		return offset_index_get_const(&list->full_offidx, index);
+	}
+
+	as_unpacker pk = {
+			.buffer = list->contents,
+			.offset = offset0,
+			.length = list->content_sz
+	};
+
+	uint32_t steps = index - index0;
+
+	if (offset_index_is_valid(&list->offidx)) {
+		uint32_t idx0 = index0 / PACKED_LIST_INDEX_STEP;
+		uint32_t idx = index / PACKED_LIST_INDEX_STEP;
+		uint32_t filled = offset_index_get_filled(&list->offidx);
+
+		if (idx0 != idx) {
+			if (idx0 < filled - 1) {
+				return packed_list_find_idx_offset(list, index);
+			}
+
+			uint32_t mod0 = index0 % PACKED_LIST_INDEX_STEP;
+			offset_index *offidx = (offset_index *)&list->offidx;
+
+			if (mod0 != 0) {
+				for (uint32_t i = mod0; i < PACKED_LIST_INDEX_STEP; i++) {
+					if (as_unpack_size(&pk) <= 0) {
+						return 0;
+					}
+
+					steps--;
+				}
+
+				idx0++;
+				offset_index_set_next(offidx, idx0, pk.offset);
+			}
+
+			uint32_t blocks = idx - idx0;
+
+			for (uint32_t i = 0; i < blocks; i++) {
+				for (uint32_t j = 0; j < PACKED_LIST_INDEX_STEP; j++) {
+					if (as_unpack_size(&pk) <= 0) {
+						return 0;
+					}
+				}
+
+				idx0++;
+				offset_index_set_next(offidx, idx0, pk.offset);
+			}
+
+			steps -= blocks * PACKED_LIST_INDEX_STEP;
+		}
+	}
+
+	for (uint32_t i = 0; i < steps; i++) {
+		if (as_unpack_size(&pk) <= 0) {
+			return 0;
+		}
+	}
+
+	return pk.offset;
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value:
+//  [value_start, value_start].
+static bool
+packed_list_find_rank_range_by_value_interval_ordered(const packed_list *list,
+		const cdt_payload *value_start, const cdt_payload *value_end,
+		uint32_t *rank_r, uint32_t *count_r, bool is_multi)
+{
+	cf_assert(offset_index_is_valid(list_full_offidx_p(list)), AS_PARTICLE, "packed_list_find_rank_range_by_value_interval_ordered() invalid full offset_index");
+	cf_assert(value_end, AS_PARTICLE, "value_end == NULL");
+
+	order_index_find find = {
+			.target = 0
+	};
+
+	if (! packed_list_find_by_value_ordered(list, value_start, &find)) {
+		return false;
+	}
+
+	*rank_r = find.result;
+
+	if (value_end == value_start) {
+		if (! find.found) {
+			*count_r = 0;
+		}
+		else if (is_multi) {
+			find.start = find.result + 1;
+			find.target = list->ele_count;
+
+			if (! packed_list_find_by_value_ordered(list, value_start, &find)) {
+				return false;
+			}
+
+			if (find.found) {
+				*count_r = find.result - *rank_r + 1;
+			}
+			else {
+				*count_r = 1;
+			}
+		}
+		else {
+			*count_r = 1;
+		}
+
+		return true;
+	}
+
+	if (! value_end->ptr) {
+		*count_r = list->ele_count - *rank_r;
+		return true;
+	}
+
+	as_unpacker pk_start = {
+			.buffer = value_start->ptr,
+			.length = value_start->sz
+	};
+
+	as_unpacker pk_end = {
+			.buffer = value_end->ptr,
+			.length = value_end->sz
+	};
+
+	msgpack_compare_t cmp = as_unpack_compare(&pk_start, &pk_end);
+
+	if (cmp == MSGPACK_COMPARE_GREATER || cmp == MSGPACK_COMPARE_EQUAL) {
+		*count_r = 0;
+		return true;
+	}
+
+	find.start = find.result;
+
+	if (! packed_list_find_by_value_ordered(list, value_end, &find)) {
+		return false;
+	}
+
+	*count_r = find.result - *rank_r;
+
+	return true;
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value:
+//  [value_start, value_start].
+// mask_val is a mask for is_multi case and a uint64_t[1] value for ! is_multi.
+static bool
+packed_list_find_rank_range_by_value_interval_unordered(const packed_list *list,
+		const cdt_payload *value_start, const cdt_payload *value_end,
+		uint32_t *rank, uint32_t *count, uint64_t *mask_val, bool inverted,
+		bool is_multi)
+{
+	cf_assert(value_end, AS_PARTICLE, "value_end == NULL");
+
+	as_unpacker pk_start = {
+			.buffer = value_start->ptr,
+			.length = value_start->sz
+	};
+
+	as_unpacker pk_end = {
+			.buffer = value_end->ptr,
+			.length = value_end->sz
+	};
+
+	offset_index *full_offidx = list_full_offidx_p(list);
+
+	if (! offset_index_is_valid(full_offidx)) {
+		full_offidx = NULL;
+	}
+
+	// Pre-check parameters.
+	if (as_unpack_size(&pk_start) <= 0) {
+		cf_warning(AS_PARTICLE, "packed_list_op_find_rank_range_by_value_interval_unordered() invalid start value");
+		return false;
+	}
+
+	if (value_end != value_start && value_end->ptr &&
+			as_unpack_size(&pk_end) <= 0) {
+		cf_warning(AS_PARTICLE, "packed_list_op_find_rank_range_by_value_interval_unordered() invalid end value");
+		return false;
+	}
+
+	*rank = 0;
+	*count = 0;
+
+	as_unpacker pk = {
+			.buffer = list->contents,
+			.length = list->content_sz
+	};
+
+	for (uint32_t i = 0; i < list->ele_count; i++) {
+		uint32_t value_offset = pk.offset; // save for pk_end
+
+		pk_start.offset = 0; // reset
+
+		msgpack_compare_t cmp_start = as_unpack_compare(&pk, &pk_start);
+
+		if (full_offidx) {
+			offset_index_set(full_offidx, i + 1, pk.offset);
+		}
+
+		if (cmp_start == MSGPACK_COMPARE_ERROR) {
+			cf_warning(AS_PARTICLE, "packed_list_op_find_rank_range_by_value_interval_unordered() invalid packed list at index %u", i);
+			return false;
+		}
+
+		if (cmp_start == MSGPACK_COMPARE_LESS) {
+			(*rank)++;
+
+			if (inverted) {
+				if (mask_val) {
+					cdt_idx_mask_set(mask_val, i);
+				}
+
+				(*count)++;
+			}
+		}
+		else if (value_start != value_end) {
+			msgpack_compare_t cmp_end = MSGPACK_COMPARE_LESS;
+
+			// NULL value_end means largest possible value.
+			if (value_end->ptr) {
+				pk.offset = value_offset;
+				pk_end.offset = 0;
+				cmp_end = as_unpack_compare(&pk, &pk_end);
+			}
+
+			if ((cmp_end == MSGPACK_COMPARE_LESS && ! inverted) ||
+					((cmp_end == MSGPACK_COMPARE_GREATER ||
+							cmp_end == MSGPACK_COMPARE_EQUAL) && inverted)) {
+				if (mask_val) {
+					cdt_idx_mask_set(mask_val, i);
+				}
+
+				(*count)++;
+			}
+		}
+		// Single value case.
+		else if (cmp_start == MSGPACK_COMPARE_EQUAL) {
+			if (is_multi) {
+				if (! inverted) {
+					if (mask_val) {
+						cdt_idx_mask_set(mask_val, i);
+					}
+
+					(*count)++;
+				}
+			}
+			else if (*count == 0) {
+				if (mask_val) {
+					*mask_val = i;
+				}
+
+				(*count)++;
+			}
+		}
+		else if (inverted && is_multi) {
+			if (mask_val) {
+				cdt_idx_mask_set(mask_val, i);
+			}
+
+			(*count)++;
+		}
+	}
+
+	if (full_offidx) {
+		offset_index_set_filled(full_offidx, list->ele_count);
+	}
+
+	return true;
+}
+
+static uint32_t
+packed_list_mem_sz(const packed_list *list, bool has_ext,
+		uint32_t *ext_content_sz_r)
+{
+	bool ordered = list_is_ordered(list);
+	uint32_t ext_cont_sz = 0;
+
+	if (has_ext) {
+		ext_cont_sz = list_calc_ext_content_sz(list->ele_count,
+				list->content_sz, ordered);
+
+		if (ext_content_sz_r) {
+			*ext_content_sz_r = ext_cont_sz;
+		}
+	}
+	else if (! ordered) {
+		return as_pack_list_header_get_size(list->ele_count) + list->content_sz;
+	}
+
+	if (! ordered && ext_cont_sz == 0) {
+		return as_pack_list_header_get_size(list->ele_count) + list->content_sz;
+	}
+
+	return as_pack_list_header_get_size(list->ele_count + 1) +
+			as_pack_ext_header_get_size(ext_cont_sz) + ext_cont_sz +
+			list->content_sz;
+}
+
+static uint32_t
+packed_list_pack_buf(const packed_list *list, uint8_t *buf, uint32_t sz,
+		uint32_t ext_content_sz, bool strip_flags)
+{
+	as_packer pk = {
+			.buffer = buf,
+			.capacity = sz
+	};
+
+	bool ordered = list_is_ordered(list);
+
+	if (ordered || ext_content_sz != 0) {
+		as_pack_list_header(&pk, list->ele_count + 1);
+		as_pack_ext_header(&pk, ext_content_sz, strip_flags ?
+				strip_ext_flags(list->ext_flags) : get_ext_flags(ordered));
+
+		if (ext_content_sz != 0) {
+			list_pack_empty_index(&pk, list->ele_count, NULL, list->content_sz,
+					ordered);
+		}
+	}
+	else {
+		as_pack_list_header(&pk, list->ele_count);
+	}
+
+	packed_list_content_pack(list, &pk);
+
+	return pk.offset;
+}
+
+static list_mem *
+packed_list_pack_mem(const packed_list *list, list_mem *p_list_mem)
+{
+	uint32_t ext_content_sz = 0;
+	uint32_t sz = packed_list_mem_sz(list, true, &ext_content_sz);
+
+	if (! p_list_mem) {
+		p_list_mem = cf_malloc_ns(sizeof(list_mem) + sz);
+	}
+
+	p_list_mem->sz = sz;
+	packed_list_pack_buf(list, p_list_mem->data, sz, ext_content_sz, false);
+
+	return p_list_mem;
+}
+
+static void
+packed_list_content_pack(const packed_list *list, as_packer *pk)
+{
+	uint8_t *ptr = pk->buffer + pk->offset;
+
+	memcpy(ptr, list->contents, list->content_sz);
+	pk->offset += list->content_sz;
+}
+
+static int
+packed_list_remove_by_idx(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, const uint64_t rm_idx, uint32_t *rm_sz)
+{
+	define_packed_list_op(op, list);
+
+	if (! packed_list_op_remove(&op, rm_idx, 1)) {
+		cf_warning(AS_PARTICLE, "packed_list_remove_by_idx() as_packed_list_remove failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (op.new_ele_count == 0) {
+		as_bin_set_empty_list(b, alloc_buf, list_is_ordered(list));
+	}
+	else {
+		uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags,
+				op.new_content_sz, op.new_ele_count, rm_idx, &list->offidx,
+				NULL);
+
+		ptr += packed_list_op_write_seg1(&op, ptr);
+		packed_list_op_write_seg2(&op, ptr);
+	}
+
+	*rm_sz = list->content_sz - op.new_content_sz;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_remove_by_mask(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t rm_count,
+		uint32_t *rm_sz)
+{
+	offset_index *full_offidx = list_full_offidx_p(list);
+
+	*rm_sz = cdt_idx_mask_get_content_sz(rm_mask, rm_count, full_offidx);
+
+	offset_index new_offidx;
+	uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags,
+			list->content_sz - *rm_sz, list->ele_count - rm_count, 0, NULL,
+			&new_offidx);
+
+	ptr = cdt_idx_mask_write_eles(rm_mask, rm_count, full_offidx, ptr, true);
+	cf_assert(ptr == ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz, AS_PARTICLE,
+			"packed_list_remove_idx_mask() pack mismatch ptr %p data %p sz %u [%p]", ptr, ((list_mem *)b->particle)->data, ((list_mem *)b->particle)->sz, ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz);
+
+	if (offset_index_is_valid(&new_offidx)) {
+		list_offset_index_rm_mask_cpy(&new_offidx, full_offidx, rm_mask,
+				rm_count);
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Assumes index/count(non-zero) is surrounded by other elements.
+static int
+packed_list_trim(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf,
+		uint32_t index, uint32_t count, cdt_result_data *result)
+{
+	cf_assert(result->is_multi, AS_PARTICLE, "packed_list_trim() required to be a multi op");
+
+	uint32_t rm_count = list->ele_count - count;
+	uint32_t index1 = index + count;
+	uint32_t offset0 = packed_list_find_idx_offset(list, index);
+	uint32_t offset1 = packed_list_find_idx_offset_continue(list, index1,
+			index, offset0);
+	uint32_t content_sz = offset1 - offset0;
+
+	if ((offset0 == 0 && index != 0) || offset1 == 0) {
+		cf_warning(AS_PARTICLE, "packed_list_trim() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (b) {
+		uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, content_sz,
+				count, 0, &list->offidx, NULL);
+
+		memcpy(ptr, list->contents + offset0, content_sz);
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_INDEX: {
+		bool is_rev = (result->type == RESULT_TYPE_REVINDEX);
+		define_int_list_builder(builder, result->alloc, rm_count);
+
+		cdt_container_builder_add_int_range(&builder, 0, index,
+				list->ele_count, is_rev);
+		cdt_container_builder_add_int_range(&builder, index1,
+				list->ele_count - index1, list->ele_count, is_rev);
+		cdt_container_builder_set_result(&builder, result);
+		break;
+	}
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK: {
+		define_int_list_builder(builder, result->alloc, rm_count);
+
+		if (list_is_ordered(list)) {
+			cdt_container_builder_add_int_range(&builder, 0, index,
+					list->ele_count, result->type == RESULT_TYPE_REVRANK);
+			cdt_container_builder_add_int_range(&builder, index + count,
+					rm_count - index, list->ele_count,
+					result->type == RESULT_TYPE_REVRANK);
+			cdt_container_builder_set_result(&builder, result);
+			break;
+		}
+
+		as_unpacker pk = {
+				.buffer = list->contents,
+				.length = list->content_sz
+		};
+
+		if (! packed_list_builder_add_ranks_by_range(list, &builder, &pk, index,
+				result->type == RESULT_TYPE_REVRANK)) {
+			cf_warning(AS_PARTICLE, "packed_list_trim() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		pk.offset = offset1;
+
+		if (! packed_list_builder_add_ranks_by_range(list, &builder, &pk,
+				rm_count - index, result->type == RESULT_TYPE_REVRANK)) {
+			cf_warning(AS_PARTICLE, "packed_list_trim() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		cdt_container_builder_set_result(&builder, result);
+		break;
+	}
+	case RESULT_TYPE_VALUE: {
+		uint32_t tail_sz = list->content_sz - offset1;
+		list_mem *p_list_mem = list_create(result->alloc, rm_count,
+				offset0 + tail_sz);
+
+		cf_assert(p_list_mem, AS_PARTICLE, "NULL list");
+		result->result->particle = (as_particle *)p_list_mem;
+
+		uint8_t *ptr = p_list_mem->data;
+		uint32_t hdr_sz = list_pack_header(ptr, rm_count);
+
+		ptr += hdr_sz;
+		memcpy(ptr, list->contents, offset0);
+		ptr += offset0;
+		memcpy(ptr, list->contents + offset1, tail_sz);
+
+		as_bin_state_set_from_type(result->result, AS_PARTICLE_TYPE_LIST);
+
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "packed_list_trim() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_get_remove_by_index_range(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, int64_t index, uint64_t count,
+		cdt_result_data *result)
+{
+	uint32_t uindex;
+	uint32_t count32;
+
+	if (! calc_index_count(index, count, list->ele_count, &uindex, &count32,
+			result->is_multi)) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() index %ld out of bounds for ele_count %u", index, list->ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (result_data_is_inverted(result)) {
+		if (! result->is_multi) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() INVERTED flag not supported for single result ops");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (result_data_is_return_index_range(result) ||
+				result_data_is_return_rank_range(result)) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() result_type %d not supported with INVERTED flag", result->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+		if (count32 == 0) {
+			// Reduce to remove all.
+			uindex = 0;
+			count32 = list->ele_count;
+		}
+		else if (uindex == 0) {
+			// Reduce to remove tail section.
+			uindex = count32;
+			count32 = list->ele_count - count32;
+		}
+		else if (uindex + count32 >= list->ele_count) {
+			// Reduce to remove head section.
+			count32 = uindex;
+			uindex = 0;
+		}
+		else {
+			return packed_list_trim(list, b, alloc_buf, uindex, count32,
+					result);
+		}
+	}
+
+	if (count32 == 0) {
+		if (! list_result_data_set_not_found(result, uindex)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	define_packed_list_op(op, list);
+
+	if (! packed_list_op_remove(&op, uindex, count32)) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() as_packed_list_remove failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (b) {
+		if (op.new_ele_count == 0) {
+			as_bin_set_empty_list(b, alloc_buf, list_is_ordered(list));
+		}
+		else {
+			uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags,
+					op.new_content_sz, op.new_ele_count, uindex, &list->offidx,
+					NULL);
+
+			ptr += packed_list_op_write_seg1(&op, ptr);
+			packed_list_op_write_seg2(&op, ptr);
+		}
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+		return result_data_set_index_rank_count(result, uindex, count32,
+				list->ele_count);
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK: {
+		if (op.new_ele_count == 0) {
+			return result_data_set_index_rank_count(result, 0, count32,
+					list->ele_count);
+		}
+
+		if (! result->is_multi) {
+			uint32_t rank;
+
+			if (list_is_ordered(list)) {
+				rank = uindex;
+			}
+			else {
+				uint32_t rcount;
+
+				cdt_payload value = {
+						.ptr = list->contents + op.seg1_sz,
+						.sz = list->content_sz - op.new_content_sz
+				};
+
+				if (! packed_list_find_rank_range_by_value_interval_unordered(
+						list, &value, &value, &rank, &rcount, NULL, false,
+						false)) {
+					return -AS_PROTO_RESULT_FAIL_PARAMETER;
+				}
+			}
+
+			if (result->type == RESULT_TYPE_REVRANK) {
+				rank = list->ele_count - rank - 1;
+			}
+
+			as_bin_set_int(result->result, (int64_t)rank);
+			break;
+		}
+
+		as_unpacker pk = {
+				.buffer = list->contents + op.seg1_sz,
+				.length = list->content_sz - op.new_content_sz
+		};
+
+		uint32_t rm_count = list->ele_count - op.new_ele_count;
+		define_int_list_builder(builder, result->alloc, rm_count);
+
+		if (list_is_ordered(list)) {
+			cdt_container_builder_add_int_range(&builder, uindex, count32,
+					list->ele_count, result->type == RESULT_TYPE_REVRANK);
+		}
+		else if (! packed_list_builder_add_ranks_by_range(list, &builder, &pk,
+				rm_count, result->type == RESULT_TYPE_REVRANK)) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		cdt_container_builder_set_result(&builder, result);
+		break;
+	}
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, list->ele_count - op.new_ele_count);
+		break;
+	case RESULT_TYPE_VALUE: {
+		const uint8_t *result_ptr = list->contents + op.seg1_sz;
+		uint32_t end = (op.seg2_sz != 0) ? op.seg2_offset : list->content_sz;
+		uint32_t result_sz = end - op.seg1_sz;
+		uint32_t result_count = list->ele_count - op.new_ele_count;
+
+		if (result->is_multi) {
+			result->result->particle =
+					list_simple_create_from_buf(result->alloc,
+							result_count, result_ptr, result_sz);
+
+			if (! result->result->particle) {
+				return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+			}
+
+			as_bin_state_set_from_type(result->result, AS_PARTICLE_TYPE_LIST);
+		}
+		else if (result_sz != 0) {
+			cf_assert(count32 <= 1, AS_PARTICLE, "packed_list_get_remove_by_index_range() result must be list for count > 1");
+			as_bin_particle_alloc_from_msgpack(result->result, result_ptr,
+					result_sz);
+		}
+		// else - leave result bin empty because result_size is 0.
+		break;
+	}
+	case RESULT_TYPE_REVINDEX_RANGE:
+		if (result->type == RESULT_TYPE_REVINDEX_RANGE) {
+			uindex = list->ele_count - uindex - count32;
+		}
+		// no break
+	case RESULT_TYPE_INDEX_RANGE:
+		result_data_set_list_int2x(result, uindex, count32);
+		break;
+	case RESULT_TYPE_RANK_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+		if (list_is_ordered(list)) {
+			return result_data_set_range(result, uindex, count32,
+					list->ele_count);
+		}
+		// no break
+	default:
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef LIST_DEBUG_VERIFY
+	if (! list_verify(b)) {
+		cdt_bin_print(b, "packed_list_get_remove_by_index_range");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value: [value_start, value_start].
+static int
+packed_list_get_remove_by_value_interval(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *value_start,
+		const cdt_payload *value_end, cdt_result_data *result)
+{
+	bool inverted = result_data_is_inverted(result);
+
+	if (inverted && ! result->is_multi) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_value_interval() INVERTED flag not supported for single result ops");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rank;
+	vla_list_full_offidx_if_invalid(u, list);
+
+	if (list_is_ordered(list)) {
+		uint32_t count;
+
+		if (! packed_list_find_rank_range_by_value_interval_ordered(list,
+				value_start, value_end, &rank, &count, result->is_multi)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return packed_list_get_remove_by_index_range(list, b, alloc_buf,
+				(int64_t)rank, (uint64_t)count, result);
+	}
+
+	uint32_t rm_count;
+	define_cdt_idx_mask(rm_mask, result->is_multi ? list->ele_count : 1);
+
+	if (! packed_list_find_rank_range_by_value_interval_unordered(list,
+			value_start, value_end, &rank, &rm_count, rm_mask, inverted,
+			result->is_multi)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_sz = 0;
+
+	if (b) {
+		if (rm_count == list->ele_count) {
+			as_bin_set_unordered_empty_list(b, alloc_buf);
+		}
+		else if (rm_count != 0) {
+			int ret;
+
+			if (result->is_multi) {
+				ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask,
+					rm_count, &rm_sz);
+			}
+			else {
+				// rm_mask[0] is an idx for single value finds.
+				ret = packed_list_remove_by_idx(list, b, alloc_buf, rm_mask[0],
+						&rm_sz);
+			}
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+		else {
+			packed_list_partial_offidx_update(list);
+		}
+	}
+	else {
+		packed_list_partial_offidx_update(list);
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+	case RESULT_TYPE_COUNT:
+	case RESULT_TYPE_REVRANK:
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK_RANGE:
+	case RESULT_TYPE_RANK_RANGE:
+		return result_data_set_range(result, rank, inverted ?
+				list->ele_count - rm_count : rm_count, list->ele_count);
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+		if (result->is_multi) {
+			result_data_set_int_list_by_mask(result, rm_mask, rm_count,
+					list->ele_count);
+		}
+		else {
+			result_data_set_index_rank_count(result, rm_mask[0], rm_count,
+					list->ele_count);
+		}
+		break;
+	case RESULT_TYPE_VALUE:
+		if (result->is_multi) {
+			list_result_data_set_values_by_mask(result, rm_mask,
+					list_full_offidx_p(list), rm_count, rm_sz);
+		}
+		else {
+			define_order_index2(rm_idx, list->ele_count, 1);
+
+			order_index_set(&rm_idx, 0, rm_mask[0]);
+			list_result_data_set_values_by_ordidx(result, &rm_idx, u.offidx,
+					rm_count, rm_sz);
+		}
+		break;
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_REVINDEX_RANGE:
+	default:
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_value_interval() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_get_remove_by_rank_range(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, int64_t rank, uint64_t count,
+		cdt_result_data *result)
+{
+	bool inverted = result_data_is_inverted(result);
+
+	if (inverted && ! result->is_multi) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() INVERTED flag not supported for single result ops");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (list_is_ordered(list)) {
+		// idx == rank for ordered lists.
+		return packed_list_get_remove_by_index_range(list, b, alloc_buf, rank,
+				count, result);
+	}
+
+	uint32_t urank;
+	uint32_t count32;
+
+	if (! calc_index_count(rank, count, list->ele_count, &urank, &count32,
+			result->is_multi)) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() rank %u out of bounds for ele_count %u", urank, list->ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	vla_list_full_offidx_if_invalid(full, list);
+
+	if (! list_full_offset_index_fill_all(full.offidx)) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() invalid packed list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	define_build_order_heap_by_range(heap, urank, count32, list->ele_count,
+			list, list_order_heap_cmp_fn, success);
+
+	if (! success) {
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() invalid packed list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_count = inverted ? list->ele_count - count32 : count32;
+
+	if (rm_count == 0) {
+		if (! list_result_data_set_not_found(result, urank)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		packed_list_partial_offidx_update(list);
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	define_cdt_idx_mask(rm_mask, list->ele_count);
+	order_index ret_idx;
+
+	cdt_idx_mask_set_by_ordidx(rm_mask, &heap._, heap.filled, count32,
+			inverted);
+
+	if (inverted) {
+		if (result_data_is_return_rank_range(result)) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() result_type %d not supported with INVERTED flag", result->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (! result->is_multi) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() singe result type %d not supported with INVERTED flag", result->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+	else {
+		order_index_init_ref(&ret_idx, &heap._, heap.filled, rm_count);
+	}
+
+	uint32_t rm_sz = 0;
+
+	if (b) {
+		int ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask,
+				rm_count, &rm_sz);
+
+		if (ret != AS_PROTO_RESULT_OK) {
+			return ret;
+		}
+	}
+	else {
+		packed_list_partial_offidx_update(list);
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+	case RESULT_TYPE_COUNT:
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+	case RESULT_TYPE_RANK_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+		return result_data_set_range(result, rank, count32, list->ele_count);
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+		result_data_set_int_list_by_mask(result, rm_mask, rm_count,
+				list->ele_count);
+		break;
+	case RESULT_TYPE_VALUE:
+		if (inverted) {
+			list_result_data_set_values_by_mask(result, rm_mask,
+					&list->full_offidx, rm_count, rm_sz);
+		}
+		else if (! list_result_data_set_values_by_ordidx(result, &ret_idx,
+				&list->full_offidx, rm_count, rm_sz)) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() invalid packed list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+		break;
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_REVINDEX_RANGE:
+	default:
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_get_remove_all_by_value_list_ordered(const packed_list *list,
+		as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk,
+		uint32_t items_count, cdt_result_data *result)
+{
+	cf_assert(result->is_multi, AS_PARTICLE, "not supported");
+
+	define_order_index2(rm_rc, list->ele_count, 2 * items_count);
+	uint32_t rm_count = 0;
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		cdt_payload value = { items_pk->buffer + items_pk->offset };
+		int64_t sz = as_unpack_size(items_pk);
+
+		if (sz <= 0) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		value.sz = (uint32_t)sz;
+
+		uint32_t rank;
+		uint32_t count;
+
+		if (! packed_list_find_rank_range_by_value_interval_ordered(list,
+				&value, &value, &rank, &count, true)) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		order_index_set(&rm_rc, 2 * i, rank);
+		order_index_set(&rm_rc, (2 * i) + 1, count);
+		rm_count += count;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+	uint32_t rm_sz = 0;
+	bool need_mask = (b || (inverted &&
+			(result_data_is_return_elements(result) ||
+					result_data_is_return_rank(result) ||
+					result_data_is_return_index(result))));
+	cond_define_cdt_idx_mask(rm_mask, list->ele_count, need_mask);
+
+	if (inverted) {
+		if (! list_full_offset_index_fill_all(list_full_offidx_p(list))) {
+			cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		rm_count = list->ele_count - rm_count;
+	}
+
+	if (need_mask) {
+		cdt_idx_mask_set_by_irc(rm_mask, &rm_rc, NULL, inverted);
+	}
+
+	if (b) {
+		if (rm_count == list->ele_count) {
+			as_bin_set_ordered_empty_list(b, alloc_buf);
+		}
+		else if (rm_count != 0) {
+			int ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask,
+					rm_count, &rm_sz);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+		else {
+			packed_list_partial_offidx_update(list);
+		}
+	}
+	else {
+		packed_list_partial_offidx_update(list);
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+		if (inverted) {
+			result_data_set_int_list_by_mask(result, rm_mask, rm_count,
+					list->ele_count);
+		}
+		else {
+			result_data_set_by_irc(result, &rm_rc, NULL, rm_count);
+		}
+		break;
+	case RESULT_TYPE_VALUE: {
+		if (inverted) {
+			list_result_data_set_values_by_mask(result, rm_mask, &list->offidx,
+					rm_count, rm_sz);
+		}
+		else {
+			list_result_data_set_values_by_idxcount(result, &rm_rc,
+					&list->offidx);
+		}
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef LIST_DEBUG_VERIFY
+	if (! list_verify(b)) {
+		cdt_bin_print(b, "packed_list_get_remove_all_by_value_list_ordered");
+		list_print(list, "original");
+		cf_crash(AS_PARTICLE, "all_by_value_list_ordered: ele_count %u items_count %u rm_count %u", list->ele_count, items_count, rm_count);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_get_remove_all_by_value_list(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *value_list,
+		cdt_result_data *result)
+{
+	if (result_data_is_return_rank_range(result) ||
+			result_data_is_return_index_range(result)) {
+		cf_warning(AS_PARTICLE, "packed_list_op_get_remove_all_by_value_list() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	as_unpacker items_pk;
+	uint32_t items_count;
+
+	if (! list_param_parse(value_list, &items_pk, &items_count)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+
+	if (items_count == 0) {
+		if (! inverted) {
+			if (! list_result_data_set_not_found(result, 0)) {
+				cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list() invalid result type %d", result->type);
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			return AS_PROTO_RESULT_OK;
+		}
+
+		result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+		return packed_list_get_remove_by_index_range(list, b, alloc_buf, 0,
+				list->ele_count, result);
+	}
+
+	vla_list_full_offidx_if_invalid(full, list);
+
+	if (list_is_ordered(list)) {
+		return packed_list_get_remove_all_by_value_list_ordered(list, b,
+				alloc_buf, &items_pk, items_count, result);
+	}
+
+	bool is_ret_rank = result_data_is_return_rank(result);
+	uint32_t rm_count = 0;
+	define_order_index(value_list_ordidx, items_count);
+	define_cdt_idx_mask(rm_mask, list->ele_count);
+	cond_vla_order_index2(rc, list->ele_count, items_count * 2, is_ret_rank);
+
+	if (! offset_index_find_items(full.offidx,
+			CDT_FIND_ITEMS_IDXS_FOR_LIST_VALUE, &items_pk, &value_list_ordidx,
+			inverted, rm_mask, &rm_count, is_ret_rank ? &rc.ordidx : NULL)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_sz = 0;
+
+	if (b) {
+		if (rm_count == list->ele_count) {
+			as_bin_set_unordered_empty_list(b, alloc_buf);
+		}
+		else if (rm_count != 0) {
+			int ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask,
+					rm_count, &rm_sz);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+		else {
+			packed_list_partial_offidx_update(list);
+		}
+	}
+	else {
+		packed_list_partial_offidx_update(list);
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+		result_data_set_int_list_by_mask(result, rm_mask, rm_count,
+				list->ele_count);
+		break;
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+		result_data_set_by_itemlist_irc(result, &value_list_ordidx,
+				&rc.ordidx, rm_count);
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_VALUE: {
+		list_result_data_set_values_by_mask(result, rm_mask, full.offidx,
+				rm_count, rm_sz);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "packed_list_op_get_remove_all_by_value_list() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_insert(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, int64_t index, const cdt_payload *payload,
+		bool payload_is_list, uint64_t mod_flags, cdt_result_data *result)
+{
+	uint32_t param_count = 1;
+	uint32_t payload_hdr_sz = 0;
+
+	if (payload_is_list) {
+		int64_t payload_count =
+				as_unpack_buf_list_element_count(payload->ptr, payload->sz);
+
+		if (payload_count < 0) {
+			cf_warning(AS_PARTICLE, "packed_list_insert() invalid payload, expected a list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (payload_count == 0) {
+			result_data_set_int(result, list->ele_count);
+			return AS_PROTO_RESULT_OK;
+		}
+
+		param_count = (uint32_t)payload_count;
+		payload_hdr_sz = as_pack_list_header_get_size((uint32_t)payload_count);
+
+		if (payload_hdr_sz > payload->sz) {
+			cf_warning(AS_PARTICLE, "packed_list_insert() invalid list header: payload->size=%d", payload->sz);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+
+	if (index > INT32_MAX || (index = calc_index(index, list->ele_count)) < 0) {
+		cf_warning(AS_PARTICLE, "packed_list_insert() index %ld out of bounds for ele_count %d", index > 0 ? index : index - list->ele_count, list->ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (mod_flags_is_bounded(mod_flags) && (uint32_t)index > list->ele_count) {
+		result_data_set_int(result, list->ele_count);
+		return AS_PROTO_RESULT_OK; // no-op
+	}
+
+	uint32_t rm_sz = 0;
+	uint32_t rm_count = 0;
+	bool is_unique = mod_flags_is_unique(mod_flags);
+	cond_define_cdt_idx_mask(rm_mask, param_count, is_unique);
+
+	if (is_unique) {
+		// Assume only here for the unordered case.
+		if (payload_is_list) {
+			as_unpacker pk = {
+					.buffer = payload->ptr + payload_hdr_sz,
+					.length = payload->sz - payload_hdr_sz
+			};
+
+			for (uint32_t i = 0; i < param_count; i++) {
+				cdt_payload val = { pk.buffer + pk.offset };
+				int64_t sz = as_unpack_size(&pk);
+				uint32_t rank;
+				uint32_t count;
+
+				if (sz <= 0) {
+					cf_warning(AS_PARTICLE, "packed_list_insert() invalid parameters");
+					return -AS_PROTO_RESULT_FAIL_PARAMETER;
+				}
+
+				val.sz = (uint32_t)sz;
+
+				if (! packed_list_find_rank_range_by_value_interval_unordered(
+						list, &val, &val, &rank, &count, NULL, false, false)) {
+					return -AS_PROTO_RESULT_FAIL_PARAMETER;
+				}
+
+				if (count == 0) {
+					as_unpacker cmp0 = {
+							.buffer = val.ptr,
+							.length = val.sz
+					};
+
+					as_unpacker cmp1 = pk;
+					bool found = false;
+
+					cmp1.offset = 0;
+
+					for (uint32_t j = 0; j < i; j++) {
+						cmp0.offset = 0;
+
+						msgpack_compare_t cmp = as_unpack_compare(&cmp0, &cmp1);
+
+						if (cmp == MSGPACK_COMPARE_EQUAL) {
+							rm_sz += val.sz;
+							rm_count++;
+							found = true;
+							break;
+						}
+					}
+
+					if (! found) {
+						cdt_idx_mask_set(rm_mask, i);
+					}
+				}
+				else {
+					// TODO - support NOFAIL
+					//rm_sz += val.sz;
+					//rm_count++;
+					as_bin_set_int(result->result, list->ele_count);
+					return mod_flags_return_exists(mod_flags);
+				}
+			}
+
+			if (param_count == rm_count) {
+				as_bin_set_int(result->result, list->ele_count);
+				return mod_flags_return_exists(mod_flags);
+			}
+		}
+		else {
+			uint32_t rank;
+			uint32_t count;
+
+			if (! packed_list_find_rank_range_by_value_interval_unordered(list,
+					payload, payload, &rank, &count, NULL, false, false)) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			if (count != 0) {
+				as_bin_set_int(result->result, list->ele_count);
+				return mod_flags_return_exists(mod_flags);
+			}
+		}
+	}
+
+	uint32_t uindex = (uint32_t)index;
+	define_packed_list_op(op, list);
+	uint32_t insert_sz = payload->sz - payload_hdr_sz - rm_sz;
+	uint32_t add_count = param_count - rm_count;
+
+	if (! packed_list_op_insert(&op, uindex, add_count, insert_sz)) {
+		cf_warning(AS_PARTICLE, "packed_list_insert() packed_list_op_insert failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags,
+			op.new_content_sz, op.new_ele_count, uindex, &list->offidx, NULL);
+
+	ptr += packed_list_op_write_seg1(&op, ptr);
+
+	const uint8_t *p = payload->ptr + payload_hdr_sz;
+
+	if (rm_sz == 0) {
+		uint32_t sz = payload->sz - payload_hdr_sz;
+
+		memcpy(ptr, p, sz);
+		ptr += sz;
+	}
+	else {
+		as_unpacker pk = {
+				.buffer = payload->ptr + payload_hdr_sz,
+				.length = payload->sz - payload_hdr_sz
+		};
+
+		uint32_t idx = 0;
+
+		for (uint32_t i = 0; i < add_count; i++) {
+			uint32_t next = cdt_idx_mask_find(rm_mask, idx, param_count, false);
+			uint32_t skip = next - idx;
+
+			for (uint32_t j = 0; j < skip; j++) {
+				as_unpack_size(&pk);
+			}
+
+			const uint8_t *begin = pk.buffer + pk.offset;
+			size_t sz = (size_t)as_unpack_size(&pk);
+
+			memcpy(ptr, begin, sz);
+			ptr += sz;
+			idx = next + 1;
+		}
+	}
+
+	packed_list_op_write_seg2(&op, ptr);
+	result_data_set_int(result, op.new_ele_count);
+
+#ifdef LIST_DEBUG_VERIFY
+	if (! list_verify(b)) {
+		cdt_bin_print(b, "packed_list_insert");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_add_ordered(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *payload, bool unique,
+		cdt_result_data *result)
+{
+	vla_list_full_offidx_if_invalid(full, list);
+
+	order_index_find find = {
+			.target = list->ele_count + 1
+	};
+
+	if (! packed_list_find_by_value_ordered(list, payload, &find)) {
+		cf_warning(AS_PARTICLE, "packed_list_add_ordered() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (find.found && unique) {
+		return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS;
+	}
+
+	return packed_list_insert(list, b, alloc_buf, (int64_t)find.result, payload,
+			false, AS_CDT_LIST_MODIFY_DEFAULT, result);
+}
+
+static int
+packed_list_add_items_ordered(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *items, bool unique,
+		cdt_result_data *result)
+{
+	int64_t add_count = as_unpack_buf_list_element_count(items->ptr, items->sz);
+
+	if (add_count < 0) {
+		cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid payload, expected a list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (add_count == 0) {
+		result_data_set_int(result, list->ele_count);
+		return AS_PROTO_RESULT_OK; // no-op
+	}
+
+	uint32_t val_count = (uint32_t)add_count;
+	uint32_t hdr_sz = as_pack_list_header_get_size(val_count);
+
+	if (hdr_sz > items->sz) {
+		cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list header: payload->size=%d", items->sz);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	// Sort items to add.
+	define_order_index(val_ord, val_count);
+	define_offset_index(val_off, items->ptr + hdr_sz, items->sz - hdr_sz,
+			val_count);
+
+	if (! list_full_offset_index_fill_all(&val_off) ||
+			! list_order_index_sort(&val_ord, &val_off,
+					AS_CDT_SORT_ASCENDING)) {
+		cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (unique) {
+		uint32_t rm_count;
+		uint32_t rm_sz;
+		bool success = order_index_sorted_mark_dup_eles(&val_ord, &val_off,
+				&rm_count, &rm_sz);
+		cf_assert(success, AS_PARTICLE, "remove dup failed");
+	}
+
+	vla_list_full_offidx_if_invalid(full, list);
+	define_order_index2(insert_idx, list->ele_count, val_count);
+	uint32_t new_content_sz = list->content_sz;
+	uint32_t new_ele_count = list->ele_count;
+
+	for (uint32_t i = 0; i < val_count; i++) {
+		uint32_t val_idx = order_index_get(&val_ord, i);
+
+		if (val_idx == val_count) {
+			continue;
+		}
+
+		uint32_t off = offset_index_get_const(&val_off, val_idx);
+		uint32_t sz = offset_index_get_delta_const(&val_off, val_idx);
+
+		const cdt_payload value = {
+				.ptr = items->ptr + hdr_sz + off,
+				.sz = sz
+		};
+
+		order_index_find find = {
+				.target = list->ele_count + 1
+		};
+
+		if (! packed_list_find_by_value_ordered(list, &value, &find)) {
+			cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (unique && find.found) {
+			// TODO - order_index_set(&val_ord, i, val_count) for NOFAIL later.
+			return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS;
+		}
+		else {
+			order_index_set(&insert_idx, i, find.result);
+			new_content_sz += sz;
+			new_ele_count++;
+		}
+	}
+
+	if (! list_full_offset_index_fill_all(full.offidx)) {
+		cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	// Construct new list.
+	offset_index new_offidx;
+	uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, new_content_sz,
+			new_ele_count, 0, &list->offidx, &new_offidx);
+
+	uint32_t list_start = 0;
+	uint32_t new_idx = 0;
+	uint32_t cpy_delta = 0;
+	uint32_t cur_offset = 0;
+
+	for (uint32_t i = 0; i < val_count; i++) {
+		uint32_t val_idx = order_index_get(&val_ord, i);
+
+		if (val_idx == val_count) {
+			continue;
+		}
+
+		uint32_t list_idx = order_index_get(&insert_idx, i);
+
+		if (list_idx > list_start) {
+			uint32_t off0 = offset_index_get_const(&list->offidx, list_start);
+			uint32_t off1 = offset_index_get_const(&list->offidx, list_idx);
+			uint32_t seg_count = list_idx - list_start;
+			uint32_t seg_sz = off1 - off0;
+
+			memcpy(ptr, list->contents + off0, seg_sz);
+			ptr += seg_sz;
+			offset_index_copy(&new_offidx, &list->offidx, new_idx, list_start,
+					seg_count, cpy_delta);
+			list_start = list_idx;
+			new_idx += seg_count;
+			cur_offset = off1 + cpy_delta;
+		}
+
+		offset_index_set(&new_offidx, new_idx++, cur_offset);
+
+		uint32_t off = offset_index_get_const(&val_off, val_idx);
+		uint32_t val_sz = offset_index_get_delta_const(&val_off, val_idx);
+
+		memcpy(ptr, items->ptr + hdr_sz + off, val_sz);
+		ptr += val_sz;
+		cpy_delta += val_sz;
+		cur_offset += val_sz;
+	}
+
+	if (list_start < list->ele_count && list->ele_count != 0) {
+		uint32_t off = offset_index_get_const(&list->offidx, list_start);
+		uint32_t seg_count = list->ele_count - list_start;
+
+		memcpy(ptr, list->contents + off, list->content_sz - off);
+		offset_index_copy(&new_offidx, &list->offidx, new_idx, list_start,
+				seg_count, cpy_delta);
+	}
+
+	offset_index_set_filled(&new_offidx, new_ele_count);
+	result_data_set_int(result, new_ele_count);
+
+#ifdef LIST_DEBUG_VERIFY
+	if (! list_verify(b)) {
+		cdt_bin_print(b, "packed_list_add_items_ordered");
+		list_print(list, "original");
+		cf_crash(AS_PARTICLE, "add_items_ordered: val_count %u", val_count);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_list_replace_ordered(const packed_list *list, as_bin *b,
+		rollback_alloc *alloc_buf, uint32_t index, const cdt_payload *value,
+		uint64_t mod_flags)
+{
+	uint32_t rank;
+	uint32_t count;
+	vla_list_full_offidx_if_invalid(u, list);
+
+	if (! packed_list_find_rank_range_by_value_interval_ordered(list,
+			value, value, &rank, &count, false)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	define_packed_list_op(op, list);
+
+	if (index > list->ele_count) {
+		cf_warning(AS_PARTICLE, "packed_list_replace_ordered() index %u > ele_count %u out of bounds not allowed for ORDERED lists", index, list->ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (! packed_list_op_remove(&op, index, 1)) {
+		cf_warning(AS_PARTICLE, "packed_list_replace_ordered() as_packed_list_remove failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (mod_flags_is_unique(mod_flags) && count != 0) {
+		if (rank == index) { // uniquely replacing element with same value
+			return AS_PROTO_RESULT_OK; // no-op
+		}
+
+		return mod_flags_return_exists(mod_flags);
+	}
+
+	uint32_t new_ele_count = list->ele_count;
+
+	op.new_content_sz += value->sz;
+
+	if (index == list->ele_count) {
+		new_ele_count++;
+	}
+
+	uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags,
+			op.new_content_sz, new_ele_count, (rank < index) ? rank : index,
+					&list->offidx, NULL);
+	uint32_t offset = offset_index_get_const(u.offidx, rank);
+
+	if (rank <= index) {
+		uint32_t tail_sz = op.seg1_sz - offset;
+
+		memcpy(ptr, list->contents, offset);
+		ptr += offset;
+		memcpy(ptr, value->ptr, value->sz);
+		ptr += value->sz;
+		memcpy(ptr, list->contents + offset, tail_sz);
+		ptr += tail_sz;
+		packed_list_op_write_seg2(&op, ptr);
+	}
+	else if (op.seg2_sz == 0) {
+		ptr += packed_list_op_write_seg1(&op, ptr);
+		memcpy(ptr, value->ptr, value->sz);
+	}
+	else {
+		uint32_t head_sz = offset - op.seg2_offset;
+		uint32_t tail_sz = op.seg2_sz - head_sz;
+
+		ptr += packed_list_op_write_seg1(&op, ptr);
+		memcpy(ptr, list->contents + op.seg2_offset, head_sz);
+		ptr += head_sz;
+		memcpy(ptr, value->ptr, value->sz);
+		ptr += value->sz;
+		memcpy(ptr, list->contents + offset, tail_sz);
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+//----------------------------------------------------------
+// packed_list_op
+//
+
+static void
+packed_list_op_init(packed_list_op *op, const packed_list *list)
+{
+	memset(op, 0, sizeof(packed_list_op));
+	op->list = list;
+}
+
+// Calculate a packed list split via insert op.
+// Return true on success.
+static bool
+packed_list_op_insert(packed_list_op *op, uint32_t index, uint32_t count,
+		uint32_t insert_sz)
+{
+	uint32_t ele_count = op->list->ele_count;
+
+	if (index >= ele_count) { // insert off the end
+		if (index + count >= INT32_MAX) {
+			cf_warning(AS_PARTICLE, "as_packed_list_insert() index %u + count %u overflow", index, count);
+			return false;
+		}
+
+		op->new_ele_count = index + count;
+		op->nil_ele_sz = index - ele_count;
+
+		op->seg1_sz = op->list->content_sz;
+		op->seg2_sz = 0;
+	}
+	else { // insert front or middle
+		op->new_ele_count = ele_count + count;
+		op->nil_ele_sz = 0;
+		uint32_t offset = packed_list_find_idx_offset(op->list, index);
+
+		if (index != 0 && offset == 0) {
+			return false;
+		}
+
+		op->seg1_sz = offset;
+		op->seg2_offset = offset;
+		op->seg2_sz = op->list->content_sz - offset;
+	}
+
+	op->new_content_sz = op->seg1_sz + op->nil_ele_sz + insert_sz + op->seg2_sz;
+
+	return true;
+}
+
+// Calculate a packed list split via remove op.
+// Assume count != 0.
+// Return true on success.
+static bool
+packed_list_op_remove(packed_list_op *op, uint32_t index, uint32_t count)
+{
+	uint32_t ele_count = op->list->ele_count;
+
+	if (index >= ele_count) { // nothing to remove
+		op->seg1_sz = op->list->content_sz;
+		op->seg2_sz = 0;
+		op->new_ele_count = ele_count;
+		op->new_content_sz = op->list->content_sz;
+
+		return true;
+	}
+
+	uint32_t offset = packed_list_find_idx_offset(op->list, index);
+
+	if (index != 0 && offset == 0) {
+		return false;
+	}
+
+	if (count >= ele_count - index) { // remove tail elements
+		op->new_ele_count = index;
+
+		op->seg1_sz = offset;
+		op->seg2_offset = 0;
+		op->seg2_sz = 0;
+	}
+	else { // remove front or middle
+		op->new_ele_count = ele_count - count;
+		op->seg1_sz = offset;
+
+		uint32_t end_off = packed_list_find_idx_offset_continue(op->list,
+				index + count, index, offset);
+
+		if (end_off == 0) {
+			return false;
+		}
+
+		op->seg2_offset = end_off;
+		op->seg2_sz = op->list->content_sz - end_off;
+	}
+
+	op->new_content_sz = op->seg1_sz + op->seg2_sz;
+
+	return true;
+}
+
+// Write segment 1 and trailing nils if any.
+// Return number of bytes written.
+static uint32_t
+packed_list_op_write_seg1(const packed_list_op *op, uint8_t *buf)
+{
+	memcpy(buf, op->list->contents, op->seg1_sz);
+
+	if (op->nil_ele_sz == 0) {
+		return op->seg1_sz;
+	}
+
+	buf += op->seg1_sz;
+	memset(buf, msgpack_nil[0], op->nil_ele_sz);
+
+	return op->seg1_sz + op->nil_ele_sz;
+}
+
+// Write segment 2 if any.
+// Return number of bytes written.
+static uint32_t
+packed_list_op_write_seg2(const packed_list_op *op, uint8_t *buf)
+{
+	if (op->seg2_sz == 0) {
+		return 0;
+	}
+
+	memcpy(buf, op->list->contents + op->seg2_offset, op->seg2_sz);
+
+	return op->seg2_sz;
+}
+
+static bool
+packed_list_builder_add_ranks_by_range(const packed_list *list,
+		cdt_container_builder *builder, as_unpacker *start, uint32_t count,
+		bool reverse)
+{
+	for (uint32_t i = 0; i < count; i++) {
+		cdt_payload value = {
+				.ptr = start->buffer + start->offset
+		};
+
+		int64_t sz = as_unpack_size(start);
+		uint32_t rank;
+		uint32_t rcount;
+
+		if (sz <= 0) {
+			return false;
+		}
+
+		value.sz = (uint32_t)sz;
+
+		if (! packed_list_find_rank_range_by_value_interval_unordered(list,
+				&value, &value, &rank, &rcount, NULL, false, false)) {
+			return false;
+		}
+
+		cdt_container_builder_add_int64(builder,
+				reverse ? list->ele_count - rank - 1 : rank);
+	}
+
+	return true;
+}
+
+//----------------------------------------------------------
+// list
+//
+
+// Create a non-indexed list.
+// If alloc_buf is NULL, memory is reserved using cf_malloc.
+static list_mem *
+list_create(rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_sz)
+{
+	uint32_t hdr_sz = as_pack_list_header_get_size(ele_count);
+	uint32_t sz = hdr_sz + content_sz;
+	list_mem *p_list_mem = (list_mem *)rollback_alloc_reserve(alloc_buf,
+			sizeof(list_mem) + sz);
+
+	p_list_mem->type = AS_PARTICLE_TYPE_LIST;
+	p_list_mem->sz = sz;
+
+	return p_list_mem;
+}
+
+static as_particle *
+list_simple_create_from_buf(rollback_alloc *alloc_buf, uint32_t ele_count,
+		const uint8_t *contents, uint32_t content_sz)
+{
+	list_mem *p_list_mem = list_create(alloc_buf, ele_count, content_sz);
+
+	if (p_list_mem) {
+		uint32_t hdr_sz = list_pack_header(p_list_mem->data, ele_count);
+
+		if (content_sz > 0 && contents) {
+			memcpy(p_list_mem->data + hdr_sz, contents, content_sz);
+		}
+	}
+
+	return (as_particle *)p_list_mem;
+}
+
+static as_particle *
+list_simple_create(rollback_alloc *alloc_buf, uint32_t ele_count,
+		uint32_t content_sz, uint8_t **contents_r)
+{
+	list_mem *p_list_mem = list_create(alloc_buf, ele_count, content_sz);
+	uint32_t hdr_sz = list_pack_header(p_list_mem->data, ele_count);
+
+	*contents_r = p_list_mem->data + hdr_sz;
+
+	return (as_particle *)p_list_mem;
+}
+
+static int
+list_set_flags(as_bin *b, rollback_alloc *alloc_buf, uint8_t set_flags,
+		cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_set_flags() invalid packed list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool reorder = false;
+	bool was_ordered = list_is_ordered(&list);
+
+	if (flags_is_ordered(set_flags)) {
+		if (was_ordered) {
+			return AS_PROTO_RESULT_OK; // no-op
+		}
+
+		if (list.ele_count > 1) {
+			reorder = true;
+		}
+	}
+	else {
+		if (! was_ordered) {
+			return AS_PROTO_RESULT_OK; // no-op
+		}
+	}
+
+	offset_index new_offidx;
+	uint8_t * const ptr = list_setup_bin(b, alloc_buf, set_flags,
+			list.content_sz, list.ele_count, reorder ? 0 : list.ele_count,
+					&list.offidx, &new_offidx);
+
+	if (! reorder) {
+		memcpy(ptr, list.contents, list.content_sz);
+	}
+	else {
+		vla_list_full_offidx_if_invalid(full, &list);
+
+		if (! list_full_offset_index_fill_all(full.offidx)) {
+			cf_warning(AS_PARTICLE, "list_set_flags() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		define_order_index(ordidx, list.ele_count);
+
+		if (! list_order_index_sort(&ordidx, full.offidx,
+				AS_CDT_SORT_ASCENDING)) {
+			cf_warning(AS_PARTICLE, "list_set_flags() invalid list");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		list_order_index_pack(&ordidx, full.offidx, ptr, &new_offidx);
+	}
+
+#ifdef LIST_DEBUG_VERIFY
+	if (! list_verify(b)) {
+		cdt_bin_print(b, "set_flags");
+		list_print(&list, "original");
+		cf_crash(AS_PARTICLE, "set_flags: set_flags %u", set_flags);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+list_append(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *payload,
+		bool payload_is_list, uint64_t mod_flags, cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_append() invalid packed list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (list_is_ordered(&list)) {
+		bool add_unique = mod_flags_is_unique(mod_flags);
+
+		if (! payload_is_list) {
+			return packed_list_add_ordered(&list, b, alloc_buf, payload,
+					add_unique, result);
+		}
+
+		return packed_list_add_items_ordered(&list, b, alloc_buf, payload,
+				add_unique, result);
+	}
+
+	return packed_list_insert(&list, b, alloc_buf, (int64_t)list.ele_count,
+			payload, payload_is_list, mod_flags, result);
+}
+
+static int
+list_insert(as_bin *b, rollback_alloc *alloc_buf, int64_t index,
+		const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags,
+		cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_insert() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (list_is_ordered(&list)) {
+		cf_warning(AS_PARTICLE, "list_insert() invalid op on ORDERED list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_list_insert(&list, b, alloc_buf, index, payload,
+			payload_is_list, mod_flags, result);
+}
+
+static int
+list_set(as_bin *b, rollback_alloc *alloc_buf, int64_t index,
+		const cdt_payload *value, uint64_t mod_flags)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_set() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (list_is_ordered(&list)) {
+		cf_warning(AS_PARTICLE, "list_set() invalid op on ORDERED list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t ele_count = list.ele_count;
+
+	if (index >= ele_count) {
+		return packed_list_insert(&list, b, alloc_buf, index, value, false,
+				mod_flags, NULL);
+	}
+
+	if (index > UINT32_MAX || (index = calc_index(index, ele_count)) < 0) {
+		cf_warning(AS_PARTICLE, "list_set() index %ld out of bounds for ele_count %d", index > 0 ? index : index - ele_count, ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (mod_flags_is_unique(mod_flags)) {
+		uint32_t rank;
+		uint32_t count;
+		uint64_t idx;
+
+		// Use non-multi-find scan to optimize for 0 or 1 copies of element.
+		// 2 or more copies will result in an additional multi-find scan below.
+		if (! packed_list_find_rank_range_by_value_interval_unordered(&list,
+				value, value, &rank, &count, &idx, false, false)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (count != 0) {
+			if (idx != (uint64_t)index) {
+				return mod_flags_return_exists(mod_flags);
+			}
+
+			// Need second scan since the dup found is at the index being set.
+			if (! packed_list_find_rank_range_by_value_interval_unordered(&list,
+					value, value, &rank, &count, NULL, false, true)) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			if (count > 1) {
+				return mod_flags_return_exists(mod_flags);
+			}
+		}
+	}
+
+	uint32_t uindex = (uint32_t)index;
+	define_packed_list_op(op, &list);
+
+	if (! packed_list_op_remove(&op, uindex, 1)) {
+		cf_warning(AS_PARTICLE, "list_set() as_packed_list_remove failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	op.new_content_sz += value->sz;
+
+	uint8_t *ptr = list_setup_bin(b, alloc_buf, list.ext_flags,
+			op.new_content_sz, ele_count, uindex, &list.offidx, NULL);
+
+	ptr += packed_list_op_write_seg1(&op, ptr);
+
+	memcpy(ptr, value->ptr, value->sz);
+	ptr += value->sz;
+
+	packed_list_op_write_seg2(&op, ptr);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+list_increment(as_bin *b, rollback_alloc *alloc_buf, int64_t index,
+		cdt_payload *delta_value, uint64_t mod_flags, cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_increment() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (index > INT32_MAX || (index = calc_index(index, list.ele_count)) < 0) {
+		cf_warning(AS_PARTICLE, "list_increment() index %ld out of bounds for ele_count %d", index > 0 ? index : index - list.ele_count, list.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t uindex = (uint32_t)index;
+	cdt_calc_delta calc_delta;
+
+	if (! cdt_calc_delta_init(&calc_delta, delta_value, false)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (uindex < list.ele_count) {
+		uint32_t offset = packed_list_find_idx_offset(&list, uindex);
+
+		if (uindex != 0 && offset == 0) {
+			cf_warning(AS_PARTICLE, "list_increment() unable to unpack element at %u", uindex);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		as_unpacker pk = {
+				.buffer = list.contents + offset,
+				.length = list.content_sz - offset
+		};
+
+		if (! cdt_calc_delta_add(&calc_delta, &pk)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+	else {
+		if (! cdt_calc_delta_add(&calc_delta, NULL)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+
+	uint8_t value_buf[CDT_MAX_PACKED_INT_SZ];
+	cdt_payload value = { value_buf, CDT_MAX_PACKED_INT_SZ };
+
+	cdt_calc_delta_pack_and_result(&calc_delta, &value, result->result);
+
+	if (list_is_ordered(&list)) {
+		return packed_list_replace_ordered(&list, b, alloc_buf, uindex, &value,
+				mod_flags);
+	}
+
+	return list_set(b, alloc_buf, (int64_t)uindex, &value, mod_flags);
+}
+
+static int
+list_sort(as_bin *b, rollback_alloc *alloc_buf, as_cdt_sort_flags sort_flags)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_sort() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (list.ele_count <= 1) {
+		return AS_PROTO_RESULT_OK;
+	}
+
+	vla_list_full_offidx_if_invalid(full, &list);
+
+	if (! list_full_offset_index_fill_all(full.offidx)) {
+		cf_warning(AS_PARTICLE, "list_sort() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	define_order_index(ordidx, list.ele_count);
+
+	if (list_is_ordered(&list)) {
+		for (uint32_t i = 0; i < list.ele_count; i++) {
+			order_index_set(&ordidx, i, i);
+		}
+	}
+	else if (! list_order_index_sort(&ordidx, full.offidx, sort_flags)) {
+		cf_warning(AS_PARTICLE, "list_sort() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_count = 0;
+	uint32_t rm_sz = 0;
+
+	if ((sort_flags & AS_CDT_SORT_DROP_DUPLICATES) != 0 &&
+			! order_index_sorted_mark_dup_eles(&ordidx, full.offidx,
+					&rm_count, &rm_sz)) {
+		cf_warning(AS_PARTICLE, "list_sort() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	offset_index new_offidx;
+	uint8_t *ptr = list_setup_bin(b, alloc_buf, list.ext_flags,
+			list.content_sz - rm_sz, list.ele_count - rm_count, 0, &list.offidx,
+			&new_offidx);
+
+	ptr = list_order_index_pack(&ordidx, full.offidx, ptr, &new_offidx);
+	cf_assert(ptr == ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz, AS_PARTICLE,
+			"list_sort() pack mismatch ptr %p data %p sz %u [%p]", ptr, ((list_mem *)b->particle)->data, ((list_mem *)b->particle)->sz, ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+list_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, int64_t index,
+		uint64_t count, cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_remove_by_index_range() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_list_get_remove_by_index_range(&list, b, alloc_buf, index,
+			count, result);
+}
+
+static int
+list_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf,
+		const cdt_payload *value_start, const cdt_payload *value_end,
+		cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_remove_by_value_interval() invalid packed list, ele_count=%d", list.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_list_get_remove_by_value_interval(&list, b, alloc_buf,
+			value_start, value_end, result);
+}
+
+static int
+list_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, int64_t rank,
+		uint64_t count, cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_remove_by_rank_range() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_list_get_remove_by_rank_range(&list, b, alloc_buf, rank,
+			count, result);
+}
+
+static int
+list_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf,
+		const cdt_payload *value_list, cdt_result_data *result)
+{
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_remove_all_by_value_list() invalid list");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_list_get_remove_all_by_value_list(&list, b, alloc_buf,
+			value_list, result);
+}
+
+// Return ptr to packed + ele_start.
+static uint8_t *
+list_setup_bin(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags,
+		uint32_t content_sz, uint32_t ele_count, uint32_t idx_trunc,
+		const offset_index *old_offidx, offset_index *new_offidx)
+{
+	bool set_ordered = flags_is_ordered(flags);
+	uint32_t ext_content_sz = list_calc_ext_content_sz(ele_count, content_sz,
+			set_ordered);
+	uint32_t ext_sz = (ext_content_sz == 0 && ! set_ordered) ?
+			0 : as_pack_ext_header_get_size(ext_content_sz) + ext_content_sz;
+	list_mem *p_list_mem = list_create(alloc_buf,
+			ele_count + (ext_sz == 0 ? 0 : 1), ext_sz + content_sz);
+
+	cf_assert(p_list_mem, AS_PARTICLE, "p_list_mem NULL");
+	b->particle = (as_particle *)p_list_mem;
+
+	as_packer pk = {
+			.buffer = p_list_mem->data,
+			.capacity = p_list_mem->sz
+	};
+
+	if (ext_sz == 0) {
+		as_pack_list_header(&pk, ele_count);
+
+		if (new_offidx) {
+			list_offset_index_init(new_offidx, NULL, ele_count, NULL,
+					content_sz);
+		}
+
+		return pk.buffer + pk.offset;
+	}
+
+	as_pack_list_header(&pk, ele_count + 1);
+	as_pack_ext_header(&pk, ext_content_sz, get_ext_flags(set_ordered));
+
+	uint8_t * const ptr = pk.buffer + pk.offset;
+	offset_index offidx_temp;
+	uint8_t * const contents = pk.buffer + pk.offset + ext_content_sz;
+
+	if (! new_offidx) {
+		new_offidx = &offidx_temp;
+	}
+
+	if (! set_ordered) {
+		list_offset_index_init(new_offidx, ptr, ele_count, contents,
+				content_sz);
+		idx_trunc /= PACKED_LIST_INDEX_STEP;
+	}
+	else {
+		list_full_offset_index_init(new_offidx, ptr, ele_count, contents,
+				content_sz);
+	}
+
+	if (idx_trunc == 0 || ! old_offidx || offset_index_is_null(old_offidx)) {
+		offset_index_set_filled(new_offidx, 1);
+	}
+	else {
+		idx_trunc = MIN(idx_trunc, offset_index_get_filled(old_offidx));
+		offset_index_copy(new_offidx, old_offidx, 0, 0, idx_trunc, 0);
+		offset_index_set_filled(new_offidx, idx_trunc);
+	}
+
+	return contents;
+}
+
+
+//==========================================================
+// cdt_list_builder
+//
+
+void
+cdt_list_builder_start(cdt_container_builder *builder,
+		rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t max_sz)
+{
+	uint32_t sz = sizeof(list_mem) + sizeof(uint64_t) + 1 + max_sz;
+	list_mem *p_list_mem = (list_mem *)rollback_alloc_reserve(alloc_buf, sz);
+
+	p_list_mem->type = AS_PARTICLE_TYPE_LIST;
+	p_list_mem->sz = list_pack_header(p_list_mem->data, ele_count);
+
+	builder->particle = (as_particle *)p_list_mem;
+	builder->write_ptr = p_list_mem->data + p_list_mem->sz;
+	builder->ele_count = 0;
+	builder->sz = &p_list_mem->sz;
+}
+
+
+//==========================================================
+// cdt_process_state_packed_list
+//
+
+bool
+cdt_process_state_packed_list_modify_optype(cdt_process_state *state,
+		cdt_modify_data *cdt_udata)
+{
+	as_bin *b = cdt_udata->b;
+	as_cdt_optype optype = state->type;
+
+	if (as_bin_inuse(b) && ! is_list_type(as_bin_get_particle_type(b))) {
+		cf_warning(AS_PARTICLE, "cdt_process_state_packed_list_modify_optype() invalid type %d", as_bin_get_particle_type(b));
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		return false;
+	}
+
+	define_rollback_alloc(alloc_buf, cdt_udata->alloc_buf, 5, true);
+	// Results always on the heap.
+	define_rollback_alloc(alloc_result, NULL, 1, false);
+	int ret = AS_PROTO_RESULT_OK;
+
+	cdt_result_data result = {
+			.result = cdt_udata->result,
+			.alloc = alloc_result,
+	};
+
+	switch (optype) {
+	case AS_CDT_OP_LIST_SET_TYPE: {
+		uint64_t list_type;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &list_type)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE);
+		ret = list_set_flags(b, alloc_buf, (uint8_t)list_type, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_APPEND: {
+		cdt_payload value;
+		uint64_t create_type = AS_PACKED_LIST_FLAG_NONE;
+		uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &value, &create_type, &modify)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, create_type);
+		ret = list_append(b, alloc_buf, &value, false, modify, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_APPEND_ITEMS: {
+		cdt_payload items;
+		uint64_t create_type = AS_PACKED_LIST_FLAG_NONE;
+		uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &items, &create_type, &modify)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, create_type);
+		ret = list_append(b, alloc_buf, &items, true, modify, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_INSERT: {
+		int64_t index;
+		cdt_payload value;
+		uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &value, &modify)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE);
+		ret = list_insert(b, alloc_buf, index, &value, false, modify, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_INSERT_ITEMS: {
+		int64_t index;
+		cdt_payload items;
+		uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &items, &modify)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE);
+		ret = list_insert(b, alloc_buf, index, &items, true, modify, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_SET: {
+		int64_t index;
+		cdt_payload value;
+		uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &value, &modify)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE);
+		ret = list_set(b, alloc_buf, index, &value, modify);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE:
+	case AS_CDT_OP_LIST_POP: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, optype == AS_CDT_OP_LIST_REMOVE ?
+				RESULT_TYPE_COUNT : RESULT_TYPE_VALUE, false);
+		ret = list_remove_by_index_range(b, alloc_buf, index, 1, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_RANGE:
+	case AS_CDT_OP_LIST_POP_RANGE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, optype == AS_CDT_OP_LIST_REMOVE_RANGE ?
+				RESULT_TYPE_COUNT : RESULT_TYPE_VALUE, true);
+		ret = list_remove_by_index_range(b, alloc_buf, index, count, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_TRIM: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result.type = RESULT_TYPE_COUNT;
+		result.flags = AS_CDT_OP_FLAG_INVERTED;
+		result.is_multi = true;
+		ret = list_remove_by_index_range(b, alloc_buf, index, count, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_CLEAR: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		packed_list list;
+
+		if (! packed_list_init_from_bin(&list, b)) {
+			cf_warning(AS_PARTICLE, "LIST_CLEAR: invalid list");
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_empty_list(b, alloc_buf, list_is_ordered(&list));
+		break;
+	}
+	case AS_CDT_OP_LIST_INCREMENT: {
+		int64_t index;
+		cdt_payload delta = { NULL };
+		uint64_t create = AS_PACKED_LIST_FLAG_NONE;
+		uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &delta, &create,
+				&modify)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_set_temp_list_if_notinuse(b, create);
+		ret = list_increment(b, alloc_buf, index, &delta, modify, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_SORT: {
+		if (! as_bin_inuse(b)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+			return false;
+		}
+
+		uint64_t flags = 0;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		ret = list_sort(b, alloc_buf, (as_cdt_sort_flags)flags);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_BY_INDEX: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = list_remove_by_index_range(b, alloc_buf, index, 1, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE:
+	case AS_CDT_OP_LIST_REMOVE_BY_VALUE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type,
+				optype == AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE);
+		ret = list_remove_by_value_interval(b, alloc_buf, &value, &value,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_BY_RANK: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t rank;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = list_remove_by_rank_range(b, alloc_buf, rank, 1, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE_LIST: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload items;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = list_remove_all_by_value_list(b, alloc_buf, &items, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_BY_INDEX_RANGE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = list_remove_by_index_range(b, alloc_buf, index, count, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_BY_VALUE_INTERVAL: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload value_start;
+		cdt_payload value_end = { NULL };
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start,
+				&value_end)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = list_remove_by_value_interval(b, alloc_buf, &value_start,
+				&value_end, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t rank;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = list_remove_by_rank_range(b, alloc_buf, rank, count, &result);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "cdt_process_state_packed_list_modify_optype() invalid cdt op: %d", optype);
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+		return false;
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state));
+		cdt_udata->ret_code = ret;
+		rollback_alloc_rollback(alloc_result);
+		rollback_alloc_rollback(alloc_buf);
+		return false;
+	}
+
+	// In case of no-op.
+	if (b->particle == (const as_particle *)&list_mem_empty) {
+		as_bin_set_unordered_empty_list(b, alloc_buf);
+	}
+	else if (b->particle == (const as_particle *)&list_ordered_empty) {
+		as_bin_set_ordered_empty_list(b, alloc_buf);
+	}
+
+	return true;
+}
+
+bool
+cdt_process_state_packed_list_read_optype(cdt_process_state *state,
+		cdt_read_data *cdt_udata)
+{
+	const as_bin *b = cdt_udata->b;
+	as_cdt_optype optype = state->type;
+
+	if (! is_list_type(as_bin_get_particle_type(b))) {
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		return false;
+	}
+
+	packed_list list;
+
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "%s: invalid list", cdt_process_state_get_op_name(state));
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		return false;
+	}
+
+	// Just one entry needed for results bin.
+	define_rollback_alloc(alloc_result, NULL, 1, false);
+	int ret = AS_PROTO_RESULT_OK;
+
+	cdt_result_data result = {
+			.result = cdt_udata->result,
+			.alloc = alloc_result,
+	};
+
+	switch (optype) {
+	case AS_CDT_OP_LIST_GET: {
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, RESULT_TYPE_VALUE, false);
+		ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index,
+				1, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_RANGE: {
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, RESULT_TYPE_VALUE, true);
+		ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index,
+				count, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_SIZE: {
+		as_bin_set_int(result.result, list.ele_count);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_BY_INDEX: {
+		uint64_t result_type;
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index,
+				1, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_ALL_BY_VALUE:
+	case AS_CDT_OP_LIST_GET_BY_VALUE: {
+		uint64_t result_type;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type,
+				optype == AS_CDT_OP_LIST_GET_ALL_BY_VALUE);
+		ret = packed_list_get_remove_by_value_interval(&list, NULL, NULL,
+				&value, &value, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_BY_RANK: {
+		uint64_t result_type;
+		int64_t rank;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = packed_list_get_remove_by_rank_range(&list, NULL, NULL, rank, 1,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_ALL_BY_VALUE_LIST: {
+		uint64_t result_type;
+		cdt_payload value_list;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_list)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_list_get_remove_all_by_value_list(&list, NULL, NULL,
+				&value_list, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_BY_INDEX_RANGE: {
+		uint64_t result_type;
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index,
+				count, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_BY_VALUE_INTERVAL: {
+		uint64_t result_type;
+		cdt_payload value_start;
+		cdt_payload value_end = { NULL };
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start,
+				&value_end)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_list_get_remove_by_value_interval(&list, NULL, NULL,
+				&value_start, &value_end, &result);
+		break;
+	}
+	case AS_CDT_OP_LIST_GET_BY_RANK_RANGE: {
+		uint64_t result_type;
+		int64_t rank;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_list_get_remove_by_rank_range(&list, NULL, NULL, rank,
+				count, &result);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "cdt_process_state_packed_list_read_optype() invalid cdt op: %d", optype);
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+		return false;
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state));
+		cdt_udata->ret_code = ret;
+		rollback_alloc_rollback(alloc_result);
+		return false;
+	}
+
+	return true;
+}
+
+
+//==========================================================
+// list_offset_index
+//
+
+static inline void
+list_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr,
+		uint32_t ele_count, const uint8_t *contents, uint32_t content_sz)
+{
+	ele_count /= PACKED_LIST_INDEX_STEP;
+
+	if (ele_count != 0) {
+		ele_count++;
+	}
+
+	offset_index_init(offidx, idx_mem_ptr, ele_count, contents, content_sz);
+	offidx->is_partial = true;
+}
+
+static void
+list_offset_index_rm_mask_cpy(offset_index *dst, const offset_index *full_src,
+		const uint64_t *rm_mask, uint32_t rm_count)
+{
+	cf_assert(rm_mask && rm_count != 0, AS_PARTICLE, "list_offset_index_rm_mask_cpy() should not do no-op copy");
+
+	uint32_t ele_count = full_src->_.ele_count;
+
+	if (! dst->is_partial) {
+		uint32_t delta = 0;
+		uint32_t prev = 0;
+		uint32_t idx = 0;
+
+		for (uint32_t i = 0; i < rm_count; i++) {
+			idx = cdt_idx_mask_find(rm_mask, idx, ele_count, false);
+			uint32_t sz = offset_index_get_delta_const(full_src, idx);
+			uint32_t diff = idx - prev;
+
+			for (uint32_t j = 1; j < diff; j++) {
+				uint32_t offset = offset_index_get_const(full_src, prev + j);
+
+				offset_index_set(dst, prev + j - i, offset - delta);
+			}
+
+			prev = idx;
+			delta += sz;
+			idx++;
+		}
+
+		uint32_t diff = full_src->_.ele_count - prev;
+
+		for (uint32_t i = 1; i < diff; i++) {
+			uint32_t offset = offset_index_get_const(full_src, prev + i);
+			offset_index_set(dst, prev + i - rm_count, offset - delta);
+		}
+
+		offset_index_set_filled(dst, dst->_.ele_count);
+		return;
+	}
+
+	uint32_t delta = 0;
+	uint32_t prev_par_idx = 0;
+	uint32_t idx = 0;
+
+	for (uint32_t i = 0; i < rm_count; i++) {
+		idx = cdt_idx_mask_find(rm_mask, idx, ele_count, false);
+		uint32_t sz = offset_index_get_delta_const(full_src, idx);
+		uint32_t par_idx = (idx - i) / PACKED_LIST_INDEX_STEP;
+		uint32_t diff = par_idx - prev_par_idx + 1;
+
+		for (uint32_t j = 1; j < diff; j++) {
+			uint32_t offset = offset_index_get_const(full_src,
+					(prev_par_idx + j) * PACKED_LIST_INDEX_STEP + i);
+			offset_index_set(dst, prev_par_idx + j, offset - delta);
+		}
+
+		prev_par_idx = par_idx;
+		delta += sz;
+		idx++;
+	}
+
+	uint32_t par_idx = (full_src->_.ele_count - rm_count) /
+			PACKED_LIST_INDEX_STEP;
+	uint32_t diff = par_idx - prev_par_idx + 1;
+
+	for (uint32_t j = 1; j < diff; j++) {
+		uint32_t offset = offset_index_get_const(full_src,
+				(prev_par_idx + j) * PACKED_LIST_INDEX_STEP + rm_count);
+		offset_index_set(dst, prev_par_idx + j, offset - delta);
+	}
+
+	offset_index_set_filled(dst, par_idx + 1);
+}
+
+
+//==========================================================
+// list_full_offset_index
+//
+
+static inline void
+list_full_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr,
+		uint32_t ele_count, const uint8_t *contents, uint32_t content_sz)
+{
+	offset_index_init(offidx, idx_mem_ptr, ele_count, contents, content_sz);
+}
+
+static bool
+list_full_offset_index_fill_to(offset_index *offidx, uint32_t index)
+{
+	uint32_t start = offset_index_get_filled(offidx);
+
+	index = MIN(index + 1, offidx->_.ele_count);
+
+	if (start >= index) {
+		return true;
+	}
+
+	as_unpacker pk = {
+			.buffer = offidx->contents,
+			.offset = offset_index_get_const(offidx, start - 1),
+			.length = offidx->content_sz
+	};
+
+	for (uint32_t i = start; i < index; i++) {
+		if (as_unpack_size(&pk) <= 0) {
+			return false;
+		}
+
+		offset_index_set(offidx, i, pk.offset);
+	}
+
+	offset_index_set_filled(offidx, index);
+
+	return true;
+}
+
+bool
+list_full_offset_index_fill_all(offset_index *offidx)
+{
+	return list_full_offset_index_fill_to(offidx, offidx->_.ele_count);
+}
+
+
+//==========================================================
+// list_order_index
+//
+
+static int
+list_order_index_sort_cmp_fn(const void *x, const void *y, void *p)
+{
+	list_order_index_sort_userdata *udata = p;
+
+	if (udata->error) {
+		return 0;
+	}
+
+	const order_index *order = udata->order;
+	uint32_t a = order_index_ptr2value(order, x);
+	uint32_t b = order_index_ptr2value(order, y);
+
+	const offset_index *offsets = udata->offsets;
+	const uint8_t *buf = udata->offsets->contents;
+	uint32_t len = udata->offsets->content_sz;
+	uint32_t x_off = offset_index_get_const(offsets, a);
+	uint32_t y_off = offset_index_get_const(offsets, b);
+
+	as_unpacker x_pk = {
+			.buffer = buf + x_off,
+			.offset = 0,
+			.length = len - x_off
+	};
+
+	as_unpacker y_pk = {
+			.buffer = buf + y_off,
+			.offset = 0,
+			.length = len - y_off
+	};
+
+	msgpack_compare_t cmp = as_unpack_compare(&x_pk, &y_pk);
+
+	switch (cmp) {
+	case MSGPACK_COMPARE_EQUAL:
+		return 0;
+	case MSGPACK_COMPARE_LESS:
+		if (udata->flags & AS_CDT_SORT_DESCENDING) {
+			cmp = MSGPACK_COMPARE_GREATER;
+		}
+		break;
+	case MSGPACK_COMPARE_GREATER:
+		if (udata->flags & AS_CDT_SORT_DESCENDING) {
+			cmp = MSGPACK_COMPARE_LESS;
+		}
+		break;
+	default:
+		udata->error = true;
+		return 0;
+	}
+
+	return (cmp == MSGPACK_COMPARE_LESS) ? -1 : 1;
+}
+
+bool
+list_order_index_sort(order_index *ordidx, const offset_index *full_offidx,
+		as_cdt_sort_flags flags)
+{
+	uint32_t ele_count = ordidx->_.ele_count;
+	list_order_index_sort_userdata udata = {
+			.order = ordidx,
+			.offsets = full_offidx,
+			.flags = flags
+	};
+
+	for (uint32_t i = 0; i < ele_count; i++) {
+		order_index_set(ordidx, i, i);
+	}
+
+	qsort_r(order_index_get_mem(ordidx, 0), ele_count, ordidx->_.ele_sz,
+			list_order_index_sort_cmp_fn, (void *)&udata);
+
+	return ! udata.error;
+}
+
+static uint8_t *
+list_order_index_pack(const order_index *ordidx,
+		const offset_index *full_offidx, uint8_t *buf, offset_index *new_offidx)
+{
+	cf_assert(new_offidx, AS_PARTICLE, "new_offidx null");
+	cf_assert(full_offidx->_.ele_count != 0, AS_PARTICLE, "ele_count == 0");
+
+	const uint8_t *contents = full_offidx->contents;
+	uint32_t buf_off = 0;
+	uint32_t write_count = 0;
+
+	for (uint32_t i = 0; i < full_offidx->_.ele_count; i++) {
+		uint32_t idx = order_index_get(ordidx, i);
+
+		if (idx == full_offidx->_.ele_count) {
+			continue;
+		}
+
+		uint32_t off = offset_index_get_const(full_offidx, idx);
+		uint32_t sz = offset_index_get_delta_const(full_offidx, idx);
+
+		memcpy(buf + buf_off, contents + off, sz);
+		buf_off += sz;
+		write_count++;
+
+		if (offset_index_is_null(new_offidx)) {
+			continue;
+		}
+
+		if (! new_offidx->is_partial) {
+			offset_index_set(new_offidx, write_count, buf_off);
+		}
+		else if (write_count % PACKED_LIST_INDEX_STEP == 0) {
+			uint32_t new_idx = write_count / PACKED_LIST_INDEX_STEP;
+			offset_index_set(new_offidx, new_idx, buf_off);
+		}
+	}
+
+	if (offset_index_is_valid(new_offidx)) {
+		offset_index_set_filled(new_offidx, (new_offidx->is_partial ?
+				(write_count / PACKED_LIST_INDEX_STEP) + 1 : write_count));
+	}
+
+	return buf + buf_off;
+}
+
+
+//==========================================================
+// list_order_heap
+//
+
+static msgpack_compare_t
+list_order_heap_cmp_fn(const void *udata, uint32_t idx1, uint32_t idx2)
+{
+	const packed_list *list = (const packed_list *)udata;
+	const offset_index *offidx = &list->full_offidx;
+
+	as_unpacker pk1 = {
+			.buffer = list->contents,
+			.offset = offset_index_get_const(offidx, idx1),
+			.length = list->content_sz
+	};
+
+	as_unpacker pk2 = {
+			.buffer = list->contents,
+			.offset = offset_index_get_const(offidx, idx2),
+			.length = list->content_sz
+	};
+
+	return as_unpack_compare(&pk1, &pk2);
+}
+
+
+//==========================================================
+// list_result_data
+//
+
+static bool
+list_result_data_set_not_found(cdt_result_data *rd, int64_t index)
+{
+	switch (rd->type) {
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_MAP:
+		return false;
+	default:
+		break;
+	}
+
+	return result_data_set_not_found(rd, index);
+}
+
+// Does not respect inverted flag.
+static void
+list_result_data_set_values_by_mask(cdt_result_data *rd, const uint64_t *mask,
+		const offset_index *full_offidx, uint32_t count, uint32_t sz)
+{
+	if (sz == 0) {
+		sz = cdt_idx_mask_get_content_sz(mask, count, full_offidx);
+	}
+
+	cdt_container_builder builder;
+	cdt_list_builder_start(&builder, rd->alloc, count, sz);
+
+	const uint8_t *end = cdt_idx_mask_write_eles(mask, count, full_offidx,
+			builder.write_ptr, false);
+
+	cf_assert(end - builder.write_ptr == sz, AS_PARTICLE, "size mismatch end - ptr %zu != sz %u", end - builder.write_ptr, sz);
+	cdt_container_builder_add_n(&builder, NULL, count, sz);
+	cdt_container_builder_set_result(&builder, rd);
+}
+
+// Does not respect inverted flag.
+static void
+list_result_data_set_values_by_idxcount(cdt_result_data *rd,
+		const order_index *idxcnt, const offset_index *full_offidx)
+{
+	uint32_t items_count = idxcnt->_.ele_count / 2;
+	uint32_t sz = 0;
+	uint32_t ret_count = 0;
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		uint32_t idx = order_index_get(idxcnt, 2 * i);
+		uint32_t count = order_index_get(idxcnt, (2 * i) + 1);
+
+		for (uint32_t j = 0; j < count; j++) {
+			sz += offset_index_get_delta_const(full_offidx, idx + j);
+		}
+	}
+
+	cdt_container_builder builder;
+	cdt_list_builder_start(&builder, rd->alloc, ret_count, sz);
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		uint32_t idx = order_index_get(idxcnt, 2 * i);
+		uint32_t count = order_index_get(idxcnt, (2 * i) + 1);
+
+		if (count == 0) {
+			continue;
+		}
+
+		uint32_t offset = offset_index_get_const(full_offidx, idx);
+		uint32_t end = offset_index_get_const(full_offidx, idx + count);
+
+		cdt_container_builder_add_n(&builder, full_offidx->contents + offset,
+				count, end - offset);
+	}
+
+	cdt_container_builder_set_result(&builder, rd);
+}
+
+// Does not respect inverted flag.
+static bool
+list_result_data_set_values_by_ordidx(cdt_result_data *rd,
+		const order_index *ordidx, const offset_index *full_offidx,
+		uint32_t count, uint32_t sz)
+{
+	if (! rd->is_multi) {
+		if (count != 0) {
+			uint32_t i = order_index_get(ordidx, 0);
+			uint32_t offset = offset_index_get_const(full_offidx, i);
+			uint32_t sz = offset_index_get_delta_const(full_offidx, i);
+
+			return as_bin_particle_alloc_from_msgpack(rd->result,
+					full_offidx->contents + offset, sz) == AS_PROTO_RESULT_OK;
+		}
+
+		return true;
+	}
+
+	if (sz == 0) {
+		sz = order_index_get_ele_size(ordidx, count, full_offidx);
+	}
+
+	uint8_t *ptr;
+
+	rd->result->particle = list_simple_create(rd->alloc, count, sz,
+			&ptr);
+	order_index_write_eles(ordidx, count, full_offidx, ptr, false);
+	as_bin_state_set_from_type(rd->result, AS_PARTICLE_TYPE_LIST);
+
+	return true;
+}
+
+
+//==========================================================
+// Debugging support.
+//
+
+static void
+list_print(const packed_list *list, const char *name)
+{
+	print_packed(list->packed, list->packed_sz, name);
+}
+
+static bool
+list_verify(const as_bin *b)
+{
+	if (! b) {
+		return true;
+	}
+
+	packed_list list;
+	uint8_t type = as_bin_get_particle_type(b);
+
+	if (type != AS_PARTICLE_TYPE_LIST) {
+		cf_warning(AS_PARTICLE, "list_verify() non-list type: %u", type);
+		return false;
+	}
+
+	// Check header.
+	if (! packed_list_init_from_bin(&list, b)) {
+		cf_warning(AS_PARTICLE, "list_verify() invalid packed list");
+		return false;
+	}
+
+	offset_index *offidx = list_full_offidx_p(&list);
+	bool check_offidx = offset_index_is_valid(offidx);
+	uint32_t filled = 0;
+	define_offset_index(temp_offidx, list.contents, list.content_sz,
+			list.ele_count);
+
+	as_unpacker pk = {
+			.buffer = list.contents,
+			.length = list.content_sz
+	};
+
+	if (check_offidx) {
+		filled = offset_index_get_filled(offidx);
+
+		if (list.ele_count != 0) {
+			offset_index_copy(&temp_offidx, offidx, 0, 0, filled, 0);
+		}
+	}
+
+	// Check offsets.
+	for (uint32_t i = 0; i < list.ele_count; i++) {
+		uint32_t offset;
+
+		if (check_offidx) {
+			if (list_is_ordered(&list)) {
+				if (i < filled) {
+					offset = offset_index_get_const(offidx, i);
+
+					if (pk.offset != offset) {
+						cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u expected=%u", i, offset, pk.offset);
+						return false;
+					}
+				}
+				else {
+					offset_index_set(&temp_offidx, i, pk.offset);
+				}
+			}
+			else if ((i % PACKED_LIST_INDEX_STEP) == 0) {
+				uint32_t step_i = i / PACKED_LIST_INDEX_STEP;
+
+				if (i < filled) {
+					offset = offset_index_get_const(offidx, i);
+
+					if (pk.offset != offset) {
+						cf_warning(AS_PARTICLE, "list_verify() i=%u step %u offset=%u expected=%u", i, step_i, offset, pk.offset);
+						return false;
+					}
+				}
+			}
+		}
+		else {
+			offset_index_set(&temp_offidx, i, pk.offset);
+		}
+
+		offset = pk.offset;
+
+		if (as_unpack_size(&pk) <= 0) {
+			cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset);
+			return false;
+		}
+	}
+
+	// Check packed size.
+	if (list.content_sz != pk.offset) {
+		cf_warning(AS_PARTICLE, "list_verify() content_sz=%u expected=%u", list.content_sz, pk.offset);
+		return false;
+	}
+
+	pk.offset = 0;
+
+	as_unpacker pk_value = pk;
+
+	// Check ordered list.
+	if (list_is_ordered(&list) && list.ele_count > 0) {
+		if (as_unpack_size(&pk) <= 0) {
+			cf_warning(AS_PARTICLE, "list_verify() pk.offset=%u invalid value", pk.offset);
+			return false;
+		}
+
+		for (uint32_t i = 1; i < list.ele_count; i++) {
+			uint32_t offset = pk.offset;
+			msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk);
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset);
+				return false;
+			}
+
+			if (cmp == MSGPACK_COMPARE_GREATER) {
+				cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u pk.offset=%u keys not in order", i, offset, pk.offset);
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+// Quash warnings for debug function.
+void
+as_cdt_list_debug_dummy()
+{
+	list_verify(NULL);
+	list_print(NULL, NULL);
+}
diff --git a/as/src/base/particle_map.c b/as/src/base/particle_map.c
new file mode 100644
index 00000000..86d71c20
--- /dev/null
+++ b/as/src/base/particle_map.c
@@ -0,0 +1,6886 @@
+/*
+ * particle_map.c
+ *
+ * Copyright (C) 2015-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+
+#include "bits.h"
+#include "fault.h"
+
+#include "base/cdt.h"
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/proto.h"
+
+
+//==========================================================
+// MAP particle interface - function declarations.
+//
+
+// Destructor, etc.
+void map_destruct(as_particle *p);
+uint32_t map_size(const as_particle *p);
+
+// Handle "wire" format.
+int32_t map_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int map_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int map_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int map_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int32_t map_size_from_wire(const uint8_t *wire_value, uint32_t value_size);
+int map_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp);
+int map_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size);
+uint32_t map_wire_size(const as_particle *p);
+uint32_t map_to_wire(const as_particle *p, uint8_t *wire);
+
+// Handle as_val translation.
+uint32_t map_size_from_asval(const as_val *val);
+void map_from_asval(const as_val *val, as_particle **pp);
+as_val *map_to_asval(const as_particle *p);
+uint32_t map_asval_wire_size(const as_val *val);
+uint32_t map_asval_to_wire(const as_val *val, uint8_t *wire);
+
+// Handle msgpack translation.
+uint32_t map_size_from_msgpack(const uint8_t *packed, uint32_t packed_size);
+void map_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp);
+
+// Handle on-device "flat" format.
+int32_t map_size_from_flat(const uint8_t *flat, uint32_t flat_size);
+int map_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp);
+int map_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp);
+uint32_t map_flat_size(const as_particle *p);
+uint32_t map_to_flat(const as_particle *p, uint8_t *flat);
+
+
+//==========================================================
+// MAP particle interface - vtable.
+//
+
+const as_particle_vtable map_vtable = {
+		map_destruct,
+		map_size,
+
+		map_concat_size_from_wire,
+		map_append_from_wire,
+		map_prepend_from_wire,
+		map_incr_from_wire,
+		map_size_from_wire,
+		map_from_wire,
+		map_compare_from_wire,
+		map_wire_size,
+		map_to_wire,
+
+		map_size_from_asval,
+		map_from_asval,
+		map_to_asval,
+		map_asval_wire_size,
+		map_asval_to_wire,
+
+		map_size_from_msgpack,
+		map_from_msgpack,
+
+		map_size_from_flat,
+		map_cast_from_flat,
+		map_from_flat,
+		map_flat_size,
+		map_to_flat
+};
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+//#define MAP_DEBUG_VERIFY
+
+#define LINEAR_FIND_RANK_MAX_COUNT		16 // switch to linear search when the count drops to this number
+
+#define AS_PACKED_MAP_FLAG_RESERVED_0	0x04 // placeholder for multimap
+#define AS_PACKED_MAP_FLAG_OFF_IDX		0x10 // has list offset index
+#define AS_PACKED_MAP_FLAG_ORD_IDX		0x20 // has value order index
+#define AS_PACKED_MAP_FLAG_ON_STACK		0x40 // map on stack
+
+struct packed_map_s;
+
+typedef bool (*packed_map_get_by_idx_func)(const struct packed_map_s *userdata, cdt_payload *contents, uint32_t index);
+
+typedef struct offidx_op_s {
+	offset_index *dest;
+	const offset_index *src;
+	uint32_t d_i;
+	uint32_t s_i;
+	int delta;
+} offidx_op;
+
+typedef struct packed_map_s {
+	const uint8_t *packed;
+	const uint8_t *contents; // where elements start (excludes ext)
+	uint32_t packed_sz;
+	uint32_t content_sz;
+
+	// Mutable field member (Is considered mutable in const objects).
+	offset_index offidx; // offset start at contents (excluding ext metadata pair)
+	uint8_t flags;
+	// Mutable field member.
+	order_index value_idx;
+
+	uint32_t ele_count; // excludes ext pair
+} packed_map;
+
+typedef struct packed_map_op_s {
+	const packed_map *map;
+
+	uint32_t new_ele_count;
+	uint32_t ele_removed;
+
+	uint32_t seg1_sz;
+	uint32_t seg2_offset;
+	uint32_t seg2_sz;
+
+	uint32_t key1_offset;
+	uint32_t key1_sz;
+	uint32_t key2_offset;
+	uint32_t key2_sz;
+} packed_map_op;
+
+typedef struct map_packer_s {
+	uint8_t *write_ptr;
+	const uint8_t *contents;
+
+	offset_index offset_idx;	// offset start at ele_start (excluding ext metadata pair)
+	order_index value_idx;
+
+	uint32_t ele_count;
+	uint32_t content_sz;		// does not include map header or ext
+	uint32_t ext_content_sz;
+
+	uint32_t ext_sz;
+	uint32_t ext_header_sz;
+
+	uint8_t flags;
+} map_packer;
+
+typedef struct map_mem_s {
+	uint8_t		type;
+	uint32_t	sz;
+	uint8_t		data[];
+} __attribute__ ((__packed__)) map_mem;
+
+typedef struct map_flat_s {
+	uint8_t		type;
+	uint32_t	sz;
+	uint8_t		data[];
+} __attribute__ ((__packed__)) map_flat;
+
+typedef struct msgpack_map_empty_flagged_s {
+	uint8_t		map_hdr;
+	uint8_t		ext_hdr;
+	uint8_t		ext_sz;
+	uint8_t		ext_flags;
+	uint8_t		nil;
+} __attribute__ ((__packed__)) msgpack_map_empty_flagged;
+
+typedef struct map_mem_empty_flagged_s {
+	map_mem mem;
+	msgpack_map_empty_flagged map;
+} map_mem_empty_flagged;
+
+#define MSGPACK_MAP_FLAGGED(__flags) { \
+		.map_hdr = 0x81, \
+		.ext_hdr = 0xC7, \
+		.ext_sz = 0, \
+		.ext_flags = __flags, \
+		.nil = 0xC0 \
+}
+
+#define MAP_MEM_EMPTY_FLAGGED_ENTRY(__flag) { \
+	{ \
+			.type = AS_PARTICLE_TYPE_MAP, \
+			.sz = sizeof(msgpack_map_empty_flagged) \
+	}, \
+	MSGPACK_MAP_FLAGGED(__flag) \
+}
+
+static const map_mem_empty_flagged map_mem_empty_flagged_table[] = {
+		MAP_MEM_EMPTY_FLAGGED_ENTRY(AS_PACKED_MAP_FLAG_K_ORDERED | AS_PACKED_MAP_FLAG_OFF_IDX),
+		MAP_MEM_EMPTY_FLAGGED_ENTRY(AS_PACKED_MAP_FLAG_KV_ORDERED | AS_PACKED_MAP_FLAG_OFF_IDX | AS_PACKED_MAP_FLAG_ORD_IDX),
+};
+static const map_mem map_mem_empty = {
+		.type = AS_PARTICLE_TYPE_MAP,
+		.sz = 1,
+		.data = {0x80},
+};
+
+typedef enum sort_by_e {
+	SORT_BY_KEY,
+	SORT_BY_VALUE
+} sort_by_t;
+
+typedef struct index_sort_userdata_s {
+	const offset_index *offsets;
+	order_index *order;
+	const uint8_t *contents;
+	uint32_t content_sz;
+	bool error;
+	sort_by_t sort_by;
+} index_sort_userdata;
+
+typedef struct map_add_control_s {
+	bool allow_overwrite;	// if key exists and map is unique-keyed - may overwrite
+	bool allow_create;		// if key does not exist - may create
+} map_add_control;
+
+typedef struct map_ele_find_s {
+	bool found_key;
+	bool found_value;
+
+	uint32_t idx;
+	uint32_t rank;
+
+	uint32_t key_offset;	// offset start at map header
+	uint32_t value_offset;	// offset start at map header
+	uint32_t sz;
+
+	uint32_t upper;
+	uint32_t lower;
+} map_ele_find;
+
+// TODO - refactor params using this.
+typedef struct map_getrem_s {
+	const packed_map *map;
+	as_bin *b;
+	rollback_alloc *alloc_buf;
+	cdt_result_data *result;
+} map_getrem;
+
+#define as_bin_use_static_map_mem_if_notinuse(__b, __flags) \
+		if (! as_bin_inuse(b)) { \
+			if (is_kv_ordered(__flags)) { \
+				(__b)->particle = (as_particle *)(map_mem_empty_flagged_table + 1); \
+			} \
+			else if (is_k_ordered(__flags)) { \
+				(__b)->particle = (as_particle *)map_mem_empty_flagged_table; \
+			} \
+			else { \
+				(__b)->particle = (as_particle *)&map_mem_empty; \
+			} \
+			as_bin_state_set_from_type(__b, AS_PARTICLE_TYPE_MAP); \
+		}
+
+#define vla_map_offidx_if_invalid(__name, __map_p) \
+		union { \
+			offset_index *offidx; \
+			uint8_t mem_temp[sizeof(offset_index *) + (offset_index_is_valid(&(__map_p)->offidx) ? 0 : offset_index_size(&(__map_p)->offidx))]; \
+		} __name; \
+		__name.offidx = (offset_index *)&(__map_p)->offidx; \
+		if (offset_index_is_null(__name.offidx)) { \
+			__name.offidx->_.ptr = __name.mem_temp + sizeof(offset_index *); \
+			offset_index_set_filled(__name.offidx, 1); \
+		}
+
+#define vla_map_allidx_if_invalid(__name, __map_p) \
+		union { \
+			struct { \
+				offset_index *offidx; \
+				order_index *ordidx; \
+			}; \
+			uint8_t mem_temp[sizeof(offset_index *) + sizeof(order_index *) + \
+							 (offset_index_is_valid(&(__map_p)->offidx) ? 0 : offset_index_size(&(__map_p)->offidx)) + \
+							 (order_index_is_valid(&(__map_p)->value_idx) ? 0 : order_index_size(&(__map_p)->value_idx))]; \
+		} __name; \
+		__name.offidx = (offset_index *)&(__map_p)->offidx; \
+		__name.ordidx = (order_index *)&(__map_p)->value_idx; \
+		if (offset_index_is_null(__name.offidx)) { \
+			__name.offidx->_.ptr = __name.mem_temp + sizeof(offset_index *) + sizeof(order_index *); \
+			offset_index_set_filled(__name.offidx, 1); \
+			if (order_index_is_null(__name.ordidx)) { \
+				__name.ordidx->_.ptr = __name.offidx->_.ptr + offset_index_size(__name.offidx); \
+				order_index_set(__name.ordidx, 0, (__map_p)->ele_count); \
+			} \
+		} \
+		else if (order_index_is_null(__name.ordidx)) { \
+			__name.ordidx->_.ptr = __name.mem_temp + sizeof(offset_index *) + sizeof(order_index *); \
+			order_index_set(__name.ordidx, 0, (__map_p)->ele_count); \
+		}
+
+#define define_map_unpacker(__name, __map_ptr) \
+		as_unpacker __name = { \
+				.buffer = (__map_ptr)->contents, \
+				.length = (__map_ptr)->content_sz \
+		}
+
+#define define_map_op(__name, __map_ptr) \
+		packed_map_op __name; \
+		packed_map_op_init(&__name, __map_ptr)
+
+#define define_map_packer(__name, __ele_count, __flags, __content_sz) \
+		map_packer __name; \
+		map_packer_init(&__name, __ele_count, __flags, __content_sz)
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static inline bool is_map_type(uint8_t type);
+static inline bool is_k_ordered(uint8_t flags);
+static inline bool is_kv_ordered(uint8_t flags);
+static uint32_t map_calc_ext_content_sz(uint8_t flags, uint32_t ele_count, uint32_t content_sz);
+static uint8_t map_adjust_incoming_flags(uint8_t flags);
+
+static inline uint32_t map_ext_content_sz(const packed_map *map);
+static inline bool map_is_k_ordered(const packed_map *map);
+static inline bool map_is_kv_ordered(const packed_map *map);
+static inline bool map_has_offidx(const packed_map *map);
+static inline bool map_fill_offidx(const packed_map *map);
+
+static inline bool skip_map_pair(as_unpacker *pk);
+
+// map_packer
+static as_particle *map_packer_create_particle(map_packer *pk, rollback_alloc *alloc_buf);
+static void map_packer_init(map_packer *pk, uint32_t ele_count, uint8_t flags, uint32_t content_sz);
+static void map_packer_setup_bin(map_packer *pk, as_bin *b, rollback_alloc *alloc_buf);
+static void map_packer_write_hdridx(map_packer *pk);
+static bool map_packer_fill_offset_index(map_packer *mpk);
+static int map_packer_fill_index_sort_compare(const void *x, const void *y, void *p);
+static bool map_packer_fill_ordidx(map_packer *mpk, const uint8_t *contents, uint32_t content_sz);
+static bool map_packer_add_op_copy_index(map_packer *mpk, const packed_map_op *add_op, map_ele_find *remove_info, const map_ele_find *add_info, uint32_t kv_sz);
+static inline void map_packer_write_seg1(map_packer *pk, const packed_map_op *op);
+static inline void map_packer_write_seg2(map_packer *pk, const packed_map_op *op);
+static inline void map_packer_write_msgpack_seg(map_packer *pk, const cdt_payload *seg);
+
+// map
+static int map_set_flags(as_bin *b, rollback_alloc *alloc_buf, as_bin *result, uint8_t set_flags);
+static int map_increment(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key, const cdt_payload *delta_value, as_bin *result, bool is_decrement);
+static int map_add(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key, const cdt_payload *value, as_bin *result, const map_add_control *control);
+static int map_add_items(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *items, as_bin *result, const map_add_control *control);
+
+static int map_remove_by_key_interval(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_start, const cdt_payload *key_end, cdt_result_data *result);
+static int map_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result);
+static int map_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result);
+static int map_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result);
+
+static int map_remove_all_by_key_list(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_list, cdt_result_data *result);
+static int map_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result);
+
+static int map_clear(as_bin *b, rollback_alloc *alloc_buf, as_bin *result);
+
+// packed_map
+static bool packed_map_init(packed_map *map, const uint8_t *buf, uint32_t sz, bool fill_idxs);
+static inline bool packed_map_init_from_particle(packed_map *map, const as_particle *p, bool fill_idxs);
+static bool packed_map_init_from_bin(packed_map *map, const as_bin *b, bool fill_idxs);
+static bool packed_map_unpack_hdridx(packed_map *map, bool fill_idxs);
+
+static void packed_map_init_indexes(const packed_map *map, as_packer *pk);
+
+static bool packed_map_ensure_ordidx_filled(const packed_map *op);
+
+static uint32_t packed_map_find_index_by_idx_unordered(const packed_map *map, uint32_t idx);
+static uint32_t packed_map_find_index_by_key_unordered(const packed_map *map, const cdt_payload *key);
+
+static void packed_map_find_rank_indexed_linear(const packed_map *map, map_ele_find *find, uint32_t start, uint32_t len);
+static bool packed_map_find_rank_indexed(const packed_map *map, map_ele_find *find);
+static bool packed_map_find_rank_by_value_indexed(const packed_map *map, map_ele_find *find, const cdt_payload *value);
+static bool packed_map_find_rank_range_by_value_interval_indexed(const packed_map *map, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank, uint32_t *count, bool is_multi);
+static bool packed_map_find_rank_range_by_value_interval_unordered(const packed_map *map, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank, uint32_t *count, uint64_t *mask);
+static bool packed_map_find_key_indexed(const packed_map *map, map_ele_find *find, const cdt_payload *key);
+static bool packed_map_find_key(const packed_map *map, map_ele_find *find, const cdt_payload *key);
+
+static int packed_map_get_remove_by_key_interval(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_start, const cdt_payload *key_end, cdt_result_data *result);
+static int packed_map_get_remove_by_index_range(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result);
+
+static int packed_map_get_remove_by_value_interval(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result);
+static int packed_map_get_remove_by_rank_range(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result);
+
+static int packed_map_get_remove_all_by_key_list(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_list, cdt_result_data *result);
+static int packed_map_get_remove_all_by_key_list_ordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, uint32_t items_count, cdt_result_data *result);
+static int packed_map_get_remove_all_by_key_list_unordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, uint32_t items_count, cdt_result_data *result);
+static int packed_map_get_remove_all_by_value_list(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result);
+static int packed_map_get_remove_all_by_value_list_ordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, uint32_t items_count, cdt_result_data *result);
+
+static int packed_map_get_remove_all(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, cdt_result_data *result);
+
+static int packed_map_remove_by_mask(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t count, uint32_t *rm_sz_r);
+static int packed_map_remove_idx_range(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, uint32_t idx, uint32_t count);
+
+static bool packed_map_get_range_by_key_interval_unordered(const packed_map *map, const cdt_payload *key_start, const cdt_payload *key_end, uint32_t *index, uint32_t *count, uint64_t *mask);
+static bool packed_map_get_range_by_key_interval_ordered(const packed_map *map, const cdt_payload *key_start, const cdt_payload *key_end, uint32_t *index, uint32_t *count);
+static int packed_map_build_rank_result_by_ele_idx(const packed_map *map, const order_index *ele_idx, uint32_t start, uint32_t count, cdt_result_data *result);
+static int packed_map_build_rank_result_by_mask(const packed_map *map, const uint64_t *mask, uint32_t count, cdt_result_data *result);
+static int packed_map_build_rank_result_by_index_range(const packed_map *map, uint32_t index, uint32_t count, cdt_result_data *result);
+
+static bool packed_map_get_key_by_idx(const packed_map *map, cdt_payload *key, uint32_t index);
+static bool packed_map_get_value_by_idx(const packed_map *map, cdt_payload *value, uint32_t idx);
+static bool packed_map_get_pair_by_idx(const packed_map *map, cdt_payload *value, uint32_t index);
+
+static int packed_map_build_index_result_by_ele_idx(const packed_map *map, const order_index *ele_idx, uint32_t start, uint32_t count, cdt_result_data *result);
+static int packed_map_build_index_result_by_mask(const packed_map *map, const uint64_t *mask, uint32_t count, cdt_result_data *result);
+static bool packed_map_build_ele_result_by_idx_range(const packed_map *map, uint32_t start_idx, uint32_t count, cdt_result_data *result);
+static bool packed_map_build_ele_result_by_ele_idx(const packed_map *map, const order_index *ele_idx, uint32_t start, uint32_t count, uint32_t rm_sz, cdt_result_data *result);
+static bool packed_map_build_ele_result_by_mask(const packed_map *map, const uint64_t *mask, uint32_t count, uint32_t rm_sz, cdt_result_data *result);
+static int packed_map_build_result_by_key(const packed_map *map, const cdt_payload *key, uint32_t idx, uint32_t count, cdt_result_data *result);
+
+static int64_t packed_map_get_rank_by_idx(const packed_map *map, uint32_t idx);
+static int packed_map_build_rank_result_by_idx(const packed_map *map, uint32_t idx, cdt_result_data *result);
+static int packed_map_build_rank_result_by_idx_range(const packed_map *map, uint32_t idx, uint32_t count, cdt_result_data *result);
+
+static msgpack_compare_t packed_map_compare_key_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2);
+static msgpack_compare_t packed_map_compare_values(as_unpacker *pk1, as_unpacker *pk2);
+static msgpack_compare_t packed_map_compare_value_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2);
+
+static bool packed_map_write_k_ordered(const packed_map *map, uint8_t *write_ptr, offset_index *offsets_new);
+
+// packed_map_op
+static void packed_map_op_init(packed_map_op *op, const packed_map *map);
+static int32_t packed_map_op_add(packed_map_op *op, const map_ele_find *found);
+static int32_t packed_map_op_remove(packed_map_op *op, const map_ele_find *found, uint32_t count, uint32_t remove_sz);
+
+static uint8_t *packed_map_op_write_seg1(const packed_map_op *op, uint8_t *buf);
+static uint8_t *packed_map_op_write_seg2(const packed_map_op *op, uint8_t *buf);
+static bool packed_map_op_write_new_offidx(const packed_map_op *op, const map_ele_find *remove_info, const map_ele_find *add_info, offset_index *new_offidx, uint32_t kv_sz);
+static bool packed_map_op_write_new_ordidx(const packed_map_op *op, const map_ele_find *remove_info, const map_ele_find *add_info, order_index *value_idx);
+
+// map_particle
+static as_particle *map_particle_create(rollback_alloc *alloc_buf, uint32_t ele_count, const uint8_t *buf, uint32_t content_sz, uint8_t flags);
+static int64_t map_particle_strip_indexes(const as_particle *p, uint8_t *dest);
+
+// map_ele_find
+static void map_ele_find_init(map_ele_find *find, const packed_map *map);
+static void map_ele_find_continue_from_lower(map_ele_find *find, const map_ele_find *found, uint32_t ele_count);
+static void map_ele_find_init_from_idx(map_ele_find *find, const packed_map *map, uint32_t idx);
+
+// map_offset_index
+static bool map_offset_index_fill(offset_index *offidx, uint32_t index);
+static int64_t map_offset_index_get(offset_index *offidx, uint32_t index);
+static int64_t map_offset_index_get_delta(offset_index *offidx, uint32_t index);
+
+// offidx_op
+static void offidx_op_init(offidx_op *op, offset_index *dest, const offset_index *src);
+static void offidx_op_remove(offidx_op *op, uint32_t index);
+static void offidx_op_remove_range(offidx_op *op, uint32_t index, uint32_t count);
+static void offidx_op_end(offidx_op *op);
+
+// order_index
+static bool order_index_sort(order_index *ordidx, const offset_index *offsets, const uint8_t *contents, uint32_t content_sz, sort_by_t sort_by);
+static inline bool order_index_set_sorted(order_index *ordidx, const offset_index *offsets, const uint8_t *ele_start, uint32_t tot_ele_sz, sort_by_t sort_by);
+static bool order_index_set_sorted_with_offsets(order_index *ordidx, const offset_index *offsets, sort_by_t sort_by);
+
+static uint32_t order_index_find_idx(const order_index *ordidx, uint32_t idx, uint32_t start, uint32_t len);
+
+// order_index_adjust
+static uint32_t order_index_adjust_lower(const order_index_adjust *via, uint32_t src);
+
+// order_index_op
+static inline void order_index_op_add(order_index *dest, const order_index *src, uint32_t add_idx, uint32_t add_rank);
+static inline void order_index_op_replace1_internal(order_index *dest, const order_index *src, uint32_t add_idx, uint32_t add_rank, uint32_t remove_rank, const order_index_adjust *adjust);
+static inline void order_index_op_replace1(order_index *dest, const order_index *src, uint32_t add_rank, uint32_t remove_rank);
+static void order_index_op_remove_idx_mask(order_index *dest, const order_index *src, const uint64_t *mask, uint32_t count);
+
+// result_data
+static bool result_data_set_key_not_found(cdt_result_data *rd, int64_t index);
+static bool result_data_set_value_not_found(cdt_result_data *rd, int64_t rank);
+
+// Debugging support
+static void map_print(const packed_map *map, const char *name);
+static bool map_verify(const as_bin *b);
+
+
+//==========================================================
+// MAP particle interface - function definitions.
+//
+
+//------------------------------------------------
+// Destructor, etc.
+//
+
+void
+map_destruct(as_particle *p)
+{
+	cf_free(p);
+}
+
+uint32_t
+map_size(const as_particle *p)
+{
+	const map_mem *p_map_mem = (const map_mem *)p;
+	return (uint32_t)sizeof(map_mem) + p_map_mem->sz;
+}
+
+//------------------------------------------------
+// Handle "wire" format.
+//
+
+int32_t
+map_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "concat size for map");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+map_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "append to map");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+map_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "prepend to map");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int
+map_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	cf_warning(AS_PARTICLE, "increment of map");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+int32_t
+map_size_from_wire(const uint8_t *wire_value, uint32_t value_size)
+{
+	// TODO - CDT can't determine in memory or not.
+	packed_map map;
+
+	if (! packed_map_init(&map, wire_value, value_size, false)) {
+		cf_warning(AS_PARTICLE, "map_size_from_wire() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (map.flags == 0) {
+		return (int32_t)(sizeof(map_mem) + value_size);
+	}
+
+	uint32_t extra_sz = map_ext_content_sz(&map);
+
+	// 1 byte for header, 1 byte for type, 1 byte for length for existing ext.
+	extra_sz += as_pack_ext_header_get_size(extra_sz) - 3;
+
+	return (int32_t)(sizeof(map_mem) + value_size + extra_sz);
+}
+
+int
+map_from_wire(as_particle_type wire_type, const uint8_t *wire_value,
+		uint32_t value_size, as_particle **pp)
+{
+	// TODO - CDT can't determine in memory or not.
+	// It works for data-not-in-memory but we'll incur a memcpy that could be
+	// eliminated.
+	packed_map map;
+
+	if (! packed_map_init(&map, wire_value, value_size, false)) {
+		cf_warning(AS_PARTICLE, "map_size_from_wire() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	map_mem *p_map_mem = (map_mem *)*pp;
+
+	p_map_mem->type = wire_type;
+
+	if (map.flags == 0) {
+		p_map_mem->sz = value_size;
+		memcpy(p_map_mem->data, wire_value, value_size);
+		return AS_PROTO_RESULT_OK;
+	}
+
+	// TODO - May want to check key order here but for now we'll trust the client/other node.
+	uint32_t ext_content_sz = map_ext_content_sz(&map);
+	// 1 byte for header, 1 byte for type, 1 byte for length for existing ext.
+	uint32_t extra_sz = as_pack_ext_header_get_size(ext_content_sz) - 3;
+
+	as_packer pk = {
+			.buffer = p_map_mem->data,
+			.capacity = value_size + extra_sz
+	};
+
+	as_pack_map_header(&pk, map.ele_count + 1);
+	as_pack_ext_header(&pk, ext_content_sz,
+			map_adjust_incoming_flags(map.flags));
+	packed_map_init_indexes(&map, &pk);
+	as_pack_val(&pk, &as_nil);
+	memcpy(pk.buffer + pk.offset, map.contents, map.content_sz);
+	p_map_mem->sz = value_size + ext_content_sz + extra_sz;
+
+#ifdef MAP_DEBUG_VERIFY
+	{
+		as_bin b;
+		b.particle = *pp;
+		as_bin_state_set_from_type(&b, AS_PARTICLE_TYPE_MAP);
+
+		if (! map_verify(&b)) {
+			offset_index_print(&map.offidx, "verify");
+			cf_warning(AS_PARTICLE, "map_from_wire: pp=%p wire_value=%p", pp, wire_value);
+		}
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+int
+map_compare_from_wire(const as_particle *p, as_particle_type wire_type,
+		const uint8_t *wire_value, uint32_t value_size)
+{
+	// TODO
+	cf_warning(AS_PARTICLE, "map_compare_from_wire() not implemented");
+	return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+}
+
+uint32_t
+map_wire_size(const as_particle *p)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_particle(&map, p, false)) {
+		cf_crash(AS_PARTICLE, "map_wire_size() invalid packed map");
+	}
+
+	if (map.flags == 0) {
+		return map.packed_sz;
+	}
+
+	uint32_t sz = map.content_sz;
+	sz += as_pack_list_header_get_size(map.ele_count + 1);
+	sz += 3 + 1; // 3 for min ext hdr and 1 for nil pair
+
+	return sz;
+}
+
+uint32_t
+map_to_wire(const as_particle *p, uint8_t *wire)
+{
+	int64_t sz = map_particle_strip_indexes(p, wire);
+	cf_assert(sz >= 0, AS_PARTICLE, "map_to_wire() strip failed with sz %ld", sz);
+	return (uint32_t)sz;
+}
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+uint32_t
+map_size_from_asval(const as_val *val)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val);
+
+	as_serializer_destroy(&s);
+
+	const as_map *map = (const as_map *)val;
+
+	if (map->flags == 0) {
+		return (uint32_t)sizeof(map_mem) + sz;
+	}
+
+	uint32_t ele_count = as_map_size(map);
+	uint32_t map_hdr_sz = as_pack_list_header_get_size(ele_count);
+	uint32_t content_sz = sz - map_hdr_sz;
+	uint32_t ext_content_sz = map_calc_ext_content_sz(map->flags, ele_count,
+			content_sz);
+
+	sz = (uint32_t)sizeof(map_mem);
+	sz += as_pack_list_header_get_size(ele_count + 1) + content_sz;
+	sz += as_pack_ext_header_get_size(ext_content_sz);	// ext header and length field
+	sz += ext_content_sz;								// ext content
+	sz++;												// nil pair
+
+	return (uint32_t)sizeof(map_mem) + sz;
+}
+
+void
+map_from_asval(const as_val *val, as_particle **pp)
+{
+	map_mem *p_map_mem = (map_mem *)*pp;
+	const as_map *av_map = (const as_map *)val;
+
+	p_map_mem->type = AS_PARTICLE_TYPE_MAP;
+
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	int32_t sz = as_serializer_serialize_presized(&s, val, p_map_mem->data);
+
+	cf_assert(sz >= 0, AS_PARTICLE, "map_from_asval() failed to presize");
+	as_serializer_destroy(&s);
+
+	if (av_map->flags == 0) {
+		p_map_mem->sz = (uint32_t)sz;
+		return;
+	}
+
+	uint8_t *temp_mem = NULL;
+	uint8_t buf[sizeof(packed_map) + (sz < CDT_MAX_STACK_OBJ_SZ ? sz : 0)];
+	packed_map *map = (packed_map *)buf;
+	bool success;
+
+	if (sz < CDT_MAX_STACK_OBJ_SZ) {
+		memcpy(buf + sizeof(packed_map), p_map_mem->data, sz);
+		success = packed_map_init(map, buf + sizeof(packed_map), sz, false);
+	}
+	else {
+		temp_mem = cf_malloc(sz);
+		memcpy(temp_mem, p_map_mem->data, sz);
+		success = packed_map_init(map, temp_mem, sz, false);
+	}
+
+	cf_assert(success, AS_PARTICLE, "map_from_asval() failed to unpack header");
+
+	uint8_t map_flags = map_adjust_incoming_flags(av_map->flags);
+	define_map_packer(mpk, map->ele_count, map_flags, map->content_sz);
+
+	mpk.write_ptr = p_map_mem->data;
+	map_packer_write_hdridx(&mpk);
+
+	if (! packed_map_write_k_ordered(map, mpk.write_ptr, &mpk.offset_idx)) {
+		cf_crash(AS_PARTICLE, "map_from_asval() sort on key failed");
+	}
+
+	p_map_mem->sz =
+			(uint32_t)(mpk.contents - p_map_mem->data + map->content_sz);
+
+	if (order_index_is_valid(&mpk.value_idx)) {
+		order_index_set(&mpk.value_idx, 0, map->ele_count);
+	}
+
+	cf_free(temp_mem);
+
+#ifdef MAP_DEBUG_VERIFY
+	{
+		as_bin b;
+		b.particle = (as_particle *)p_map_mem;
+		as_bin_state_set_from_type(&b, AS_PARTICLE_TYPE_MAP);
+		if (! map_verify(&b)) {
+			cdt_bin_print(&b, "map_from_asval");
+		}
+	}
+#endif
+}
+
+as_val *
+map_to_asval(const as_particle *p)
+{
+	map_mem *p_map_mem = (map_mem *)p;
+
+	as_buffer buf = {
+			.capacity = p_map_mem->sz,
+			.size = p_map_mem->sz,
+			.data = p_map_mem->data
+	};
+
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	as_val *val = NULL;
+
+	as_serializer_deserialize(&s, &buf, &val);
+	as_serializer_destroy(&s);
+
+	if (! val) {
+		return (as_val *)as_hashmap_new(0);
+	}
+
+	packed_map map;
+
+	packed_map_init_from_particle(&map, p, false);
+	((as_map *)val)->flags = (uint32_t)map.flags;
+
+	return val;
+}
+
+uint32_t
+map_asval_wire_size(const as_val *val)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val);
+
+	as_serializer_destroy(&s);
+
+	return sz;
+}
+
+uint32_t
+map_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	as_serializer s;
+	as_msgpack_init(&s);
+
+	int32_t sz = as_serializer_serialize_presized(&s, val, wire);
+
+	as_serializer_destroy(&s);
+	cf_assert(sz > 0, AS_PARTICLE, "map_asval_to_wire() sz %d failed to serialize", sz);
+
+	return (uint32_t)sz;
+}
+
+//------------------------------------------------
+// Handle msgpack translation.
+//
+
+uint32_t
+map_size_from_msgpack(const uint8_t *packed, uint32_t packed_size)
+{
+	return (uint32_t)sizeof(map_mem) + packed_size;
+}
+
+void
+map_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp)
+{
+	map_mem *p_map_mem = (map_mem *)*pp;
+
+	p_map_mem->type = AS_PARTICLE_TYPE_MAP;
+	p_map_mem->sz = packed_size;
+	memcpy(p_map_mem->data, packed, p_map_mem->sz);
+}
+
+//------------------------------------------------
+// Handle on-device "flat" format.
+//
+
+int32_t
+map_size_from_flat(const uint8_t *flat, uint32_t flat_size)
+{
+	// TODO - maybe never used
+	return -1;
+}
+
+int
+map_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	// Cast temp buffer from disk to data-not-in-memory.
+	map_flat *p_map_flat = (map_flat *)flat;
+
+	// This assumes map_flat is the same as map_mem.
+	*pp = (as_particle *)p_map_flat;
+
+	return 0;
+}
+
+int
+map_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp)
+{
+	const map_flat *p_map_flat = (const map_flat *)flat;
+	packed_map map;
+
+	// This path implies disk-backed data-in-memory so fill_idxs -> true.
+	if (! packed_map_init(&map, p_map_flat->data, p_map_flat->sz, true)) {
+		cf_warning(AS_PARTICLE, "map_from_flat() invalid packed map");
+		return -1;
+	}
+
+	if (map.flags == 0) {
+		// Convert temp buffer from disk to data-in-memory.
+		map_mem *p_map_mem = cf_malloc_ns(sizeof(map_mem) + p_map_flat->sz);
+
+		p_map_mem->type = p_map_flat->type;
+		p_map_mem->sz = p_map_flat->sz;
+		memcpy(p_map_mem->data, p_map_flat->data, p_map_mem->sz);
+
+		*pp = (as_particle *)p_map_mem;
+
+		return 0;
+	}
+
+	uint8_t flags = map_adjust_incoming_flags(map.flags);
+	define_map_packer(mpk, map.ele_count, flags, map.content_sz);
+	as_particle *p = map_packer_create_particle(&mpk, NULL);
+
+	if (! p) {
+		return -1;
+	}
+
+	map_packer_write_hdridx(&mpk);
+	memcpy(mpk.write_ptr, map.contents, map.content_sz);
+
+	if (! map_packer_fill_offset_index(&mpk)) {
+		cf_free(p);
+		return -1;
+	}
+
+	if (order_index_is_valid(&mpk.value_idx)) {
+		if (! order_index_set_sorted(&mpk.value_idx, &map.offidx,
+				map.contents, map.content_sz, SORT_BY_VALUE)) {
+			cf_free(p);
+			return -1;
+		}
+	}
+
+	*pp = p;
+
+	return 0;
+}
+
+uint32_t
+map_flat_size(const as_particle *p)
+{
+	const map_mem *p_map_mem = (const map_mem *)p;
+
+	packed_map map;
+
+	if (! packed_map_init_from_particle(&map, p, false)) {
+		const as_bin b = {
+				.particle = (as_particle *)p
+		};
+
+		cdt_bin_print(&b, "map");
+		cf_crash(AS_PARTICLE, "map_flat_size() invalid packed map");
+	}
+
+	if (map.flags == 0) {
+		return sizeof(map_flat) + p_map_mem->sz;
+	}
+
+	uint32_t sz = map.content_sz;
+	sz += as_pack_list_header_get_size(map.ele_count + 1);
+	sz += 3 + 1; // 3 for min ext hdr and 1 for nil pair
+
+	return (uint32_t)sizeof(map_flat) + sz;
+}
+
+uint32_t
+map_to_flat(const as_particle *p, uint8_t *flat)
+{
+	map_flat *p_map_flat = (map_flat *)flat;
+	int64_t sz = map_particle_strip_indexes(p, p_map_flat->data);
+
+	cf_assert(sz >= 0, AS_PARTICLE, "map_to_flat() strip indexes failed with sz %ld", sz);
+	p_map_flat->sz = (uint32_t)sz;
+
+	// Already wrote the type.
+
+	return sizeof(map_flat) + p_map_flat->sz;
+}
+
+
+//==========================================================
+// Global API.
+//
+
+void
+as_bin_set_empty_packed_map(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags)
+{
+	b->particle = map_particle_create(alloc_buf, 0, NULL, 0, flags);
+	as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_MAP);
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static inline bool
+is_map_type(uint8_t type)
+{
+	return type == AS_PARTICLE_TYPE_MAP;
+}
+
+static inline bool
+is_k_ordered(uint8_t flags)
+{
+	return flags & AS_PACKED_MAP_FLAG_K_ORDERED;
+}
+
+static inline bool
+is_kv_ordered(uint8_t flags)
+{
+	return (flags & AS_PACKED_MAP_FLAG_KV_ORDERED) ==
+			AS_PACKED_MAP_FLAG_KV_ORDERED;
+}
+
+static uint32_t
+map_calc_ext_content_sz(uint8_t flags, uint32_t ele_count, uint32_t content_sz)
+{
+	uint32_t sz = 0;
+
+	if (is_k_ordered(flags)) {
+		offset_index offidx;
+
+		offset_index_init(&offidx, NULL, ele_count, NULL, content_sz);
+		sz += offset_index_size(&offidx);
+	}
+
+	if (is_kv_ordered(flags)) {
+		order_index ordidx;
+
+		order_index_init(&ordidx, NULL, ele_count);
+		sz += order_index_size(&ordidx);
+	}
+
+	return sz;
+}
+
+static uint8_t
+map_adjust_incoming_flags(uint8_t flags)
+{
+	static const uint8_t mask = AS_PACKED_MAP_FLAG_KV_ORDERED |
+			AS_PACKED_MAP_FLAG_OFF_IDX | AS_PACKED_MAP_FLAG_ORD_IDX;
+
+	if (is_k_ordered(flags)) {
+		flags |= AS_PACKED_MAP_FLAG_OFF_IDX;
+	}
+
+	if (is_kv_ordered(flags)) {
+		flags |= AS_PACKED_MAP_FLAG_ORD_IDX;
+	}
+
+	return flags & mask;
+}
+
+static inline uint32_t
+map_ext_content_sz(const packed_map *map)
+{
+	return map_calc_ext_content_sz(map->flags, map->ele_count, map->content_sz);
+}
+
+static inline bool
+map_is_k_ordered(const packed_map *map)
+{
+	return is_k_ordered(map->flags);
+}
+
+static inline bool
+map_is_kv_ordered(const packed_map *map)
+{
+	return is_kv_ordered(map->flags);
+}
+
+static inline bool
+map_has_offidx(const packed_map *map)
+{
+	return offset_index_is_valid(&map->offidx);
+}
+
+static inline bool
+map_fill_offidx(const packed_map *map)
+{
+	offset_index *offidx = (offset_index *)&map->offidx;
+	return map_offset_index_fill(offidx, map->ele_count);
+}
+
+static inline bool
+skip_map_pair(as_unpacker *pk)
+{
+	if (as_unpack_size(pk) <= 0) {
+		return false;
+	}
+
+	if (as_unpack_size(pk) <= 0) {
+		return false;
+	}
+
+	return true;
+}
+
+//------------------------------------------------
+// map_packer
+
+static as_particle *
+map_packer_create_particle(map_packer *pk, rollback_alloc *alloc_buf)
+{
+	uint32_t sz = pk->ext_sz + pk->content_sz +
+			as_pack_map_header_get_size(pk->ele_count + (pk->flags ? 1 : 0));
+	map_mem *p_map_mem = (map_mem *)(alloc_buf
+			? rollback_alloc_reserve(alloc_buf, sizeof(map_mem) + sz)
+			: cf_malloc(sizeof(map_mem) + sz)); // response, so not cf_malloc_ns()
+
+	p_map_mem->type = AS_PARTICLE_TYPE_MAP;
+	p_map_mem->sz = sz;
+	pk->write_ptr = p_map_mem->data;
+
+	return (as_particle *)p_map_mem;
+}
+
+static void
+map_packer_init(map_packer *pk, uint32_t ele_count, uint8_t flags,
+		uint32_t content_sz)
+{
+	pk->ele_count = ele_count;
+	pk->content_sz = content_sz;
+	pk->ext_content_sz = 0;
+
+	offset_index_init(&pk->offset_idx, NULL, ele_count, NULL, content_sz);
+
+	if (flags & AS_PACKED_MAP_FLAG_OFF_IDX) {
+		pk->ext_content_sz += offset_index_size(&pk->offset_idx);
+	}
+
+	order_index_init(&pk->value_idx, NULL, ele_count);
+
+	if (flags & AS_PACKED_MAP_FLAG_ORD_IDX) {
+		pk->ext_content_sz += order_index_size(&pk->value_idx);
+	}
+
+	pk->flags = flags;
+
+	if (flags == AS_PACKED_MAP_FLAG_NONE) {
+		pk->ext_header_sz = 0;
+		pk->ext_sz = 0;
+	}
+	else {
+		pk->ext_header_sz = as_pack_ext_header_get_size(pk->ext_content_sz);
+		pk->ext_sz = pk->ext_header_sz + pk->ext_content_sz + 1; // +1 for packed nil
+	}
+
+	pk->write_ptr = NULL;
+	pk->contents = NULL;
+}
+
+static void
+map_packer_setup_bin(map_packer *pk, as_bin *b, rollback_alloc *alloc_buf)
+{
+	b->particle = map_packer_create_particle(pk, alloc_buf);
+}
+
+static void
+map_packer_write_hdridx(map_packer *pk)
+{
+	as_packer write = {
+			.buffer = pk->write_ptr,
+			.capacity = INT_MAX
+	};
+
+	as_pack_map_header(&write, pk->ele_count +
+			(pk->flags == AS_PACKED_MAP_FLAG_NONE ? 0 : 1));
+
+	if (pk->flags == AS_PACKED_MAP_FLAG_NONE) {
+		pk->write_ptr += write.offset;
+		pk->contents = pk->write_ptr;
+
+		return;
+	}
+
+	as_pack_ext_header(&write, pk->ext_content_sz, pk->flags);
+
+	if (pk->ext_content_sz > 0) {
+		uint8_t *ptr = pk->write_ptr + write.offset;
+		uint32_t index_sz_left = pk->ext_content_sz;
+		uint32_t sz = offset_index_size(&pk->offset_idx);
+
+		if ((pk->flags & AS_PACKED_MAP_FLAG_OFF_IDX) && index_sz_left >= sz) {
+			offset_index_set_ptr(&pk->offset_idx, ptr,
+					ptr + pk->ext_content_sz + 1); // +1 for nil pair
+			ptr += sz;
+			index_sz_left -= sz;
+		}
+
+		sz = order_index_size(&pk->value_idx);
+
+		if ((pk->flags & AS_PACKED_MAP_FLAG_ORD_IDX) && index_sz_left >= sz) {
+			order_index_set_ptr(&pk->value_idx, ptr);
+		}
+	}
+
+	// Pack nil.
+	write.offset += pk->ext_content_sz;
+	write.buffer[write.offset++] = msgpack_nil[0];
+
+	pk->write_ptr += write.offset;
+	pk->contents = pk->write_ptr;
+	pk->offset_idx.contents = pk->contents;
+}
+
+static bool
+map_packer_fill_offset_index(map_packer *mpk)
+{
+	if (offset_index_is_null(&mpk->offset_idx)) {
+		return true;
+	}
+
+	offset_index_set_filled(&mpk->offset_idx, 1);
+
+	return map_offset_index_fill(&mpk->offset_idx, mpk->ele_count);
+}
+
+// qsort_r callback function.
+static int
+map_packer_fill_index_sort_compare(const void *x, const void *y, void *p)
+{
+	index_sort_userdata *udata = (index_sort_userdata *)p;
+
+	if (udata->error) {
+		return 0;
+	}
+
+	order_index *ordidx = udata->order;
+	uint32_t x_idx = order_index_ptr2value(ordidx, x);
+	uint32_t y_idx = order_index_ptr2value(ordidx, y);
+	const offset_index *offidx = udata->offsets;
+	const uint8_t *contents = udata->contents;
+	uint32_t content_sz = udata->content_sz;
+	uint32_t x_off = offset_index_get_const(offidx, x_idx);
+	uint32_t y_off = offset_index_get_const(offidx, y_idx);
+
+	as_unpacker x_pk = {
+			.buffer = contents,
+			.offset = x_off,
+			.length = content_sz
+	};
+
+	as_unpacker y_pk = {
+			.buffer = contents,
+			.offset = y_off,
+			.length = content_sz
+	};
+
+	if (udata->sort_by == SORT_BY_VALUE) {
+		// Skip keys.
+		if (as_unpack_size(&x_pk) <= 0) {
+			udata->error = true;
+			return 0;
+		}
+
+		if (as_unpack_size(&y_pk) <= 0) {
+			udata->error = true;
+			return 0;
+		}
+	}
+
+	msgpack_compare_t cmp = as_unpack_compare(&x_pk, &y_pk);
+
+	if (cmp == MSGPACK_COMPARE_EQUAL) {
+		if (udata->sort_by == SORT_BY_KEY) {
+			if ((cmp = as_unpack_compare(&x_pk, &y_pk)) ==
+					MSGPACK_COMPARE_EQUAL) {
+				return 0;
+			}
+		}
+		else {
+			return 0;
+		}
+	}
+
+	if (cmp == MSGPACK_COMPARE_LESS) {
+		return -1;
+	}
+
+	if (cmp == MSGPACK_COMPARE_GREATER) {
+		return 1;
+	}
+
+	udata->error = true;
+
+	return 0;
+}
+
+static bool
+map_packer_fill_ordidx(map_packer *mpk, const uint8_t *contents,
+		uint32_t content_sz)
+{
+	if (order_index_is_null(&mpk->value_idx)) {
+		return true;
+	}
+
+	return order_index_set_sorted(&mpk->value_idx, &mpk->offset_idx, contents,
+			content_sz, SORT_BY_VALUE);
+}
+
+static bool
+map_packer_add_op_copy_index(map_packer *mpk, const packed_map_op *add_op,
+		map_ele_find *remove_info, const map_ele_find *add_info, uint32_t kv_sz)
+{
+	// No elements left.
+	if (add_op->new_ele_count == 0) {
+		return true;
+	}
+
+	if (offset_index_is_valid(&mpk->offset_idx)) {
+		if (! packed_map_op_write_new_offidx(add_op, remove_info, add_info,
+				&mpk->offset_idx, kv_sz) &&
+				! map_packer_fill_offset_index(mpk)) {
+			return false;
+		}
+	}
+
+	if (order_index_is_valid(&mpk->value_idx)) {
+		if (remove_info->found_key &&
+				order_index_is_filled(&add_op->map->value_idx)) {
+			if (! packed_map_find_rank_indexed(add_op->map, remove_info)) {
+				cf_warning(AS_PARTICLE, "map_packer_add_op_copy_index() remove_info find rank failed");
+				return false;
+			}
+
+			if (! remove_info->found_value) {
+				cf_warning(AS_PARTICLE, "map_packer_add_op_copy_index() remove_info rank not found: idx=%u found=%d ele_count=%u", remove_info->idx, remove_info->found_key, add_op->map->ele_count);
+				return false;
+			}
+		}
+
+		if (! packed_map_op_write_new_ordidx(
+				add_op, remove_info, add_info, &mpk->value_idx) &&
+				! map_packer_fill_ordidx(mpk, mpk->contents, mpk->content_sz)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static inline void
+map_packer_write_seg1(map_packer *pk, const packed_map_op *op)
+{
+	pk->write_ptr = packed_map_op_write_seg1(op, pk->write_ptr);
+}
+
+static inline void
+map_packer_write_seg2(map_packer *pk, const packed_map_op *op)
+{
+	pk->write_ptr = packed_map_op_write_seg2(op, pk->write_ptr);
+}
+
+static inline void
+map_packer_write_msgpack_seg(map_packer *pk, const cdt_payload *seg)
+{
+	memcpy(pk->write_ptr, seg->ptr, seg->sz);
+	pk->write_ptr += seg->sz;
+}
+
+//------------------------------------------------
+// map
+
+static int
+map_set_flags(as_bin *b, rollback_alloc *alloc_buf, as_bin *result,
+		uint8_t set_flags)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, false)) {
+		cf_warning(AS_PARTICLE, "packed_map_set_flags() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint8_t map_flags = map.flags;
+	uint32_t ele_count = map.ele_count;
+	bool reorder = false;
+
+	if ((set_flags & AS_PACKED_MAP_FLAG_KV_ORDERED) ==
+			AS_PACKED_MAP_FLAG_V_ORDERED) {
+		cf_warning(AS_PARTICLE, "packed_map_set_flags() invalid flags 0x%x", set_flags);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (is_kv_ordered(set_flags)) {
+		if (! is_kv_ordered(map_flags)) {
+			if (ele_count > 1 && ! is_k_ordered(map_flags)) {
+				reorder = true;
+			}
+
+			map_flags |= AS_PACKED_MAP_FLAG_KV_ORDERED;
+			map_flags |= AS_PACKED_MAP_FLAG_OFF_IDX;
+			map_flags |= AS_PACKED_MAP_FLAG_ORD_IDX;
+		}
+	}
+	else if (is_k_ordered(set_flags)) {
+		if (is_kv_ordered(map_flags)) {
+			map_flags &= ~AS_PACKED_MAP_FLAG_V_ORDERED;
+			map_flags &= ~AS_PACKED_MAP_FLAG_ORD_IDX;
+		}
+		else if (! is_k_ordered(map_flags)) {
+			if (ele_count > 1) {
+				reorder = true;
+			}
+
+			map_flags |= AS_PACKED_MAP_FLAG_K_ORDERED;
+			map_flags |= AS_PACKED_MAP_FLAG_OFF_IDX;
+		}
+	}
+	else if ((set_flags & AS_PACKED_MAP_FLAG_KV_ORDERED) == 0) {
+		map_flags &= ~AS_PACKED_MAP_FLAG_KV_ORDERED;
+		map_flags &= ~AS_PACKED_MAP_FLAG_OFF_IDX;
+		map_flags &= ~AS_PACKED_MAP_FLAG_ORD_IDX;
+	}
+
+	define_map_packer(mpk, ele_count, map_flags, map.content_sz);
+
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+
+	if (reorder) {
+		vla_map_offidx_if_invalid(u, &map);
+
+		if (! packed_map_write_k_ordered(&map, mpk.write_ptr,
+				&mpk.offset_idx)) {
+			cf_warning(AS_PARTICLE, "packed_map_set_flags() sort on key failed, set_flags = 0x%x", set_flags);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+	else {
+		memcpy(mpk.write_ptr, map.contents, map.content_sz);
+
+		if (offset_index_is_valid(&mpk.offset_idx)) {
+			if (offset_index_is_full(&map.offidx)) {
+				offset_index_copy(&mpk.offset_idx, &map.offidx, 0, 0,
+						ele_count, 0);
+			}
+			else if (! map_packer_fill_offset_index(&mpk)) {
+				cf_warning(AS_PARTICLE, "packed_map_set_flags() fill index failed");
+				return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+			}
+		}
+	}
+
+	if (order_index_is_valid(&mpk.value_idx)) {
+		if (order_index_is_filled(&map.value_idx)) {
+			order_index_copy(&mpk.value_idx, &map.value_idx, 0, 0, ele_count,
+					NULL);
+		}
+		else {
+			map_packer_fill_ordidx(&mpk, mpk.contents, mpk.content_sz);
+		}
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_set_flags");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+map_increment(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key,
+		const cdt_payload *delta_value, as_bin *result, bool is_decrement)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "packed_map_increment() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	map_ele_find find_key;
+	map_ele_find_init(&find_key, &map);
+
+	if (! packed_map_find_key(&map, &find_key, key)) {
+		cf_warning(AS_PARTICLE, "packed_map_increment() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	cdt_calc_delta calc_delta;
+
+	if (! cdt_calc_delta_init(&calc_delta, delta_value, is_decrement)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (find_key.found_key) {
+		define_map_unpacker(pk_map_value, &map);
+
+		pk_map_value.offset = find_key.value_offset;
+
+		if (! cdt_calc_delta_add(&calc_delta, &pk_map_value)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+	else {
+		if (! cdt_calc_delta_add(&calc_delta, NULL)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+
+	uint8_t value_buf[CDT_MAX_PACKED_INT_SZ];
+
+	cdt_payload value = {
+			.ptr = value_buf,
+			.sz = 0
+	};
+
+	cdt_calc_delta_pack_and_result(&calc_delta, &value, result);
+
+	map_add_control control = {
+			.allow_overwrite = true,
+			.allow_create = true,
+	};
+
+	return map_add(b, alloc_buf, key, &value, NULL, &control);
+}
+
+static int
+map_add(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key,
+		const cdt_payload *value, as_bin *result,
+		const map_add_control *control)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "map_add() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	map_ele_find find_key_to_remove;
+	map_ele_find_init(&find_key_to_remove, &map);
+
+	if (! packed_map_find_key(&map, &find_key_to_remove, key)) {
+		cf_warning(AS_PARTICLE, "map_add() find key failed, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (find_key_to_remove.found_key) {
+		// ADD for [unique] & [key exist].
+		if (! control->allow_overwrite) {
+			return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS;
+		}
+	}
+	else {
+		// REPLACE for ![key exist].
+		if (! control->allow_create) {
+			return -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND;
+		}
+
+		// Normal cases handled by packed_map_op_add():
+		//  ADD for (![unique] & [key exist]) or ![key exist]
+		//  PUT for all cases
+		//  REPLACE for ([unique] & [key exist])
+		//  UPDATE for ([unique] & [key exist]) or ![key exist]
+	}
+
+	define_map_op(op, &map);
+	int32_t new_sz = packed_map_op_add(&op, &find_key_to_remove);
+
+	if (new_sz < 0) {
+		cf_warning(AS_PARTICLE, "map_add() failed with ret=%d, ele_count=%u", new_sz, map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t content_sz = (uint32_t)new_sz + key->sz + value->sz;
+	define_map_packer(mpk, op.new_ele_count, map.flags, content_sz);
+
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+
+	map_ele_find find_value_to_add;
+
+	map_ele_find_init(&find_value_to_add, &map);
+	find_value_to_add.idx = find_key_to_remove.idx;	// Find closest matching position for multiple same values.
+
+	if (order_index_is_valid(&mpk.value_idx) &&
+			order_index_is_filled(&map.value_idx)) {
+		if (! packed_map_find_rank_by_value_indexed(&map,
+				&find_value_to_add, value)) {
+			cf_warning(AS_PARTICLE, "map_add() find_value_to_add rank failed");
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+	}
+
+	map_packer_write_seg1(&mpk, &op);
+	map_packer_write_msgpack_seg(&mpk, key);
+	map_packer_write_msgpack_seg(&mpk, value);
+	map_packer_write_seg2(&mpk, &op);
+
+	if (! map_packer_add_op_copy_index(&mpk, &op, &find_key_to_remove,
+			&find_value_to_add, key->sz + value->sz)) {
+		cf_warning(AS_PARTICLE, "map_add() copy index failed");
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (result) {
+		as_bin_set_int(result, op.new_ele_count);
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (! map_verify(b)) {
+		cdt_bin_print(b, "map_add");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+map_add_items_unordered(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const offset_index *val_off,
+		order_index *val_ord, as_bin *result, const map_add_control *control)
+{
+	define_cdt_idx_mask(rm_mask, map->ele_count);
+	uint32_t rm_count = 0;
+	uint32_t rm_sz = 0;
+
+	for (uint32_t i = 0; i < map->ele_count; i++) {
+		uint32_t offset = offset_index_get_const(&map->offidx, i);
+
+		cdt_payload value = {
+				.ptr = map->contents + offset,
+				.sz = map->content_sz - offset
+		};
+
+		order_index_find find = {
+				.count = val_ord->max_idx,
+				.target = 0 // find first occurrence of value
+		};
+
+		order_index_find_rank_by_value(val_ord, &value, val_off, &find);
+
+		if (find.found) {
+			// ADD for [unique] & [key exist].
+			if (! control->allow_overwrite) {
+				return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS;
+			}
+
+			cdt_idx_mask_set(rm_mask, i);
+			rm_count++;
+			rm_sz += offset_index_get_delta_const(&map->offidx, i);
+		}
+		else {
+			// REPLACE for ![key exist].
+			if (! control->allow_create) {
+				return -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND;
+			}
+		}
+	}
+
+	uint32_t dup_count;
+	uint32_t dup_sz;
+
+	order_index_sorted_mark_dup_eles(val_ord, val_off, &dup_count, &dup_sz);
+
+	uint32_t new_ele_count = map->ele_count - rm_count +
+			val_ord->max_idx - dup_count;
+	uint32_t new_content_sz = map->content_sz - rm_sz +
+			val_off->content_sz - dup_sz;
+	define_map_packer(mpk, new_ele_count, map->flags, new_content_sz);
+
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+	mpk.write_ptr = cdt_idx_mask_write_eles(rm_mask, rm_count, &map->offidx,
+			mpk.write_ptr, true);
+	mpk.write_ptr = order_index_write_eles(val_ord, val_ord->max_idx, val_off,
+			mpk.write_ptr, false);
+	as_bin_set_int(result, new_ele_count);
+
+#ifdef MAP_DEBUG_VERIFY
+	if (! map_verify(b)) {
+		cdt_bin_print(b, "map_add_items_unordered");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+map_add_items_ordered(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const offset_index *val_off,
+		order_index *val_ord, as_bin *result, const map_add_control *control)
+{
+	uint32_t dup_count;
+	uint32_t dup_sz;
+
+	order_index_sorted_mark_dup_eles(val_ord, val_off, &dup_count, &dup_sz);
+
+	if (map->ele_count == 0) {
+		uint32_t new_content_sz = order_index_get_ele_size(val_ord,
+				val_ord->max_idx, val_off);
+		uint32_t new_ele_count = val_ord->max_idx - dup_count;
+		define_map_packer(mpk, new_ele_count, map->flags, new_content_sz);
+
+		map_packer_setup_bin(&mpk, b, alloc_buf);
+		map_packer_write_hdridx(&mpk);
+		order_index_write_eles(val_ord, val_ord->max_idx, val_off,
+				mpk.write_ptr, false);
+
+		if (offset_index_is_valid(&mpk.offset_idx)) {
+			offset_index_set_filled(&mpk.offset_idx, 1);
+
+			for (uint32_t i = 0; i < val_ord->max_idx; i++) {
+				uint32_t val_idx = order_index_get(val_ord, i);
+
+				if (val_idx == val_ord->max_idx) {
+					continue;
+				}
+
+				uint32_t sz = offset_index_get_delta_const(val_off, val_idx);
+
+				offset_index_append_size(&mpk.offset_idx, sz);
+			}
+		}
+
+		if (order_index_is_valid(&mpk.value_idx)) {
+			order_index_set(&mpk.value_idx, 0, new_ele_count);
+		}
+
+		as_bin_set_int(result, new_ele_count);
+
+#ifdef MAP_DEBUG_VERIFY
+		if (! map_verify(b)) {
+			cdt_bin_print(b, "map_add_items_ordered");
+			map_print(map, "original");
+			offset_index_print(val_off, "val_off");
+			order_index_print(val_ord, "val_ord");
+			cf_crash(AS_PARTICLE, "ele_count 0 dup_count %u dup_sz %u new_ele_count %u new_content_sz %u", dup_count, dup_sz, new_ele_count, new_content_sz);
+		}
+#endif
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	define_cdt_idx_mask(rm_mask, map->ele_count);
+	uint32_t rm_count = 0;
+	uint32_t rm_sz = 0;
+	define_order_index2(insert_idx, map->ele_count, val_ord->max_idx);
+
+	for (uint32_t i = 0; i < val_ord->max_idx; i++) {
+		uint32_t val_idx = order_index_get(val_ord, i);
+
+		if (val_idx == val_ord->max_idx) {
+			continue;
+		}
+
+		uint32_t off = offset_index_get_const(val_off, val_idx);
+		uint32_t sz = offset_index_get_delta_const(val_off, val_idx);
+
+		const cdt_payload value = {
+				.ptr = val_off->contents + off,
+				.sz = sz
+		};
+
+		map_ele_find find;
+		map_ele_find_init(&find, map);
+
+		if (! packed_map_find_key_indexed(map, &find, &value)) {
+			cf_warning(AS_PARTICLE, "map_add_items_ordered() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (find.found_key) {
+			// ADD for [unique] & [key exist].
+			if (! control->allow_overwrite) {
+				return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS;
+			}
+
+			if (! cdt_idx_mask_is_set(rm_mask, find.idx)) {
+				cdt_idx_mask_set(rm_mask, find.idx);
+				rm_count++;
+				rm_sz += offset_index_get_delta_const(&map->offidx, find.idx);
+			}
+		}
+		else {
+			// REPLACE for ![key exist].
+			if (! control->allow_create) {
+				return -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND;
+			}
+		}
+
+		cf_assert(find.idx <= map->ele_count, AS_PARTICLE, "Invalid find.idx %u > ele_count %u", find.idx, map->ele_count);
+		order_index_set(&insert_idx, i, find.idx);
+	}
+
+	uint32_t new_ele_count = map->ele_count - rm_count + val_ord->max_idx -
+			dup_count;
+	uint32_t new_content_sz = map->content_sz - rm_sz + val_off->content_sz -
+			dup_sz;
+	define_map_packer(mpk, new_ele_count, map->flags, new_content_sz);
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+	uint32_t start_off = 0;
+
+	for (uint32_t i = 0; i < val_ord->max_idx; i++) {
+		uint32_t val_idx = order_index_get(val_ord, i);
+
+		if (val_idx == val_ord->max_idx) {
+			continue;
+		}
+
+		uint32_t index = order_index_get(&insert_idx, i);
+		uint32_t off = offset_index_get_const(&map->offidx, index);
+
+		if (start_off < off) {
+			uint32_t sz = off - start_off;
+
+			memcpy(mpk.write_ptr, map->contents + start_off, sz);
+			mpk.write_ptr += sz;
+
+			if (index == map->ele_count) {
+				start_off = map->content_sz;
+			}
+			else if (cdt_idx_mask_is_set(rm_mask, index)) {
+				start_off = offset_index_get_const(&map->offidx, index + 1);
+			}
+			else {
+				start_off = off;
+			}
+		}
+		else if (index == map->ele_count) {
+			start_off = map->content_sz;
+		}
+		else if (start_off == off && cdt_idx_mask_is_set(rm_mask, index)) {
+			start_off = offset_index_get_const(&map->offidx, index + 1);
+		}
+
+		uint32_t val_offset = offset_index_get_const(val_off, val_idx);
+		uint32_t val_sz = offset_index_get_delta_const(val_off, val_idx);
+
+		memcpy(mpk.write_ptr, val_off->contents + val_offset, val_sz);
+		mpk.write_ptr += val_sz;
+	}
+
+	uint32_t sz = map->content_sz - start_off;
+
+	if (sz != 0) {
+		memcpy(mpk.write_ptr, map->contents + start_off, sz);
+	}
+
+	if (offset_index_is_valid(&mpk.offset_idx)) {
+		uint32_t read_index = 0;
+		uint32_t write_index = 1;
+		int delta = 0;
+
+		offset_index_set_filled(&mpk.offset_idx, 1);
+
+		for (uint32_t i = 0; i < val_ord->max_idx; i++) {
+			uint32_t val_idx = order_index_get(val_ord, i);
+
+			if (val_idx == val_ord->max_idx) {
+				continue;
+			}
+
+			uint32_t index = order_index_get(&insert_idx, i);
+
+			if (index > read_index) {
+				uint32_t count = index - read_index;
+
+				if (read_index + count == map->ele_count) {
+					count--;
+				}
+
+				offset_index_copy(&mpk.offset_idx, &map->offidx, write_index,
+						read_index + 1, count, delta);
+				write_index += count;
+				read_index += count;
+				offset_index_set_filled(&mpk.offset_idx, write_index);
+
+				if (index != map->ele_count &&
+						cdt_idx_mask_is_set(rm_mask, index)) {
+					read_index++;
+					delta -= offset_index_get_delta_const(&map->offidx, index);
+				}
+			}
+			else if (index != map->ele_count && index == read_index &&
+					cdt_idx_mask_is_set(rm_mask, index)) {
+				read_index++;
+				delta -= offset_index_get_delta_const(&map->offidx, index);
+			}
+
+			uint32_t sz = offset_index_get_delta_const(val_off, val_idx);
+
+			offset_index_append_size(&mpk.offset_idx, sz);
+			write_index++;
+			delta += sz;
+		}
+
+		if (read_index + 1 < map->ele_count && write_index < new_ele_count) {
+			offset_index_copy(&mpk.offset_idx, &map->offidx, write_index,
+					read_index + 1, map->ele_count - read_index - 1, delta);
+		}
+
+		offset_index_set_filled(&mpk.offset_idx, map->ele_count);
+	}
+
+	if (order_index_is_valid(&mpk.value_idx)) {
+		order_index_set(&mpk.value_idx, 0, new_ele_count);
+	}
+
+	as_bin_set_int(result, new_ele_count);
+
+#ifdef MAP_DEBUG_VERIFY
+	if (! map_verify(b)) {
+		cdt_bin_print(b, "map_add_items_ordered");
+		map_print(map, "original");
+		offset_index_print(val_off, "val_off");
+		order_index_print(val_ord, "val_ord");
+		cf_crash(AS_PARTICLE, "ele_count %u dup_count %u dup_sz %u new_ele_count %u new_content_sz %u", map->ele_count, dup_count, dup_sz, new_ele_count, new_content_sz);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+map_add_items(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *items,
+		as_bin *result, const map_add_control *control)
+{
+	as_unpacker pk = {
+			.buffer = items->ptr,
+			.length = items->sz
+	};
+
+	int64_t items_count = as_unpack_map_header_element_count(&pk);
+
+	if (items_count < 0) {
+		cf_warning(AS_PARTICLE, "map_add_items() invalid parameter, expected packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (items_count > 0 && as_unpack_peek_is_ext(&pk)) {
+		if (! skip_map_pair(&pk)) {
+			cf_warning(AS_PARTICLE, "map_add_items() invalid parameter");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		items_count--;
+	}
+
+	if (items_count == 0) {
+		return AS_PROTO_RESULT_OK; // no-op
+	}
+
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "map_add_items() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	vla_map_offidx_if_invalid(u, &map);
+
+	// Pre-fill index.
+	if (! map_offset_index_fill(u.offidx, map.ele_count)) {
+		cf_warning(AS_PARTICLE, "map_add_items() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	const uint8_t *val_contents = pk.buffer + pk.offset;
+	uint32_t val_content_sz = pk.length - pk.offset;
+	uint32_t val_count = (uint32_t)items_count;
+	define_order_index(val_ord, val_count);
+	define_offset_index(val_off, val_contents, val_content_sz, val_count);
+
+	// Sort items to add.
+	if (! map_offset_index_fill(&val_off, val_count) ||
+			! order_index_set_sorted(&val_ord, &val_off, val_contents,
+					val_content_sz, SORT_BY_KEY)) {
+		cf_warning(AS_PARTICLE, "map_add_items() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (map_is_k_ordered(&map)) {
+		return map_add_items_ordered(&map, b, alloc_buf, &val_off, &val_ord,
+				result, control);
+	}
+
+	return map_add_items_unordered(&map, b, alloc_buf, &val_off, &val_ord,
+			result, control);
+}
+
+static int
+map_remove_by_key_interval(as_bin *b, rollback_alloc *alloc_buf,
+		const cdt_payload *key_start, const cdt_payload *key_end,
+		cdt_result_data *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "packed_map_remove_by_key_interval() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_map_get_remove_by_key_interval(&map, b, alloc_buf, key_start,
+			key_end, result);
+}
+
+static int
+map_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf,
+		int64_t index, uint64_t count, cdt_result_data *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "packed_map_remove_by_index_range() invalid packed map index, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_map_get_remove_by_index_range(&map, b, alloc_buf, index,
+			count, result);
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value: [value_start, value_start].
+static int
+map_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf,
+		const cdt_payload *value_start, const cdt_payload *value_end,
+		cdt_result_data *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "packed_map_remove_by_value_interval() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_map_get_remove_by_value_interval(&map, b, alloc_buf,
+			value_start, value_end, result);
+}
+
+static int
+map_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf,
+		int64_t rank, uint64_t count, cdt_result_data *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "packed_map_remove_by_index_range() invalid packed map index, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_map_get_remove_by_rank_range(&map, b, alloc_buf, rank, count,
+			result);
+}
+
+static int
+map_remove_all_by_key_list(as_bin *b, rollback_alloc *alloc_buf,
+		const cdt_payload *key_list, cdt_result_data *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "map_remove_all_by_key_list() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_map_get_remove_all_by_key_list(&map, b, alloc_buf, key_list,
+			result);
+}
+
+static int
+map_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf,
+		const cdt_payload *value_list, cdt_result_data *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, true)) {
+		cf_warning(AS_PARTICLE, "map_get_remove_all_value_items() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return packed_map_get_remove_all_by_value_list(&map, b, alloc_buf,
+			value_list, result);
+}
+
+static int
+map_clear(as_bin *b, rollback_alloc *alloc_buf, as_bin *result)
+{
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, false)) {
+		cf_warning(AS_PARTICLE, "packed_map_clear() invalid packed map, ele_count=%u", map.ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	define_map_packer(mpk, 0, map.flags, 0);
+
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+//------------------------------------------------
+// packed_map
+
+static bool
+packed_map_init(packed_map *map, const uint8_t *buf, uint32_t sz,
+		bool fill_idxs)
+{
+	map->packed = buf;
+	map->packed_sz = sz;
+
+	map->ele_count = 0;
+
+	return packed_map_unpack_hdridx(map, fill_idxs);
+}
+
+static inline bool
+packed_map_init_from_particle(packed_map *map, const as_particle *p,
+		bool fill_idxs)
+{
+	const map_mem *p_map_mem = (const map_mem *)p;
+	return packed_map_init(map, p_map_mem->data, p_map_mem->sz, fill_idxs);
+}
+
+static bool
+packed_map_init_from_bin(packed_map *map, const as_bin *b, bool fill_idxs)
+{
+	uint8_t type = as_bin_get_particle_type(b);
+
+	cf_assert(is_map_type(type), AS_PARTICLE, "as_packed_map_init_from_bin() invalid type %d", type);
+
+	return packed_map_init_from_particle(map, b->particle, fill_idxs);
+}
+
+static bool
+packed_map_unpack_hdridx(packed_map *map, bool fill_idxs)
+{
+	as_unpacker pk = {
+			.buffer = map->packed,
+			.length = map->packed_sz
+	};
+
+	if (map->packed_sz == 0) {
+		map->flags = 0;
+		return false;
+	}
+
+	int64_t ele_count = as_unpack_map_header_element_count(&pk);
+
+	if (ele_count < 0) {
+		return false;
+	}
+
+	map->ele_count = (uint32_t)ele_count;
+
+	if (ele_count != 0 && as_unpack_peek_is_ext(&pk)) {
+		as_msgpack_ext ext;
+
+		if (as_unpack_ext(&pk, &ext) != 0) {
+			return false;
+		}
+
+		if (as_unpack_size(&pk) <= 0) { // skip the packed nil
+			return false;
+		}
+
+		map->flags = ext.type;
+		map->ele_count--;
+
+		map->contents = map->packed + pk.offset;
+		map->content_sz = map->packed_sz - pk.offset;
+		offset_index_init(&map->offidx, NULL, map->ele_count, map->contents,
+				map->content_sz);
+		order_index_init(&map->value_idx, NULL, map->ele_count);
+
+		uint32_t index_sz_left = ext.size;
+		uint8_t *ptr = (uint8_t *)ext.data;
+		uint32_t sz = offset_index_size(&map->offidx);
+
+		if ((map->flags & AS_PACKED_MAP_FLAG_OFF_IDX) && index_sz_left >= sz) {
+			offset_index_set_ptr(&map->offidx, ptr, map->packed + pk.offset);
+			ptr += sz;
+			index_sz_left -= sz;
+
+			if (fill_idxs) {
+				map_fill_offidx(map);
+			}
+		}
+
+		sz = order_index_size(&map->value_idx);
+
+		if ((map->flags & AS_PACKED_MAP_FLAG_ORD_IDX) && index_sz_left >= sz) {
+			order_index_set_ptr(&map->value_idx, ptr);
+		}
+	}
+	else {
+		map->contents = map->packed + pk.offset;
+		map->content_sz = map->packed_sz - pk.offset;
+
+		offset_index_init(&map->offidx, NULL, ele_count, map->contents,
+				map->content_sz);
+		order_index_init(&map->value_idx, NULL, ele_count);
+		map->flags = AS_PACKED_MAP_FLAG_NONE;
+	}
+
+	return true;
+}
+
+static void
+packed_map_init_indexes(const packed_map *map, as_packer *pk)
+{
+	uint8_t *ptr = pk->buffer + pk->offset;
+
+	if (map_is_k_ordered(map)) {
+		offset_index offidx;
+
+		offset_index_init(&offidx, ptr, map->ele_count, map->contents,
+				map->content_sz);
+
+		uint32_t offidx_sz = offset_index_size(&offidx);
+
+		ptr += offidx_sz;
+		offset_index_set_filled(&offidx, 1);
+		pk->offset += offidx_sz;
+	}
+
+	if (map_is_kv_ordered(map)) {
+		order_index ordidx;
+
+		order_index_init(&ordidx, ptr, map->ele_count);
+		order_index_set(&ordidx, 0, map->ele_count);
+		pk->offset += order_index_size(&ordidx);
+	}
+}
+
+static bool
+packed_map_ensure_ordidx_filled(const packed_map *op)
+{
+	order_index *ordidx = (order_index *)&op->value_idx;
+
+	if (! order_index_is_filled(ordidx)) {
+		if (! map_fill_offidx(op)) {
+			cf_warning(AS_PARTICLE, "packed_map_ensure_ordidx_filled() failed to fill offset_idx");
+			return false;
+		}
+
+		return order_index_set_sorted(ordidx, &op->offidx,
+				op->contents, op->content_sz, SORT_BY_VALUE);
+	}
+
+	return true;
+}
+
+static uint32_t
+packed_map_find_index_by_idx_unordered(const packed_map *map, uint32_t idx)
+{
+	uint32_t pk_offset = offset_index_get_const(&map->offidx, idx);
+
+	cdt_payload key = {
+			.ptr = map->contents + pk_offset,
+			.sz = map->content_sz - pk_offset
+	};
+
+	return packed_map_find_index_by_key_unordered(map, &key);
+}
+
+static uint32_t
+packed_map_find_index_by_key_unordered(const packed_map *map,
+		const cdt_payload *key)
+{
+	as_unpacker pk_key = {
+			.buffer = key->ptr,
+			.length = key->sz
+	};
+
+	uint32_t index = 0;
+	define_map_unpacker(pk, map);
+
+	for (uint32_t i = 0; i < map->ele_count; i++) {
+		pk_key.offset = 0;
+		msgpack_compare_t cmp = as_unpack_compare(&pk, &pk_key);
+
+		if (cmp == MSGPACK_COMPARE_ERROR) {
+			return map->ele_count;
+		}
+
+		if (cmp == MSGPACK_COMPARE_LESS) {
+			index++;
+		}
+
+		if (as_unpack_size(&pk) <= 0) {
+			return map->ele_count;
+		}
+	}
+
+	return index;
+}
+
+static void
+packed_map_find_rank_indexed_linear(const packed_map *map, map_ele_find *find,
+		uint32_t start, uint32_t len)
+{
+	uint32_t rank = order_index_find_idx(&map->value_idx, find->idx, start,
+			len);
+
+	if (rank < start + len) {
+		find->found_value = true;
+		find->rank = rank;
+	}
+}
+
+// Find rank given index (find->idx).
+// Return true on success.
+static bool
+packed_map_find_rank_indexed(const packed_map *map, map_ele_find *find)
+{
+	uint32_t ele_count = map->ele_count;
+
+	if (ele_count == 0) {
+		return true;
+	}
+
+	if (find->idx >= ele_count) {
+		find->found_value = false;
+		return true;
+	}
+
+	const offset_index *offset_idx = &map->offidx;
+	const order_index *value_idx = &map->value_idx;
+
+	uint32_t rank = ele_count / 2;
+	uint32_t upper = ele_count;
+	uint32_t lower = 0;
+
+	as_unpacker pk_value = {
+			.buffer = map->contents + find->value_offset,
+			.length = find->key_offset + find->sz - find->value_offset
+	};
+
+	find->found_value = false;
+
+	while (true) {
+		if (upper - lower < LINEAR_FIND_RANK_MAX_COUNT) {
+			packed_map_find_rank_indexed_linear(map, find, lower,
+					upper - lower);
+			return true;
+		}
+
+		uint32_t idx = order_index_get(value_idx, rank);
+
+		if (find->idx == idx) {
+			find->found_value = true;
+			find->rank = rank;
+			break;
+		}
+
+		as_unpacker pk_buf = {
+				.buffer = map->contents,
+				.offset = offset_index_get_const(offset_idx, idx),
+				.length = map->content_sz
+		};
+
+		if (as_unpack_size(&pk_buf) <= 0) { // skip key
+			cf_warning(AS_PARTICLE, "packed_map_find_rank_indexed() unpack key failed at rank=%u", rank);
+			return false;
+		}
+
+		pk_value.offset = 0; // reset
+
+		msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk_buf);
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			if (find->idx < idx) {
+				cmp = MSGPACK_COMPARE_LESS;
+			}
+			else if (find->idx > idx) {
+				cmp = MSGPACK_COMPARE_GREATER;
+			}
+
+			find->found_value = true;
+		}
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			find->rank = rank;
+			break;
+		}
+
+		if (cmp == MSGPACK_COMPARE_GREATER) {
+			if (rank >= upper - 1) {
+				find->rank = rank + 1;
+				break;
+			}
+
+			lower = rank + 1;
+			rank += upper;
+			rank /= 2;
+		}
+		else if (cmp == MSGPACK_COMPARE_LESS) {
+			if (rank == lower) {
+				find->rank = rank;
+				break;
+			}
+
+			upper = rank;
+			rank += lower;
+			rank /= 2;
+		}
+		else {
+			cf_warning(AS_PARTICLE, "packed_map_find_rank_indexed() error=%d lower=%u rank=%u upper=%u", (int)cmp, lower, rank, upper);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+// Find (closest) rank given value.
+// Find closest rank for find->idx (0 means first instance of value).
+// FIXME - this is mechanically different from order_index_find_rank_by_value()
+//  where target = ele_count finds the largest rank; here it finds the largest
+//  rank + 1 in the case that the value exist; fix to conform.
+// Return true on success.
+static bool
+packed_map_find_rank_by_value_indexed(const packed_map *map, map_ele_find *find,
+		const cdt_payload *value)
+{
+	const offset_index *offset_idx = &map->offidx;
+	const order_index *value_idx = &map->value_idx;
+
+	find->found_value = false;
+
+	if (map->ele_count == 0) {
+		return true;
+	}
+
+	uint32_t rank = map->ele_count / 2;
+
+	as_unpacker pk_value = {
+			.buffer = value->ptr,
+			.length = value->sz
+	};
+
+	while (true) {
+		uint32_t idx = order_index_get(value_idx, rank);
+		uint32_t pk_offset = offset_index_get_const(offset_idx, idx);
+
+		as_unpacker pk_buf = {
+				.buffer = map->contents + pk_offset,
+				.length = map->content_sz - pk_offset
+		};
+
+		if (as_unpack_size(&pk_buf) <= 0) { // skip key
+			return false;
+		}
+
+		pk_value.offset = 0; // reset
+
+		msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk_buf);
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			if (find->idx < idx) {
+				cmp = MSGPACK_COMPARE_LESS;
+			}
+			else if (find->idx > idx) {
+				cmp = MSGPACK_COMPARE_GREATER;
+			}
+
+			find->found_value = true;
+		}
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			find->found_value = true;
+			find->rank = rank;
+			break;
+		}
+
+		if (cmp == MSGPACK_COMPARE_GREATER) {
+			if (rank >= find->upper - 1) {
+				find->rank = rank + 1;
+				break;
+			}
+
+			find->lower = rank + 1;
+			rank += find->upper;
+			rank /= 2;
+		}
+		else if (cmp == MSGPACK_COMPARE_LESS) {
+			if (rank == find->lower) {
+				find->rank = rank;
+				break;
+			}
+
+			find->upper = rank;
+			rank += find->lower;
+			rank /= 2;
+		}
+		else {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value: [value_start, value_start].
+static bool
+packed_map_find_rank_range_by_value_interval_indexed(const packed_map *map,
+		const cdt_payload *value_start, const cdt_payload *value_end,
+		uint32_t *rank, uint32_t *count, bool is_multi)
+{
+	cf_assert(map_has_offidx(map), AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() offset_index needs to be valid");
+
+	map_ele_find find_start;
+
+	map_ele_find_init(&find_start, map);
+	find_start.idx = 0; // find least ranked entry with value == value_start
+
+	if (! packed_map_find_rank_by_value_indexed(map, &find_start,
+			value_start)) {
+		cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() invalid packed map");
+		return false;
+	}
+
+	*rank = find_start.rank;
+	*count = 1;
+
+	if (! value_end || ! value_end->ptr) {
+		*count = map->ele_count - *rank;
+	}
+	else {
+		map_ele_find find_end;
+
+		map_ele_find_init(&find_end, map);
+
+		if (value_end != value_start) {
+			find_end.idx = 0;
+
+			if (! packed_map_find_rank_by_value_indexed(map, &find_end,
+					value_end)) {
+				cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() invalid packed map");
+				return false;
+			}
+
+			*count = (find_end.rank > find_start.rank) ?
+					find_end.rank - find_start.rank : 0;
+		}
+		else {
+			if (! find_start.found_value) {
+				*count = 0;
+			}
+			else if (is_multi) {
+				find_end.idx = map->ele_count; // find highest ranked entry with value == value_start
+
+				if (! packed_map_find_rank_by_value_indexed(map, &find_end,
+						value_start)) {
+					cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() invalid packed map");
+					return false;
+				}
+
+				*count = find_end.rank - find_start.rank;
+			}
+		}
+	}
+
+	return true;
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value: [value_start, value_start].
+static bool
+packed_map_find_rank_range_by_value_interval_unordered(const packed_map *map,
+		const cdt_payload *value_start, const cdt_payload *value_end,
+		uint32_t *rank, uint32_t *count, uint64_t *mask)
+{
+	cf_assert(map_has_offidx(map), AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() offset_index needs to be valid");
+	cf_assert(value_end, AS_PARTICLE, "value_end == NULL");
+
+	as_unpacker pk_start = {
+			.buffer = value_start->ptr,
+			.length = value_start->sz
+	};
+
+	as_unpacker pk_end = {
+			.buffer = value_end->ptr,
+			.length = value_end->sz
+	};
+
+	// Pre-check parameters.
+	if (as_unpack_size(&pk_start) <= 0) {
+		cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid start value");
+		return false;
+	}
+
+	if (value_end != value_start) {
+		// Pre-check parameters.
+		if (value_end->ptr && as_unpack_size(&pk_end) < 0) {
+			cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid end value");
+			return false;
+		}
+	}
+
+	*rank = 0;
+	*count = 0;
+
+	offset_index *offidx = (offset_index *)&map->offidx;
+	define_map_unpacker(pk, map);
+
+	for (uint32_t i = 0; i < map->ele_count; i++) {
+		offset_index_set(offidx, i, pk.offset);
+
+		if (as_unpack_size(&pk) <= 0) { // skip key
+			cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid packed map at index %u", i);
+			return false;
+		}
+
+		uint32_t value_offset = pk.offset; // save for pk_end
+
+		pk_start.offset = 0; // reset
+
+		msgpack_compare_t cmp_start = as_unpack_compare(&pk, &pk_start);
+
+		if (cmp_start == MSGPACK_COMPARE_ERROR) {
+			cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid packed map at index %u", i);
+			return false;
+		}
+
+		if (cmp_start == MSGPACK_COMPARE_LESS) {
+			(*rank)++;
+		}
+		else if (value_start != value_end) {
+			msgpack_compare_t cmp_end = MSGPACK_COMPARE_LESS;
+
+			// NULL value_end means largest possible value.
+			if (value_end->ptr) {
+				pk.offset = value_offset;
+				pk_end.offset = 0;
+				cmp_end = as_unpack_compare(&pk, &pk_end);
+			}
+
+			if (cmp_end == MSGPACK_COMPARE_LESS) {
+				cdt_idx_mask_set(mask, i);
+				(*count)++;
+			}
+		}
+		// Single value case.
+		else if (cmp_start == MSGPACK_COMPARE_EQUAL) {
+			cdt_idx_mask_set(mask, i);
+			(*count)++;
+		}
+	}
+
+	offset_index_set_filled(offidx, map->ele_count);
+
+	return true;
+}
+
+// Find key given list index.
+// Return true on success.
+static bool
+packed_map_find_key_indexed(const packed_map *map, map_ele_find *find,
+		const cdt_payload *key)
+{
+	const offset_index *offidx = &map->offidx;
+	uint32_t ele_count = map->ele_count;
+
+	find->lower = 0;
+	find->upper = ele_count;
+
+	uint32_t idx = (find->lower + find->upper) / 2;
+
+	as_unpacker pk_key = {
+			.buffer = key->ptr,
+			.length = key->sz
+	};
+
+	find->found_key = false;
+
+	if (ele_count == 0) {
+		find->idx = 0;
+		return true;
+	}
+
+	while (true) {
+		uint32_t offset = offset_index_get_const(offidx, idx);
+		uint32_t content_sz = map->content_sz;
+		uint32_t sz = content_sz - offset;
+
+		as_unpacker pk_buf = {
+				.buffer = map->contents + offset,
+				.length = sz
+		};
+
+		pk_key.offset = 0; // reset
+
+		msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk_buf);
+		uint32_t key_sz = pk_buf.offset;
+
+		if (cmp == MSGPACK_COMPARE_EQUAL) {
+			if (! find->found_key) {
+				find->found_key = true;
+				find->key_offset = offset;
+				find->value_offset = offset + key_sz;
+				find->idx = idx++;
+				find->sz = (idx >= ele_count) ?
+						sz : offset_index_get_const(offidx, idx) - offset;
+			}
+
+			break;
+		}
+
+		if (cmp == MSGPACK_COMPARE_GREATER) {
+			if (idx >= find->upper - 1) {
+				if (++idx >= ele_count) {
+					find->key_offset = content_sz;
+					find->value_offset = content_sz;
+					find->idx = idx;
+					find->sz = 0;
+					break;
+				}
+
+				if (! find->found_key) {
+					uint32_t offset = offset_index_get_const(offidx, idx);
+					uint32_t tail = content_sz - offset;
+
+					as_unpacker pk = {
+							.buffer = map->contents + offset,
+							.length = tail
+					};
+
+					if (as_unpack_size(&pk) <= 0) {
+						cf_warning(AS_PARTICLE, "packed_map_find_key_indexed() invalid packed map");
+						return false;
+					}
+
+					find->key_offset = offset;
+					find->value_offset = offset + pk.offset;
+					find->idx = idx++;
+					find->sz = (idx >= ele_count) ?
+							tail : offset_index_get_const(offidx, idx) - offset;
+				}
+
+				break;
+			}
+
+			find->lower = idx + 1;
+			idx += find->upper;
+			idx /= 2;
+		}
+		else if (cmp == MSGPACK_COMPARE_LESS) {
+			if (idx == find->lower) {
+				find->key_offset = offset;
+				find->value_offset = offset + key_sz;
+				find->idx = idx++;
+				find->sz = (idx >= ele_count) ?
+						sz : offset_index_get_const(offidx, idx) - offset;
+				break;
+			}
+
+			find->upper = idx;
+			idx += find->lower;
+			idx /= 2;
+		}
+		else {
+			cf_warning(AS_PARTICLE, "packed_map_find_key_indexed() compare error=%d", (int)cmp);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static bool
+packed_map_find_key(const packed_map *map, map_ele_find *find,
+		const cdt_payload *key)
+{
+	uint32_t ele_count = map->ele_count;
+	offset_index *offidx = (offset_index *)&map->offidx;
+
+	if (ele_count == 0) {
+		return true;
+	}
+
+	if (map_is_k_ordered(map) && offset_index_is_full(offidx)) {
+		if (! packed_map_find_key_indexed(map, find, key)) {
+			cf_warning(AS_PARTICLE, "packed_map_find_key() packed_map_op_find_key_indexed failed");
+			return false;
+		}
+
+		return true;
+	}
+
+	as_unpacker pk_key = {
+			.buffer = key->ptr,
+			.length = key->sz
+	};
+
+	find->found_key = false;
+
+	define_map_unpacker(pk, map);
+	uint32_t content_sz = pk.length;
+
+	if (! offset_index_is_valid(offidx)) {
+		offidx = NULL;
+	}
+
+	if (map_is_k_ordered(map)) {
+		// Ordered compare.
+
+		// Allows for continuation from last search.
+		if (find->lower > 0) {
+			pk.offset = find->key_offset;
+		}
+
+		for (uint32_t i = find->lower; i < find->upper; i++) {
+			uint32_t key_offset = pk.offset;
+			uint32_t sz;
+
+			pk_key.offset = 0; // reset
+
+			msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk);
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				return false;
+			}
+
+			find->value_offset = pk.offset;
+
+			if (offidx) {
+				int64_t ret = map_offset_index_get_delta(offidx, i);
+
+				if (ret < 0) {
+					return false;
+				}
+
+				pk.offset = (uint32_t)map_offset_index_get(offidx, i + 1);
+				sz = (uint32_t)ret;
+			}
+			else {
+				// Skip value.
+				if (as_unpack_size(&pk) <= 0) {
+					return false;
+				}
+
+				sz = pk.offset - key_offset;
+			}
+
+			if (cmp != MSGPACK_COMPARE_GREATER) {
+				if (cmp == MSGPACK_COMPARE_EQUAL) {
+					find->found_key = true;
+				}
+
+				find->idx = i;
+				find->key_offset = key_offset;
+				find->sz = sz;
+
+				return true;
+			}
+		}
+
+		if (find->upper == ele_count) {
+			find->key_offset = content_sz;
+			find->value_offset = content_sz;
+			find->sz = 0;
+		}
+		else {
+			if (offidx && ! offset_index_set_next(offidx, find->upper,
+					pk.offset)) {
+				cf_warning(AS_PARTICLE, "offset mismatch at i=%u offset=%u offidx_offset=%u", find->upper, pk.offset, offset_index_get_const(offidx, find->upper));
+			}
+
+			find->key_offset = pk.offset;
+
+			// Skip key.
+			if (as_unpack_size(&pk) <= 0) {
+				return false;
+			}
+
+			find->value_offset = pk.offset;
+
+			// Skip value.
+			if (as_unpack_size(&pk) <= 0) {
+				return false;
+			}
+
+			find->sz = pk.offset - find->key_offset;
+		}
+
+		find->idx = find->upper;
+	}
+	else {
+		// Unordered compare.
+		// Assumes same keys are clustered.
+		for (uint32_t i = 0; i < ele_count; i++) {
+			uint32_t offset = pk.offset;
+
+			pk_key.offset = 0; // reset
+
+			msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk);
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				return false;
+			}
+
+			uint32_t value_offset = pk.offset;
+
+			if (cmp == MSGPACK_COMPARE_EQUAL) {
+				// Skip value.
+				if (as_unpack_size(&pk) <= 0) {
+					return false;
+				}
+
+				if (! find->found_key) {
+					find->found_key = true;
+					find->idx = i;
+					find->key_offset = offset;
+					find->value_offset = value_offset;
+					find->sz = pk.offset - offset;
+				}
+
+				if (offidx && ! offset_index_set_next(offidx, i + 1,
+						pk.offset)) {
+					cf_warning(AS_PARTICLE, "offset mismatch at i=%u offset=%u offidx_offset=%u", i + 1, pk.offset, offset_index_get_const(offidx, i + 1));
+				}
+
+				return true;
+			}
+			else if (find->found_key) {
+				return true;
+			}
+			else if (as_unpack_size(&pk) <= 0) { // skip value
+				return false;
+			}
+
+			if (offidx && ! offset_index_set_next(offidx, i + 1, pk.offset)) {
+				cf_warning(AS_PARTICLE, "offset mismatch at i=%u offset=%u offidx_offset=%u", i + 1, pk.offset, offset_index_get_const(offidx, i + 1));
+			}
+		}
+
+		find->key_offset = content_sz;
+		find->value_offset = content_sz;
+		find->sz = 0;
+		find->idx = ele_count;
+	}
+
+	return true;
+}
+
+static int
+packed_map_get_remove_by_key_interval(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *key_start,
+		const cdt_payload *key_end, cdt_result_data *result)
+{
+	if (result_data_is_return_rank_range(result)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	vla_map_offidx_if_invalid(u, map);
+	uint32_t index = 0;
+	uint32_t count = 0;
+
+	if (map_is_k_ordered(map)) {
+		if (! packed_map_get_range_by_key_interval_ordered(map, key_start,
+				key_end, &index, &count)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return packed_map_get_remove_by_index_range(map, b, alloc_buf, index,
+				count, result);
+	}
+
+	bool inverted = result_data_is_inverted(result);
+
+	if (inverted && ! result->is_multi) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() INVERTED flag not supported for single result ops");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (key_start == key_end) {
+		map_ele_find find_key;
+		map_ele_find_init(&find_key, map);
+
+		if (! packed_map_find_key(map, &find_key, key_start)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() find key failed, ele_count=%u", map->ele_count);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (! find_key.found_key) {
+			if (! result_data_set_key_not_found(result, -1)) {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() invalid result_type %d", result->type);
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			return AS_PROTO_RESULT_OK;
+		}
+
+		if (b) {
+			define_map_op(op, map);
+			int32_t new_sz = packed_map_op_remove(&op, &find_key, 1,
+					find_key.sz);
+
+			if (new_sz < 0) {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() packed_map_transform_remove_key failed with ret=%d, ele_count=%u", new_sz, map->ele_count);
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			define_map_packer(mpk, op.new_ele_count, map->flags,
+					(uint32_t)new_sz);
+
+			map_packer_setup_bin(&mpk, b, alloc_buf);
+			map_packer_write_hdridx(&mpk);
+			map_packer_write_seg1(&mpk, &op);
+			map_packer_write_seg2(&mpk, &op);
+		}
+
+#ifdef MAP_DEBUG_VERIFY
+		if (b && ! map_verify(b)) {
+			cdt_bin_print(b, "packed_map_get_remove_by_key_interval");
+			map_print(map, "original");
+			cf_crash(AS_PARTICLE, "ele_count %u index %u count 1 is_multi %d inverted %d", map->ele_count, index, result->is_multi, inverted);
+		}
+#endif
+
+		return packed_map_build_result_by_key(map, key_start, find_key.idx,
+				1, result);
+	}
+
+	define_cdt_idx_mask(rm_mask, map->ele_count);
+
+	if (! packed_map_get_range_by_key_interval_unordered(map, key_start,
+			key_end, &index, &count, rm_mask)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_count = count;
+
+	if (inverted) {
+		rm_count = map->ele_count - count;
+		cdt_idx_mask_invert(rm_mask, map->ele_count);
+	}
+
+	int ret = AS_PROTO_RESULT_OK;
+	uint32_t rm_sz = 0;
+
+	if (b) {
+		if ((ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+				rm_count, &rm_sz)) != AS_PROTO_RESULT_OK) {
+			return ret;
+		}
+	}
+
+	if (result_data_is_return_elements(result)) {
+		if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz,
+				result)) {
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+	}
+	else if (result_data_is_return_rank(result)) {
+		ret = packed_map_build_rank_result_by_mask(map, rm_mask, rm_count,
+				result);
+	}
+	else {
+		ret = result_data_set_range(result, index, count, map->ele_count);
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		return ret;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_by_key_interval");
+		map_print(map, "original");
+		cf_crash(AS_PARTICLE, "ele_count %u index %u count %u rm_count %u inverted %d", map->ele_count, index, count, rm_count, inverted);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_trim_ordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf,
+		uint32_t index, uint32_t count, cdt_result_data *result)
+{
+	cf_assert(result->is_multi, AS_PARTICLE, "packed_map_trim_ordered() required to be a multi op");
+	cf_assert(! result_data_is_inverted(result), AS_PARTICLE, "packed_map_trim_ordered() INVERTED flag not supported");
+
+	vla_map_offidx_if_invalid(u, map);
+	uint32_t rm_count = map->ele_count - count;
+	uint32_t index1 = index + count;
+
+	// Pre-fill index.
+	if (! map_offset_index_fill(u.offidx, index + count)) {
+		cf_warning(AS_PARTICLE, "packed_map_trim_ordered() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t offset0 = offset_index_get_const(u.offidx, index);
+	uint32_t offset1 = offset_index_get_const(u.offidx, index1);
+	uint32_t content_sz = offset1 - offset0;
+
+	if (b) {
+		define_map_packer(mpk, count, map->flags, content_sz);
+
+		map_packer_setup_bin(&mpk, b, alloc_buf);
+		map_packer_write_hdridx(&mpk);
+		memcpy(mpk.write_ptr, map->contents + offset0, content_sz);
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_INDEX: {
+		bool is_rev = (result->type == RESULT_TYPE_REVINDEX);
+		define_int_list_builder(builder, result->alloc, rm_count);
+
+		cdt_container_builder_add_int_range(&builder, 0, index, map->ele_count,
+				is_rev);
+		cdt_container_builder_add_int_range(&builder, index1,
+				map->ele_count - index1, map->ele_count, is_rev);
+		cdt_container_builder_set_result(&builder, result);
+		break;
+	}
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+		result->flags = AS_CDT_OP_FLAG_INVERTED;
+
+		return packed_map_build_rank_result_by_index_range(map, index, count,
+				result);
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP:
+		result->flags = AS_CDT_OP_FLAG_INVERTED;
+
+		if (! packed_map_build_ele_result_by_idx_range(map, index, count,
+				result)) {
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+
+		break;
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_trim_ordered() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Set b = NULL for get_by_index_range operation.
+static int
+packed_map_get_remove_by_index_range(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, int64_t index, uint64_t count,
+		cdt_result_data *result)
+{
+	uint32_t uindex;
+	uint32_t count32;
+
+	if (! calc_index_count(index, count, map->ele_count, &uindex, &count32,
+			result->is_multi)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() index %ld out of bounds for ele_count %u", index, map->ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (result_data_is_return_rank_range(result)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (result_data_is_inverted(result)) {
+		if (! result->is_multi) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() INVERTED flag not supported for single result ops");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+		if (count32 == 0) {
+			// Reduce to remove all.
+			uindex = 0;
+			count32 = map->ele_count;
+		}
+		else if (uindex == 0) {
+			// Reduce to remove tail section.
+			uindex = count32;
+			count32 = map->ele_count - count32;
+		}
+		else if (uindex + count32 >= map->ele_count) {
+			// Reduce to remove head section.
+			count32 = uindex;
+			uindex = 0;
+		}
+		else if (map_is_k_ordered(map)) {
+			return packed_map_trim_ordered(map, b, alloc_buf, uindex, count32,
+					result);
+		}
+		else {
+			result->flags |= AS_CDT_OP_FLAG_INVERTED;
+		}
+	}
+
+	if (count32 == 0) {
+		if (! result_data_set_key_not_found(result, uindex)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid result type %d", result->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	vla_map_offidx_if_invalid(u, map);
+
+	if (count32 == map->ele_count) {
+		return packed_map_get_remove_all(map, b, alloc_buf, result);
+	}
+
+	int ret = AS_PROTO_RESULT_OK;
+
+	if (map_is_k_ordered(map)) {
+		// Pre-fill index.
+		if (! map_offset_index_fill(u.offidx, uindex + count32)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (b) {
+			ret = packed_map_remove_idx_range(map, b, alloc_buf, uindex,
+					count32);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+
+		if (result_data_is_return_elements(result)) {
+			if (! packed_map_build_ele_result_by_idx_range(map, uindex, count32,
+					result)) {
+				return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+			}
+		}
+		else if (result_data_is_return_rank(result)) {
+			ret = packed_map_build_rank_result_by_index_range(map, uindex,
+					count32, result);
+		}
+		else {
+			ret = result_data_set_range(result, uindex, count32,
+					map->ele_count);
+		}
+	}
+	else {
+		// Pre-fill index.
+		if (! map_fill_offidx(map)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		define_build_order_heap_by_range(heap, uindex, count32, map->ele_count,
+				map, packed_map_compare_key_by_idx, success);
+
+		if (! success) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		uint32_t rm_sz = 0;
+		bool inverted = result_data_is_inverted(result);
+		define_cdt_idx_mask(rm_mask, map->ele_count);
+		uint32_t rm_count = (inverted ? map->ele_count - count32 : count32);
+
+		cdt_idx_mask_set_by_ordidx(rm_mask, &heap._, heap.filled, count32,
+				inverted);
+
+		if (b) {
+			int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+					rm_count, &rm_sz);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+
+		switch (result->type) {
+		case RESULT_TYPE_RANK:
+		case RESULT_TYPE_REVRANK:
+			if (inverted) {
+				ret = packed_map_build_rank_result_by_mask(map, rm_mask,
+						rm_count, result);
+			}
+			else {
+				if (heap.cmp == MSGPACK_COMPARE_LESS) {
+					order_heap_reverse_end(&heap, count32);
+				}
+
+				ret = packed_map_build_rank_result_by_ele_idx(map, &heap._,
+						heap.filled, count32, result);
+			}
+			break;
+		case RESULT_TYPE_KEY:
+		case RESULT_TYPE_VALUE:
+		case RESULT_TYPE_MAP: {
+			bool success;
+
+			if (inverted) {
+				success = packed_map_build_ele_result_by_mask(map, rm_mask,
+						rm_count, rm_sz, result);
+			}
+			else {
+				if (heap.cmp == MSGPACK_COMPARE_LESS) {
+					order_heap_reverse_end(&heap, count32);
+				}
+
+				success = packed_map_build_ele_result_by_ele_idx(map, &heap._,
+						heap.filled, count32, rm_sz, result);
+			}
+
+			if (! success) {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map");
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			break;
+		}
+		default:
+			ret = result_data_set_range(result, uindex, count32,
+					map->ele_count);
+			break;
+		}
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		return ret;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_by_index_range");
+		map_print(map, "original");
+		cf_crash(AS_PARTICLE, "ele_count %u uindex %u count32 %u", map->ele_count, uindex, count32);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// value_end == NULL means looking for: [value_start, largest possible value].
+// value_start == value_end means looking for a single value: [value_start, value_start].
+static int
+packed_map_get_remove_by_value_interval(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *value_start,
+		const cdt_payload *value_end, cdt_result_data *result)
+{
+	if (result_data_is_return_index_range(result)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+
+	if (inverted && ! result->is_multi) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() INVERTED flag not supported for single result ops");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (map->ele_count == 0) {
+		if (! result_data_set_value_not_found(result, -1)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	vla_map_offidx_if_invalid(u, map);
+
+	// Pre-fill index.
+	if (! map_fill_offidx(map)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rank = 0;
+	uint32_t count = 0;
+	int ret = AS_PROTO_RESULT_OK;
+
+	if (order_index_is_valid(&map->value_idx)) {
+		if (! packed_map_ensure_ordidx_filled(map)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (! packed_map_find_rank_range_by_value_interval_indexed(map,
+				value_start, value_end, &rank, &count, result->is_multi)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		uint32_t rm_count = (inverted ? map->ele_count - count : count);
+		bool need_mask = b || (inverted &&
+				(result_data_is_return_elements(result) ||
+						result_data_is_return_index(result)));
+		cond_define_cdt_idx_mask(rm_mask, map->ele_count, need_mask);
+		uint32_t rm_sz = 0;
+
+		if (need_mask) {
+			cdt_idx_mask_set_by_ordidx(rm_mask, &map->value_idx, rank, count,
+					inverted);
+		}
+
+		if (b) {
+			int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+					rm_count, &rm_sz);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+
+		if (result_data_is_return_elements(result)) {
+			if (inverted) {
+				if (! packed_map_build_ele_result_by_mask(map, rm_mask,
+						rm_count, rm_sz, result)) {
+					cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map");
+					return -AS_PROTO_RESULT_FAIL_PARAMETER;
+				}
+			}
+			else if (! packed_map_build_ele_result_by_ele_idx(map,
+					&map->value_idx, rank, count, rm_sz, result)) {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map");
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+		}
+		else if (result_data_is_return_index(result)) {
+			if (inverted) {
+				ret = packed_map_build_index_result_by_mask(map, rm_mask,
+						rm_count, result);
+			}
+			else {
+				ret = packed_map_build_index_result_by_ele_idx(map,
+						&map->value_idx, rank, count, result);
+			}
+		}
+		else {
+			ret = result_data_set_range(result, rank, count, map->ele_count);
+		}
+	}
+	else {
+		define_cdt_idx_mask(rm_mask, map->ele_count);
+
+		if (! packed_map_find_rank_range_by_value_interval_unordered(map,
+				value_start, value_end, &rank, &count, rm_mask)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (count == 0) {
+			if (inverted) {
+				result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+				return packed_map_get_remove_all(map, b, alloc_buf, result);
+			}
+			else if (! result_data_set_value_not_found(result, rank)) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+		}
+		else {
+			if (! result->is_multi) {
+				count = 1;
+			}
+
+			uint32_t rm_sz = 0;
+			uint32_t rm_count = count;
+
+			if (inverted) {
+				cdt_idx_mask_invert(rm_mask, map->ele_count);
+				rm_count = map->ele_count - count;
+			}
+
+			if (b) {
+				ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+						rm_count, &rm_sz);
+			}
+
+			if (result_data_is_return_elements(result)) {
+				if (! packed_map_build_ele_result_by_mask(map, rm_mask,
+						rm_count, rm_sz, result)) {
+					return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+				}
+			}
+			else if (result_data_is_return_index(result)) {
+				ret = packed_map_build_index_result_by_mask(map, rm_mask,
+						rm_count, result);
+			}
+			else {
+				ret = result_data_set_range(result, rank, count,
+						map->ele_count);
+			}
+		}
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		return ret;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_by_value_interval");
+		map_print(map, "original");
+		cf_crash(AS_PARTICLE, "ele_count %u rank %u count %u inverted %d", map->ele_count, rank, count, inverted);
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_get_remove_by_rank_range(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, int64_t rank, uint64_t count,
+		cdt_result_data *result)
+{
+	uint32_t urank;
+	uint32_t count32;
+
+	if (! calc_index_count(rank, count, map->ele_count, &urank, &count32,
+			result->is_multi)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() rank %ld out of bounds for ele_count %u", rank, map->ele_count);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (result_data_is_return_index_range(result)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (result_data_is_inverted(result)) {
+		if (! result->is_multi) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() INVERTED flag not supported for single result ops");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+		if (count32 == 0) {
+			// Reduce to remove all.
+			urank = 0;
+			count32 = map->ele_count;
+		}
+		else if (urank == 0) {
+			// Reduce to remove tail section.
+			urank = count32;
+			count32 = map->ele_count - count32;
+		}
+		else if (urank + count32 >= map->ele_count) {
+			// Reduce to remove head section.
+			count32 = urank;
+			urank = 0;
+		}
+		else {
+			result->flags |= AS_CDT_OP_FLAG_INVERTED;
+		}
+	}
+
+	if (count32 == 0) {
+		if (! result_data_set_value_not_found(result, urank)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	vla_map_offidx_if_invalid(u, map);
+
+	if (! map_fill_offidx(map)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+	uint32_t rm_count = inverted ? map->ele_count - count32 : count32;
+	const order_index *ordidx = &map->value_idx;
+	define_cdt_idx_mask(rm_mask, map->ele_count);
+	order_index ret_idxs;
+
+	if (order_index_is_valid(ordidx)) {
+		if (! packed_map_ensure_ordidx_filled(map)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		cdt_idx_mask_set_by_ordidx(rm_mask, ordidx, urank, count32, inverted);
+		order_index_init_ref(&ret_idxs, ordidx, urank, count32);
+	}
+	else {
+		define_build_order_heap_by_range(heap, urank, count32, map->ele_count,
+						map, packed_map_compare_value_by_idx, success);
+
+		if (! success) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		cdt_idx_mask_set_by_ordidx(rm_mask, &heap._, heap.filled, count32,
+				inverted);
+
+		if (! inverted) {
+			if (heap.cmp == MSGPACK_COMPARE_LESS) {
+				// Reorder results from lowest to highest order.
+				order_heap_reverse_end(&heap, count32);
+			}
+
+			if (result_data_is_return_index(result)) {
+				int ret = packed_map_build_index_result_by_ele_idx(map,
+						&heap._, heap.filled, count32, result);
+
+				if (ret != AS_PROTO_RESULT_OK) {
+					cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() build index result failed");
+					return ret;
+				}
+			}
+			else if (result_data_is_return_elements(result)) {
+				if (! packed_map_build_ele_result_by_ele_idx(map, &heap._,
+						heap.filled, count32, 0, result)) {
+					cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map");
+					return -AS_PROTO_RESULT_FAIL_PARAMETER;
+				}
+			}
+		}
+	}
+
+	uint32_t rm_sz = 0;
+	int ret = AS_PROTO_RESULT_OK;
+
+	if (b) {
+		ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, rm_count,
+				&rm_sz);
+
+		if (ret != AS_PROTO_RESULT_OK) {
+			return ret;
+		}
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+	case RESULT_TYPE_COUNT:
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+		ret = result_data_set_index_rank_count(result, urank, count32,
+				map->ele_count);
+		break;
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX:
+		if (inverted) {
+			ret = packed_map_build_index_result_by_mask(map, rm_mask, rm_count,
+					result);
+		}
+		else if (! as_bin_inuse(result->result)) {
+			ret = packed_map_build_index_result_by_ele_idx(map, &ret_idxs,
+					0, rm_count, result);
+		}
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP:
+		if (inverted) {
+			if (! packed_map_build_ele_result_by_mask(map, rm_mask,
+					rm_count, rm_sz, result)) {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map");
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+		}
+		else if (! as_bin_inuse(result->result) &&
+				! packed_map_build_ele_result_by_ele_idx(map, &ret_idxs, 0,
+						rm_count, rm_sz, result)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+		break;
+	case RESULT_TYPE_REVRANK_RANGE:
+	case RESULT_TYPE_RANK_RANGE:
+		ret = result_data_set_range(result, urank, rm_count, map->ele_count);
+		break;
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_REVINDEX_RANGE:
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() result_type %d not supported", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		return ret;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_by_rank_range");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_get_remove_all_by_key_list(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *key_list,
+		cdt_result_data *result)
+{
+	as_unpacker items_pk;
+	uint32_t items_count;
+
+	if (! list_param_parse(key_list, &items_pk, &items_count)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+
+	if (items_count == 0) {
+		switch (result->type) {
+		case RESULT_TYPE_RANK:
+		case RESULT_TYPE_REVRANK:
+		case RESULT_TYPE_INDEX_RANGE:
+		case RESULT_TYPE_REVINDEX_RANGE:
+		case RESULT_TYPE_RANK_RANGE:
+		case RESULT_TYPE_REVRANK_RANGE:
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list() invalid result type %d", result->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		default:
+			break;
+		}
+
+		if (! inverted) {
+			result_data_set_key_not_found(result, 0);
+
+			return AS_PROTO_RESULT_OK;
+		}
+
+		result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+		return packed_map_get_remove_all(map, b, alloc_buf, result);
+	}
+
+	vla_map_offidx_if_invalid(u, map);
+
+	if (map_is_k_ordered(map)) {
+		return packed_map_get_remove_all_by_key_list_ordered(map, b, alloc_buf,
+				&items_pk, items_count, result);
+	}
+
+	return packed_map_get_remove_all_by_key_list_unordered(map, b, alloc_buf,
+			&items_pk, items_count, result);
+}
+
+static int
+packed_map_get_remove_all_by_key_list_ordered(const packed_map *map,
+		as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk,
+		uint32_t items_count, cdt_result_data *result)
+{
+	define_order_index2(rm_ic, map->ele_count, 2 * items_count);
+	uint32_t rm_count = 0;
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		cdt_payload key = {
+				.ptr = items_pk->buffer + items_pk->offset,
+				.sz = items_pk->offset
+		};
+
+		if (as_unpack_size(items_pk) <= 0) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() invalid parameter");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		key.sz = items_pk->offset - key.sz;
+
+		map_ele_find find_key;
+		map_ele_find_init(&find_key, map);
+
+		if (! packed_map_find_key(map, &find_key, &key)) {
+			if (cdt_payload_is_int(&key)) {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() find key=%ld failed, ele_count=%u", cdt_payload_get_int64(&key), map->ele_count);
+			}
+			else {
+				cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() find key failed, ele_count=%u", map->ele_count);
+			}
+
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		uint32_t count = find_key.found_key ? 1 : 0;
+
+		order_index_set(&rm_ic, 2 * i, find_key.idx);
+		order_index_set(&rm_ic, (2 * i) + 1, count);
+		rm_count += count;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+	bool need_mask = b || result_data_is_return_elements(result) ||
+			(inverted && result_data_is_return_index(result));
+	cond_define_cdt_idx_mask(rm_mask, map->ele_count, need_mask);
+	uint32_t rm_sz = 0;
+
+	if (inverted) {
+		rm_count = map->ele_count - rm_count;
+	}
+
+	if (need_mask) {
+		cdt_idx_mask_set_by_irc(rm_mask, &rm_ic, NULL, inverted);
+	}
+
+	if (b) {
+		int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+				rm_count, &rm_sz);
+
+		if (ret != AS_PROTO_RESULT_OK) {
+			return ret;
+		}
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_INDEX:
+		if (inverted) {
+			result_data_set_int_list_by_mask(result, rm_mask, rm_count,
+					map->ele_count);
+		}
+		else {
+			result_data_set_by_irc(result, &rm_ic, NULL, items_count);
+		}
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP:
+		if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count,
+				rm_sz, result)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+		break;
+	case RESULT_TYPE_REVRANK:
+	case RESULT_TYPE_RANK:
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() invalid return type %d", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_all_by_key_list_ordered");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_get_remove_all_by_key_list_unordered(const packed_map *map,
+		as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk,
+		uint32_t items_count, cdt_result_data *result)
+{
+	bool inverted = result_data_is_inverted(result);
+	bool is_ret_index = result_data_is_return_index(result);
+	uint32_t rm_count;
+	define_cdt_idx_mask(rm_mask, map->ele_count);
+	define_order_index(key_list_ordidx, items_count);
+	cond_vla_order_index2(ic, map->ele_count, items_count * 2, is_ret_index);
+
+	if (! offset_index_find_items((offset_index *)&map->offidx,
+			CDT_FIND_ITEMS_IDXS_FOR_MAP_KEY, items_pk, &key_list_ordidx,
+			inverted, rm_mask, &rm_count, is_ret_index ? &ic.ordidx : NULL)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_sz = 0;
+
+	if (b) {
+		int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+				rm_count, &rm_sz);
+
+		if (ret < 0) {
+			return ret;
+		}
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_INDEX: {
+		result_data_set_by_itemlist_irc(result, &key_list_ordidx, &ic.ordidx,
+				rm_count);
+		break;
+	}
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP: {
+		if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz,
+				result)) {
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_unordered() invalid return type %d", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_all_by_key_list_unordered");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_get_remove_all_by_value_list(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const cdt_payload *value_list,
+		cdt_result_data *result)
+{
+	as_unpacker items_pk;
+	uint32_t items_count;
+
+	if (! list_param_parse(value_list, &items_pk, &items_count)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+
+	if (items_count == 0) {
+		switch (result->type) {
+		case RESULT_TYPE_INDEX_RANGE:
+		case RESULT_TYPE_REVINDEX_RANGE:
+		case RESULT_TYPE_RANK_RANGE:
+		case RESULT_TYPE_REVRANK_RANGE:
+			cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_value_list() invalid result type %d", result->type);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		default:
+			break;
+		}
+
+		if (! inverted) {
+			result_data_set_not_found(result, 0);
+			return AS_PROTO_RESULT_OK;
+		}
+
+		result->flags &= ~AS_CDT_OP_FLAG_INVERTED;
+
+		return packed_map_get_remove_all(map, b, alloc_buf, result);
+	}
+
+	vla_map_offidx_if_invalid(u, map);
+
+	if (order_index_is_valid(&map->value_idx)) {
+		return packed_map_get_remove_all_by_value_list_ordered(map, b,
+				alloc_buf, &items_pk, items_count, result);
+	}
+
+	bool is_ret_rank = result_data_is_return_rank(result);
+	define_cdt_idx_mask(rm_mask, map->ele_count);
+	uint32_t rm_count = 0;
+	define_order_index(value_list_ordidx, items_count);
+	cond_vla_order_index2(rc, map->ele_count, items_count * 2, is_ret_rank);
+
+	if (! offset_index_find_items(u.offidx,
+			CDT_FIND_ITEMS_IDXS_FOR_MAP_VALUE, &items_pk, &value_list_ordidx,
+			inverted, rm_mask, &rm_count, is_ret_rank ? &rc.ordidx : NULL)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_sz = 0;
+
+	if (b) {
+		int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+				rm_count, &rm_sz);
+
+		if (ret < 0) {
+			return ret;
+		}
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_INDEX: {
+		int ret = packed_map_build_index_result_by_mask(map, rm_mask, rm_count,
+				result);
+
+		if (ret != AS_PROTO_RESULT_OK) {
+			return ret;
+		}
+
+		break;
+	}
+	case RESULT_TYPE_REVRANK:
+	case RESULT_TYPE_RANK: {
+		result_data_set_by_itemlist_irc(result, &value_list_ordidx,
+				&rc.ordidx, rm_count);
+		break;
+	}
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP: {
+		if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz,
+				result)) {
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_value_list() invalid return type %d", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_all_by_value_list");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_get_remove_all_by_value_list_ordered(const packed_map *map,
+		as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk,
+		uint32_t items_count, cdt_result_data *result)
+{
+	define_order_index2(rm_rc, map->ele_count, 2 * items_count);
+
+	if (! packed_map_ensure_ordidx_filled(map)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t rm_count = 0;
+
+	for (uint32_t i = 0; i < items_count; i++) {
+		cdt_payload value = {
+				.ptr = items_pk->buffer + items_pk->offset,
+				.sz = items_pk->offset
+		};
+
+		if (as_unpack_size(items_pk) <= 0) {
+			cf_warning(AS_PARTICLE, "packed_map_remove_all_value_items_ordered() invalid parameter");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		value.sz = items_pk->offset - value.sz;
+
+		uint32_t rank = 0;
+		uint32_t count = 0;
+
+		if (! packed_map_find_rank_range_by_value_interval_indexed(map,
+				&value, &value, &rank, &count, result->is_multi)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		order_index_set(&rm_rc, 2 * i, rank);
+		order_index_set(&rm_rc, (2 * i) + 1, count);
+		rm_count += count;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+	bool need_mask = b || result_data_is_return_elements(result) ||
+			(inverted && (result_data_is_return_index(result) ||
+					result_data_is_return_rank(result)));
+	cond_define_cdt_idx_mask(rm_mask, map->ele_count, need_mask);
+	uint32_t rm_sz = 0;
+
+	if (inverted) {
+		rm_count = map->ele_count - rm_count;
+	}
+
+	if (need_mask) {
+		cdt_idx_mask_set_by_irc(rm_mask, &rm_rc, &map->value_idx, inverted);
+	}
+
+	if (b) {
+		int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask,
+				rm_count, &rm_sz);
+
+		if (ret != AS_PROTO_RESULT_OK) {
+			return ret;
+		}
+	}
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_INDEX: {
+		if (inverted) {
+			int ret = packed_map_build_index_result_by_mask(map, rm_mask,
+					rm_count, result);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+		else {
+			result_data_set_by_irc(result, &rm_rc, &map->value_idx, rm_count);
+		}
+		break;
+	}
+	case RESULT_TYPE_REVRANK:
+	case RESULT_TYPE_RANK:
+		if (inverted) {
+			int ret = packed_map_build_rank_result_by_mask(map, rm_mask,
+					rm_count, result);
+
+			if (ret != AS_PROTO_RESULT_OK) {
+				return ret;
+			}
+		}
+		else {
+			result_data_set_by_irc(result, &rm_rc, NULL, rm_count);
+		}
+		break;
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, rm_count);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP: {
+		if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz,
+				result)) {
+			cf_warning(AS_PARTICLE, "packed_map_remove_all_value_items_ordered() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_remove_all_value_items_ordered() invalid return type %d", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_remove_all_value_items_ordered");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_get_remove_all(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, cdt_result_data *result)
+{
+	cf_assert(! result_data_is_inverted(result), AS_PARTICLE, "packed_map_get_remove_all() INVERTED flag is invalid here");
+
+	if (b) {
+		as_bin_set_empty_packed_map(b, alloc_buf, map->flags);
+	}
+
+	bool is_rev = false;
+
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_REVINDEX:
+	case RESULT_TYPE_REVRANK:
+		is_rev = true;
+		// no break
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_RANK: {
+		define_int_list_builder(builder, result->alloc, map->ele_count);
+
+		cdt_container_builder_add_int_range(&builder, 0, map->ele_count,
+				map->ele_count, is_rev);
+		cdt_container_builder_set_result(&builder, result);
+		break;
+	}
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, map->ele_count);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP: {
+		if (! packed_map_build_ele_result_by_idx_range(map, 0, map->ele_count,
+				result)) {
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+		break;
+	}
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_REVINDEX_RANGE:
+	case RESULT_TYPE_RANK_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+		result_data_set_list_int2x(result, 0, map->ele_count);
+		break;
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_get_remove_all() invalid return type %d", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+#ifdef MAP_DEBUG_VERIFY
+	if (b && ! map_verify(b)) {
+		cdt_bin_print(b, "packed_map_get_remove_all");
+	}
+#endif
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_remove_by_mask(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t count,
+		uint32_t *rm_sz_r)
+{
+	if (count == 0) {
+		return AS_PROTO_RESULT_OK;
+	}
+
+	const offset_index *offidx = &map->offidx;
+	uint32_t rm_sz = cdt_idx_mask_get_content_sz(rm_mask, count, offidx);
+
+	if (rm_sz_r) {
+		*rm_sz_r = rm_sz;
+	}
+
+	uint32_t new_ele_count = map->ele_count - count;
+	uint32_t content_sz = map->content_sz - rm_sz;
+	define_map_packer(mpk, new_ele_count, map->flags, content_sz);
+
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+	mpk.write_ptr = cdt_idx_mask_write_eles(rm_mask, count, offidx,
+			mpk.write_ptr, true);
+
+	if (offset_index_is_valid(&mpk.offset_idx)) {
+		if (offset_index_is_full(offidx)) {
+			offidx_op off_op;
+			offidx_op_init(&off_op, &mpk.offset_idx, offidx);
+			uint32_t rm_idx = 0;
+
+			for (uint32_t i = 0; i < count; i++) {
+				rm_idx = cdt_idx_mask_find(rm_mask, rm_idx, map->ele_count,
+						false);
+				offidx_op_remove(&off_op, rm_idx);
+				rm_idx++;
+			}
+
+			offidx_op_end(&off_op);
+		}
+		else {
+			offset_index_set_filled(&mpk.offset_idx, 1);
+			map_offset_index_fill(&mpk.offset_idx, new_ele_count);
+		}
+	}
+
+	if (order_index_is_valid(&mpk.value_idx)) {
+		if (order_index_is_filled(&map->value_idx)) {
+			order_index_op_remove_idx_mask(&mpk.value_idx, &map->value_idx,
+					rm_mask, count);
+		}
+		else if (! order_index_set_sorted(&mpk.value_idx, &mpk.offset_idx,
+				mpk.contents, mpk.content_sz, SORT_BY_VALUE)) {
+			cf_warning(AS_PARTICLE, "packed_map_remove_indexes() failed to sort new value_idex");
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_remove_idx_range(const packed_map *map, as_bin *b,
+		rollback_alloc *alloc_buf, uint32_t idx, uint32_t count)
+{
+	offset_index *offidx = (offset_index *)&map->offidx;
+	uint32_t offset0 = offset_index_get_const(offidx, idx);
+	uint32_t idx_end = idx + count;
+	uint32_t offset1 = offset_index_get_const(offidx, idx_end);
+	uint32_t content_sz = map->content_sz - offset1 + offset0;
+	uint32_t new_ele_count = map->ele_count - count;
+	define_map_packer(mpk, new_ele_count, map->flags, content_sz);
+
+	map_packer_setup_bin(&mpk, b, alloc_buf);
+	map_packer_write_hdridx(&mpk);
+
+	uint32_t tail_sz = map->content_sz - offset1;
+
+	memcpy(mpk.write_ptr, map->contents, offset0);
+	mpk.write_ptr += offset0;
+	memcpy(mpk.write_ptr, map->contents + offset1, tail_sz);
+
+	if (offset_index_is_valid(&mpk.offset_idx)) {
+		if (offset_index_is_full(offidx)) {
+			offidx_op offop;
+
+			offidx_op_init(&offop, &mpk.offset_idx, offidx);
+			offidx_op_remove_range(&offop, idx, count);
+			offidx_op_end(&offop);
+		}
+		else {
+			offset_index_set_filled(&mpk.offset_idx, 1);
+			map_offset_index_fill(&mpk.offset_idx, new_ele_count);
+		}
+	}
+
+	if (order_index_is_valid(&mpk.value_idx)) {
+		if (order_index_is_filled(&map->value_idx)) {
+			uint32_t count0 = 0;
+
+			for (uint32_t i = 0; i < map->ele_count; i++) {
+				uint32_t idx0 = order_index_get(&map->value_idx, i);
+
+				if (idx0 >= idx && idx0 < idx_end) {
+					continue;
+				}
+
+				if (idx0 >= idx_end) {
+					idx0 -= count;
+				}
+
+				order_index_set(&mpk.value_idx, count0++, idx0);
+			}
+		}
+		else if (! order_index_set_sorted(&mpk.value_idx, &mpk.offset_idx,
+				mpk.contents, mpk.content_sz, SORT_BY_VALUE)) {
+			cf_warning(AS_PARTICLE, "packed_map_remove_idx_range() failed to sort new value_idex");
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static bool
+packed_map_get_range_by_key_interval_unordered(const packed_map *map,
+		const cdt_payload *key_start, const cdt_payload *key_end,
+		uint32_t *index, uint32_t *count, uint64_t *mask)
+{
+	cf_assert(key_end, AS_PARTICLE, "key_end == NULL");
+
+	as_unpacker pk_start = {
+			.buffer = key_start->ptr,
+			.length = key_start->sz
+	};
+
+	as_unpacker pk_end = {
+			.buffer = key_end->ptr,
+			.length = key_end->sz
+	};
+
+	// Pre-check parameters.
+	if (as_unpack_size(&pk_start) <= 0) {
+		cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid start key");
+		return false;
+	}
+
+	if (key_end->ptr) {
+		// Pre-check parameters.
+		if (as_unpack_size(&pk_end) <= 0) {
+			cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid end key");
+			return false;
+		}
+	}
+
+	*index = 0;
+	*count = 0;
+
+	offset_index *offidx = (offset_index *)&map->offidx;
+	define_map_unpacker(pk, map);
+
+	for (uint32_t i = 0; i < map->ele_count; i++) {
+		uint32_t key_offset = pk.offset; // start of key
+
+		offset_index_set(offidx, i, key_offset);
+
+		pk_start.offset = 0;
+
+		msgpack_compare_t cmp_start = as_unpack_compare(&pk, &pk_start);
+
+		if (cmp_start == MSGPACK_COMPARE_ERROR) {
+			cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid packed map at index %u", i);
+			return false;
+		}
+
+		if (cmp_start == MSGPACK_COMPARE_LESS) {
+			(*index)++;
+		}
+		else {
+			msgpack_compare_t cmp_end = MSGPACK_COMPARE_LESS;
+
+			// NULL key_end->ptr means largest possible value.
+			if (key_end->ptr) {
+				pk.offset = key_offset;
+				pk_end.offset = 0;
+				cmp_end = as_unpack_compare(&pk, &pk_end);
+			}
+
+			if (cmp_end == MSGPACK_COMPARE_LESS) {
+				cdt_idx_mask_set(mask, i);
+				(*count)++;
+			}
+		}
+
+		// Skip value.
+		if (as_unpack_size(&pk) <= 0) {
+			cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid packed map at index %u", i);
+			return false;
+		}
+	}
+
+	offset_index_set_filled(offidx, map->ele_count);
+
+	return true;
+}
+
+static bool
+packed_map_get_range_by_key_interval_ordered(const packed_map *map,
+		const cdt_payload *key_start, const cdt_payload *key_end,
+		uint32_t *index, uint32_t *count)
+{
+	map_ele_find find_key_start;
+	map_ele_find_init(&find_key_start, map);
+
+	if (! packed_map_find_key(map, &find_key_start, key_start)) {
+		cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_ordered() find key failed, ele_count=%u", map->ele_count);
+		return false;
+	}
+
+	*index = find_key_start.idx;
+
+	if (key_start == key_end) {
+		if (find_key_start.found_key) {
+			*count = 1;
+		}
+		else {
+			*count = 0;
+		}
+	}
+	else if (key_end && key_end->ptr) {
+		map_ele_find find_key_end;
+
+		map_ele_find_continue_from_lower(&find_key_end, &find_key_start,
+				map->ele_count);
+
+		if (! packed_map_find_key(map, &find_key_end, key_end)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_ordered() find key failed, ele_count=%u", map->ele_count);
+			return false;
+		}
+
+		if (find_key_end.idx <= find_key_start.idx) {
+			*count = 0;
+		}
+		else {
+			*count = find_key_end.idx - find_key_start.idx;
+		}
+	}
+	else {
+		*count = map->ele_count - find_key_start.idx;
+	}
+
+	return true;
+}
+
+// Does not respect invert flag.
+static int
+packed_map_build_rank_result_by_ele_idx(const packed_map *map,
+		const order_index *ele_idx, uint32_t start, uint32_t count,
+		cdt_result_data *result)
+{
+	if (! result->is_multi) {
+		uint32_t idx = order_index_get(ele_idx, start);
+
+		return packed_map_build_rank_result_by_idx(map, idx, result);
+	}
+
+	define_int_list_builder(builder, result->alloc, count);
+	bool is_rev = result->type == RESULT_TYPE_REVRANK;
+
+	vla_map_allidx_if_invalid(uv, map);
+
+	if (! packed_map_ensure_ordidx_filled(map)) {
+		cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_ele_idx() ordidx fill failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	for (uint32_t i = 0; i < count; i++) {
+		uint32_t idx = order_index_get(ele_idx, start + i);
+		map_ele_find find;
+
+		map_ele_find_init_from_idx(&find, map, idx);
+		packed_map_find_rank_indexed(map, &find);
+
+		if (! find.found_value) {
+			cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_ele_idx() idx %u not found find.rank %u", idx, find.rank);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		uint32_t rank = find.rank;
+
+		cdt_container_builder_add_int_range(&builder, rank, 1, map->ele_count,
+				is_rev);
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Does not respect invert flag.
+static int
+packed_map_build_rank_result_by_mask(const packed_map *map,
+		const uint64_t *mask, uint32_t count, cdt_result_data *result)
+{
+	uint32_t idx = 0;
+
+	if (! result->is_multi) {
+		idx = cdt_idx_mask_find(mask, idx, map->ele_count, false);
+
+		return packed_map_build_rank_result_by_idx(map, idx, result);
+	}
+
+	define_int_list_builder(builder, result->alloc, count);
+	bool is_rev = result->type == RESULT_TYPE_REVRANK;
+
+	vla_map_allidx_if_invalid(uv, map);
+
+	if (! packed_map_ensure_ordidx_filled(map)) {
+		cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_mask() ordidx fill failed");
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	for (uint32_t i = 0; i < count; i++) {
+		idx = cdt_idx_mask_find(mask, idx, map->ele_count, false);
+
+		map_ele_find find;
+
+		map_ele_find_init_from_idx(&find, map, idx);
+		packed_map_find_rank_indexed(map, &find);
+
+		if (! find.found_value) {
+			cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_mask() idx %u not found find.rank %u", idx, find.rank);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		uint32_t rank = find.rank;
+
+		cdt_container_builder_add_int_range(&builder, rank, 1, map->ele_count,
+				is_rev);
+		idx++;
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_build_rank_result_by_index_range(const packed_map *map,
+		uint32_t index, uint32_t count, cdt_result_data *result)
+{
+	if (! result->is_multi) {
+		return packed_map_build_rank_result_by_idx(map, index, result);
+	}
+
+	cf_assert(map_is_k_ordered(map), AS_PARTICLE, "map must be K_ORDERED");
+
+	bool inverted = result_data_is_inverted(result);
+	uint32_t ret_count = (inverted ? map->ele_count - count : count);
+	define_int_list_builder(builder, result->alloc, ret_count);
+	vla_map_allidx_if_invalid(uv, map);
+
+	if (! packed_map_ensure_ordidx_filled(map)) {
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool is_rev = result->type == RESULT_TYPE_REVRANK;
+
+	if (inverted) {
+		for (uint32_t i = 0; i < index; i++) {
+			map_ele_find find;
+
+			map_ele_find_init_from_idx(&find, map, i);
+			packed_map_find_rank_indexed(map, &find);
+
+			if (! find.found_value) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			uint32_t rank = find.rank;
+
+			if (is_rev) {
+				rank = map->ele_count - rank - 1;
+			}
+
+			cdt_container_builder_add_int64(&builder, rank);
+		}
+
+		for (uint32_t i = index + count; i < map->ele_count; i++) {
+			map_ele_find find;
+
+			map_ele_find_init_from_idx(&find, map, i);
+			packed_map_find_rank_indexed(map, &find);
+
+			if (! find.found_value) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			uint32_t rank = find.rank;
+
+			if (is_rev) {
+				rank = map->ele_count - rank - 1;
+			}
+
+			cdt_container_builder_add_int64(&builder, rank);
+		}
+	}
+	else {
+		for (uint32_t i = 0; i < count; i++) {
+			map_ele_find find;
+
+			map_ele_find_init_from_idx(&find, map, index + i);
+			packed_map_find_rank_indexed(map, &find);
+
+			if (! find.found_value) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			uint32_t rank = find.rank;
+
+			if (result->type == RESULT_TYPE_REVRANK) {
+				rank = map->ele_count - rank - 1;
+			}
+
+			cdt_container_builder_add_int64(&builder, rank);
+		}
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static bool
+packed_map_get_key_by_idx(const packed_map *map, cdt_payload *key,
+		uint32_t index)
+{
+	uint32_t pk_offset = offset_index_get_const(&map->offidx, index);
+
+	as_unpacker pk = {
+			.buffer = map->contents + pk_offset,
+			.length = map->content_sz - pk_offset
+	};
+
+	int64_t sz = as_unpack_size(&pk); // read key
+
+	if (sz <= 0) {
+		cf_warning(AS_PARTICLE, "packed_map_get_key_by_idx() read key failed sz %ld", sz);
+		return false;
+	}
+
+	key->ptr = pk.buffer;
+	key->sz = (uint32_t)sz;
+
+	return true;
+}
+
+static bool
+packed_map_get_value_by_idx(const packed_map *map, cdt_payload *value,
+		uint32_t idx)
+{
+	uint32_t pk_offset = offset_index_get_const(&map->offidx, idx);
+	uint32_t sz = offset_index_get_delta_const(&map->offidx, idx);
+
+	as_unpacker pk = {
+			.buffer = map->contents + pk_offset,
+			.length = map->content_sz - pk_offset
+	};
+
+	int64_t key_sz = as_unpack_size(&pk); // read key
+
+	if (key_sz <= 0) {
+		cf_warning(AS_PARTICLE, "packed_map_get_value_by_idx() read key failed key_sz %ld", key_sz);
+		return false;
+	}
+
+	value->ptr = pk.buffer + (uint32_t)key_sz;
+	value->sz = sz - (uint32_t)key_sz;
+
+	return true;
+}
+
+static bool
+packed_map_get_pair_by_idx(const packed_map *map, cdt_payload *value,
+		uint32_t index)
+{
+	uint32_t pk_offset = offset_index_get_const(&map->offidx, index);
+	uint32_t sz = offset_index_get_delta_const(&map->offidx, index);
+
+	value->ptr = map->contents + pk_offset;
+	value->sz = sz;
+
+	return true;
+}
+
+// Does not respect invert flag.
+static int
+packed_map_build_index_result_by_ele_idx(const packed_map *map,
+		const order_index *ele_idx, uint32_t start, uint32_t count,
+		cdt_result_data *result)
+{
+	if (count == 0) {
+		if (! result_data_set_not_found(result, start)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	if (! result->is_multi) {
+		uint32_t index = order_index_get(ele_idx, start);
+
+		if (! map_is_k_ordered(map)) {
+			index = packed_map_find_index_by_idx_unordered(map, index);
+		}
+
+		if (result->type == RESULT_TYPE_REVINDEX) {
+			index = map->ele_count - index - 1;
+		}
+
+		as_bin_set_int(result->result, index);
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	define_int_list_builder(builder, result->alloc, count);
+
+	if (map_is_k_ordered(map)) {
+		for (uint32_t i = 0; i < count; i++) {
+			uint32_t index = order_index_get(ele_idx, start + i);
+
+			if (result->type == RESULT_TYPE_REVINDEX) {
+				index = map->ele_count - index - 1;
+			}
+
+			cdt_container_builder_add_int64(&builder, index);
+		}
+	}
+	else {
+		offset_index *offidx = (offset_index *)&map->offidx;
+
+		// Preset offsets if necessary.
+		if (! map_offset_index_fill(offidx, map->ele_count)) {
+			cf_warning(AS_PARTICLE, "packed_map_build_index_result_by_ele_idx() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		// Make order index on stack.
+		define_order_index(keyordidx, map->ele_count);
+		bool success = order_index_set_sorted(&keyordidx, offidx, map->contents,
+				map->content_sz, SORT_BY_KEY);
+
+		cf_assert(success, AS_PARTICLE, "invalid packed map with full offidx");
+
+		for (uint32_t i = 0; i < count; i++) {
+			uint32_t idx = order_index_get(ele_idx, start + i);
+			uint32_t index = order_index_find_idx(&keyordidx, idx, 0,
+					map->ele_count);
+
+			if (index >= map->ele_count) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			if (result->type == RESULT_TYPE_REVINDEX) {
+				index = map->ele_count - index - 1;
+			}
+
+			cdt_container_builder_add_int64(&builder, index);
+		}
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Does not respect invert flag.
+static int
+packed_map_build_index_result_by_mask(const packed_map *map,
+		const uint64_t *mask, uint32_t count, cdt_result_data *result)
+{
+	if (count == 0) {
+		result_data_set_not_found(result, -1);
+		return AS_PROTO_RESULT_OK;
+	}
+
+	if (! result->is_multi) {
+		uint32_t index = cdt_idx_mask_find(mask, 0, map->ele_count, false);
+
+		if (! map_is_k_ordered(map)) {
+			index = packed_map_find_index_by_idx_unordered(map, index);
+		}
+
+		if (result->type == RESULT_TYPE_REVINDEX) {
+			index = map->ele_count - index - 1;
+		}
+
+		as_bin_set_int(result->result, index);
+
+		return AS_PROTO_RESULT_OK;
+	}
+
+	define_int_list_builder(builder, result->alloc, count);
+
+	if (map_is_k_ordered(map)) {
+		uint32_t index = 0;
+
+		for (uint32_t i = 0; i < count; i++) {
+			index = cdt_idx_mask_find(mask, index, map->ele_count, false);
+			cdt_container_builder_add_int64(&builder,
+					result->type == RESULT_TYPE_REVINDEX ?
+							map->ele_count - index - 1 : index);
+			index++;
+		}
+	}
+	else {
+		offset_index *offidx = (offset_index *)&map->offidx;
+
+		// Preset offsets if necessary.
+		if (! map_offset_index_fill(offidx, map->ele_count)) {
+			cf_warning(AS_PARTICLE, "packed_map_build_index_result_by_ele_idx() invalid packed map");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		// Make order index on stack.
+		define_order_index(keyordidx, map->ele_count);
+		bool success = order_index_set_sorted(&keyordidx, offidx, map->contents,
+				map->content_sz, SORT_BY_KEY);
+		uint32_t idx = 0;
+
+		cf_assert(success, AS_PARTICLE, "invalid packed map with full offidx");
+
+		for (uint32_t i = 0; i < count; i++) {
+			idx = cdt_idx_mask_find(mask, idx, map->ele_count, false);
+
+			uint32_t index = order_index_find_idx(&keyordidx, idx, 0,
+					map->ele_count);
+
+			if (index >= map->ele_count) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			if (result->type == RESULT_TYPE_REVINDEX) {
+				index = map->ele_count - index - 1;
+			}
+
+			cdt_container_builder_add_int64(&builder, index);
+			idx++;
+		}
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Build by map ele_idx range.
+static bool
+packed_map_build_ele_result_by_idx_range(const packed_map *map,
+		uint32_t start_idx, uint32_t count, cdt_result_data *result)
+{
+	offset_index *offidx = (offset_index *)&map->offidx;
+
+	if (! map_offset_index_fill(offidx, map->ele_count)) {
+		cf_warning(AS_PARTICLE, "packed_map_build_ele_result_by_idx_range() invalid packed map");
+		return false;
+	}
+
+	bool inverted = result_data_is_inverted(result);
+	uint32_t offset0 = offset_index_get_const(offidx, start_idx);
+	uint32_t offset1 = offset_index_get_const(offidx, start_idx + count);
+	uint32_t max_sz = offset1 - offset0;
+	uint32_t ret_count = count;
+	cdt_container_builder builder;
+
+	if (inverted) {
+		ret_count = map->ele_count - count;
+		max_sz = map->content_sz - max_sz;
+	}
+
+	if (result->type == RESULT_TYPE_MAP) {
+		cdt_map_builder_start(&builder, result->alloc, ret_count, max_sz,
+				AS_PACKED_MAP_FLAG_PRESERVE_ORDER);
+
+		if (inverted) {
+			uint32_t tail_sz = map->content_sz - offset1;
+
+			memcpy(builder.write_ptr, map->contents, offset0);
+			builder.write_ptr += offset0;
+			memcpy(builder.write_ptr, map->contents + offset1, tail_sz);
+		}
+		else {
+			memcpy(builder.write_ptr, map->contents + offset0, max_sz);
+		}
+
+		*builder.sz += max_sz;
+		cdt_container_builder_set_result(&builder, result);
+
+		return true;
+	}
+
+	packed_map_get_by_idx_func get_by_idx_func;
+
+	if (result->type == RESULT_TYPE_KEY) {
+		get_by_idx_func = packed_map_get_key_by_idx;
+	}
+	else {
+		get_by_idx_func = packed_map_get_value_by_idx;
+	}
+
+	if (result->is_multi) {
+		cdt_list_builder_start(&builder, result->alloc, ret_count, max_sz);
+	}
+	else {
+		cdt_payload packed;
+
+		if (! get_by_idx_func(map, &packed, start_idx)) {
+			return false;
+		}
+
+		return rollback_alloc_from_msgpack(result->alloc, result->result,
+				&packed);
+	}
+
+	if (inverted) {
+		for (uint32_t i = 0; i < start_idx; i++) {
+			cdt_payload packed;
+
+			if (! get_by_idx_func(map, &packed, i)) {
+				return false;
+			}
+
+			cdt_container_builder_add(&builder, packed.ptr, packed.sz);
+		}
+
+		for (uint32_t i = start_idx + count; i < map->ele_count; i++) {
+			cdt_payload packed;
+
+			if (! get_by_idx_func(map, &packed, i)) {
+				return false;
+			}
+
+			cdt_container_builder_add(&builder, packed.ptr, packed.sz);
+		}
+	}
+	else {
+		for (uint32_t i = 0; i < count; i++) {
+			cdt_payload packed;
+
+			if (! get_by_idx_func(map, &packed, start_idx + i)) {
+				return false;
+			}
+
+			cdt_container_builder_add(&builder, packed.ptr, packed.sz);
+		}
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return true;
+}
+
+// Does not respect invert flag.
+static bool
+packed_map_build_ele_result_by_ele_idx(const packed_map *map,
+		const order_index *ele_idx, uint32_t start, uint32_t count,
+		uint32_t rm_sz, cdt_result_data *result)
+{
+	if (rm_sz == 0) {
+		if (start != 0) {
+			order_index ref;
+
+			order_index_init_ref(&ref, ele_idx, start, count);
+			rm_sz = order_index_get_ele_size(&ref, count, &map->offidx);
+		}
+		else {
+			rm_sz = order_index_get_ele_size(ele_idx, count, &map->offidx);
+		}
+	}
+
+	packed_map_get_by_idx_func get_by_index_func;
+	cdt_container_builder builder;
+	uint32_t max_sz = (count != 0 ? rm_sz : 0);
+
+	if (result->type == RESULT_TYPE_MAP) {
+		get_by_index_func = packed_map_get_pair_by_idx;
+
+		cdt_map_builder_start(&builder, result->alloc, count, max_sz,
+				AS_PACKED_MAP_FLAG_PRESERVE_ORDER);
+	}
+	else {
+		if (result->type == RESULT_TYPE_KEY) {
+			get_by_index_func = packed_map_get_key_by_idx;
+		}
+		else {
+			get_by_index_func = packed_map_get_value_by_idx;
+		}
+
+		if (result->is_multi) {
+			cdt_list_builder_start(&builder, result->alloc, count,
+					max_sz - count);
+		}
+		else if (count == 0) {
+			return true;
+		}
+		else {
+			uint32_t index = order_index_get(ele_idx, start);
+			cdt_payload packed;
+
+			if (! get_by_index_func(map, &packed, index)) {
+				return false;
+			}
+
+			return rollback_alloc_from_msgpack(result->alloc, result->result,
+					&packed);
+		}
+	}
+
+	for (uint32_t i = 0; i < count; i++) {
+		uint32_t index = order_index_get(ele_idx, i + start);
+		cdt_payload packed;
+
+		if (! get_by_index_func(map, &packed, index)) {
+			return false;
+		}
+
+		cdt_container_builder_add(&builder, packed.ptr, packed.sz);
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return true;
+}
+
+// Does not respect invert flag.
+static bool
+packed_map_build_ele_result_by_mask(const packed_map *map, const uint64_t *mask,
+		uint32_t count, uint32_t rm_sz, cdt_result_data *result)
+{
+	if (! result->is_multi) {
+		uint32_t idx = cdt_idx_mask_find(mask, 0, map->ele_count, false);
+		define_order_index2(ele_idx, map->ele_count, 1);
+
+		order_index_set(&ele_idx, 0, idx);
+
+		return packed_map_build_ele_result_by_ele_idx(map, &ele_idx, 0, 1,
+				rm_sz, result);
+	}
+
+	if (rm_sz == 0) {
+		rm_sz = cdt_idx_mask_get_content_sz(mask, count, &map->offidx);
+	}
+
+	packed_map_get_by_idx_func get_by_index_func;
+	cdt_container_builder builder;
+	uint32_t max_sz = (count != 0 ? rm_sz : 0);
+
+	if (result->type == RESULT_TYPE_MAP) {
+		get_by_index_func = packed_map_get_pair_by_idx;
+
+		cdt_map_builder_start(&builder, result->alloc, count, max_sz,
+				AS_PACKED_MAP_FLAG_PRESERVE_ORDER);
+	}
+	else {
+		if (result->type == RESULT_TYPE_KEY) {
+			get_by_index_func = packed_map_get_key_by_idx;
+		}
+		else {
+			get_by_index_func = packed_map_get_value_by_idx;
+		}
+
+		cdt_list_builder_start(&builder, result->alloc, count, max_sz - count);
+	}
+
+	uint32_t index = 0;
+
+	for (uint32_t i = 0; i < count; i++) {
+		cdt_payload packed;
+
+		index = cdt_idx_mask_find(mask, index, map->ele_count, false);
+
+		if (! get_by_index_func(map, &packed, index)) {
+			return false;
+		}
+
+		cdt_container_builder_add(&builder, packed.ptr, packed.sz);
+		index++;
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return true;
+}
+
+static int
+packed_map_build_result_by_key(const packed_map *map, const cdt_payload *key,
+		uint32_t idx, uint32_t count, cdt_result_data *result)
+{
+	switch (result->type) {
+	case RESULT_TYPE_NONE:
+		break;
+	case RESULT_TYPE_INDEX_RANGE:
+	case RESULT_TYPE_REVINDEX_RANGE:
+	case RESULT_TYPE_INDEX:
+	case RESULT_TYPE_REVINDEX: {
+		uint32_t index = idx;
+
+		if (! map_is_k_ordered(map)) {
+			index = packed_map_find_index_by_key_unordered(map, key);
+		}
+
+		if (result_data_is_return_index_range(result)) {
+			if (result->type == RESULT_TYPE_REVINDEX_RANGE) {
+				index = map->ele_count - index - count;
+			}
+
+			result_data_set_list_int2x(result, index, count);
+		}
+		else {
+			if (result->type == RESULT_TYPE_REVINDEX) {
+				index = map->ele_count - index - count;
+			}
+
+			as_bin_set_int(result->result, index);
+		}
+
+		break;
+	}
+	case RESULT_TYPE_RANK:
+	case RESULT_TYPE_REVRANK:
+		if (result->is_multi) {
+			return packed_map_build_rank_result_by_idx_range(map, idx, count,
+					result);
+		}
+
+		return packed_map_build_rank_result_by_idx(map, idx, result);
+	case RESULT_TYPE_COUNT:
+		as_bin_set_int(result->result, count);
+		break;
+	case RESULT_TYPE_KEY:
+	case RESULT_TYPE_VALUE:
+	case RESULT_TYPE_MAP:
+		if (! packed_map_build_ele_result_by_idx_range(map, idx, count,
+				result)) {
+			return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+
+		break;
+	case RESULT_TYPE_RANK_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+	default:
+		cf_warning(AS_PARTICLE, "packed_map_build_result_by_key() invalid result_type %d", result->type);
+		return -AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+// Return negative codes on error.
+static int64_t
+packed_map_get_rank_by_idx(const packed_map *map, uint32_t idx)
+{
+	cf_assert(map_has_offidx(map), AS_PARTICLE, "packed_map_get_rank_by_idx() offset_index needs to be valid");
+
+	uint32_t rank;
+
+	if (order_index_is_valid(&map->value_idx)) {
+		if (! packed_map_ensure_ordidx_filled(map)) {
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		map_ele_find find_key;
+		map_ele_find_init_from_idx(&find_key, map, idx);
+
+		if (! packed_map_find_rank_indexed(map, &find_key)) {
+			cf_warning(AS_PARTICLE, "packed_map_get_rank_by_idx() packed_map_find_rank_indexed failed");
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		if (! find_key.found_value) {
+			cf_warning(AS_PARTICLE, "packed_map_get_rank_by_idx() rank not found, idx=%u rank=%u", find_key.idx, find_key.rank);
+			return -AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+
+		rank = find_key.rank;
+	}
+	else {
+		const offset_index *offidx = &map->offidx;
+		uint32_t pk_offset = offset_index_get_const(offidx, idx);
+		define_map_unpacker(pk, map);
+
+		as_unpacker pk_entry = {
+				.buffer = map->contents + pk_offset,
+				.length = map->content_sz - pk_offset
+		};
+
+		rank = 0;
+
+		for (uint32_t i = 0; i < map->ele_count; i++) {
+			pk_entry.offset = 0;
+
+			msgpack_compare_t cmp = packed_map_compare_values(&pk, &pk_entry);
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				return -AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			if (cmp == MSGPACK_COMPARE_LESS) {
+				rank++;
+			}
+		}
+	}
+
+	return (int64_t)rank;
+}
+
+static int
+packed_map_build_rank_result_by_idx(const packed_map *map, uint32_t idx,
+		cdt_result_data *result)
+{
+	int64_t rank = packed_map_get_rank_by_idx(map, idx);
+
+	if (rank < 0) {
+		return (int)rank;
+	}
+
+	if (result->type == RESULT_TYPE_REVRANK) {
+		as_bin_set_int(result->result, (int64_t)map->ele_count - rank - 1);
+	}
+	else {
+		as_bin_set_int(result->result, rank);
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static int
+packed_map_build_rank_result_by_idx_range(const packed_map *map, uint32_t idx,
+		uint32_t count, cdt_result_data *result)
+{
+	define_int_list_builder(builder, result->alloc, count);
+
+	for (uint32_t i = 0; i < count; i++) {
+		int64_t rank = packed_map_get_rank_by_idx(map, idx);
+
+		if (rank < 0) {
+			return (int)rank;
+		}
+
+		if (result->type == RESULT_TYPE_REVRANK) {
+			rank = (int64_t)map->ele_count - rank - 1;
+		}
+
+		cdt_container_builder_add_int64(&builder, rank);
+	}
+
+	cdt_container_builder_set_result(&builder, result);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+static msgpack_compare_t
+packed_map_compare_key_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2)
+{
+	const packed_map *map = ptr;
+	const offset_index *offidx = &map->offidx;
+
+	as_unpacker pk1 = {
+			.buffer = map->contents,
+			.offset = offset_index_get_const(offidx, idx1),
+			.length = map->content_sz
+	};
+
+	as_unpacker pk2 = {
+			.buffer = map->contents,
+			.offset = offset_index_get_const(offidx, idx2),
+			.length = map->content_sz
+	};
+
+	msgpack_compare_t ret = as_unpack_compare(&pk1, &pk2);
+
+	if (ret == MSGPACK_COMPARE_EQUAL) {
+		ret = as_unpack_compare(&pk1, &pk2);
+	}
+
+	return ret;
+}
+
+static msgpack_compare_t
+packed_map_compare_values(as_unpacker *pk1, as_unpacker *pk2)
+{
+	msgpack_compare_t keycmp = as_unpack_compare(pk1, pk2);
+
+	if (keycmp == MSGPACK_COMPARE_ERROR) {
+		return MSGPACK_COMPARE_ERROR;
+	}
+
+	msgpack_compare_t ret = as_unpack_compare(pk1, pk2);
+
+	if (ret == MSGPACK_COMPARE_EQUAL) {
+		return keycmp;
+	}
+
+	return ret;
+}
+
+static msgpack_compare_t
+packed_map_compare_value_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2)
+{
+	const packed_map *map = ptr;
+	const offset_index *offidx = &map->offidx;
+
+	as_unpacker pk1 = {
+			.buffer = map->contents,
+			.offset = offset_index_get_const(offidx, idx1),
+			.length = map->content_sz
+	};
+
+	as_unpacker pk2 = {
+			.buffer = map->contents,
+			.offset = offset_index_get_const(offidx, idx2),
+			.length = map->content_sz
+	};
+
+	return packed_map_compare_values(&pk1, &pk2);
+}
+
+static bool
+packed_map_write_k_ordered(const packed_map *map, uint8_t *write_ptr,
+		offset_index *offsets_new)
+{
+	uint32_t ele_count = map->ele_count;
+	define_order_index(key_ordidx, ele_count);
+	vla_map_offidx_if_invalid(old, map);
+
+	if (! map_fill_offidx(map)) {
+		cf_warning(AS_PARTICLE, "packed_map_op_write_k_ordered() offset fill failed");
+		return false;
+	}
+
+	if (! order_index_set_sorted_with_offsets(&key_ordidx, old.offidx,
+			SORT_BY_KEY)) {
+		return false;
+	}
+
+	const uint8_t *ptr = old.offidx->contents;
+
+	offset_index_set_filled(offsets_new, 1);
+
+	for (uint32_t i = 0; i < ele_count; i++) {
+		uint32_t index = order_index_get(&key_ordidx, i);
+		uint32_t offset = offset_index_get_const(old.offidx, index);
+		uint32_t sz = offset_index_get_delta_const(old.offidx, index);
+
+		memcpy(write_ptr, ptr + offset, sz);
+		write_ptr += sz;
+		offset_index_append_size(offsets_new, sz);
+	}
+
+	return true;
+}
+
+//------------------------------------------------
+// packed_map_op
+
+static void
+packed_map_op_init(packed_map_op *op, const packed_map *map)
+{
+	op->map = map;
+
+	op->new_ele_count = 0;
+	op->ele_removed = 0;
+
+	op->seg1_sz = 0;
+	op->seg2_offset = 0;
+	op->seg2_sz = 0;
+
+	op->key1_offset = 0;
+	op->key1_sz = 0;
+	op->key2_offset = 0;
+	op->key2_sz = 0;
+}
+
+// Return new size of map elements.
+static int32_t
+packed_map_op_add(packed_map_op *op, const map_ele_find *found)
+{
+	// Replace at offset.
+	if (found->found_key) {
+		op->new_ele_count = op->map->ele_count;
+		op->seg2_offset = found->key_offset + found->sz;
+	}
+	// Insert at offset.
+	else {
+		op->new_ele_count = op->map->ele_count + 1;
+		op->seg2_offset = found->key_offset;
+	}
+
+	op->seg1_sz = found->key_offset;
+	op->seg2_sz = op->map->content_sz - op->seg2_offset;
+
+	return (int32_t)(op->seg1_sz + op->seg2_sz);
+}
+
+static int32_t
+packed_map_op_remove(packed_map_op *op, const map_ele_find *found,
+		uint32_t count, uint32_t remove_sz)
+{
+	op->new_ele_count = op->map->ele_count - count;
+	op->seg1_sz = found->key_offset;
+	op->seg2_offset = found->key_offset + remove_sz;
+	op->seg2_sz = op->map->content_sz - op->seg2_offset;
+
+	op->ele_removed = count;
+
+	return (int32_t)(op->seg1_sz + op->seg2_sz);
+}
+
+static uint8_t *
+packed_map_op_write_seg1(const packed_map_op *op, uint8_t *buf)
+{
+	const uint8_t *src = op->map->contents;
+
+	memcpy(buf, src, op->seg1_sz);
+	memcpy(buf + op->seg1_sz, src + op->key1_offset, op->key1_sz);
+
+	return buf + op->seg1_sz + op->key1_sz;
+}
+
+static uint8_t *
+packed_map_op_write_seg2(const packed_map_op *op, uint8_t *buf)
+{
+	const uint8_t *src = op->map->contents;
+
+	memcpy(buf, src + op->key2_offset, op->key2_sz);
+	memcpy(buf + op->key2_sz, src + op->seg2_offset, op->seg2_sz);
+
+	return buf + op->key2_sz + op->seg2_sz;
+}
+
+static bool
+packed_map_op_write_new_offidx(const packed_map_op *op,
+		const map_ele_find *remove_info, const map_ele_find *add_info,
+		offset_index *new_offidx, uint32_t kv_sz)
+{
+	const offset_index *offidx = &op->map->offidx;
+
+	if (! offset_index_is_full(offidx)) {
+		return false;
+	}
+
+	cf_assert(op->new_ele_count >= op->map->ele_count, AS_PARTICLE, "op->new_ele_count %u < op->map->ele_count %u", op->new_ele_count, op->map->ele_count);
+
+	uint32_t ele_count = op->map->ele_count;
+
+	if (op->new_ele_count - op->map->ele_count != 0) { // add 1
+		// Insert at end.
+		if (remove_info->idx == ele_count) {
+			offset_index_copy(new_offidx, offidx, 0, 0, ele_count, 0);
+			offset_index_set(new_offidx, ele_count, op->seg1_sz + op->seg2_sz);
+		}
+		// Insert at offset.
+		else {
+			offset_index_copy(new_offidx, offidx, 0, 0,
+					remove_info->idx + 1, 0);
+			offset_index_copy(new_offidx, offidx, remove_info->idx + 1,
+					remove_info->idx, (ele_count - remove_info->idx), kv_sz);
+		}
+	}
+	else { // replace 1
+		cf_assert(remove_info->idx == add_info->idx, AS_PARTICLE, "remove_info->idx %u != add_info->idx %u", remove_info->idx, add_info->idx);
+
+		offset_index_copy(new_offidx, offidx, 0, 0, remove_info->idx, 0);
+		offset_index_set(new_offidx, remove_info->idx, remove_info->key_offset);
+
+		int delta = (int)kv_sz - (int)remove_info->sz;
+
+		offset_index_copy(new_offidx, offidx, remove_info->idx + 1,
+				remove_info->idx + 1, ele_count - remove_info->idx - 1, delta);
+	}
+
+	offset_index_set_filled(new_offidx, op->new_ele_count);
+
+	return true;
+}
+
+static bool
+packed_map_op_write_new_ordidx(const packed_map_op *op,
+		const map_ele_find *remove_info, const map_ele_find *add_info,
+		order_index *value_idx)
+{
+	const order_index *ordidx = &op->map->value_idx;
+
+	if (order_index_is_null(ordidx)) {
+		return false;
+	}
+
+	cf_assert(op->new_ele_count >= op->map->ele_count, AS_PARTICLE, "op->new_ele_count %u < op->map->ele_count %u", op->new_ele_count, op->map->ele_count);
+
+	if (op->new_ele_count - op->map->ele_count != 0) { // add 1
+		order_index_op_add(value_idx, ordidx, add_info->idx, add_info->rank);
+	}
+	else { // replace 1
+		cf_assert(remove_info->idx == add_info->idx, AS_PARTICLE, "remove_info->idx %u != add_info->idx %u", remove_info->idx, add_info->idx);
+
+		order_index_op_replace1(value_idx, ordidx, add_info->rank,
+				remove_info->rank);
+	}
+
+	return true;
+}
+
+//------------------------------------------------
+// map_particle
+
+static as_particle *
+map_particle_create(rollback_alloc *alloc_buf, uint32_t ele_count,
+		const uint8_t *buf, uint32_t content_sz, uint8_t flags)
+{
+	define_map_packer(mpk, ele_count, flags, content_sz);
+	map_mem *p_map_mem = (map_mem *)map_packer_create_particle(&mpk, alloc_buf);
+
+	if (! p_map_mem) {
+		return NULL;
+	}
+
+	map_packer_write_hdridx(&mpk);
+
+	if (buf) {
+		memcpy(mpk.write_ptr, buf, content_sz);
+	}
+
+	return (as_particle *)p_map_mem;
+}
+
+// Return new size on success, negative values on failure.
+static int64_t
+map_particle_strip_indexes(const as_particle *p, uint8_t *dest)
+{
+	const map_mem *p_map_mem = (const map_mem *)p;
+
+	if (p_map_mem->sz == 0) {
+		return 0;
+	}
+
+	as_unpacker upk = {
+			.buffer = p_map_mem->data,
+			.length = p_map_mem->sz
+	};
+
+	int64_t ele_count = as_unpack_map_header_element_count(&upk);
+
+	if (ele_count < 0) {
+		return -1;
+	}
+
+	as_packer pk = {
+			.buffer = dest,
+			.capacity = INT_MAX
+	};
+
+	if (ele_count > 0 && as_unpack_peek_is_ext(&upk)) {
+		as_msgpack_ext ext;
+
+		if (as_unpack_ext(&upk, &ext) != 0) {
+			return -2;
+		}
+
+		// Skip nil val.
+		if (as_unpack_size(&upk) <= 0) {
+			return -3;
+		}
+
+		uint8_t flags = ext.type;
+
+		if (flags != AS_PACKED_MAP_FLAG_NONE) {
+			ele_count--;
+		}
+
+		flags &= ~(AS_PACKED_MAP_FLAG_OFF_IDX | AS_PACKED_MAP_FLAG_ORD_IDX);
+
+		if (flags != AS_PACKED_MAP_FLAG_NONE) {
+			as_pack_map_header(&pk, (uint32_t)ele_count + 1);
+			as_pack_ext_header(&pk, 0, flags);
+			pk.buffer[pk.offset++] = msgpack_nil[0];
+		}
+		else {
+			as_pack_map_header(&pk, (uint32_t)ele_count);
+		}
+	}
+	else {
+		// Copy header.
+		as_pack_map_header(&pk, (uint32_t)ele_count);
+	}
+
+	// Copy elements.
+	size_t ele_sz = (size_t)(upk.length - upk.offset);
+
+	memcpy(pk.buffer + pk.offset, upk.buffer + upk.offset, ele_sz);
+
+	return (int64_t)pk.offset + (int64_t)ele_sz;
+}
+
+//------------------------------------------------
+// map_ele_find
+
+static void
+map_ele_find_init(map_ele_find *find, const packed_map *map)
+{
+	find->found_key = false;
+	find->found_value = false;
+	find->idx = map->ele_count;
+	find->rank = map->ele_count;
+
+	find->key_offset = 0;
+	find->value_offset = 0;
+	find->sz = 0;
+
+	find->lower = 0;
+	find->upper = map->ele_count;
+}
+
+static void
+map_ele_find_continue_from_lower(map_ele_find *find, const map_ele_find *found,
+		uint32_t ele_count)
+{
+	find->found_key = false;
+	find->found_value = false;
+
+	find->idx = ele_count + found->idx;
+	find->idx /= 2;
+	find->rank = find->idx;
+
+	find->key_offset = found->key_offset;
+	find->value_offset = found->value_offset;
+	find->sz = found->sz;
+
+	find->lower = found->idx;
+	find->upper = ele_count;
+}
+
+static void
+map_ele_find_init_from_idx(map_ele_find *find, const packed_map *map,
+		uint32_t idx)
+{
+	map_ele_find_init(find, map);
+	find->found_key = true;
+	find->idx = idx;
+	find->key_offset = offset_index_get_const(&map->offidx, idx);
+
+	as_unpacker pk = {
+			.buffer = map->contents,
+			.offset = find->key_offset,
+			.length = map->content_sz
+	};
+
+	as_unpack_size(&pk);
+	find->value_offset = pk.offset;
+	find->sz = offset_index_get_const(&map->offidx, idx + 1) - find->key_offset;
+}
+
+//------------------------------------------------
+// map_offset_index
+
+static bool
+map_offset_index_fill(offset_index *offidx, uint32_t index)
+{
+	uint32_t ele_filled = offset_index_get_filled(offidx);
+
+	if (index < ele_filled || offidx->_.ele_count == ele_filled) {
+		return true;
+	}
+
+	as_unpacker pk = {
+			.buffer = offidx->contents,
+			.length = offidx->content_sz
+	};
+
+	pk.offset = offset_index_get_const(offidx, ele_filled - 1);
+
+	for (uint32_t i = ele_filled; i < index; i++) {
+		if (as_unpack_size(&pk) <= 0) {
+			return false;
+		}
+
+		if (as_unpack_size(&pk) <= 0) {
+			return false;
+		}
+
+		offset_index_set(offidx, i, pk.offset);
+	}
+
+	if (as_unpack_size(&pk) <= 0) {
+		return false;
+	}
+
+	if (as_unpack_size(&pk) <= 0) {
+		return false;
+	}
+
+	// Make sure last iteration is in range for set.
+	if (index < offidx->_.ele_count) {
+		offset_index_set(offidx, index, pk.offset);
+		offset_index_set_filled(offidx, index + 1);
+	}
+	// Check if sizes match.
+	else if (pk.offset != offidx->content_sz) {
+		cf_warning(AS_PARTICLE, "map_offset_index_fill() offset mismatch %u, expected %u", pk.offset, offidx->content_sz);
+		return false;
+	}
+	else {
+		offset_index_set_filled(offidx, offidx->_.ele_count);
+	}
+
+	return true;
+}
+
+static int64_t
+map_offset_index_get(offset_index *offidx, uint32_t index)
+{
+	if (index > offidx->_.ele_count) {
+		index = offidx->_.ele_count;
+	}
+
+	if (! map_offset_index_fill(offidx, index)) {
+		return -1;
+	}
+
+	return (int64_t)offset_index_get_const(offidx, index);
+}
+
+static int64_t
+map_offset_index_get_delta(offset_index *offidx, uint32_t index)
+{
+	int64_t offset = map_offset_index_get(offidx, index);
+
+	if (offset < 0) {
+		return offset;
+	}
+
+	if (index == offidx->_.ele_count - 1) {
+		return (int64_t)offidx->content_sz - offset;
+	}
+
+	return map_offset_index_get(offidx, index + 1) - offset;
+}
+
+//------------------------------------------------
+// offidx_op
+
+static void
+offidx_op_init(offidx_op *op, offset_index *dest, const offset_index *src)
+{
+	op->dest = dest;
+	op->src = src;
+	op->d_i = 0;
+	op->s_i = 0;
+	op->delta = 0;
+}
+
+static void
+offidx_op_remove(offidx_op *op, uint32_t index)
+{
+	uint32_t count = index - op->s_i;
+	uint32_t mem_sz = offset_index_get_delta_const(op->src, index);
+
+	offset_index_copy(op->dest, op->src, op->d_i, op->s_i, count, op->delta);
+
+	op->delta -= mem_sz;
+	op->d_i += count;
+	op->s_i += count + 1;
+}
+
+static void
+offidx_op_remove_range(offidx_op *op, uint32_t index, uint32_t count)
+{
+	uint32_t ele_count = op->src->_.ele_count;
+	uint32_t delta_count = index - op->s_i;
+	uint32_t offset = offset_index_get_const(op->src, index);
+	uint32_t mem_sz;
+
+	if (index + count == ele_count) {
+		mem_sz = op->src->content_sz - offset;
+	}
+	else {
+		mem_sz = offset_index_get_const(op->src, index + count) - offset;
+	}
+
+	offset_index_copy(op->dest, op->src, op->d_i, op->s_i, delta_count,
+			op->delta);
+
+	op->delta -= mem_sz;
+	op->d_i += delta_count;
+	op->s_i += delta_count + count;
+}
+
+static void
+offidx_op_end(offidx_op *op)
+{
+	uint32_t ele_count = op->src->_.ele_count;
+	uint32_t count = ele_count - op->s_i;
+
+	offset_index_copy(op->dest, op->src, op->d_i, op->s_i, count, op->delta);
+	op->d_i += count;
+	offset_index_set_filled(op->dest, op->d_i);
+}
+
+//------------------------------------------------
+// order_index
+
+static bool
+order_index_sort(order_index *ordidx, const offset_index *offsets,
+		const uint8_t *contents, uint32_t content_sz, sort_by_t sort_by)
+{
+	uint32_t ele_count = ordidx->_.ele_count;
+
+	index_sort_userdata udata = {
+			.order = ordidx,
+			.offsets = offsets,
+			.contents = contents,
+			.content_sz = content_sz,
+			.error = false,
+			.sort_by = sort_by
+	};
+
+	qsort_r(order_index_get_mem(ordidx, 0), ele_count, ordidx->_.ele_sz,
+			map_packer_fill_index_sort_compare, (void *)&udata);
+
+	if (udata.error) {
+		return false;
+	}
+
+	return true;
+}
+
+static inline bool
+order_index_set_sorted(order_index *ordidx, const offset_index *offsets,
+		const uint8_t *ele_start, uint32_t tot_ele_sz, sort_by_t sort_by)
+{
+	uint32_t ele_count = ordidx->_.ele_count;
+
+	for (uint32_t i = 0; i < ele_count; i++) {
+		order_index_set(ordidx, i, i);
+	}
+
+	return order_index_sort(ordidx, offsets, ele_start, tot_ele_sz, sort_by);
+}
+
+static bool
+order_index_set_sorted_with_offsets(order_index *ordidx,
+		const offset_index *offsets, sort_by_t sort_by)
+{
+	return order_index_set_sorted(ordidx, offsets, offsets->contents,
+			offsets->content_sz, sort_by);
+}
+
+static uint32_t
+order_index_find_idx(const order_index *ordidx, uint32_t idx, uint32_t start,
+		uint32_t len)
+{
+	for (uint32_t i = start; i < start + len; i++) {
+		if (order_index_get(ordidx, i) == idx) {
+			return i;
+		}
+	}
+
+	return start + len;
+}
+
+//------------------------------------------------
+// order_index_adjust
+
+static uint32_t
+order_index_adjust_lower(const order_index_adjust *via, uint32_t src)
+{
+	if (src >= via->lower) {
+		return src + via->delta;
+	}
+
+	return src;
+}
+
+//------------------------------------------------
+// order_index_op
+
+static inline void
+order_index_op_add(order_index *dest, const order_index *src, uint32_t add_idx,
+		uint32_t add_rank)
+{
+	uint32_t ele_count = src->_.ele_count;
+
+	order_index_adjust adjust = {
+			.f = order_index_adjust_lower,
+			.lower = add_idx,
+			.upper = 0,
+			.delta = 1
+	};
+
+	cf_assert(add_rank <= ele_count, AS_PARTICLE, "order_index_op_add() add_rank(%u) > ele_count(%u)", add_rank, ele_count);
+	order_index_copy(dest, src, 0, 0, add_rank, &adjust);
+	order_index_set(dest, add_rank, add_idx);
+	order_index_copy(dest, src, add_rank + 1, add_rank, ele_count - add_rank,
+			&adjust);
+}
+
+static inline void
+order_index_op_replace1_internal(order_index *dest, const order_index *src,
+		uint32_t add_idx, uint32_t add_rank, uint32_t remove_rank,
+		const order_index_adjust *adjust)
+{
+	uint32_t ele_count = src->_.ele_count;
+
+	if (add_rank == remove_rank) {
+		order_index_copy(dest, src, 0, 0, ele_count, NULL);
+	}
+	else if (add_rank > remove_rank) {
+		order_index_copy(dest, src, 0, 0, remove_rank, adjust);
+		order_index_copy(dest, src, remove_rank, remove_rank + 1,
+				add_rank - remove_rank - 1, adjust);
+		order_index_set(dest, add_rank - 1, add_idx);
+		order_index_copy(dest, src, add_rank, add_rank, ele_count - add_rank,
+				adjust);
+	}
+	else {
+		order_index_copy(dest, src, 0, 0, add_rank, adjust);
+		order_index_set(dest, add_rank, add_idx);
+		order_index_copy(dest, src, add_rank + 1, add_rank,
+				remove_rank - add_rank, adjust);
+		order_index_copy(dest, src, remove_rank + 1, remove_rank + 1,
+				ele_count - remove_rank - 1, adjust);
+	}
+}
+
+// Replace remove_rank with add_rank in dest.
+static inline void
+order_index_op_replace1(order_index *dest, const order_index *src,
+		uint32_t add_rank, uint32_t remove_rank)
+{
+	uint32_t add_idx = order_index_get(src, remove_rank);
+
+	order_index_op_replace1_internal(dest, src, add_idx, add_rank, remove_rank,
+			NULL);
+}
+
+static void
+order_index_op_remove_idx_mask(order_index *dest, const order_index *src,
+		const uint64_t *mask, uint32_t count)
+{
+	if (count == 0) {
+		return;
+	}
+
+	uint32_t ele_count = src->max_idx;
+	uint32_t mask_count = cdt_idx_mask_count(ele_count);
+	define_order_index2(cntidx, ele_count, mask_count);
+
+	order_index_set(&cntidx, 0, cf_bit_count64(mask[0]));
+
+	for (uint32_t i = 1; i < mask_count; i++) {
+		uint32_t prev = order_index_get(&cntidx, i - 1);
+
+		order_index_set(&cntidx, i, prev + cf_bit_count64(mask[i]));
+	}
+
+	uint32_t di = 0;
+
+	for (uint32_t i = 0; i < ele_count; i++) {
+		uint32_t idx = order_index_get(src, i);
+
+		if (idx >= ele_count || cdt_idx_mask_is_set(mask, idx)) {
+			continue;
+		}
+
+		uint32_t mask_i = idx / 64;
+		uint32_t offset = idx % 64;
+		uint64_t bits = cdt_idx_mask_get(mask, idx) & ((1ULL << offset) - 1);
+
+		if (mask_i == 0) {
+			idx -= cf_bit_count64(bits);
+		}
+		else {
+			idx -= cf_bit_count64(bits) + order_index_get(&cntidx, mask_i - 1);
+		}
+
+		order_index_set(dest, di++, idx);
+	}
+
+	cf_assert(dest->_.ele_count == di, AS_PARTICLE, "count mismatch ele_count %u != di %u", dest->_.ele_count, di);
+}
+
+
+//==========================================================
+// result_data
+
+static bool
+result_data_set_key_not_found(cdt_result_data *rd, int64_t index)
+{
+	switch (rd->type) {
+	case RESULT_TYPE_RANK_RANGE:
+	case RESULT_TYPE_REVRANK_RANGE:
+		break;
+	default:
+		return result_data_set_not_found(rd, index);
+	}
+
+	return false;
+}
+
+static bool
+result_data_set_value_not_found(cdt_result_data *rd, int64_t rank)
+{
+	switch (rd->type) {
+	case RESULT_TYPE_REVINDEX_RANGE:
+	case RESULT_TYPE_INDEX_RANGE:
+		return false;
+	default:
+		return result_data_set_not_found(rd, rank);
+	}
+
+	return true;
+}
+
+
+//==========================================================
+// cdt_map_builder
+//
+
+void
+cdt_map_builder_start(cdt_container_builder *builder, rollback_alloc *alloc_buf,
+		uint32_t ele_count, uint32_t max_sz, uint8_t flags)
+{
+	uint32_t sz = sizeof(map_mem) + sizeof(uint64_t) + 1 + 3 + max_sz;
+	map_mem *p_map_mem = (map_mem *)rollback_alloc_reserve(alloc_buf, sz);
+
+	as_packer pk = {
+			.buffer = p_map_mem->data,
+			.capacity = INT_MAX
+	};
+
+	if (flags != AS_PACKED_MAP_FLAG_NONE) {
+		as_pack_map_header(&pk, ele_count + 1);
+		as_pack_ext_header(&pk, 0, flags);
+		pk.buffer[pk.offset++] = msgpack_nil[0];
+	}
+	else {
+		as_pack_map_header(&pk, ele_count);
+	}
+
+	p_map_mem->type = AS_PARTICLE_TYPE_MAP;
+	p_map_mem->sz = pk.offset;
+
+	builder->particle = (as_particle *)p_map_mem;
+	builder->write_ptr = p_map_mem->data + p_map_mem->sz;
+	builder->ele_count = 0;
+	builder->sz = &p_map_mem->sz;
+}
+
+
+//==========================================================
+// cdt_process_state_packed_map
+//
+
+bool
+cdt_process_state_packed_map_modify_optype(cdt_process_state *state,
+		cdt_modify_data *cdt_udata)
+{
+	as_bin *b = cdt_udata->b;
+	as_cdt_optype optype = state->type;
+
+	if (! is_map_type(as_bin_get_particle_type(b)) && as_bin_inuse(b)) {
+		cf_warning(AS_PARTICLE, "cdt_process_state_packed_map_modify_optype() invalid type %d", as_bin_get_particle_type(b));
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		return false;
+	}
+
+	define_rollback_alloc(alloc_buf, cdt_udata->alloc_buf, 1, true);
+	// Results always on the heap.
+	define_rollback_alloc(alloc_result, NULL, 1, false);
+	int ret = AS_PROTO_RESULT_OK;
+
+	cdt_result_data result = {
+			.result = cdt_udata->result,
+			.alloc = alloc_result,
+	};
+
+	switch (optype) {
+	case AS_CDT_OP_MAP_SET_TYPE: {
+		uint64_t flags;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_use_static_map_mem_if_notinuse(b, 0);
+		ret = map_set_flags(b, alloc_buf, result.result, (uint8_t)flags);
+		break;
+	}
+	case AS_CDT_OP_MAP_ADD: {
+		cdt_payload key;
+		cdt_payload value;
+		uint64_t flags = 0;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &value, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		map_add_control control = {
+				.allow_overwrite = false,
+				.allow_create = true,
+		};
+
+		as_bin_use_static_map_mem_if_notinuse(b, flags);
+		ret = map_add(b, alloc_buf, &key, &value, result.result, &control);
+		break;
+	}
+	case AS_CDT_OP_MAP_ADD_ITEMS: {
+		cdt_payload items;
+		uint64_t flags = 0;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &items, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		map_add_control control = {
+				.allow_overwrite = false,
+				.allow_create = true,
+		};
+
+		as_bin_use_static_map_mem_if_notinuse(b, flags);
+		ret = map_add_items(b, alloc_buf, &items, result.result, &control);
+		break;
+	}
+	case AS_CDT_OP_MAP_PUT: {
+		cdt_payload key;
+		cdt_payload value;
+		uint64_t flags = 0;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &value, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		map_add_control control = {
+				.allow_overwrite = true,
+				.allow_create = true,
+		};
+
+		as_bin_use_static_map_mem_if_notinuse(b, flags);
+		ret = map_add(b, alloc_buf, &key, &value, result.result, &control);
+		break;
+	}
+	case AS_CDT_OP_MAP_PUT_ITEMS: {
+		cdt_payload items;
+		uint64_t flags = 0;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &items, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		map_add_control control = {
+				.allow_overwrite = true,
+				.allow_create = true,
+		};
+
+		as_bin_use_static_map_mem_if_notinuse(b, flags);
+		ret = map_add_items(b, alloc_buf, &items, result.result, &control);
+		break;
+	}
+	case AS_CDT_OP_MAP_REPLACE: {
+		cdt_payload key;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		map_add_control control = {
+				.allow_overwrite = true,
+				.allow_create = false,
+		};
+
+		as_bin_use_static_map_mem_if_notinuse(b, 0);
+		ret = map_add(b, alloc_buf, &key, &value, result.result, &control);
+		break;
+	}
+	case AS_CDT_OP_MAP_REPLACE_ITEMS: {
+		if (! as_bin_inuse(b)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND;
+			return false;
+		}
+
+		cdt_payload items;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &items)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		map_add_control control = {
+				.allow_overwrite = true,
+				.allow_create = false,
+		};
+
+		ret = map_add_items(b, alloc_buf, &items, result.result, &control);
+		break;
+	}
+	case AS_CDT_OP_MAP_INCREMENT:
+	case AS_CDT_OP_MAP_DECREMENT: {
+		cdt_payload key;
+		cdt_payload delta_value = { NULL };
+		uint64_t flags = 0;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &delta_value, &flags)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		as_bin_use_static_map_mem_if_notinuse(b, flags);
+		ret = map_increment(b, alloc_buf, &key, &delta_value, result.result,
+				optype == AS_CDT_OP_MAP_DECREMENT);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_KEY: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t op_flags;
+		cdt_payload key;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &op_flags, &key)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, op_flags, false);
+		ret = map_remove_by_key_interval(b, alloc_buf, &key, &key, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_INDEX: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = map_remove_by_index_range(b, alloc_buf, index, 1, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_VALUE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = map_remove_by_value_interval(b, alloc_buf, &value, &value,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_RANK: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = map_remove_by_rank_range(b, alloc_buf, index, 1, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_KEY_LIST: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload items;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_all_by_key_list(b, alloc_buf, &items, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_ALL_BY_VALUE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_by_value_interval(b, alloc_buf, &value, &value,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_VALUE_LIST: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload items;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_all_by_value_list(b, alloc_buf, &items, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_KEY_INTERVAL: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload key_start;
+		cdt_payload key_end = { NULL };
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &key_start,
+				&key_end)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_by_key_interval(b, alloc_buf, &key_start, &key_end,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_INDEX_RANGE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_by_index_range(b, alloc_buf, index, count, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_VALUE_INTERVAL: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		cdt_payload value_start;
+		cdt_payload value_end = { NULL };
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start,
+				&value_end)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_by_value_interval(b, alloc_buf, &value_start,
+				&value_end, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_REMOVE_BY_RANK_RANGE: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		uint64_t result_type;
+		int64_t rank;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = map_remove_by_rank_range(b, alloc_buf, rank, count, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_CLEAR: {
+		if (! as_bin_inuse(b)) {
+			return true; // no-op
+		}
+
+		ret = map_clear(b, alloc_buf, result.result);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "cdt_process_state_packed_map_modify_optype() invalid cdt op: %d", optype);
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+		return false;
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state));
+		cdt_udata->ret_code = ret;
+		rollback_alloc_rollback(alloc_result);
+		rollback_alloc_rollback(alloc_buf);
+		return false;
+	}
+
+	if (b->particle == (const as_particle *)&map_mem_empty) {
+		as_bin_set_empty_packed_map(b, alloc_buf, 0);
+	}
+	else if (b->particle == (const as_particle *)map_mem_empty_flagged_table) {
+		as_bin_set_empty_packed_map(b, alloc_buf,
+				map_mem_empty_flagged_table[0].map.ext_flags);
+	}
+	else if (b->particle ==
+			(const as_particle *)(map_mem_empty_flagged_table + 1)) {
+		as_bin_set_empty_packed_map(b, alloc_buf,
+				map_mem_empty_flagged_table[1].map.ext_flags);
+	}
+
+	return true;
+}
+
+bool
+cdt_process_state_packed_map_read_optype(cdt_process_state *state,
+		cdt_read_data *cdt_udata)
+{
+	const as_bin *b = cdt_udata->b;
+	as_cdt_optype optype = state->type;
+
+	if (! is_map_type(as_bin_get_particle_type(b))) {
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		return false;
+	}
+
+	packed_map map;
+
+	if (! packed_map_init_from_bin(&map, b, false)) {
+		cf_warning(AS_PARTICLE, "%s: invalid map", cdt_process_state_get_op_name(state));
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+		return false;
+	}
+
+	// Just one entry needed for results bin.
+	define_rollback_alloc(alloc_result, NULL, 1, false);
+	int ret = AS_PROTO_RESULT_OK;
+
+	cdt_result_data result = {
+			.result = cdt_udata->result,
+			.alloc = alloc_result,
+	};
+
+	switch (optype) {
+	case AS_CDT_OP_MAP_SIZE: {
+		as_bin_set_int(result.result, map.ele_count);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_KEY: {
+		uint64_t op_flags;
+		cdt_payload key;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &op_flags, &key)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, op_flags, false);
+		ret = packed_map_get_remove_by_key_interval(&map, NULL, NULL, &key,
+				&key, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_VALUE: {
+		uint64_t result_type;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = packed_map_get_remove_by_value_interval(&map, NULL, NULL,
+				&value, &value, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_INDEX: {
+		uint64_t result_type;
+		int64_t index;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = packed_map_get_remove_by_index_range(&map, NULL, NULL, index, 1,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_RANK: {
+		uint64_t result_type;
+		int64_t rank;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, false);
+		ret = packed_map_get_remove_by_rank_range(&map, NULL, NULL, rank, 1,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_ALL_BY_VALUE: {
+		uint64_t result_type;
+		cdt_payload value;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_by_value_interval(&map, NULL, NULL,
+				&value, &value, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_KEY_INTERVAL: {
+		uint64_t result_type;
+		cdt_payload key_start;
+		cdt_payload key_end = { NULL };
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &key_start,
+				&key_end)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_by_key_interval(&map, NULL, NULL,
+				&key_start, &key_end, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_VALUE_INTERVAL: {
+		uint64_t result_type;
+		cdt_payload value_start;
+		cdt_payload value_end = { NULL };
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start,
+				&value_end)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_by_value_interval(&map, NULL, NULL,
+				&value_start, &value_end, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_INDEX_RANGE: {
+		uint64_t result_type;
+		int64_t index;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_by_index_range(&map, NULL, NULL, index,
+				count, &result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_RANK_RANGE: {
+		uint64_t result_type;
+		int64_t rank;
+		uint64_t count = UINT32_MAX;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_by_rank_range(&map, NULL, NULL, rank, count,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_KEY_LIST: {
+		uint64_t result_type;
+		cdt_payload items;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_all_by_key_list(&map, NULL, NULL, &items,
+				&result);
+		break;
+	}
+	case AS_CDT_OP_MAP_GET_BY_VALUE_LIST: {
+		uint64_t result_type;
+		cdt_payload items;
+
+		if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) {
+			cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+			return false;
+		}
+
+		result_data_set(&result, result_type, true);
+		ret = packed_map_get_remove_all_by_value_list(&map, NULL, NULL, &items,
+				&result);
+		break;
+	}
+	default:
+		cf_warning(AS_PARTICLE, "cdt_process_state_packed_map_read_optype() invalid cdt op: %d", optype);
+		cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER;
+		return false;
+	}
+
+	if (ret != AS_PROTO_RESULT_OK) {
+		cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state));
+		cdt_udata->ret_code = ret;
+		rollback_alloc_rollback(alloc_result);
+		return false;
+	}
+
+	return true;
+}
+
+
+//==========================================================
+// Debugging support.
+//
+
+static void
+map_print(const packed_map *map, const char *name)
+{
+	print_packed(map->packed, map->packed_sz, name);
+}
+
+static bool
+map_verify(const as_bin *b)
+{
+	packed_map map;
+
+	uint8_t type = as_bin_get_particle_type(b);
+
+	if (type != AS_PARTICLE_TYPE_MAP) {
+		cf_warning(AS_PARTICLE, "map_verify() non-map type: %u", type);
+		return false;
+	}
+
+	// Check header.
+	if (! packed_map_init_from_bin(&map, b, false)) {
+		cf_warning(AS_PARTICLE, "map_verify() invalid packed map");
+		return false;
+	}
+
+	if (map.flags != 0) {
+		const uint8_t *byte = map.contents - 1;
+
+		if (*byte != 0xC0) {
+			cf_warning(AS_PARTICLE, "map_verify() invalid ext header, expected C0 for pair.2");
+		}
+	}
+
+	const order_index *ordidx = &map.value_idx;
+	bool check_offidx = map_has_offidx(&map);
+	define_map_unpacker(pk, &map);
+	vla_map_offidx_if_invalid(u, &map);
+
+	uint32_t filled = offset_index_get_filled(u.offidx);
+	define_offset_index(temp_offidx, u.offidx->contents, u.offidx->content_sz,
+			u.offidx->_.ele_count);
+
+	if (map.ele_count != 0) {
+		offset_index_copy(&temp_offidx, u.offidx, 0, 0, filled, 0);
+	}
+
+	// Check offsets.
+	for (uint32_t i = 0; i < map.ele_count; i++) {
+		uint32_t offset;
+
+		if (check_offidx) {
+			if (i < filled) {
+				offset = offset_index_get_const(u.offidx, i);
+
+				if (pk.offset != offset) {
+					cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u expected=%d", i, offset, pk.offset);
+					return false;
+				}
+			}
+			else {
+				offset_index_set(&temp_offidx, i, pk.offset);
+			}
+		}
+		else {
+			offset_index_set(u.offidx, i, pk.offset);
+		}
+
+		offset = pk.offset;
+
+		if (as_unpack_size(&pk) <= 0) {
+			cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset);
+			return false;
+		}
+
+		offset = pk.offset;
+
+		if (as_unpack_size(&pk) <= 0) {
+			cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid value", i, offset, pk.offset);
+			return false;
+		}
+	}
+
+	if (check_offidx && filled < map.ele_count) {
+		u.offidx->_.ptr = temp_offidx._.ptr;
+	}
+
+	// Check packed size.
+	if (map.content_sz != pk.offset) {
+		cf_warning(AS_PARTICLE, "map_verify() content_sz=%u expected=%u", map.content_sz, pk.offset);
+		return false;
+	}
+
+	// Check key orders.
+	if (map_is_k_ordered(&map) && map.ele_count > 0) {
+		pk.offset = 0;
+
+		define_map_unpacker(pk_key, &map);
+
+		for (uint32_t i = 1; i < map.ele_count; i++) {
+			uint32_t offset = pk.offset;
+			msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk);
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset);
+				return false;
+			}
+
+			if (cmp == MSGPACK_COMPARE_GREATER) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u keys not in order", i, offset, pk.offset);
+				return false;
+			}
+
+			pk_key.offset = offset;
+
+			if (as_unpack_size(&pk) <= 0) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid value", i, offset, pk.offset);
+				return false;
+			}
+		}
+	}
+
+	// Check value orders.
+	if (order_index_is_filled(ordidx) && map.ele_count > 0) {
+		// Compare with freshly sorted.
+		define_order_index(cmp_order, map.ele_count);
+
+		order_index_set_sorted(&cmp_order, u.offidx, map.contents,
+				map.content_sz, SORT_BY_VALUE);
+
+		for (uint32_t i = 0; i < map.ele_count; i++) {
+			uint32_t expected = order_index_get(&cmp_order, i);
+			uint32_t index = order_index_get(ordidx, i);
+
+			if (index != expected) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u index=%u expected=%u invalid order index", i, index, expected);
+				return false;
+			}
+		}
+
+		// Walk index and check value order.
+		pk.offset = 0;
+
+		define_map_unpacker(prev_value, &map);
+		uint32_t index = order_index_get(ordidx, 0);
+
+		prev_value.offset = offset_index_get_const(u.offidx, index);
+
+		if (as_unpack_size(&prev_value) <= 0) {
+			cf_warning(AS_PARTICLE, "map_verify() index=%u pk.offset=%u invalid key", index, pk.offset);
+			return false;
+		}
+
+		for (uint32_t i = 1; i < map.ele_count; i++) {
+			index = order_index_get(ordidx, i);
+			pk.offset = offset_index_get_const(u.offidx, index);
+
+			if (as_unpack_size(&pk) <= 0) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u index=%u pk.offset=%u invalid key", i, index, pk.offset);
+				return false;
+			}
+
+			uint32_t offset = pk.offset;
+			msgpack_compare_t cmp = as_unpack_compare(&prev_value, &pk);
+
+			if (cmp == MSGPACK_COMPARE_ERROR) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid value", i, offset, pk.offset);
+				return false;
+			}
+
+			if (cmp == MSGPACK_COMPARE_GREATER) {
+				cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u value index not in order", i, offset, pk.offset);
+				return false;
+			}
+
+			prev_value.offset = offset;
+		}
+	}
+
+	return true;
+}
+
+// Quash warnings for debug function.
+void
+as_cdt_map_debug_dummy()
+{
+	map_verify(NULL);
+	map_print(NULL, NULL);
+}
diff --git a/as/src/base/particle_string.c b/as/src/base/particle_string.c
new file mode 100644
index 00000000..4f43f623
--- /dev/null
+++ b/as/src/base/particle_string.c
@@ -0,0 +1,173 @@
+/*
+ * particle_string.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aerospike/as_string.h"
+#include "aerospike/as_val.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/particle.h"
+#include "base/particle_blob.h"
+
+
+//==========================================================
+// STRING particle interface - function declarations.
+//
+
+// Most STRING particle table functions just use the equivalent BLOB particle
+// functions. Here are the differences...
+
+// Handle as_val translation.
+uint32_t string_size_from_asval(const as_val *val);
+void string_from_asval(const as_val *val, as_particle **pp);
+as_val *string_to_asval(const as_particle *p);
+uint32_t string_asval_wire_size(const as_val *val);
+uint32_t string_asval_to_wire(const as_val *val, uint8_t *wire);
+
+
+//==========================================================
+// STRING particle interface - vtable.
+//
+
+const as_particle_vtable string_vtable = {
+		blob_destruct,
+		blob_size,
+
+		blob_concat_size_from_wire,
+		blob_append_from_wire,
+		blob_prepend_from_wire,
+		blob_incr_from_wire,
+		blob_size_from_wire,
+		blob_from_wire,
+		blob_compare_from_wire,
+		blob_wire_size,
+		blob_to_wire,
+
+		string_size_from_asval,
+		string_from_asval,
+		string_to_asval,
+		string_asval_wire_size,
+		string_asval_to_wire,
+
+		blob_size_from_msgpack,
+		blob_from_msgpack,
+
+		blob_size_from_flat,
+		blob_cast_from_flat,
+		blob_from_flat,
+		blob_flat_size,
+		blob_to_flat
+};
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// Same as related BLOB struct. TODO - just expose BLOB structs?
+
+typedef struct string_mem_s {
+	uint8_t		type;
+	uint32_t	sz;
+	uint8_t		data[];
+} __attribute__ ((__packed__)) string_mem;
+
+
+//==========================================================
+// STRING particle interface - function definitions.
+//
+
+// Most STRING particle table functions just use the equivalent BLOB particle
+// functions. Here are the differences...
+
+//------------------------------------------------
+// Handle as_val translation.
+//
+
+uint32_t
+string_size_from_asval(const as_val *val)
+{
+	return (uint32_t)(sizeof(string_mem) + as_string_len(as_string_fromval(val)));
+}
+
+void
+string_from_asval(const as_val *val, as_particle **pp)
+{
+	string_mem *p_string_mem = (string_mem *)*pp;
+
+	as_string *string = as_string_fromval(val);
+
+	p_string_mem->type = AS_PARTICLE_TYPE_STRING;
+	p_string_mem->sz = (uint32_t)as_string_len(string);
+	memcpy(p_string_mem->data, as_string_tostring(string), p_string_mem->sz);
+}
+
+as_val *
+string_to_asval(const as_particle *p)
+{
+	string_mem *p_string_mem = (string_mem *)p;
+
+	uint8_t *value = cf_malloc(p_string_mem->sz + 1);
+
+	memcpy(value, p_string_mem->data, p_string_mem->sz);
+	value[p_string_mem->sz] = 0;
+
+	return (as_val *)as_string_new_wlen((char *)value, p_string_mem->sz, true);
+}
+
+uint32_t
+string_asval_wire_size(const as_val *val)
+{
+	return as_string_len(as_string_fromval(val));
+}
+
+uint32_t
+string_asval_to_wire(const as_val *val, uint8_t *wire)
+{
+	as_string *string = as_string_fromval(val);
+	uint32_t size = (uint32_t)as_string_len(string);
+
+	memcpy(wire, as_string_tostring(string), size);
+
+	return size;
+}
+
+
+//==========================================================
+// as_bin particle functions specific to STRING.
+//
+
+uint32_t
+as_bin_particle_string_ptr(const as_bin *b, char **p_value)
+{
+	// Caller must ensure this is called only for STRING particles.
+	string_mem *p_string_mem = (string_mem *)b->particle;
+
+	*p_value = (char *)p_string_mem->data;
+
+	return p_string_mem->sz;
+}
diff --git a/as/src/base/predexp.c b/as/src/base/predexp.c
new file mode 100644
index 00000000..ac1add56
--- /dev/null
+++ b/as/src/base/predexp.c
@@ -0,0 +1,2149 @@
+/*
+ * predexp.c
+ *
+ * Copyright (C) 2016-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/predexp.h"
+
+#include <inttypes.h>
+#include <regex.h>
+
+#include <aerospike/as_arraylist.h>
+#include <aerospike/as_arraylist_iterator.h>
+#include <aerospike/as_hashmap_iterator.h>
+#include <aerospike/as_map.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "fault.h"
+
+#include "base/particle.h"
+#include "geospatial/geospatial.h"
+
+typedef enum {
+	PREDEXP_FALSE = 0,		// Matching nodes only
+	PREDEXP_TRUE = 1,		// Matching nodes only
+	PREDEXP_UNKNOWN = 2,	// Matching nodes only
+	PREDEXP_VALUE = 3,		// Value nodes only
+	PREDEXP_NOVALUE = 4		// Value nodes only
+} predexp_retval_t;
+
+typedef struct wrapped_as_bin_s {
+	as_bin	bin;
+	bool	must_free;
+} wrapped_as_bin_t;
+
+// Called to destroy a predexp when no longer needed.
+typedef void (*predexp_eval_dtor_fn)(predexp_eval_t* bp);
+
+typedef predexp_retval_t (*predexp_eval_eval_fn)(predexp_eval_t* bp,
+												 predexp_args_t* argsp,
+												 wrapped_as_bin_t* wbinp);
+
+// Convenience macro, converts boolean to retval.
+#define PREDEXP_RETVAL(bb)	((bb) ? PREDEXP_TRUE : PREDEXP_FALSE)
+
+#define PREDEXP_VALUE_NODE			0x01	// represents a value
+#define PREDEXP_IMMEDIATE_NODE		0x02	// constant per-query value
+
+struct predexp_eval_base_s {
+	predexp_eval_t*			next;
+	predexp_eval_dtor_fn	dtor_fn;
+	predexp_eval_eval_fn	eval_fn;
+	uint8_t					flags;
+	uint8_t					type;
+};
+
+struct predexp_var_s {
+	char					vname[AS_ID_BIN_SZ];
+	as_bin					bin;
+	as_predexp_var_t*		next;
+};
+
+// This function can set bin values for all bloblike types (strings)
+
+extern const as_particle_vtable *particle_vtable[];
+
+#if 0
+static void predexp_eval_base_dtor(predexp_eval_t* bp)
+{
+	cf_free(bp);
+}
+#endif
+
+static void predexp_eval_base_init(predexp_eval_t* bp,
+								   predexp_eval_dtor_fn	dtor_fn,
+								   predexp_eval_eval_fn eval_fn,
+								   uint8_t flags,
+								   uint8_t type)
+{
+	bp->next = NULL;
+	bp->dtor_fn = dtor_fn;
+	bp->eval_fn = eval_fn;
+	bp->flags = flags;
+	bp->type = type;
+}
+
+// ----------------------------------------------------------------
+// Helper Functions
+// ----------------------------------------------------------------
+
+static void
+destroy_list(predexp_eval_t* bp)
+{
+	while (bp != NULL) {
+		predexp_eval_t* next = bp->next;
+		(*bp->dtor_fn)(bp);
+		bp = next;
+	}
+}
+
+// ----------------------------------------------------------------
+// Tag Definitions
+// ----------------------------------------------------------------
+
+// FIXME - these need to be in common w/ the clients
+//
+#define AS_PREDEXP_AND					1
+#define AS_PREDEXP_OR					2
+#define AS_PREDEXP_NOT					3
+
+#define AS_PREDEXP_INTEGER_VALUE		10
+#define AS_PREDEXP_STRING_VALUE			11
+#define AS_PREDEXP_GEOJSON_VALUE		12
+
+#define AS_PREDEXP_INTEGER_BIN			100
+#define AS_PREDEXP_STRING_BIN			101
+#define AS_PREDEXP_GEOJSON_BIN			102
+#define AS_PREDEXP_LIST_BIN				103
+#define AS_PREDEXP_MAP_BIN				104
+
+#define AS_PREDEXP_INTEGER_VAR			120
+#define AS_PREDEXP_STRING_VAR			121
+#define AS_PREDEXP_GEOJSON_VAR			122
+
+#define AS_PREDEXP_REC_DEVICE_SIZE		150
+#define AS_PREDEXP_REC_LAST_UPDATE		151
+#define AS_PREDEXP_REC_VOID_TIME		152
+#define AS_PREDEXP_REC_DIGEST_MODULO	153
+
+#define AS_PREDEXP_INTEGER_EQUAL		200
+#define AS_PREDEXP_INTEGER_UNEQUAL		201
+#define AS_PREDEXP_INTEGER_GREATER		202
+#define AS_PREDEXP_INTEGER_GREATEREQ	203
+#define AS_PREDEXP_INTEGER_LESS			204
+#define AS_PREDEXP_INTEGER_LESSEQ		205
+
+#define AS_PREDEXP_STRING_EQUAL			210
+#define AS_PREDEXP_STRING_UNEQUAL		211
+#define AS_PREDEXP_STRING_REGEX			212
+
+#define AS_PREDEXP_GEOJSON_WITHIN		220
+#define AS_PREDEXP_GEOJSON_CONTAINS		221
+
+#define AS_PREDEXP_LIST_ITERATE_OR		250
+#define AS_PREDEXP_MAPKEY_ITERATE_OR	251
+#define AS_PREDEXP_MAPVAL_ITERATE_OR	252
+#define AS_PREDEXP_LIST_ITERATE_AND		253
+#define AS_PREDEXP_MAPKEY_ITERATE_AND	254
+#define AS_PREDEXP_MAPVAL_ITERATE_AND	255
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_AND
+// ----------------------------------------------------------------
+
+typedef struct {
+	predexp_eval_t		base;
+	predexp_eval_t*		child;
+} predexp_eval_and_t;
+
+static void
+destroy_and(predexp_eval_t* bp)
+{
+	predexp_eval_and_t* dp = (predexp_eval_and_t *) bp;
+	destroy_list(dp->child);
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_and(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	predexp_eval_and_t* dp = (predexp_eval_and_t *) bp;
+
+	// Start optimistically.
+	predexp_retval_t retval = PREDEXP_TRUE;
+
+	// Scan the children.
+	for (predexp_eval_t* cp = dp->child; cp != NULL; cp = cp->next) {
+
+		switch ((*cp->eval_fn)(cp, argsp, NULL)) {
+		case PREDEXP_FALSE:
+			// Shortcut, skip remaining children.
+			return PREDEXP_FALSE;
+		case PREDEXP_UNKNOWN:
+			// Downgrade our return value, continue scanning children.
+			retval = PREDEXP_UNKNOWN;
+			break;
+		case PREDEXP_TRUE:
+			// Continue scanning children.
+			break;
+		case PREDEXP_VALUE:
+		case PREDEXP_NOVALUE:
+			// Child can't be value node; shouldn't ever happen.
+			cf_crash(AS_PREDEXP, "eval_and child was value node");
+		}
+	}
+
+	return retval;
+}
+
+static bool
+build_and(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	if (len != sizeof(uint16_t)) {
+		cf_warning(AS_PREDEXP, "predexp_and: unexpected size %d", len);
+		return false;
+	}
+	uint16_t nterms = cf_swap_from_be16(* (uint16_t *) pp);
+	pp += sizeof(uint16_t);
+
+	predexp_eval_and_t* dp =
+		(predexp_eval_and_t *) cf_malloc(sizeof(predexp_eval_and_t));
+
+	// Start optimistically.
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_and,
+						   eval_and,
+						   0,
+						   AS_PARTICLE_TYPE_NULL);
+	dp->child = NULL;
+
+	for (uint16_t ndx = 0; ndx < nterms; ++ndx) {
+		// If there is not an available child expr cleanup and fail.
+		if (! *stackpp) {
+			cf_warning(AS_PREDEXP, "predexp_and: missing child %d", ndx);
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+
+		// Transfer the expr at the top of the stack to our child list.
+		predexp_eval_t* child;
+		child = *stackpp;			// Child from the top of the stack.
+		*stackpp = child->next;		// Stack points around the child.
+		child->next = dp->child;	// Child now points to prior list head.
+		dp->child = child;			// Child is now the top of our list.
+
+		// Make sure the child is not a value node.
+		if (dp->child->flags & PREDEXP_VALUE_NODE) {
+			cf_warning(AS_PREDEXP, "predexp_and: child %d is value node", ndx);
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;			// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_and(%d)", stackpp, nterms);
+	
+	return true;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_OR
+// ----------------------------------------------------------------
+
+typedef struct {
+	predexp_eval_t		base;
+	predexp_eval_t*		child;
+} predexp_eval_or_t;
+
+static void
+destroy_or(predexp_eval_t* bp)
+{
+	predexp_eval_or_t* dp = (predexp_eval_or_t *) bp;
+	destroy_list(dp->child);
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_or(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	predexp_eval_or_t* dp = (predexp_eval_or_t *) bp;
+
+	// Start pessimistically.
+	predexp_retval_t retval = PREDEXP_FALSE;
+
+	// Scan the children.
+	for (predexp_eval_t* cp = dp->child; cp != NULL; cp = cp->next) {
+		switch ((*cp->eval_fn)(cp, argsp, NULL)) {
+		case PREDEXP_TRUE:
+			// Shortcut, skip remaining children.
+			return PREDEXP_TRUE;
+		case PREDEXP_UNKNOWN:
+			// Upgrade our return value, continue scanning children.
+			retval = PREDEXP_UNKNOWN;
+			break;
+		case PREDEXP_FALSE:
+			// Continue scanning children.
+			break;
+		case PREDEXP_VALUE:
+		case PREDEXP_NOVALUE:
+			// Child can't be value node; shouldn't ever happen.
+			cf_crash(AS_PREDEXP, "eval_or child was value node");
+		}
+	}
+
+	return retval;
+}
+
+static bool
+build_or(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	if (len != sizeof(uint16_t)) {
+		cf_warning(AS_PREDEXP, "predexp_or: unexpected size %d", len);
+		return false;
+	}
+	uint16_t nterms = cf_swap_from_be16(* (uint16_t *) pp);
+	pp += sizeof(uint16_t);
+
+	predexp_eval_or_t* dp =
+		(predexp_eval_or_t *) cf_malloc(sizeof(predexp_eval_or_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_or,
+						   eval_or,
+						   0,
+						   AS_PARTICLE_TYPE_NULL);
+	dp->child = NULL;
+
+	for (uint16_t ndx = 0; ndx < nterms; ++ndx) {
+		// If there is not an available child expr cleanup and fail.
+		if (! *stackpp) {
+			cf_warning(AS_PREDEXP, "predexp_or: missing child %d", ndx);
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+		// Transfer the expr at the top of the stack to our child list.
+		predexp_eval_t* child;
+		child = *stackpp;			// Child from the top of the stack.
+		*stackpp = child->next;		// Stack points around the child.
+		child->next = dp->child;	// Child now points to prior list head.
+		dp->child = child;			// Child is now the top of our list.
+
+		// Make sure the child is not a value node.
+		if (dp->child->flags & PREDEXP_VALUE_NODE) {
+			cf_warning(AS_PREDEXP, "predexp_or: child %d is value node", ndx);
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;			// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_or(%d)", stackpp, nterms);
+	
+	return true;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_NOT
+// ----------------------------------------------------------------
+
+typedef struct {
+	predexp_eval_t		base;
+	predexp_eval_t*		child;
+} predexp_eval_not_t;
+
+static void
+destroy_not(predexp_eval_t* bp)
+{
+	predexp_eval_not_t* dp = (predexp_eval_not_t *) bp;
+	destroy_list(dp->child);
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_not(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	predexp_eval_not_t* dp = (predexp_eval_not_t *) bp;
+
+	predexp_eval_t* cp = dp->child;
+
+	switch ((*cp->eval_fn)(cp, argsp, NULL)) {
+	case PREDEXP_FALSE:
+		return PREDEXP_TRUE;
+	case PREDEXP_UNKNOWN:
+		return PREDEXP_UNKNOWN;
+	case PREDEXP_TRUE:
+		return PREDEXP_FALSE;
+	case PREDEXP_VALUE:
+	case PREDEXP_NOVALUE:
+		// Child can't be value node; shouldn't ever happen.
+		cf_crash(AS_PREDEXP, "eval_not child was value node");
+	}
+
+	return PREDEXP_UNKNOWN;	// Can't get here, makes compiler happy.
+}
+
+static bool
+build_not(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	if (len != 0) {
+		cf_warning(AS_PREDEXP, "predexp_not: unexpected size %d", len);
+		return false;
+	}
+
+	predexp_eval_not_t* dp =
+		(predexp_eval_not_t *) cf_malloc(sizeof(predexp_eval_not_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_not,
+						   eval_not,
+						   0,
+						   AS_PARTICLE_TYPE_NULL);
+	dp->child = NULL;
+
+	// If there is not an available child expr cleanup and fail.
+	if (! *stackpp) {
+		cf_warning(AS_PREDEXP, "predexp_not: missing child");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+	// Transfer the expr at the top of the stack to our child list.
+	predexp_eval_t* child;
+	child = *stackpp;			// Child from the top of the stack.
+	*stackpp = child->next;		// Stack points around the child.
+	child->next = dp->child;	// Child now points to prior list head.
+	dp->child = child;			// Child is now the top of our list.
+
+	// Make sure the child is not a value node.
+	if (dp->child->flags & PREDEXP_VALUE_NODE) {
+		cf_warning(AS_PREDEXP, "predexp_not: child is value node");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;			// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_not", stackpp);
+	
+	return true;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_*_COMPARE
+// ----------------------------------------------------------------
+
+// GEOSPATIAL NOTES:
+//
+// We want to perform all possible computation on the query region
+// once, prior to visiting all the points.  The current value
+// interface is opaque; it returns a bin particle only; there is no
+// way to pass associated precomputed state.  So we keep the
+// precomputed region query state here in the comparison node instead.
+//
+// IMPROVEMENTS:
+//
+// We currently parse the incoming query (IMMEDIATE) region twice;
+// once in the from_wire_fn and again explicitly in the build_compare
+// routine, this time retaining the region.  Maybe we should make an
+// exposed as_geojson_from_wire which additionally returns the
+// computed region; the particle geojson_from_wire could call this
+// routine and then discard the region.
+//
+// We can improve the performance of the comparison by covering the
+// region at build time and saving all of the cell min/max ranges.
+// Candidate points can first be checked against the list of ranges to
+// make sure they are a rough match before performing the more
+// expensive strict region match.  This change requires a bunch more
+// state; probably we'll want a pointer to the
+// predexp_eval_geojson_state_t instead of using a union at this
+// point.
+
+typedef struct predexp_eval_geojson_state_s {
+	uint64_t				cellid;
+	geo_region_t			region;
+} predexp_eval_geojson_state_t;
+
+typedef struct predexp_eval_regex_state_s {
+	regex_t					regex;
+	bool					iscompiled;
+} predexp_eval_regex_state_t;
+
+typedef struct predexp_eval_compare_s {
+	predexp_eval_t			base;
+	uint16_t				tag;
+	uint8_t					type;
+	predexp_eval_t*			lchild;
+	predexp_eval_t*			rchild;
+	union {
+		predexp_eval_geojson_state_t	geojson;
+		predexp_eval_regex_state_t		regex;
+	} state;
+} predexp_eval_compare_t;
+
+static void
+destroy_compare(predexp_eval_t* bp)
+{
+	predexp_eval_compare_t* dp = (predexp_eval_compare_t *) bp;
+	if (dp->lchild) {
+		(*dp->lchild->dtor_fn)(dp->lchild);
+	}
+	if (dp->rchild) {
+		(*dp->rchild->dtor_fn)(dp->rchild);
+	}
+	if (dp->type == AS_PARTICLE_TYPE_GEOJSON && dp->state.geojson.region) {
+		geo_region_destroy(dp->state.geojson.region);
+	}
+	if (dp->tag == AS_PREDEXP_STRING_REGEX && dp->state.regex.iscompiled) {
+		regfree(&dp->state.regex.regex);
+	}
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_compare(predexp_eval_t* bp,
+			 predexp_args_t* argsp,
+			 wrapped_as_bin_t* wbinp)
+{
+	predexp_eval_compare_t* dp = (predexp_eval_compare_t *) bp;
+
+	predexp_retval_t retval = PREDEXP_UNKNOWN;
+
+	wrapped_as_bin_t lwbin;
+	wrapped_as_bin_t rwbin;
+	lwbin.must_free = false;
+	rwbin.must_free = false;
+
+	// Fetch the child values.  Are either of the values unknown?
+	// During the metadata phase this returns PREDEXP_UNKNOWN.  During
+	// the record phase we consider a comparison with an unknown value
+	// to be PREDEXP_FALSE (missing bin or bin or wrong type).
+
+	if ((*dp->lchild->eval_fn)(dp->lchild, argsp, &lwbin) ==
+		PREDEXP_NOVALUE) {
+		retval = argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN;
+		goto Cleanup;
+	}
+
+	if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &rwbin) ==
+		PREDEXP_NOVALUE) {
+		retval = argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN;
+		goto Cleanup;
+	}
+
+	switch (dp->type) {
+	case AS_PARTICLE_TYPE_INTEGER: {
+		int64_t lval = as_bin_particle_integer_value(&lwbin.bin);
+		int64_t rval = as_bin_particle_integer_value(&rwbin.bin);
+		switch (dp->tag) {
+		case AS_PREDEXP_INTEGER_EQUAL:
+			retval = PREDEXP_RETVAL(lval == rval);
+			goto Cleanup;
+		case AS_PREDEXP_INTEGER_UNEQUAL:
+			retval = PREDEXP_RETVAL(lval != rval);
+			goto Cleanup;
+		case AS_PREDEXP_INTEGER_GREATER:
+			retval = PREDEXP_RETVAL(lval >  rval);
+			goto Cleanup;
+		case AS_PREDEXP_INTEGER_GREATEREQ:
+			retval = PREDEXP_RETVAL(lval >= rval);
+			goto Cleanup;
+		case AS_PREDEXP_INTEGER_LESS:
+			retval = PREDEXP_RETVAL(lval <  rval);
+			goto Cleanup;
+		case AS_PREDEXP_INTEGER_LESSEQ:
+			retval = PREDEXP_RETVAL(lval <= rval);
+			goto Cleanup;
+		default:
+			cf_crash(AS_PREDEXP, "eval_compare integer unknown tag %d",
+					 dp->tag);
+		}
+	}
+	case AS_PARTICLE_TYPE_STRING: {
+		// We always need to fetch the left argument.
+		char* lptr;
+		uint32_t llen = as_bin_particle_string_ptr(&lwbin.bin, &lptr);
+		char* rptr;
+		uint32_t rlen;
+		switch (dp->tag) {
+		case AS_PREDEXP_STRING_EQUAL:
+		case AS_PREDEXP_STRING_UNEQUAL:
+			// These comparisons need the right argument too.
+			rlen = as_bin_particle_string_ptr(&rwbin.bin, &rptr);
+			bool isequal = (llen == rlen) && (memcmp(lptr, rptr, llen) == 0);
+			switch (dp->tag) {
+			case AS_PREDEXP_STRING_EQUAL:
+				retval = isequal;
+				goto Cleanup;
+			case AS_PREDEXP_STRING_UNEQUAL:
+				retval = ! isequal;
+				goto Cleanup;
+			default:
+				cf_crash(AS_PREDEXP, "eval_compare string (eq) unknown tag %d",
+						 dp->tag);
+			}
+		case AS_PREDEXP_STRING_REGEX: {
+			char* tmpstr = cf_strndup(lptr, llen);
+			int rv = regexec(&dp->state.regex.regex, tmpstr, 0, NULL, 0);
+			cf_free(tmpstr);
+			retval = rv == 0;
+			goto Cleanup;
+		}
+		default:
+			cf_crash(AS_PREDEXP, "eval_compare string unknown tag %d", dp->tag);
+		}
+	}
+	case AS_PARTICLE_TYPE_GEOJSON: {
+		// as_particle* lpart = lbinp->particle;
+		// as_particle* rpart = rbinp->particle;
+
+		switch (dp->tag) {
+		case AS_PREDEXP_GEOJSON_WITHIN:
+		case AS_PREDEXP_GEOJSON_CONTAINS: {
+			bool isstrict = true;
+			bool ismatch = as_particle_geojson_match(lwbin.bin.particle,
+													 dp->state.geojson.cellid,
+													 dp->state.geojson.region,
+													 isstrict);
+			retval = PREDEXP_RETVAL(ismatch);
+			goto Cleanup;
+		}
+		default:
+			cf_crash(AS_PREDEXP, "eval_compare geojson unknown tag %d",
+					 dp->tag);
+		}
+	}
+	default:
+		cf_crash(AS_PREDEXP, "eval_compare unknown type %d", dp->type);
+	}
+
+ Cleanup:
+	if (lwbin.must_free) {
+		cf_crash(AS_PREDEXP, "eval_compare need bin cleanup, didn't before");
+	}
+	if (rwbin.must_free) {
+		cf_crash(AS_PREDEXP, "eval_compare need bin cleanup, didn't before");
+	}
+	return retval;
+}
+
+static bool
+build_compare(predexp_eval_t** stackpp,
+					  uint32_t len,
+					  uint8_t* pp,
+					  uint16_t tag)
+{
+	predexp_eval_compare_t* dp = (predexp_eval_compare_t *)
+			cf_malloc(sizeof(predexp_eval_compare_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_compare,
+						   eval_compare,
+						   0,
+						   AS_PARTICLE_TYPE_NULL);
+
+	dp->tag = tag;
+	dp->lchild = NULL;
+	dp->rchild = NULL;
+
+	// IMPORTANT - If your state doesn't want to be initialized
+	// to all 0 rethink this ...
+	//
+	memset(&dp->state, 0, sizeof(dp->state));
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_EQUAL:
+	case AS_PREDEXP_INTEGER_UNEQUAL:
+	case AS_PREDEXP_INTEGER_GREATER:
+	case AS_PREDEXP_INTEGER_GREATEREQ:
+	case AS_PREDEXP_INTEGER_LESS:
+	case AS_PREDEXP_INTEGER_LESSEQ:
+		dp->type = AS_PARTICLE_TYPE_INTEGER;
+		break;
+	case AS_PREDEXP_STRING_EQUAL:
+	case AS_PREDEXP_STRING_UNEQUAL:
+	case AS_PREDEXP_STRING_REGEX:
+		dp->type = AS_PARTICLE_TYPE_STRING;
+		break;
+	case AS_PREDEXP_GEOJSON_WITHIN:
+	case AS_PREDEXP_GEOJSON_CONTAINS:
+		dp->type = AS_PARTICLE_TYPE_GEOJSON;
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_compare called with bogus tag: %d", tag);
+		break;
+	}
+
+	uint8_t* endp = pp + len;
+
+	uint32_t regex_opts = 0;
+	if (tag == AS_PREDEXP_STRING_REGEX) {
+		// This comparison takes a uint32_t opts argument.
+		if (pp + sizeof(uint32_t) > endp) {
+			cf_warning(AS_PREDEXP, "build_compare: regex opts past end");
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+		regex_opts = cf_swap_from_be32(* (uint32_t *) pp);
+		pp += sizeof(uint32_t);
+	}
+
+	// No arguments.
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_compare: msg unaligned");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	// ---- Pop the right child off the stack.
+
+	if (! *stackpp) {
+		cf_warning(AS_PREDEXP, "predexp_compare: missing right child");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	dp->rchild = *stackpp;
+	*stackpp = dp->rchild->next;
+	dp->rchild->next = NULL;
+
+	if ((dp->rchild->flags & PREDEXP_VALUE_NODE) == 0) {
+		cf_warning(AS_PREDEXP,
+				   "predexp compare: right child is not value node");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	if (dp->rchild->type != dp->type) {
+		cf_warning(AS_PREDEXP, "predexp compare: right child is wrong type");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	// ---- Pop the left child off the stack.
+
+	if (! *stackpp) {
+		cf_warning(AS_PREDEXP, "predexp_compare: missing left child");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	dp->lchild = *stackpp;
+	*stackpp = dp->lchild->next;
+	dp->lchild->next = NULL;
+
+	if ((dp->lchild->flags & PREDEXP_VALUE_NODE) == 0) {
+		cf_warning(AS_PREDEXP, "predexp compare: left child is not value node");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	if (dp->lchild->type != dp->type) {
+		cf_warning(AS_PREDEXP, "predexp compare: left child is wrong type");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	switch (tag) {
+	case AS_PREDEXP_GEOJSON_WITHIN:
+	case AS_PREDEXP_GEOJSON_CONTAINS:
+		// The right child needs to be an immediate value.
+		if ((dp->rchild->flags & PREDEXP_IMMEDIATE_NODE) == 0) {
+			cf_warning(AS_PREDEXP,
+					   "predexp compare: within arg not immediate GeoJSON");
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+
+		// Extract the query GeoJSON value.
+		predexp_args_t* argsp = NULL;	// immediate values don't need args
+		wrapped_as_bin_t rwbin;
+		rwbin.must_free = false;
+		if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &rwbin) ==
+			PREDEXP_NOVALUE) {
+			cf_warning(AS_PREDEXP,
+					   "predexp compare: within arg had unknown value");
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+		size_t sz;
+		char const * ptr = as_geojson_mem_jsonstr(rwbin.bin.particle, &sz);
+
+		// Parse the child, save the computed state.
+		if (!geo_parse(NULL, ptr, sz,
+					   &dp->state.geojson.cellid,
+					   &dp->state.geojson.region)) {
+			cf_warning(AS_PREDEXP, "predexp compare: failed to parse GeoJSON");
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			if (rwbin.must_free) {
+				cf_crash(AS_PREDEXP,
+						 "predexp compare now needs bin destructor");
+			}
+			return false;
+		}
+		if (rwbin.must_free) {
+			cf_crash(AS_PREDEXP, "predexp compare now needs bin destructor");
+		}
+		break;
+	case AS_PREDEXP_STRING_REGEX:
+		// The right child needs to be an immediate value.
+		if ((dp->rchild->flags & PREDEXP_IMMEDIATE_NODE) == 0) {
+			cf_warning(AS_PREDEXP,
+					   "predexp compare: regex arg not immediate string");
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+
+		// Extract the query regex value.
+		predexp_args_t* argsp2 = NULL;	// immediate values don't need args
+		wrapped_as_bin_t rwbin2;
+		rwbin2.must_free = false;
+		if ((*dp->rchild->eval_fn)(dp->rchild, argsp2, &rwbin2) ==
+			PREDEXP_NOVALUE) {
+			cf_warning(AS_PREDEXP,
+					   "predexp compare: regex arg had unknown value");
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			return false;
+		}
+		char* rptr;
+		uint32_t rlen = as_bin_particle_string_ptr(&rwbin2.bin, &rptr);
+		char* tmpregexp = cf_strndup(rptr, rlen);
+		int rv = regcomp(&dp->state.regex.regex, tmpregexp, regex_opts);
+		cf_free(tmpregexp);
+		if (rv != 0) {
+			char errbuf[1024];
+			regerror(rv, &dp->state.regex.regex, errbuf, sizeof(errbuf));
+			cf_warning(AS_PREDEXP, "predexp compare: regex compile failed: %s",
+					   errbuf);
+			(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+			if (rwbin2.must_free) {
+				cf_crash(AS_PREDEXP,
+						 "predexp compare now needs bin destructor");
+			}
+			return false;
+		}
+		dp->state.regex.iscompiled = true;
+		if (rwbin2.must_free) {
+			cf_crash(AS_PREDEXP, "predexp compare now needs bin destructor");
+		}
+		break;
+
+	default:
+		// Don't do anything for the others ...
+		break;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;			// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_EQUAL:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_equal", stackpp);
+		break;
+	case AS_PREDEXP_INTEGER_UNEQUAL:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_unequal", stackpp);
+		break;
+	case AS_PREDEXP_INTEGER_GREATER:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_greater", stackpp);
+		break;
+	case AS_PREDEXP_INTEGER_GREATEREQ:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_greatereq", stackpp);
+		break;
+	case AS_PREDEXP_INTEGER_LESS:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_less", stackpp);
+		break;
+	case AS_PREDEXP_INTEGER_LESSEQ:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_lesseq", stackpp);
+		break;
+	case AS_PREDEXP_STRING_EQUAL:
+		cf_debug(AS_PREDEXP, "%p: predexp_string_equal", stackpp);
+		break;
+	case AS_PREDEXP_STRING_UNEQUAL:
+		cf_debug(AS_PREDEXP, "%p: predexp_string_unequal", stackpp);
+		break;
+	case AS_PREDEXP_STRING_REGEX:
+		cf_debug(AS_PREDEXP, "%p: predexp_string_regex(%d)", stackpp,
+				 regex_opts);
+		break;
+	case AS_PREDEXP_GEOJSON_WITHIN:
+		cf_debug(AS_PREDEXP, "%p: predexp_geojson_within", stackpp);
+		break;
+	case AS_PREDEXP_GEOJSON_CONTAINS:
+		cf_debug(AS_PREDEXP, "%p: predexp_geojson_contains", stackpp);
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_compare called with bogus tag: %d", tag);
+		break;
+	}
+
+	return true;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_*_VALUE
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_value_s {
+	predexp_eval_t			base;
+	as_bin					bin;
+	uint8_t					type;
+} predexp_eval_value_t;
+
+static void
+destroy_value(predexp_eval_t* bp)
+{
+	predexp_eval_value_t* dp = (predexp_eval_value_t *) bp;
+	as_bin_particle_destroy(&dp->bin, true);
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_value(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP, "eval_value called outside value context");
+	}
+
+	predexp_eval_value_t* dp = (predexp_eval_value_t *) bp;
+	// We don't have a ns in this context.  But the source bin doesn't
+	// have any name index stuff anyway ...
+	as_single_bin_copy(&wbinp->bin, &dp->bin);
+	wbinp->must_free = false;	// bin is constant, destroyed after query above
+	return PREDEXP_VALUE;
+}
+
+static bool
+build_value(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag)
+{
+	predexp_eval_value_t* dp = (predexp_eval_value_t *)
+			cf_malloc(sizeof(predexp_eval_value_t));
+
+	uint8_t type;
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_VALUE: type = AS_PARTICLE_TYPE_INTEGER; break;
+	case AS_PREDEXP_STRING_VALUE: type = AS_PARTICLE_TYPE_STRING; break;
+	case AS_PREDEXP_GEOJSON_VALUE: type = AS_PARTICLE_TYPE_GEOJSON; break;
+	default:
+		cf_crash(AS_PREDEXP, "build_value called with bogus tag: %d", tag);
+		return false;
+	}
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_value,
+						   eval_value,
+						   PREDEXP_VALUE_NODE | PREDEXP_IMMEDIATE_NODE,
+						   type);
+
+	as_bin_set_empty(&dp->bin);
+	dp->bin.particle = NULL;
+
+	uint8_t* endp = pp + len;
+
+	size_t vallen = len;
+	void* valptr = (char*) pp;
+	pp += vallen;
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "predexp value: msg unaligned");
+		goto Failed;
+	}
+
+	int32_t mem_size = particle_vtable[type]->size_from_wire_fn(valptr, vallen);
+
+	if (mem_size != 0) {
+		dp->bin.particle = cf_malloc((size_t)mem_size);
+	}
+
+	int result = particle_vtable[type]->from_wire_fn(type,
+													 valptr,
+													 vallen,
+													 &dp->bin.particle);
+
+	// Set the bin's iparticle metadata.
+	if (result == 0) {
+		as_bin_state_set_from_type(&dp->bin, type);
+	}
+	else {
+		cf_warning(AS_PREDEXP, "failed to build predexp value with err %d",
+				   result);
+		if (mem_size != 0) {
+			cf_free(dp->bin.particle);
+		}
+		as_bin_set_empty(&dp->bin);
+		dp->bin.particle = NULL;
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;		// We're the new top
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_VALUE:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_value(%"PRId64")", stackpp,
+				 (int64_t) dp->bin.particle);
+		break;
+	case AS_PREDEXP_STRING_VALUE: {
+		cf_debug(AS_PREDEXP, "%p: predexp_string_value(\"%s\")", stackpp,
+				 CF_ZSTR1K(valptr, vallen));
+		break;
+	}
+	case AS_PREDEXP_GEOJSON_VALUE: {
+		size_t jsonsz;
+		char const * jsonptr =
+			as_geojson_mem_jsonstr(dp->bin.particle, &jsonsz);
+		cf_debug(AS_PREDEXP, "%p: predexp_geojson_value(%s)", stackpp,
+				CF_ZSTR1K(jsonptr, jsonsz));
+		break;
+	}
+	default:
+		cf_crash(AS_PREDEXP, "build_value called with bogus tag: %d", tag);
+		break;
+	}
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_*_BIN
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_bin_s {
+	predexp_eval_t			base;
+	char					bname[AS_ID_BIN_SZ];
+	uint8_t					type;
+} predexp_eval_bin_t;
+
+static void
+destroy_bin(predexp_eval_t* bp)
+{
+	predexp_eval_bin_t* dp = (predexp_eval_bin_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_bin(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP, "eval_bin called outside value context");
+	}
+
+	predexp_eval_bin_t* dp = (predexp_eval_bin_t *) bp;
+
+	// We require record data to operate.
+	if (! argsp->rd) {
+		return PREDEXP_NOVALUE;
+	}
+
+	as_bin* bb = as_bin_get(argsp->rd, dp->bname);
+	if (! bb) {
+		return PREDEXP_NOVALUE;
+	}
+
+	if (as_bin_get_particle_type(bb) != dp->type) {
+		return PREDEXP_NOVALUE;
+	}
+
+	as_bin_copy(argsp->ns, &wbinp->bin, bb);
+	wbinp->must_free = false;	// bin is owned by record, in caller
+	return PREDEXP_VALUE;
+}
+
+static bool
+build_bin(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag)
+{
+	predexp_eval_bin_t* dp = (predexp_eval_bin_t *)
+			cf_malloc(sizeof(predexp_eval_bin_t));
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_BIN:
+		dp->type = AS_PARTICLE_TYPE_INTEGER;
+		break;
+	case AS_PREDEXP_STRING_BIN:
+		dp->type = AS_PARTICLE_TYPE_STRING;
+		break;
+	case AS_PREDEXP_GEOJSON_BIN:
+		dp->type = AS_PARTICLE_TYPE_GEOJSON;
+		break;
+	case AS_PREDEXP_LIST_BIN:
+		dp->type = AS_PARTICLE_TYPE_LIST;
+		break;
+	case AS_PREDEXP_MAP_BIN:
+		dp->type = AS_PARTICLE_TYPE_MAP;
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_bin called with bogus tag: %d", tag);
+		break;
+	}
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_bin,
+						   eval_bin,
+						   PREDEXP_VALUE_NODE,
+						   dp->type);
+
+	uint8_t* endp = pp + len;
+
+	if (len >= sizeof(dp->bname)) {
+		cf_warning(AS_PREDEXP, "build_bin: binname too long");
+		goto Failed;
+	}
+	uint8_t bnlen = (uint8_t) len;
+	memcpy(dp->bname, pp, bnlen);
+	dp->bname[bnlen] = '\0';
+	pp += bnlen;
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_bin: msg unaligned");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_BIN:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_bin(\"%s\")", stackpp,
+				 dp->bname);
+		break;
+	case AS_PREDEXP_STRING_BIN:
+		cf_debug(AS_PREDEXP, "%p: predexp_string_bin(\"%s\")", stackpp,
+				 dp->bname);
+		break;
+	case AS_PREDEXP_GEOJSON_BIN:
+		cf_debug(AS_PREDEXP, "%p: predexp_geojson_bin(\"%s\")", stackpp,
+				 dp->bname);
+		break;
+	case AS_PREDEXP_LIST_BIN:
+		cf_debug(AS_PREDEXP, "%p: predexp_list_bin(\"%s\")", stackpp,
+				 dp->bname);
+		break;
+	case AS_PREDEXP_MAP_BIN:
+		cf_debug(AS_PREDEXP, "%p: predexp_map_bin(\"%s\")", stackpp,
+				 dp->bname);
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_bin called with bogus tag: %d", tag);
+		break;
+	}
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_*_VAR
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_var_s {
+	predexp_eval_t			base;
+	char					vname[AS_ID_BIN_SZ];
+	uint8_t					type;
+} predexp_eval_var_t;
+
+static void
+destroy_var(predexp_eval_t* bp)
+{
+	predexp_eval_var_t* dp = (predexp_eval_var_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_var(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP, "eval_var called outside value context");
+	}
+
+	predexp_eval_var_t* dp = (predexp_eval_var_t *) bp;
+
+	for (as_predexp_var_t* vp = argsp->vl; vp != NULL; vp = vp->next) {
+		if (strcmp(dp->vname, vp->vname) == 0) {
+			// Is it the correct type?
+			if (as_bin_get_particle_type(&vp->bin) != dp->type) {
+				return PREDEXP_NOVALUE;
+			}
+
+			// Return it.
+			as_bin_copy(argsp->ns, &wbinp->bin, &vp->bin);
+			wbinp->must_free = false;	// bin is owned by iterator
+			return PREDEXP_VALUE;
+		}
+	}
+
+	// If we get here we didn't find the named variable in the list.
+	return PREDEXP_NOVALUE;
+}
+
+static bool
+build_var(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag)
+{
+	predexp_eval_var_t* dp = (predexp_eval_var_t *)
+			cf_malloc(sizeof(predexp_eval_var_t));
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_VAR:
+		dp->type = AS_PARTICLE_TYPE_INTEGER;
+		break;
+	case AS_PREDEXP_STRING_VAR:
+		dp->type = AS_PARTICLE_TYPE_STRING;
+		break;
+	case AS_PREDEXP_GEOJSON_VAR:
+		dp->type = AS_PARTICLE_TYPE_GEOJSON;
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_var called with bogus tag: %d", tag);
+		break;
+	}
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_var,
+						   eval_var,
+						   PREDEXP_VALUE_NODE,
+						   dp->type);
+
+	uint8_t* endp = pp + len;
+
+	if (len >= sizeof(dp->vname)) {
+		cf_warning(AS_PREDEXP, "build_var: varname too long");
+		goto Failed;
+	}
+	uint8_t bnlen = (uint8_t) len;
+	memcpy(dp->vname, pp, bnlen);
+	dp->vname[bnlen] = '\0';
+	pp += bnlen;
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_var: msg unaligned");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	switch (tag) {
+	case AS_PREDEXP_INTEGER_VAR:
+		cf_debug(AS_PREDEXP, "%p: predexp_integer_var(\"%s\")", stackpp,
+				 dp->vname);
+		break;
+	case AS_PREDEXP_STRING_VAR:
+		cf_debug(AS_PREDEXP, "%p: predexp_string_var(\"%s\")", stackpp,
+				 dp->vname);
+		break;
+	case AS_PREDEXP_GEOJSON_VAR:
+		cf_debug(AS_PREDEXP, "%p: predexp_geojson_var(\"%s\")", stackpp,
+				 dp->vname);
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_var called with bogus tag: %d", tag);
+		break;
+	}
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_REC_DEVICE_SIZE
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_rec_device_size_s {
+	predexp_eval_t			base;
+} predexp_eval_rec_device_size_t;
+
+static void
+destroy_rec_device_size(predexp_eval_t* bp)
+{
+	predexp_eval_rec_device_size_t* dp = (predexp_eval_rec_device_size_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_rec_device_size(predexp_eval_t* bp,
+					 predexp_args_t* argsp,
+					 wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP,
+				 "eval_rec_device_size called outside value context");
+	}
+
+	// predexp_eval_rec_device_size_t* dp =
+	//     (predexp_eval_rec_device_size_t *) bp;
+
+	int64_t rec_device_size = argsp->md->n_rblocks * 128;
+
+	as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER);
+	as_bin_particle_integer_set(&wbinp->bin, rec_device_size);
+	return PREDEXP_VALUE;
+}
+
+static bool
+build_rec_device_size(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	predexp_eval_rec_device_size_t* dp = (predexp_eval_rec_device_size_t *)
+			cf_malloc(sizeof(predexp_eval_rec_device_size_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_rec_device_size,
+						   eval_rec_device_size,
+						   PREDEXP_VALUE_NODE,
+						   AS_PARTICLE_TYPE_INTEGER);
+
+	uint8_t* endp = pp + len;
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_rec_device_size: msg unaligned");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_rec_device_size()", stackpp);
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_REC_LAST_UPDATE
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_rec_last_update_s {
+	predexp_eval_t			base;
+	as_bin					bin;
+} predexp_eval_rec_last_update_t;
+
+static void
+destroy_rec_last_update(predexp_eval_t* bp)
+{
+	predexp_eval_rec_last_update_t* dp = (predexp_eval_rec_last_update_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_rec_last_update(predexp_eval_t* bp,
+					 predexp_args_t* argsp,
+					 wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP,
+				 "eval_rec_last_update called outside value context");
+	}
+
+	// predexp_eval_rec_last_update_t* dp =
+	//     (predexp_eval_rec_last_update_t *) bp;
+
+	int64_t rec_last_update_ns =
+		(int64_t) cf_utc_ns_from_clepoch_ms(argsp->md->last_update_time);
+
+	as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER);
+	as_bin_particle_integer_set(&wbinp->bin, rec_last_update_ns);
+	return PREDEXP_VALUE;
+}
+
+static bool
+build_rec_last_update(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	predexp_eval_rec_last_update_t* dp = (predexp_eval_rec_last_update_t *)
+			cf_malloc(sizeof(predexp_eval_rec_last_update_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_rec_last_update,
+						   eval_rec_last_update,
+						   PREDEXP_VALUE_NODE,
+						   AS_PARTICLE_TYPE_INTEGER);
+
+	uint8_t* endp = pp + len;
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_rec_last_update: msg unaligned");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_rec_last_update()", stackpp);
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_REC_VOID_TIME
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_rec_void_time_s {
+	predexp_eval_t			base;
+	as_bin					bin;
+} predexp_eval_rec_void_time_t;
+
+static void
+destroy_rec_void_time(predexp_eval_t* bp)
+{
+	predexp_eval_rec_void_time_t* dp = (predexp_eval_rec_void_time_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_rec_void_time(predexp_eval_t* bp,
+				   predexp_args_t* argsp,
+				   wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP, "eval_rec_void_time called outside value context");
+	}
+
+	// predexp_eval_rec_void_time_t* dp = (predexp_eval_rec_void_time_t *) bp;
+
+	int64_t rec_void_time_ns =
+			(int64_t) cf_utc_ns_from_clepoch_sec(argsp->md->void_time);
+
+	// SPECIAL CASE - if the argsp->md->rec_void_time == 0 set the
+	// rec_void_time_ns to 0 as well.
+	//
+	if (argsp->md->void_time == 0) {
+		rec_void_time_ns = 0;
+	}
+
+	as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER);
+	as_bin_particle_integer_set(&wbinp->bin, rec_void_time_ns);
+	return PREDEXP_VALUE;
+}
+
+static bool
+build_rec_void_time(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	predexp_eval_rec_void_time_t* dp = (predexp_eval_rec_void_time_t *)
+			cf_malloc(sizeof(predexp_eval_rec_void_time_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_rec_void_time,
+						   eval_rec_void_time,
+						   PREDEXP_VALUE_NODE,
+						   AS_PARTICLE_TYPE_INTEGER);
+
+	uint8_t* endp = pp + len;
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_rec_void_time: msg unaligned");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_rec_void_time()", stackpp);
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_REC_DIGEST_MODULO
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_rec_digest_modulo_s {
+	predexp_eval_t			base;
+	int32_t					mod;
+} predexp_eval_rec_digest_modulo_t;
+
+static void
+destroy_rec_digest_modulo(predexp_eval_t* bp)
+{
+	predexp_eval_rec_digest_modulo_t* dp =
+		(predexp_eval_rec_digest_modulo_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_rec_digest_modulo(predexp_eval_t* bp,
+				   predexp_args_t* argsp,
+				   wrapped_as_bin_t* wbinp)
+{
+	if (wbinp == NULL) {
+		cf_crash(AS_PREDEXP,
+				 "eval_rec_digest_modulo called outside value context");
+	}
+
+	predexp_eval_rec_digest_modulo_t* dp =
+		(predexp_eval_rec_digest_modulo_t *) bp;
+
+	// We point at the last 4 bytes of the digest.
+	uint32_t* valp = (uint32_t*) &argsp->md->keyd.digest[16];
+	int64_t digest_modulo = *valp % dp->mod;
+
+	as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER);
+	as_bin_particle_integer_set(&wbinp->bin, digest_modulo);
+	return PREDEXP_VALUE;
+}
+
+static bool
+build_rec_digest_modulo(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp)
+{
+	predexp_eval_rec_digest_modulo_t* dp = (predexp_eval_rec_digest_modulo_t *)
+			cf_malloc(sizeof(predexp_eval_rec_digest_modulo_t));
+
+	predexp_eval_base_init((predexp_eval_t *) dp,
+						   destroy_rec_digest_modulo,
+						   eval_rec_digest_modulo,
+						   PREDEXP_VALUE_NODE,
+						   AS_PARTICLE_TYPE_INTEGER);
+
+	uint8_t* endp = pp + len;
+
+	if (pp + sizeof(int32_t) > endp) {
+		cf_warning(AS_PREDEXP, "build_rec_digest_modulo: msg too short");
+		goto Failed;
+	}
+
+	dp->mod = cf_swap_from_be32(* (int32_t*) pp);
+	pp += sizeof(int32_t);
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_rec_digest_modulo: msg unaligned");
+		goto Failed;
+	}
+
+	if (dp->mod == 0) {
+		cf_warning(AS_PREDEXP, "build_rec_digest_modulo: zero modulo invalid");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	cf_debug(AS_PREDEXP, "%p: predexp_rec_digest_modulo(%d)", stackpp, dp->mod);
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// AS_PREDEXP_*_ITERATE_*
+// ----------------------------------------------------------------
+
+typedef struct predexp_eval_iter_s {
+	predexp_eval_t			base;
+	uint16_t				tag;
+	uint8_t					type;
+	predexp_eval_t*			lchild;		// per-element expr
+	predexp_eval_t*			rchild;		// collection
+	char					vname[AS_ID_BIN_SZ];
+} predexp_eval_iter_t;
+
+static void
+destroy_iter(predexp_eval_t* bp)
+{
+	predexp_eval_iter_t* dp = (predexp_eval_iter_t *) bp;
+	cf_free(dp);
+}
+
+static predexp_retval_t
+eval_list_iter(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	predexp_eval_iter_t* dp = (predexp_eval_iter_t *) bp;
+
+	predexp_retval_t retval = PREDEXP_UNKNOWN;  // init makes compiler happy
+	switch (dp->tag) {
+	case AS_PREDEXP_LIST_ITERATE_OR:
+		// Start pessimistically.
+		retval = PREDEXP_FALSE;
+		break;
+	case AS_PREDEXP_LIST_ITERATE_AND:
+		// Start optimistically.
+		retval = PREDEXP_TRUE;
+		break;
+	default:
+		cf_crash(AS_PREDEXP,
+				 "eval_list_iter called with bogus tag: %d", dp->tag);
+	}
+
+	wrapped_as_bin_t lwbin;
+	lwbin.must_free = false;
+	if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &lwbin) ==
+		PREDEXP_NOVALUE) {
+		return argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN;
+	}
+
+	as_predexp_var_t var;
+	memcpy(var.vname, dp->vname, sizeof(var.vname));
+
+	// Make sure our var starts out empty.
+	as_bin_set_empty(&var.bin);
+	var.bin.particle = NULL;
+
+	// Prepend our var to the list.
+	var.next = argsp->vl;
+	argsp->vl = &var;
+
+	// Traverse the collection.
+	as_val* lval = as_bin_particle_to_asval(&lwbin.bin);
+	as_arraylist* list = (as_arraylist*) as_list_fromval(lval);
+	as_arraylist_iterator it;
+	as_arraylist_iterator_init(&it, list);
+	while (as_arraylist_iterator_has_next(&it)) {
+		// Set our var to the element's value.
+		as_val* val = (as_val*) as_arraylist_iterator_next(&it);
+		int old_arena = cf_alloc_clear_ns_arena();
+		int rv = as_bin_particle_replace_from_asval(&var.bin, val);
+		cf_alloc_restore_ns_arena(old_arena);
+		if (rv != 0) {
+			cf_warning(AS_PREDEXP,
+					   "eval_list_iter: particle from asval failed");
+			continue;
+		}
+
+		switch (dp->tag) {
+		case AS_PREDEXP_LIST_ITERATE_OR:
+			switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) {
+			case PREDEXP_TRUE:
+				// Shortcut, skip remaining children.
+				retval = PREDEXP_TRUE;
+				goto Done;
+			case PREDEXP_UNKNOWN:
+				// Upgrade our return value, continue scanning children.
+				retval = PREDEXP_UNKNOWN;
+				break;
+			case PREDEXP_FALSE:
+				// Continue scanning children.
+				break;
+			case PREDEXP_VALUE:
+			case PREDEXP_NOVALUE:
+				// Child can't be value node; shouldn't ever happen.
+				cf_crash(AS_PREDEXP, "eval_list_iter child was value node");
+			}
+			break;
+		case AS_PREDEXP_LIST_ITERATE_AND:
+			switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) {
+			case PREDEXP_FALSE:
+				// Shortcut, skip remaining children.
+				retval = PREDEXP_FALSE;
+				goto Done;
+			case PREDEXP_UNKNOWN:
+				// Downgrade our return value, continue scanning children.
+				retval = PREDEXP_UNKNOWN;
+				break;
+			case PREDEXP_TRUE:
+				// Continue scanning children.
+				break;
+			case PREDEXP_VALUE:
+			case PREDEXP_NOVALUE:
+				// Child can't be value node; shouldn't ever happen.
+				cf_crash(AS_PREDEXP, "eval_list_iter child was value node");
+			}
+			break;
+		default:
+			cf_crash(AS_PREDEXP, "eval_list_iter called with bogus tag: %d",
+					 dp->tag);
+		}
+
+	}
+
+ Done:
+	as_bin_particle_destroy(&var.bin, true);
+	as_bin_set_empty(&var.bin);
+	var.bin.particle = NULL;
+
+	as_arraylist_iterator_destroy(&it);
+
+	as_val_destroy(lval);
+
+	// Remove our var from the list.
+	argsp->vl = var.next;
+
+	if (lwbin.must_free) {
+		cf_crash(AS_PREDEXP, "eval_list_iter need bin cleanup, didn't before");
+	}
+
+	return retval;
+}
+
+static predexp_retval_t
+eval_map_iter(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp)
+{
+	predexp_eval_iter_t* dp = (predexp_eval_iter_t *) bp;
+
+	predexp_retval_t retval = PREDEXP_UNKNOWN; // init makes compiler happy
+	switch (dp->tag) {
+	case AS_PREDEXP_MAPKEY_ITERATE_OR:
+	case AS_PREDEXP_MAPVAL_ITERATE_OR:
+		// Start pessimistically.
+		retval = PREDEXP_FALSE;
+		break;
+	case AS_PREDEXP_MAPKEY_ITERATE_AND:
+	case AS_PREDEXP_MAPVAL_ITERATE_AND:
+		// Start optimistically.
+		retval = PREDEXP_TRUE;
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "eval_map_iter called with bogus tag: %d",
+				 dp->tag);
+	}
+
+	wrapped_as_bin_t lwbin;
+	lwbin.must_free = false;
+	if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &lwbin) ==
+		PREDEXP_NOVALUE) {
+		return argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN;
+	}
+
+	as_predexp_var_t var;
+	memcpy(var.vname, dp->vname, sizeof(var.vname));
+
+	// Make sure our var starts out empty.
+	as_bin_set_empty(&var.bin);
+	var.bin.particle = NULL;
+
+	// Prepend our var to the list.
+	var.next = argsp->vl;
+	argsp->vl = &var;
+
+	// Traverse the collection.
+	as_val* mval = as_bin_particle_to_asval(&lwbin.bin);
+	as_hashmap* map = (as_hashmap*) as_map_fromval(mval);
+	as_hashmap_iterator it;
+	as_hashmap_iterator_init(&it, map);
+	while (as_hashmap_iterator_has_next(&it)) {
+		// Set our var to the element's value.
+		as_pair* pair = (as_pair*) as_hashmap_iterator_next(&it);
+		as_val* val = NULL;  // init makes compiler happy
+		switch (dp->tag) {
+		case AS_PREDEXP_MAPKEY_ITERATE_OR:
+		case AS_PREDEXP_MAPKEY_ITERATE_AND:
+			val = as_pair_1(pair);
+			break;
+		case AS_PREDEXP_MAPVAL_ITERATE_OR:
+		case AS_PREDEXP_MAPVAL_ITERATE_AND:
+			val = as_pair_2(pair);
+			break;
+		default:
+			cf_crash(AS_PREDEXP, "eval_map_iter called with bogus tag (2): %d",
+					 dp->tag);
+		}
+
+		int old_arena = cf_alloc_clear_ns_arena();
+		int rv = as_bin_particle_replace_from_asval(&var.bin, val);
+		cf_alloc_restore_ns_arena(old_arena);
+		if (rv != 0) {
+			cf_warning(AS_PREDEXP, "eval_map_iter: particle from asval failed");
+			continue;
+		}
+
+		switch (dp->tag) {
+		case AS_PREDEXP_MAPKEY_ITERATE_OR:
+		case AS_PREDEXP_MAPVAL_ITERATE_OR:
+			switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) {
+			case PREDEXP_TRUE:
+				// Shortcut, skip remaining children.
+				retval = PREDEXP_TRUE;
+				goto Done;
+			case PREDEXP_UNKNOWN:
+				// Upgrade our return value, continue scanning children.
+				retval = PREDEXP_UNKNOWN;
+				break;
+			case PREDEXP_FALSE:
+				// Continue scanning children.
+				break;
+			case PREDEXP_VALUE:
+			case PREDEXP_NOVALUE:
+				// Child can't be value node; shouldn't ever happen.
+				cf_crash(AS_PREDEXP, "eval_map_iter child was value node");
+			}
+			break;
+		case AS_PREDEXP_MAPKEY_ITERATE_AND:
+		case AS_PREDEXP_MAPVAL_ITERATE_AND:
+			switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) {
+			case PREDEXP_FALSE:
+				// Shortcut, skip remaining children.
+				retval = PREDEXP_FALSE;
+				goto Done;
+			case PREDEXP_UNKNOWN:
+				// Downgrade our return value, continue scanning children.
+				retval = PREDEXP_UNKNOWN;
+				break;
+			case PREDEXP_TRUE:
+				// Continue scanning children.
+				break;
+			case PREDEXP_VALUE:
+			case PREDEXP_NOVALUE:
+				// Child can't be value node; shouldn't ever happen.
+				cf_crash(AS_PREDEXP, "eval_map_iter child was value node");
+			}
+			break;
+		default:
+			cf_crash(AS_PREDEXP, "eval_map_iter called with bogus tag: %d",
+					 dp->tag);
+		}
+
+	}
+
+ Done:
+	as_bin_particle_destroy(&var.bin, true);
+	as_bin_set_empty(&var.bin);
+	var.bin.particle = NULL;
+
+	as_hashmap_iterator_destroy(&it);
+
+	as_val_destroy(mval);
+
+	// Remove our var from the list.
+	argsp->vl = var.next;
+
+	if (lwbin.must_free) {
+		cf_crash(AS_PREDEXP, "eval_map_iter need bin cleanup, didn't before");
+	}
+	return retval;
+}
+
+static bool
+build_iter(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag)
+{
+	predexp_eval_iter_t* dp = (predexp_eval_iter_t *)
+			cf_malloc(sizeof(predexp_eval_iter_t));
+
+	switch (tag) {
+	case AS_PREDEXP_LIST_ITERATE_OR:
+	case AS_PREDEXP_LIST_ITERATE_AND:
+		predexp_eval_base_init((predexp_eval_t *) dp,
+							   destroy_iter,
+							   eval_list_iter,
+							   0,
+							   AS_PARTICLE_TYPE_NULL);
+		dp->type = AS_PARTICLE_TYPE_LIST;
+		break;
+	case AS_PREDEXP_MAPKEY_ITERATE_OR:
+	case AS_PREDEXP_MAPVAL_ITERATE_OR:
+	case AS_PREDEXP_MAPKEY_ITERATE_AND:
+	case AS_PREDEXP_MAPVAL_ITERATE_AND:
+		predexp_eval_base_init((predexp_eval_t *) dp,
+							   destroy_iter,
+							   eval_map_iter,
+							   0,
+							   AS_PARTICLE_TYPE_NULL);
+		dp->type = AS_PARTICLE_TYPE_MAP;
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_iter called with bogus tag: %d", tag);
+	}
+
+	dp->tag = tag;
+	dp->lchild = NULL;
+	dp->rchild = NULL;
+
+	uint8_t* endp = pp + len;
+
+	if (len >= sizeof(dp->vname)) {
+		cf_warning(AS_PREDEXP, "build_iter: varname too long");
+		goto Failed;
+	}
+	uint8_t vnlen = (uint8_t) len;
+	memcpy(dp->vname, pp, vnlen);
+	dp->vname[vnlen] = '\0';
+	pp += vnlen;
+
+	// ---- Pop the right child (collection) off the stack.
+
+	if (! *stackpp) {
+		cf_warning(AS_PREDEXP, "predexp_iterate: missing right child");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	dp->rchild = *stackpp;
+	*stackpp = dp->rchild->next;
+	dp->rchild->next = NULL;
+
+	if ((dp->rchild->flags & PREDEXP_VALUE_NODE) == 0) {
+		cf_warning(AS_PREDEXP,
+				   "predexp iterate: right child is not value node");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	if (dp->rchild->type != dp->type) {
+		cf_warning(AS_PREDEXP, "predexp iterate: right child is wrong type");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	// ---- Pop the left child (per-element expr) off the stack.
+
+	if (! *stackpp) {
+		cf_warning(AS_PREDEXP, "predexp_iterate: missing left child");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	dp->lchild = *stackpp;
+	*stackpp = dp->lchild->next;
+	dp->lchild->next = NULL;
+
+	if ((dp->lchild->flags & PREDEXP_VALUE_NODE) == 1) {
+		cf_warning(AS_PREDEXP, "predexp iterate: left child is value node");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	if (dp->lchild->type != AS_PARTICLE_TYPE_NULL) {
+		cf_warning(AS_PREDEXP, "predexp iterate: left child is wrong type");
+		(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+		return false;
+	}
+
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "build_iter: msg unaligned");
+		goto Failed;
+	}
+
+	// Success, push ourself onto the stack.
+	dp->base.next = *stackpp;				// We point next at the old top.
+	*stackpp = (predexp_eval_t *) dp;	// We're the new top
+
+	switch (tag) {
+	case AS_PREDEXP_LIST_ITERATE_OR:
+		cf_debug(AS_PREDEXP, "%p: predexp_list_iterate_or()", stackpp);
+		break;
+	case AS_PREDEXP_LIST_ITERATE_AND:
+		cf_debug(AS_PREDEXP, "%p: predexp_list_iterate_and()", stackpp);
+		break;
+	case AS_PREDEXP_MAPKEY_ITERATE_OR:
+		cf_debug(AS_PREDEXP, "%p: predexp_mapkey_iterate_or()", stackpp);
+		break;
+	case AS_PREDEXP_MAPVAL_ITERATE_OR:
+		cf_debug(AS_PREDEXP, "%p: predexp_mapval_iterate_or()", stackpp);
+		break;
+	case AS_PREDEXP_MAPKEY_ITERATE_AND:
+		cf_debug(AS_PREDEXP, "%p: predexp_mapkey_iterate_and()", stackpp);
+		break;
+	case AS_PREDEXP_MAPVAL_ITERATE_AND:
+		cf_debug(AS_PREDEXP, "%p: predexp_mapval_iterate_and()", stackpp);
+		break;
+	default:
+		cf_crash(AS_PREDEXP, "build_iter called with bogus tag: %d", tag);
+	}
+
+	return true;
+
+ Failed:
+	(*dp->base.dtor_fn)((predexp_eval_t *) dp);
+	return false;
+}
+
+// ----------------------------------------------------------------
+// External Interface
+// ----------------------------------------------------------------
+
+
+static bool
+build(predexp_eval_t** stackpp, uint16_t tag, uint32_t len, uint8_t* pp)
+{
+	switch (tag) {
+	case AS_PREDEXP_AND:
+		return build_and(stackpp, len, pp);
+	case AS_PREDEXP_OR:
+		return build_or(stackpp, len, pp);
+	case AS_PREDEXP_NOT:
+		return build_not(stackpp, len, pp);
+	case AS_PREDEXP_INTEGER_EQUAL:
+	case AS_PREDEXP_INTEGER_UNEQUAL:
+	case AS_PREDEXP_INTEGER_GREATER:
+	case AS_PREDEXP_INTEGER_GREATEREQ:
+	case AS_PREDEXP_INTEGER_LESS:
+	case AS_PREDEXP_INTEGER_LESSEQ:
+	case AS_PREDEXP_STRING_EQUAL:
+	case AS_PREDEXP_STRING_UNEQUAL:
+	case AS_PREDEXP_STRING_REGEX:
+	case AS_PREDEXP_GEOJSON_WITHIN:
+	case AS_PREDEXP_GEOJSON_CONTAINS:
+		return build_compare(stackpp, len, pp, tag);
+	case AS_PREDEXP_INTEGER_VALUE:
+	case AS_PREDEXP_STRING_VALUE:
+	case AS_PREDEXP_GEOJSON_VALUE:
+		return build_value(stackpp, len, pp, tag);
+	case AS_PREDEXP_INTEGER_BIN:
+	case AS_PREDEXP_STRING_BIN:
+	case AS_PREDEXP_GEOJSON_BIN:
+	case AS_PREDEXP_LIST_BIN:
+	case AS_PREDEXP_MAP_BIN:
+		return build_bin(stackpp, len, pp, tag);
+	case AS_PREDEXP_INTEGER_VAR:
+	case AS_PREDEXP_STRING_VAR:
+	case AS_PREDEXP_GEOJSON_VAR:
+		return build_var(stackpp, len, pp, tag);
+	case AS_PREDEXP_REC_DEVICE_SIZE:
+		return build_rec_device_size(stackpp, len, pp);
+	case AS_PREDEXP_REC_LAST_UPDATE:
+		return build_rec_last_update(stackpp, len, pp);
+	case AS_PREDEXP_REC_VOID_TIME:
+		return build_rec_void_time(stackpp, len, pp);
+	case AS_PREDEXP_REC_DIGEST_MODULO:
+		return build_rec_digest_modulo(stackpp, len, pp);
+	case AS_PREDEXP_LIST_ITERATE_OR:
+	case AS_PREDEXP_LIST_ITERATE_AND:
+	case AS_PREDEXP_MAPKEY_ITERATE_OR:
+	case AS_PREDEXP_MAPKEY_ITERATE_AND:
+	case AS_PREDEXP_MAPVAL_ITERATE_OR:
+	case AS_PREDEXP_MAPVAL_ITERATE_AND:
+		return build_iter(stackpp, len, pp, tag);
+	default:
+		cf_warning(AS_PREDEXP, "unexpected predexp tag: %d", tag);
+		return false;
+	}
+}
+
+predexp_eval_t*
+predexp_build(as_msg_field* pfp)
+{
+	predexp_eval_t* stackp = NULL;
+
+	cf_debug(AS_PREDEXP, "%p: predexp_build starting", &stackp);
+
+	uint8_t* pp = pfp->data;
+	uint32_t pdsize = as_msg_field_get_value_sz(pfp);
+	uint8_t* endp = pp + pdsize;
+
+	// Minumum possible TLV token is 6 bytes.
+	while (pp + 6 <= endp) {
+		uint16_t tag = cf_swap_from_be16(* (uint16_t *) pp);
+		pp += sizeof(uint16_t);
+
+		uint32_t len = cf_swap_from_be32(* (uint32_t *) pp);
+		pp += sizeof(uint32_t);
+
+		if (pp + len > endp) {
+			cf_warning(AS_PREDEXP, "malformed predexp field");
+			goto FAILED;
+		}
+
+		if (!build(&stackp, tag, len, pp)) {
+			// Warning should already have happened
+			goto FAILED;
+		}
+		pp += len;
+	}
+
+	// The cursor needs to neatly point at the end pointer.
+	if (pp != endp) {
+		cf_warning(AS_PREDEXP, "malformed predexp field");
+		goto FAILED;
+	}
+
+	// We'd better have exactly one node on the stack now.
+	if (!stackp) {
+		cf_warning(AS_PREDEXP, "no top level predexp");
+		goto FAILED;
+	}
+	if (stackp->next) {
+		cf_warning(AS_PREDEXP, "multiple top-level predexp");
+		goto FAILED;
+	}
+
+	// The top node needs to be a matching node, not a value node.
+	if (stackp->flags & PREDEXP_VALUE_NODE) {
+		cf_warning(AS_PREDEXP, "top-level predexp is value node");
+		goto FAILED;
+	}
+
+	cf_debug(AS_PREDEXP, "%p: predexp_build finished", &stackp);
+
+	// Return the root of the predicate expression tree.
+	return stackp;
+
+ FAILED:
+	cf_debug(AS_PREDEXP, "%p: predexp_build failed", &stackp);
+	destroy_list(stackp);
+	return NULL;
+}
+
+bool
+predexp_matches_metadata(predexp_eval_t* bp, predexp_args_t* argsp)
+{
+	if (! bp) {
+		return true;
+	}
+
+	return ((*bp->eval_fn)(bp, argsp, NULL) != PREDEXP_FALSE);
+}
+
+bool
+predexp_matches_record(predexp_eval_t* bp, predexp_args_t* argsp)
+{
+	if (! bp) {
+		return true;
+	}
+
+	switch ((*bp->eval_fn)(bp, argsp, NULL)) {
+	case PREDEXP_TRUE:
+		return true;
+	case PREDEXP_FALSE:
+		return false;
+	default:
+		cf_crash(AS_PREDEXP, "predexp eval returned other then true/false "
+				 "with record data present");
+		return false;	// makes compiler happy
+	}
+}
+
+void
+predexp_destroy(predexp_eval_t* bp)
+{
+	(*bp->dtor_fn)(bp);
+}
diff --git a/as/src/base/probes.d b/as/src/base/probes.d
new file mode 100644
index 00000000..87b3f6d5
--- /dev/null
+++ b/as/src/base/probes.d
@@ -0,0 +1,25 @@
+provider asd {
+   probe trans__demarshal(uint64_t, uint64_t, uint64_t);
+   probe query__starting(uint64_t, uint64_t);
+   probe query__qtrsetup_starting(uint64_t, uint64_t);
+   probe query__qtrsetup_finished(uint64_t, uint64_t);
+   probe query__init(uint64_t, uint64_t);
+   probe query__done(uint64_t, uint64_t, uint64_t);
+   probe query__trans_done(uint64_t, uint64_t, uint64_t);
+   probe query__qtr_alloc(uint64_t, uint64_t, uint64_t);
+   probe query__qtr_free(uint64_t, uint64_t, uint64_t);
+   probe query__ioreq_starting(uint64_t, uint64_t);
+   probe query__ioreq_finished(uint64_t, uint64_t);
+   probe query__io_starting(uint64_t, uint64_t);
+   probe query__io_notmatch(uint64_t, uint64_t);
+   probe query__io_error(uint64_t, uint64_t);
+   probe query__io_finished(uint64_t, uint64_t);
+   probe query__netio_starting(uint64_t, uint64_t);
+   probe query__netio_finished(uint64_t, uint64_t);
+   probe query__addfin(uint64_t, uint64_t);
+   probe query__sendpacket_starting(uint64_t, uint32_t, uint32_t);
+   probe query__sendpacket_continue(uint64_t, uint32_t);
+   probe query__sendpacket_finished(uint64_t);
+   probe sindex__msgrange_starting(uint64_t, uint64_t);
+   probe sindex__msgrange_finished(uint64_t, uint64_t);
+};
diff --git a/as/src/base/proto.c b/as/src/base/proto.c
new file mode 100644
index 00000000..65db1709
--- /dev/null
+++ b/as/src/base/proto.c
@@ -0,0 +1,885 @@
+/*
+ * proto.c
+ *
+ * Copyright (C) 2008-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/proto.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "socket.h"
+
+#include "base/as_stap.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define MSG_STACK_BUFFER_SZ (1024 * 16)
+#define NETIO_MAX_IO_RETRY 5
+
+static const char SUCCESS_BIN_NAME[] = "SUCCESS";
+static const char FAILURE_BIN_NAME[] = "FAILURE";
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_queue g_netio_queue;
+static cf_queue g_netio_slow_queue;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static int send_reply_buf(as_file_handle *fd_h, uint8_t *msgp, size_t msg_sz);
+static void *run_netio(void *q_to_wait_on);
+static int netio_send_packet(as_file_handle *fd_h, cf_buf_builder *bb_r, uint32_t *offset, bool blocking);
+
+
+//==========================================================
+// Public API - byte swapping.
+//
+
+void
+as_proto_swap(as_proto *proto)
+{
+	uint8_t version = proto->version;
+	uint8_t type = proto->type;
+
+	proto->version = proto->type = 0;
+	proto->sz = cf_swap_from_be64(*(uint64_t *)proto);
+	proto->version = version;
+	proto->type = type;
+}
+
+void
+as_msg_swap_header(as_msg *m)
+{
+	m->generation = cf_swap_from_be32(m->generation);
+	m->record_ttl = cf_swap_from_be32(m->record_ttl);
+	m->transaction_ttl = cf_swap_from_be32(m->transaction_ttl);
+	m->n_fields = cf_swap_from_be16(m->n_fields);
+	m->n_ops = cf_swap_from_be16(m->n_ops);
+}
+
+void
+as_msg_swap_field(as_msg_field *mf)
+{
+	mf->field_sz = cf_swap_from_be32(mf->field_sz);
+}
+
+void
+as_msg_swap_op(as_msg_op *op)
+{
+	op->op_sz = cf_swap_from_be32(op->op_sz);
+}
+
+
+//==========================================================
+// Public API - generating internal transactions.
+//
+
+// Allocates cl_msg returned - caller must free it. Everything is host-ordered.
+// Will add more parameters (e.g. for set name) only as they become necessary.
+cl_msg *
+as_msg_create_internal(const char *ns_name, const cf_digest *keyd,
+		uint8_t info1, uint8_t info2, uint8_t info3)
+{
+	size_t ns_name_len = strlen(ns_name);
+
+	size_t msg_sz = sizeof(cl_msg) +
+			sizeof(as_msg_field) + ns_name_len +
+			sizeof(as_msg_field) + sizeof(cf_digest);
+
+	cl_msg *msgp = (cl_msg *)cf_malloc(msg_sz);
+
+	msgp->proto.version = PROTO_VERSION;
+	msgp->proto.type = PROTO_TYPE_AS_MSG;
+	msgp->proto.sz = msg_sz - sizeof(as_proto);
+
+	as_msg *m = &msgp->msg;
+
+	m->header_sz = sizeof(as_msg);
+	m->info1 = info1;
+	m->info2 = info2;
+	m->info3 = info3;
+	m->unused = 0;
+	m->result_code = 0;
+	m->generation = 0;
+	m->record_ttl = 0;
+	m->transaction_ttl = 0;
+	m->n_fields = 2;
+	m->n_ops = 0;
+
+	as_msg_field *mf = (as_msg_field *)(m->data);
+
+	mf->type = AS_MSG_FIELD_TYPE_NAMESPACE;
+	mf->field_sz = (uint32_t)ns_name_len + 1;
+	memcpy(mf->data, ns_name, ns_name_len);
+
+	mf = as_msg_field_get_next(mf);
+
+	mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
+	mf->field_sz = sizeof(cf_digest) + 1;
+	*(cf_digest *)mf->data = *keyd;
+
+	return msgp;
+}
+
+
+//==========================================================
+// Public API - packing responses.
+//
+
+// Allocates cl_msg returned - caller must free it.
+cl_msg *
+as_msg_make_response_msg(uint32_t result_code, uint32_t generation,
+		uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count,
+		as_namespace *ns, cl_msg *msgp_in, size_t *msg_sz_in, uint64_t trid)
+{
+	uint16_t n_fields = 0;
+	size_t msg_sz = sizeof(cl_msg);
+
+	if (trid != 0) {
+		n_fields++;
+		msg_sz += sizeof(as_msg_field) + sizeof(trid);
+	}
+
+	msg_sz += sizeof(as_msg_op) * bin_count;
+
+	for (uint16_t i = 0; i < bin_count; i++) {
+		if (ops) {
+			msg_sz += ops[i]->name_sz;
+		}
+		else if (bins[i]) {
+			msg_sz += ns->single_bin ?
+					0 : strlen(as_bin_get_name_from_id(ns, bins[i]->id));
+		}
+		else {
+			cf_crash(AS_PROTO, "making response message with null bin and op");
+		}
+
+		if (bins[i]) {
+			msg_sz += as_bin_particle_client_value_size(bins[i]);
+		}
+	}
+
+	uint8_t *buf;
+
+	if (! msgp_in || *msg_sz_in < msg_sz) {
+		buf = cf_malloc(msg_sz);
+	}
+	else {
+		buf = (uint8_t *)msgp_in;
+	}
+
+	*msg_sz_in = msg_sz;
+
+	cl_msg *msgp = (cl_msg *)buf;
+
+	msgp->proto.version = PROTO_VERSION;
+	msgp->proto.type = PROTO_TYPE_AS_MSG;
+	msgp->proto.sz = msg_sz - sizeof(as_proto);
+
+	as_proto_swap(&msgp->proto);
+
+	as_msg *m = &msgp->msg;
+
+	m->header_sz = sizeof(as_msg);
+	m->info1 = 0;
+	m->info2 = 0;
+	m->info3 = 0;
+	m->unused = 0;
+	m->result_code = result_code;
+	m->generation = generation == 0 ? 0 : plain_generation(generation, ns);
+	m->record_ttl = void_time;
+	m->transaction_ttl = 0;
+	m->n_fields = n_fields;
+	m->n_ops = bin_count;
+
+	as_msg_swap_header(m);
+
+	buf = m->data;
+
+	if (trid != 0) {
+		as_msg_field *mf = (as_msg_field *)buf;
+
+		mf->field_sz = 1 + sizeof(uint64_t);
+		mf->type = AS_MSG_FIELD_TYPE_TRID;
+		*(uint64_t *)mf->data = cf_swap_to_be64(trid);
+		as_msg_swap_field(mf);
+		buf += sizeof(as_msg_field) + sizeof(uint64_t);
+	}
+
+	for (uint16_t i = 0; i < bin_count; i++) {
+		as_msg_op *op = (as_msg_op *)buf;
+
+		op->version = 0;
+
+		if (ops) {
+			op->op = ops[i]->op;
+			memcpy(op->name, ops[i]->name, ops[i]->name_sz);
+			op->name_sz = ops[i]->name_sz;
+		}
+		else {
+			op->op = AS_MSG_OP_READ;
+			op->name_sz = as_bin_memcpy_name(ns, op->name, bins[i]);
+		}
+
+		op->op_sz = 4 + op->name_sz;
+
+		buf += sizeof(as_msg_op) + op->name_sz;
+		buf += as_bin_particle_to_client(bins[i], op);
+
+		as_msg_swap_op(op);
+	}
+
+	return msgp;
+}
+
+// FIXME - only old batch sets include_key false - remove parameter ???
+// FIXME - only old batch sets skip_empty_records false - remove parameter ???
+// Pass NULL bb_r for sizing only. Return value is size if >= 0, error if < 0.
+int32_t
+as_msg_make_response_bufbuilder(cf_buf_builder **bb_r, as_storage_rd *rd,
+		bool no_bin_data, bool include_key, bool skip_empty_records,
+		cf_vector *select_bins)
+{
+	as_namespace *ns = rd->ns;
+	as_record *r = rd->r;
+
+	size_t ns_len = strlen(ns->name);
+	const char *set_name = as_index_get_set_name(r, ns);
+	size_t set_name_len = set_name ? strlen(set_name) : 0;
+
+	uint8_t* key = NULL;
+	uint32_t key_size = 0;
+
+	if (include_key && r->key_stored == 1) {
+		if (! as_storage_record_get_key(rd)) {
+			cf_warning(AS_PROTO, "can't get key - skipping record");
+			return -1;
+		}
+
+		key = rd->key;
+		key_size = rd->key_size;
+	}
+
+	uint16_t n_fields = 2; // always add namespace and digest
+	size_t msg_sz = sizeof(as_msg) +
+			sizeof(as_msg_field) + ns_len +
+			sizeof(as_msg_field) + sizeof(cf_digest);
+
+	if (set_name) {
+		n_fields++;
+		msg_sz += sizeof(as_msg_field) + set_name_len;
+	}
+
+	if (key) {
+		n_fields++;
+		msg_sz += sizeof(as_msg_field) + key_size;
+	}
+
+	uint32_t n_select_bins = 0;
+	uint16_t n_bins_matched = 0;
+	uint16_t n_record_bins = 0;
+
+	if (! no_bin_data) {
+		if (select_bins) {
+			n_select_bins = cf_vector_size(select_bins);
+
+			for (uint32_t i = 0; i < n_select_bins; i++) {
+				char bin_name[AS_ID_BIN_SZ];
+
+				cf_vector_get(select_bins, i, (void*)&bin_name);
+
+				as_bin *b = as_bin_get(rd, bin_name);
+
+				if (! b) {
+					continue;
+				}
+
+				msg_sz += sizeof(as_msg_op);
+				msg_sz += ns->single_bin ? 0 : strlen(bin_name);
+				msg_sz += as_bin_particle_client_value_size(b);
+
+				n_bins_matched++;
+			}
+
+			// Don't return an empty record.
+			if (skip_empty_records && n_bins_matched == 0) {
+				return 0;
+			}
+		}
+		else {
+			n_record_bins = as_bin_inuse_count(rd);
+
+			msg_sz += sizeof(as_msg_op) * n_record_bins;
+
+			for (uint16_t i = 0; i < n_record_bins; i++) {
+				as_bin *b = &rd->bins[i];
+
+				msg_sz += ns->single_bin ?
+						0 : strlen(as_bin_get_name_from_id(ns, b->id));
+				msg_sz += (int)as_bin_particle_client_value_size(b);
+			}
+		}
+	}
+
+	// NULL buf-builder means just return size.
+	if (! bb_r) {
+		return (int32_t)msg_sz;
+	}
+
+	uint8_t *buf;
+
+	cf_buf_builder_reserve(bb_r, (int)msg_sz, &buf);
+
+	as_msg *m = (as_msg *)buf;
+
+	m->header_sz = sizeof(as_msg);
+	m->info1 = no_bin_data ? AS_MSG_INFO1_GET_NO_BINS : 0;
+	m->info2 = 0;
+	m->info3 = 0;
+	m->unused = 0;
+	m->result_code = AS_PROTO_RESULT_OK;
+	m->generation = plain_generation(r->generation, ns);
+	m->record_ttl = r->void_time;
+	m->transaction_ttl = 0;
+	m->n_fields = n_fields;
+
+	if (no_bin_data) {
+		m->n_ops = 0;
+	}
+	else {
+		m->n_ops = select_bins ? n_bins_matched : n_record_bins;
+	}
+
+	as_msg_swap_header(m);
+
+	buf = m->data;
+
+	as_msg_field *mf = (as_msg_field *)buf;
+
+	mf->field_sz = ns_len + 1;
+	mf->type = AS_MSG_FIELD_TYPE_NAMESPACE;
+	memcpy(mf->data, ns->name, ns_len);
+	as_msg_swap_field(mf);
+	buf += sizeof(as_msg_field) + ns_len;
+
+	mf = (as_msg_field *)buf;
+	mf->field_sz = sizeof(cf_digest) + 1;
+	mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
+	memcpy(mf->data, &r->keyd, sizeof(cf_digest));
+	as_msg_swap_field(mf);
+	buf += sizeof(as_msg_field) + sizeof(cf_digest);
+
+	if (set_name) {
+		mf = (as_msg_field *)buf;
+		mf->field_sz = set_name_len + 1;
+		mf->type = AS_MSG_FIELD_TYPE_SET;
+		memcpy(mf->data, set_name, set_name_len);
+		as_msg_swap_field(mf);
+		buf += sizeof(as_msg_field) + set_name_len;
+	}
+
+	if (key) {
+		mf = (as_msg_field *)buf;
+		mf->field_sz = key_size + 1;
+		mf->type = AS_MSG_FIELD_TYPE_KEY;
+		memcpy(mf->data, key, key_size);
+		as_msg_swap_field(mf);
+		buf += sizeof(as_msg_field) + key_size;
+	}
+
+	if (no_bin_data) {
+		return (int32_t)msg_sz;
+	}
+
+	if (select_bins) {
+		for (uint32_t i = 0; i < n_select_bins; i++) {
+			char bin_name[AS_ID_BIN_SZ];
+
+			cf_vector_get(select_bins, i, (void*)&bin_name);
+
+			as_bin *b = as_bin_get(rd, bin_name);
+
+			if (! b) {
+				continue;
+			}
+
+			as_msg_op *op = (as_msg_op *)buf;
+
+			op->op = AS_MSG_OP_READ;
+			op->version = 0;
+			op->name_sz = as_bin_memcpy_name(ns, op->name, b);
+			op->op_sz = 4 + op->name_sz;
+
+			buf += sizeof(as_msg_op) + op->name_sz;
+			buf += as_bin_particle_to_client(b, op);
+
+			as_msg_swap_op(op);
+		}
+	}
+	else {
+		for (uint16_t i = 0; i < n_record_bins; i++) {
+			as_msg_op *op = (as_msg_op *)buf;
+
+			op->op = AS_MSG_OP_READ;
+			op->version = 0;
+			op->name_sz = as_bin_memcpy_name(ns, op->name, &rd->bins[i]);
+			op->op_sz = 4 + op->name_sz;
+
+			buf += sizeof(as_msg_op) + op->name_sz;
+			buf += as_bin_particle_to_client(&rd->bins[i], op);
+
+			as_msg_swap_op(op);
+		}
+	}
+
+	return (int32_t)msg_sz;
+}
+
+cl_msg *
+as_msg_make_val_response(bool success, const as_val *val, uint32_t result_code,
+		uint32_t generation, uint32_t void_time, uint64_t trid,
+		size_t *p_msg_sz)
+{
+	const char *bin_name;
+	size_t bin_name_len;
+
+	if (success) {
+		bin_name = SUCCESS_BIN_NAME;
+		bin_name_len = sizeof(SUCCESS_BIN_NAME) - 1;
+	}
+	else {
+		bin_name = FAILURE_BIN_NAME;
+		bin_name_len = sizeof(FAILURE_BIN_NAME) - 1;
+	}
+
+	uint16_t n_fields = 0;
+	size_t msg_sz = sizeof(cl_msg);
+
+	if (trid != 0) {
+		n_fields++;
+		msg_sz += sizeof(as_msg_field) + sizeof(trid);
+	}
+
+	msg_sz += sizeof(as_msg_op) + bin_name_len +
+			as_particle_asval_client_value_size(val);
+
+	uint8_t *buf = cf_malloc(msg_sz);
+	cl_msg *msgp = (cl_msg *)buf;
+
+	msgp->proto.version = PROTO_VERSION;
+	msgp->proto.type = PROTO_TYPE_AS_MSG;
+	msgp->proto.sz = msg_sz - sizeof(as_proto);
+
+	as_proto_swap(&msgp->proto);
+
+	as_msg *m = &msgp->msg;
+
+	m->header_sz = sizeof(as_msg);
+	m->info1 = 0;
+	m->info2 = 0;
+	m->info3 = 0;
+	m->unused = 0;
+	m->result_code = result_code;
+	m->generation = generation;
+	m->record_ttl = void_time;
+	m->transaction_ttl = 0;
+	m->n_fields = n_fields;
+	m->n_ops = 1; // only the one special bin
+
+	as_msg_swap_header(m);
+
+	buf = m->data;
+
+	if (trid != 0) {
+		as_msg_field *mf = (as_msg_field *)buf;
+
+		mf->field_sz = 1 + sizeof(uint64_t);
+		mf->type = AS_MSG_FIELD_TYPE_TRID;
+		*(uint64_t *)mf->data = cf_swap_to_be64(trid);
+		as_msg_swap_field(mf);
+		buf += sizeof(as_msg_field) + sizeof(uint64_t);
+	}
+
+	as_msg_op *op = (as_msg_op *)buf;
+
+	op->op = AS_MSG_OP_READ;
+	op->name_sz = (uint8_t)bin_name_len;
+	memcpy(op->name, bin_name, op->name_sz);
+	op->op_sz = 4 + op->name_sz;
+	op->version = 0;
+
+	as_particle_asval_to_client(val, op);
+
+	as_msg_swap_op(op);
+
+	*p_msg_sz = msg_sz;
+
+	return msgp;
+}
+
+// Caller-provided val_sz must be the result of calling
+// as_particle_asval_client_value_size() for same val.
+void
+as_msg_make_val_response_bufbuilder(const as_val *val, cf_buf_builder **bb_r,
+		uint32_t val_sz, bool success)
+{
+	const char *bin_name;
+	size_t bin_name_len;
+
+	if (success) {
+		bin_name = SUCCESS_BIN_NAME;
+		bin_name_len = sizeof(SUCCESS_BIN_NAME) - 1;
+	}
+	else {
+		bin_name = FAILURE_BIN_NAME;
+		bin_name_len = sizeof(FAILURE_BIN_NAME) - 1;
+	}
+
+	size_t msg_sz = sizeof(as_msg) + sizeof(as_msg_op) + bin_name_len + val_sz;
+
+	uint8_t *buf;
+
+	cf_buf_builder_reserve(bb_r, (int)msg_sz, &buf);
+
+	as_msg *m = (as_msg *)buf;
+
+	m->header_sz = sizeof(as_msg);
+	m->info1 = 0;
+	m->info2 = 0;
+	m->info3 = 0;
+	m->unused = 0;
+	m->result_code = AS_PROTO_RESULT_OK;
+	m->generation = 0;
+	m->record_ttl = 0;
+	m->transaction_ttl = 0;
+	m->n_fields = 0;
+	m->n_ops = 1; // only the one special bin
+
+	as_msg_swap_header(m);
+
+	as_msg_op *op = (as_msg_op *)m->data;
+
+	op->op = AS_MSG_OP_READ;
+	op->name_sz = (uint8_t)bin_name_len;
+	memcpy(op->name, bin_name, op->name_sz);
+	op->op_sz = 4 + op->name_sz;
+	op->version = 0;
+
+	as_particle_asval_to_client(val, op);
+
+	as_msg_swap_op(op);
+}
+
+
+//==========================================================
+// Public API - sending responses to client.
+//
+
+// Make an individual transaction response and send it.
+int
+as_msg_send_reply(as_file_handle *fd_h, uint32_t result_code,
+		uint32_t generation, uint32_t void_time, as_msg_op **ops, as_bin **bins,
+		uint16_t bin_count, as_namespace *ns, uint64_t trid)
+{
+	uint8_t stack_buf[MSG_STACK_BUFFER_SZ];
+	size_t msg_sz = sizeof(stack_buf);
+	uint8_t *msgp = (uint8_t *)as_msg_make_response_msg(result_code, generation,
+			void_time, ops, bins, bin_count, ns, (cl_msg *)stack_buf, &msg_sz,
+			trid);
+
+	int rv = send_reply_buf(fd_h, msgp, msg_sz);
+
+	if (msgp != stack_buf) {
+		cf_free(msgp);
+	}
+
+	return rv;
+}
+
+// Send a pre-made response saved in a dyn-buf.
+int
+as_msg_send_ops_reply(as_file_handle *fd_h, cf_dyn_buf *db)
+{
+	return send_reply_buf(fd_h, db->buf, db->used_sz);
+}
+
+// Send a blocking "fin" message with default timeout.
+bool
+as_msg_send_fin(cf_socket *sock, uint32_t result_code)
+{
+	return as_msg_send_fin_timeout(sock, result_code, CF_SOCKET_TIMEOUT) != 0;
+}
+
+// Send a blocking "fin" message with a specified timeout.
+size_t
+as_msg_send_fin_timeout(cf_socket *sock, uint32_t result_code, int32_t timeout)
+{
+	cl_msg msgp;
+
+	msgp.proto.version = PROTO_VERSION;
+	msgp.proto.type = PROTO_TYPE_AS_MSG;
+	msgp.proto.sz = sizeof(as_msg);
+
+	as_proto_swap(&msgp.proto);
+
+	as_msg *m = &msgp.msg;
+
+	m->header_sz = sizeof(as_msg);
+	m->info1 = 0;
+	m->info2 = 0;
+	m->info3 = AS_MSG_INFO3_LAST;
+	m->unused = 0;
+	m->result_code = result_code;
+	m->generation = 0;
+	m->record_ttl = 0;
+	m->transaction_ttl = 0;
+	m->n_fields = 0;
+	m->n_ops = 0;
+
+	as_msg_swap_header(m);
+
+	if (cf_socket_send_all(sock, (uint8_t*)&msgp, sizeof(msgp), MSG_NOSIGNAL,
+			timeout) < 0) {
+		cf_warning(AS_PROTO, "send error - fd %d %s", CSFD(sock),
+				cf_strerror(errno));
+		return 0;
+	}
+
+	return sizeof(cl_msg);
+}
+
+
+//==========================================================
+// Public API - query "net-IO" responses.
+//
+
+void 
+as_netio_init()
+{
+	cf_queue_init(&g_netio_queue, sizeof(as_netio), 64, true);
+	cf_queue_init(&g_netio_slow_queue, sizeof(as_netio), 64, true);
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, run_netio,
+			(void *)&g_netio_queue) != 0) {
+		cf_crash(AS_PROTO, "failed to create netio thread");
+	}
+
+	if (pthread_create(&thread, &attrs, run_netio,
+			(void *)&g_netio_slow_queue) != 0) {
+		cf_crash(AS_PROTO, "failed to create netio slow thread");
+	}
+}
+
+// Based on io object, send buffer to the network, or queue for retry.
+//
+// start_cb: Callback to the module before the real IO is started. Returns:
+//      AS_NETIO_OK: Everything ok, go ahead with IO.
+//      AS_NETIO_ERR: If there was issue like abort/err/timeout etc.
+//
+// finish_cb: Callback to module with status code of the IO call. Returns:
+//      AS_NETIO_OK: Everything ok.
+//      AS_NETIO_CONTINUE: The IO was requeued.
+//      AS_NETIO_ERR: IO erred out due to some issue.
+//
+// finish_cb should do the needful like release ref to user data etc.
+//
+// Returns:
+// AS_NETIO_OK: Everything is fine, both start_cb & finish_cb were called.
+// AS_NETIO_ERR: Something failed either calling start_cb or while doing
+//      network IO, finish_cb is called.
+//
+// This function consumes qtr reference. It calls finish_cb which releases ref
+// to qtr. In case of AS_NETIO_CONTINUE: this function also consumes bb_r and
+// ref for fd_h. The background thread is responsible for freeing up bb_r and
+// releasing ref to fd_h.
+int
+as_netio_send(as_netio *io, bool slow, bool blocking)
+{
+	int ret = io->start_cb(io, io->seq);
+
+	if (ret == AS_NETIO_OK) {
+		ret = io->finish_cb(io, netio_send_packet(io->fd_h, io->bb_r,
+				&io->offset, blocking));
+	} 
+	else {
+		ret = io->finish_cb(io, ret);
+	}
+
+	// If needs requeue then requeue it.
+	switch (ret) {
+	case AS_NETIO_CONTINUE:
+		if (slow) {
+			io->slow = true;
+			cf_queue_push(&g_netio_slow_queue, io);
+		}
+		else {
+			cf_queue_push(&g_netio_queue, io);
+		}
+		break;
+	default:
+		ret = AS_NETIO_OK;
+		break;
+	}
+
+	return ret;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static int
+send_reply_buf(as_file_handle *fd_h, uint8_t *msgp, size_t msg_sz)
+{
+	cf_assert(cf_socket_exists(&fd_h->sock), AS_PROTO, "fd is invalid");
+
+	if (cf_socket_send_all(&fd_h->sock, msgp, msg_sz, MSG_NOSIGNAL,
+			CF_SOCKET_TIMEOUT) < 0) {
+		// Common when a client aborts.
+		cf_debug(AS_PROTO, "protocol write fail: fd %d sz %zu errno %d",
+				CSFD(&fd_h->sock), msg_sz, errno);
+
+		as_end_of_transaction_force_close(fd_h);
+		return -1;
+	}
+
+	as_end_of_transaction_ok(fd_h);
+	return 0;
+}
+
+static void *
+run_netio(void *q_to_wait_on)
+{
+	cf_queue *q = (cf_queue*)q_to_wait_on;
+
+	while (true) {
+		as_netio io;
+
+		if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) {
+			cf_crash(AS_PROTO, "failed to pop from IO worker queue.");
+		}
+
+		if (io.slow) {
+			usleep(g_config.proto_slow_netio_sleep_ms * 1000);
+		}
+
+		as_netio_send(&io, true, false);
+	}
+
+	return NULL;
+}
+
+static int
+netio_send_packet(as_file_handle *fd_h, cf_buf_builder *bb_r, uint32_t *offset,
+		bool blocking)
+{
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	uint32_t len = bb_r->used_sz;
+	uint8_t *buf = bb_r->buf;
+
+	as_proto proto;
+
+	proto.version = PROTO_VERSION;
+	proto.type = PROTO_TYPE_AS_MSG;
+	proto.sz = len - 8;
+	as_proto_swap(&proto);
+
+	memcpy(bb_r->buf, &proto, 8);
+
+	uint32_t pos = *offset;
+
+	ASD_QUERY_SENDPACKET_STARTING(nodeid, pos, len);
+
+	int retry = 0;
+
+	cf_detail(AS_PROTO," start at %p %d %d", buf, pos, len);
+
+	while (pos < len) {
+		int rv = cf_socket_send(&fd_h->sock, buf + pos, len - pos,
+				MSG_NOSIGNAL);
+
+		if (rv <= 0) {
+			if (errno != EAGAIN) {
+				cf_debug(AS_PROTO, "packet send response error returned %d errno %d fd %d",
+						rv, errno, CSFD(&fd_h->sock));
+				return AS_NETIO_IO_ERR;
+			}
+
+			if (! blocking && (retry > NETIO_MAX_IO_RETRY)) {
+				*offset = pos;
+				cf_detail(AS_PROTO," end at %p %d %d", buf, pos, len);
+				ASD_QUERY_SENDPACKET_CONTINUE(nodeid, pos);
+				return AS_NETIO_CONTINUE;
+			}
+
+			retry++;
+			// bigger packets so try few extra times
+			usleep(100);
+		}
+		else {
+			pos += rv;
+		}
+	}
+
+	ASD_QUERY_SENDPACKET_FINISHED(nodeid);
+	return AS_NETIO_OK;
+}
diff --git a/as/src/base/rec_props.c b/as/src/base/rec_props.c
new file mode 100644
index 00000000..e26016f7
--- /dev/null
+++ b/as/src/base/rec_props.c
@@ -0,0 +1,230 @@
+/*
+ * rec_props.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * A list of record properties.
+ *
+ */
+
+//==========================================================
+// Includes
+//
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "base/rec_props.h"
+
+
+//==========================================================
+// Private "Class Members"
+//
+
+//------------------------------------------------
+// Function Declarations
+//
+
+//------------------------------------------------
+// Data
+//
+
+//------------------------------------------------
+// Constants
+//
+
+
+//==========================================================
+// Typedefs
+//
+
+typedef struct as_rec_prop_field_s {
+	as_rec_props_field_id	id;
+	uint32_t				value_size;
+	uint8_t					value[];
+} __attribute__ ((__packed__)) as_rec_prop_field;
+
+
+//==========================================================
+// Public API
+//
+
+//------------------------------------------------
+// Clear the object.
+//
+void
+as_rec_props_clear(as_rec_props *this)
+{
+	this->p_data = NULL;
+	this->size = 0;
+}
+
+//------------------------------------------------
+// Parse a specific field.
+//
+int
+as_rec_props_get_value(const as_rec_props *this,
+		as_rec_props_field_id id, uint32_t *p_value_size, uint8_t **pp_value)
+{
+	const uint8_t *p_read = this->p_data;
+	const uint8_t *p_end = p_read + this->size - sizeof(as_rec_prop_field);
+
+	while (p_read < p_end) {
+		as_rec_prop_field* p_field = (as_rec_prop_field*)p_read;
+
+		if (p_field->id == id) {
+			if (p_value_size) {
+				*p_value_size = p_field->value_size;
+			}
+
+			if (pp_value) {
+				*pp_value = p_field->value;
+			}
+
+			return 0;
+		}
+
+		p_read += sizeof(as_rec_prop_field) + p_field->value_size;
+	}
+
+	return -1;
+}
+
+//------------------------------------------------
+// Get packed size of field, given value size.
+//
+uint32_t
+as_rec_props_sizeof_field(uint32_t value_size)
+{
+	return sizeof(as_rec_prop_field) + value_size;
+}
+
+//------------------------------------------------
+// Set p_data member to external buffer. (The size
+// member will be used like a write pointer in add
+// methods, so it starts at 0 here.)
+//
+void
+as_rec_props_init(as_rec_props *this, uint8_t *p_data)
+{
+	this->p_data = p_data;
+	this->size = 0;
+}
+
+//------------------------------------------------
+// Allocate memory for data. (The size member will
+// be used like a write pointer in add methods, so
+// it starts at 0 here.)
+//
+void
+as_rec_props_init_malloc(as_rec_props *this, uint32_t malloc_size)
+{
+	this->p_data = cf_malloc(malloc_size);
+	this->size = 0;
+}
+
+//------------------------------------------------
+// Append a field, trusting that:
+// - this->p_data has been allocated big enough
+// - this->size is the size added so far
+//
+void
+as_rec_props_add_field(as_rec_props *this,
+		as_rec_props_field_id id, uint32_t value_size, const uint8_t *p_value)
+{
+	as_rec_prop_field* p_field =
+			(as_rec_prop_field*)(this->p_data + this->size);
+
+	p_field->id = id;
+	p_field->value_size = value_size;
+	memcpy(p_field->value, p_value, value_size);
+
+	this->size += as_rec_props_sizeof_field(value_size);
+}
+
+//------------------------------------------------
+// Same as as_rec_props_add_field(), but where
+// p_value is to be a null-terminated string.
+//
+void
+as_rec_props_add_field_null_terminate(as_rec_props *this,
+		as_rec_props_field_id id, uint32_t value_len, const uint8_t *p_value)
+{
+	as_rec_prop_field* p_field =
+			(as_rec_prop_field*)(this->p_data + this->size);
+
+	p_field->id = id;
+	p_field->value_size = value_len + 1;
+	memcpy(p_field->value, p_value, value_len);
+	p_field->value[value_len] = 0;
+
+	this->size += as_rec_props_sizeof_field(p_field->value_size);
+}
+
+//------------------------------------------------
+// Returns size required for as_rec_props p_data
+// buffer for specified fields.
+//
+size_t
+as_rec_props_size_all(const uint8_t *set_name, size_t set_name_len,
+		const uint8_t *key, size_t key_size)
+{
+	size_t rec_props_data_size = 0;
+
+	if (set_name) {
+		rec_props_data_size += as_rec_props_sizeof_field(set_name_len + 1);
+	}
+
+	if (key) {
+		rec_props_data_size += as_rec_props_sizeof_field(key_size);
+	}
+
+	return rec_props_data_size;
+}
+
+//------------------------------------------------
+// Add all specified fields, trusting that:
+// - this->p_data has been allocated big enough
+//
+void
+as_rec_props_fill_all(as_rec_props *this, uint8_t *p_data,
+		const uint8_t *set_name, size_t set_name_len, const uint8_t *key,
+		size_t key_size)
+{
+	as_rec_props_init(this, p_data);
+
+	if (set_name) {
+		as_rec_props_add_field_null_terminate(this, CL_REC_PROPS_FIELD_SET_NAME,
+				set_name_len, set_name);
+	}
+
+	if (key) {
+		as_rec_props_add_field(this, CL_REC_PROPS_FIELD_KEY, key_size, key);
+	}
+}
+
+
+//==========================================================
+// Private Functions
+//
diff --git a/as/src/base/record.c b/as/src/base/record.c
new file mode 100644
index 00000000..4366bfad
--- /dev/null
+++ b/as/src/base/record.c
@@ -0,0 +1,958 @@
+/*
+ * record.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "arenax.h"
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/rec_props.h"
+#include "base/secondary_index.h"
+#include "base/truncate.h"
+#include "base/xdr_serverside.h"
+#include "storage/storage.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define STACK_PARTICLES_SIZE (1024 * 1024)
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void record_replace_failed(as_remote_record *rr, as_index_ref* r_ref, as_storage_rd* rd, bool is_create);
+
+int record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete);
+int record_apply_dim(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete);
+int record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete);
+int record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete);
+
+void update_index_metadata(as_remote_record *rr, index_metadata *old, as_record *r);
+void unwind_index_metadata(const index_metadata *old, as_record *r);
+void unwind_dim_single_bin(as_bin* old_bin, as_bin* new_bin);
+
+int unpickle_bins(as_remote_record *rr, as_storage_rd *rd, cf_ll_buf *particles_llb);
+
+void xdr_write_replica(as_remote_record *rr, bool is_delete, uint32_t set_id);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline int
+resolve_generation_direct(uint16_t left, uint16_t right)
+{
+	return left == right ? 0 : (right > left  ? 1 : -1);
+}
+
+static inline int
+resolve_generation(uint16_t left, uint16_t right)
+{
+	return left == right ? 0 : (as_gen_less_than(left, right) ? 1 : -1);
+}
+
+// Assumes remote generation is not 0. (Local may be 0 if creating record.)
+static inline bool
+next_generation(uint16_t local, uint16_t remote, as_namespace* ns)
+{
+	local = plain_generation(local, ns);
+	remote = plain_generation(remote, ns);
+
+	return local == 0xFFFF ? remote == 1 : remote - local == 1;
+}
+
+
+//==========================================================
+// Public API - record lock lifecycle.
+//
+
+// Returns:
+//  1 - created new record
+//  0 - found existing record
+// -1 - failure - could not allocate arena stage
+int
+as_record_get_create(as_index_tree *tree, cf_digest *keyd, as_index_ref *r_ref,
+		as_namespace *ns)
+{
+	int rv;
+
+	while ((rv = as_index_get_insert_vlock(tree, keyd, r_ref)) == -2) {
+		// rv = -2 - found "half created" or deleted record, wait for other
+		// thread to finish, and try again.
+		usleep(50);
+	}
+
+	if (rv == 1) {
+		cf_atomic64_incr(&ns->n_objects);
+	}
+
+	return rv;
+}
+
+
+// Returns:
+//  0 - found
+// -1 - not found
+int
+as_record_get(as_index_tree *tree, cf_digest *keyd, as_index_ref *r_ref)
+{
+	return as_index_get_vlock(tree, keyd, r_ref);
+}
+
+
+// Done with record - unlock, release, and if ref-count hits 0, destroy record
+// and free arena element.
+void
+as_record_done(as_index_ref *r_ref, as_namespace *ns)
+{
+	if (! r_ref->skip_lock) {
+		cf_mutex_unlock(r_ref->olock);
+	}
+
+	int rc = as_index_release(r_ref->r);
+
+	if (rc > 0) {
+		return;
+	}
+
+	cf_assert(rc == 0, AS_RECORD, "index ref-count %d", rc);
+
+	as_record_destroy(r_ref->r, ns);
+	cf_arenax_free(ns->arena, r_ref->r_h);
+}
+
+
+//==========================================================
+// Public API - record lifecycle utilities.
+//
+
+// Returns:
+//  0 - found
+// -1 - not found
+int
+as_record_exists(as_index_tree *tree, cf_digest *keyd)
+{
+	return as_index_exists(tree, keyd);
+}
+
+
+// TODO - inline this, if/when we unravel header files.
+bool
+as_record_is_expired(const as_record *r)
+{
+	return r->void_time != 0 && r->void_time < as_record_void_time_get();
+}
+
+
+// Called when writes encounter a "doomed" record, to delete the doomed record
+// and create a new one in place without giving up the record lock.
+void
+as_record_rescue(as_index_ref *r_ref, as_namespace *ns)
+{
+	record_delete_adjust_sindex(r_ref->r, ns);
+	as_record_destroy(r_ref->r, ns);
+	as_index_clear_record_info(r_ref->r);
+	cf_atomic64_incr(&ns->n_objects);
+}
+
+
+// Called only after last reference is released. Called by as_record_done(),
+// also given to index trees to be called when tree releases record reference.
+void
+as_record_destroy(as_record *r, as_namespace *ns)
+{
+	if (ns->storage_data_in_memory) {
+		// Note - rd is a limited container here - not calling
+		// as_storage_record_create(), _open(), _close().
+		as_storage_rd rd;
+
+		rd.r = r;
+		rd.ns = ns;
+		as_storage_rd_load_n_bins(&rd);
+		as_storage_rd_load_bins(&rd, NULL);
+
+		as_storage_record_drop_from_mem_stats(&rd);
+
+		as_record_destroy_bins(&rd);
+
+		if (! ns->single_bin) {
+			as_record_free_bin_space(r);
+
+			if (r->dim) {
+				cf_free(r->dim); // frees the key
+			}
+		}
+	}
+
+	as_record_drop_stats(r, ns);
+
+	// Dereference record's storage used-size.
+	as_storage_record_destroy(ns, r);
+
+	return;
+}
+
+
+// Called only if data-in-memory, and not single-bin.
+void
+as_record_free_bin_space(as_record *r)
+{
+	as_bin_space *bin_space = as_index_get_bin_space(r);
+
+	if (bin_space) {
+		cf_free((void*)bin_space);
+		as_index_set_bin_space(r, NULL);
+	}
+}
+
+
+// Destroy all particles in all bins.
+void
+as_record_destroy_bins(as_storage_rd *rd)
+{
+	as_record_destroy_bins_from(rd, 0);
+}
+
+
+// Destroy particles in specified bins.
+void
+as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from)
+{
+	for (uint16_t i = from; i < rd->n_bins; i++) {
+		as_bin *b = &rd->bins[i];
+
+		if (! as_bin_inuse(b)) {
+			return; // no more used bins - there are never unused bin gaps
+		}
+
+		as_bin_particle_destroy(b, rd->ns->storage_data_in_memory);
+		as_bin_set_empty(b);
+	}
+}
+
+
+// Called only for data-in-memory multi-bin, with no key currently stored.
+// Note - have to modify if/when other metadata joins key in as_rec_space.
+void
+as_record_allocate_key(as_record *r, const uint8_t *key, uint32_t key_size)
+{
+	as_rec_space *rec_space = (as_rec_space *)
+			cf_malloc_ns(sizeof(as_rec_space) + key_size);
+
+	rec_space->bin_space = (as_bin_space *)r->dim;
+	rec_space->key_size = key_size;
+	memcpy((void*)rec_space->key, (const void*)key, key_size);
+
+	r->dim = (void*)rec_space;
+}
+
+
+// Called only for data-in-memory multi-bin, with a key currently stored.
+// Note - have to modify if/when other metadata joins key in as_rec_space.
+void
+as_record_remove_key(as_record *r)
+{
+	as_bin_space *p_bin_space = ((as_rec_space *)r->dim)->bin_space;
+
+	cf_free(r->dim);
+	r->dim = (void *)p_bin_space;
+}
+
+
+//==========================================================
+// Public API - pickled record utilities.
+//
+
+// Flatten record's bins into "pickle" format for fabric.
+uint8_t *
+as_record_pickle(as_storage_rd *rd, size_t *len_r)
+{
+	as_namespace *ns = rd->ns;
+
+	uint32_t sz = 2; // always 2 bytes for number of bins
+	uint16_t n_bins_in_use;
+
+	for (n_bins_in_use = 0; n_bins_in_use < rd->n_bins; n_bins_in_use++) {
+		as_bin *b = &rd->bins[n_bins_in_use];
+
+		if (! as_bin_inuse(b)) {
+			break;
+		}
+
+		sz += 1; // for bin name length
+		sz += ns->single_bin ?
+				0 : strlen(as_bin_get_name_from_id(ns, b->id)); // for bin name
+		sz += 1; // was for version - currently not used
+
+		sz += as_bin_particle_pickled_size(b);
+	}
+
+	uint8_t *pickle = cf_malloc(sz);
+	uint8_t *buf = pickle;
+
+	(*(uint16_t *)buf) = cf_swap_to_be16(n_bins_in_use); // number of bins
+	buf += 2;
+
+	for (uint16_t i = 0; i < n_bins_in_use; i++) {
+		as_bin *b = &rd->bins[i];
+
+		// Copy bin name, skipping a byte for name length.
+		uint8_t name_len = (uint8_t)as_bin_memcpy_name(ns, buf + 1, b);
+
+		*buf++ = name_len; // fill in bin name length
+		buf += name_len; // skip past bin name
+		*buf++ = 0; // was version - currently not used
+
+		buf += as_bin_particle_to_pickled(b, buf);
+	}
+
+	*len_r = sz;
+
+	return pickle;
+}
+
+
+// If remote record is better than local record, replace local with remote.
+int
+as_record_replace_if_better(as_remote_record *rr, bool is_repl_write,
+		bool skip_sindex, bool do_xdr_write)
+{
+	as_namespace *ns = rr->rsv->ns;
+
+	if (! as_storage_has_space(ns)) {
+		cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name);
+		return AS_PROTO_RESULT_FAIL_OUT_OF_SPACE;
+	}
+
+	CF_ALLOC_SET_NS_ARENA(ns);
+
+	as_index_tree *tree = rr->rsv->tree;
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns);
+
+	if (rv < 0) {
+		return AS_PROTO_RESULT_FAIL_OUT_OF_SPACE;
+	}
+
+	bool is_create = rv == 1;
+	as_index *r = r_ref.r;
+
+	int result;
+
+	conflict_resolution_pol policy = ns->conflict_resolution_policy;
+
+	if (is_repl_write) {
+		bool from_replica;
+
+		if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src,
+				&from_replica)) != AS_PROTO_RESULT_OK) {
+			record_replace_failed(rr, &r_ref, NULL, is_create);
+			return result;
+		}
+
+		repl_write_init_repl_state(rr, from_replica);
+		policy = repl_write_conflict_resolution_policy(ns);
+	}
+
+	if (! is_create && record_replace_check(r, ns) < 0) {
+		record_replace_failed(rr, &r_ref, NULL, is_create);
+		return AS_PROTO_RESULT_FAIL_FORBIDDEN;
+	}
+
+	// If local record is better, no-op or fail.
+	if (! is_create && (result = as_record_resolve_conflict(policy,
+			r->generation, r->last_update_time, (uint16_t)rr->generation,
+			rr->last_update_time)) <= 0) {
+		record_replace_failed(rr, &r_ref, NULL, is_create);
+		return result == 0 ?
+				AS_PROTO_RESULT_FAIL_RECORD_EXISTS :
+				AS_PROTO_RESULT_FAIL_GENERATION;
+	}
+	// else - remote winner - apply it.
+
+	// If creating record, write set-ID into index.
+	if (is_create) {
+		if (rr->set_name && (result = as_index_set_set_w_len(r, ns,
+				rr->set_name, rr->set_name_len, false)) < 0) {
+			record_replace_failed(rr, &r_ref, NULL, is_create);
+			return -result;
+		}
+
+		r->last_update_time = rr->last_update_time;
+
+		// Don't write record if it would be truncated.
+		if (as_truncate_record_is_truncated(r, ns)) {
+			record_replace_failed(rr, &r_ref, NULL, is_create);
+			return AS_PROTO_RESULT_OK;
+		}
+	}
+	// else - not bothering to check that sets match.
+
+	as_storage_rd rd;
+
+	if (is_create) {
+		as_storage_record_create(ns, r, &rd);
+	}
+	else {
+		as_storage_record_open(ns, r, &rd);
+	}
+
+	// Assemble rec-props.
+	size_t rec_props_data_size = as_rec_props_size_all(
+			(const uint8_t *)rr->set_name, rr->set_name_len, rr->key,
+			rr->key_size);
+	uint8_t rec_props_data[rec_props_data_size];
+
+	as_rec_props_fill_all(&rd.rec_props, rec_props_data,
+			(const uint8_t *)rr->set_name, rr->set_name_len, rr->key,
+			rr->key_size);
+
+	// Split according to configuration to replace local record.
+	bool is_delete = false;
+
+	if (ns->storage_data_in_memory) {
+		if (ns->single_bin) {
+			result = record_apply_dim_single_bin(rr, &rd, &is_delete);
+		}
+		else {
+			result = record_apply_dim(rr, &rd, skip_sindex, &is_delete);
+		}
+	}
+	else {
+		if (ns->single_bin) {
+			result = record_apply_ssd_single_bin(rr, &rd, &is_delete);
+		}
+		else {
+			result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete);
+		}
+	}
+
+	if (result != 0) {
+		record_replace_failed(rr, &r_ref, &rd, is_create);
+		return result;
+	}
+
+	uint16_t set_id = as_index_get_set_id(r); // save for XDR write
+
+	record_replaced(r, rr);
+
+	as_storage_record_close(&rd);
+	as_record_done(&r_ref, ns);
+
+	if (do_xdr_write) {
+		xdr_write_replica(rr, is_delete, set_id);
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+//==========================================================
+// Public API - conflict resolution.
+//
+
+// Returns -1 if left wins, 1 if right wins, and 0 for tie.
+int
+as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen,
+		uint64_t left_lut, uint16_t right_gen, uint64_t right_lut)
+{
+	int result = 0;
+
+	switch (policy) {
+	case AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION:
+		// Doesn't use resolve_generation() - direct comparison gives much
+		// better odds of picking the record with more history after a split
+		// brain where one side starts the record from scratch.
+		result = resolve_generation_direct(left_gen, right_gen);
+		if (result == 0) {
+			result = resolve_last_update_time(left_lut, right_lut);
+		}
+		break;
+	case AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME:
+		result = resolve_last_update_time(left_lut, right_lut);
+		if (result == 0) {
+			result = resolve_generation(left_gen, right_gen);
+		}
+		break;
+	case AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP:
+		result = record_resolve_conflict_cp(left_gen, left_lut, right_gen,
+				right_lut);
+		break;
+	default:
+		cf_crash(AS_RECORD, "invalid conflict resolution policy");
+		break;
+	}
+
+	return result;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+void
+record_replace_failed(as_remote_record *rr, as_index_ref* r_ref,
+		as_storage_rd* rd, bool is_create)
+{
+	if (is_create) {
+		as_index_delete(rr->rsv->tree, rr->keyd);
+	}
+
+	if (rd) {
+		as_storage_record_close(rd);
+	}
+
+	as_record_done(r_ref, rr->rsv->ns);
+}
+
+
+// TODO - as_storage_record_get_n_bytes_memory() could check bins in use.
+int
+record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd,
+		bool *is_delete)
+{
+	as_namespace* ns = rr->rsv->ns;
+	as_record* r = rd->r;
+
+	rd->n_bins = 1;
+
+	// Set rd->bins!
+	as_storage_rd_load_bins(rd, NULL);
+
+	// For memory accounting, note current usage.
+	uint64_t memory_bytes = 0;
+
+	if (as_bin_inuse(rd->bins)) {
+		memory_bytes = as_storage_record_get_n_bytes_memory(rd);
+	}
+
+	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf);
+
+	if (n_new_bins > 1) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins);
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	// Keep old bin intact for unwinding, clear record bin for incoming.
+	as_bin old_bin;
+
+	as_single_bin_copy(&old_bin, rd->bins);
+	as_bin_set_empty(rd->bins);
+
+	int result;
+
+	// Fill the new bins and particles.
+	if (n_new_bins == 1 &&
+			(result = unpickle_bins(rr, rd, NULL)) != 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name);
+		unwind_dim_single_bin(&old_bin, rd->bins);
+		return result;
+	}
+
+	// Apply changes to metadata in as_index needed for and writing.
+	index_metadata old_metadata;
+
+	update_index_metadata(rr, &old_metadata, r);
+
+	// Write the record to storage.
+	if ((result = as_record_write_from_pickle(rd)) < 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
+		unwind_index_metadata(&old_metadata, r);
+		unwind_dim_single_bin(&old_bin, rd->bins);
+		return -result;
+	}
+
+	// Cleanup - destroy old bin, can't unwind after.
+	as_bin_particle_destroy(&old_bin, true);
+
+	as_storage_record_adjust_mem_stats(rd, memory_bytes);
+	*is_delete = n_new_bins == 0;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+int
+record_apply_dim(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex,
+		bool *is_delete)
+{
+	as_namespace* ns = rr->rsv->ns;
+	as_record* r = rd->r;
+
+	// Set rd->n_bins!
+	as_storage_rd_load_n_bins(rd);
+
+	// Set rd->bins!
+	as_storage_rd_load_bins(rd, NULL);
+
+	// For memory accounting, note current usage.
+	uint64_t memory_bytes = as_storage_record_get_n_bytes_memory(rd);
+
+	// Keep old bins intact for sindex adjustment and unwinding.
+	uint16_t n_old_bins = rd->n_bins;
+	as_bin* old_bins = rd->bins;
+
+	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf);
+	as_bin new_bins[n_new_bins];
+
+	memset(new_bins, 0, sizeof(new_bins));
+	rd->n_bins = n_new_bins;
+	rd->bins = new_bins;
+
+	// Fill the new bins and particles.
+	int result = unpickle_bins(rr, rd, NULL);
+
+	if (result != 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name);
+		destroy_stack_bins(new_bins, n_new_bins);
+		return result;
+	}
+
+	// Apply changes to metadata in as_index needed for and writing.
+	index_metadata old_metadata;
+
+	update_index_metadata(rr, &old_metadata, r);
+
+	// Write the record to storage.
+	if ((result = as_record_write_from_pickle(rd)) < 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
+		unwind_index_metadata(&old_metadata, r);
+		destroy_stack_bins(new_bins, n_new_bins);
+		return -result;
+	}
+
+	// Success - adjust sindex, looking at old and new bins.
+	if (! (skip_sindex &&
+			next_generation(r->generation, (uint16_t)rr->generation, ns)) &&
+					record_has_sindex(r, ns)) {
+		write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd,
+				old_bins, n_old_bins, new_bins, n_new_bins);
+	}
+
+	// Cleanup - destroy relevant bins, can't unwind after.
+	destroy_stack_bins(old_bins, n_old_bins);
+
+	// Fill out new_bin_space.
+	as_bin_space* new_bin_space = NULL;
+
+	if (n_new_bins != 0) {
+		new_bin_space = (as_bin_space*)
+				cf_malloc_ns(sizeof(as_bin_space) + sizeof(new_bins));
+
+		new_bin_space->n_bins = rd->n_bins;
+		memcpy((void*)new_bin_space->bins, new_bins, sizeof(new_bins));
+	}
+
+	// Swizzle the index element's as_bin_space pointer.
+	as_bin_space* old_bin_space = as_index_get_bin_space(r);
+
+	if (old_bin_space) {
+		cf_free(old_bin_space);
+	}
+
+	as_index_set_bin_space(r, new_bin_space);
+
+	// Accommodate a new stored key - wasn't needed for pickling and writing.
+	if (r->key_stored == 0 && rd->key) {
+		as_record_allocate_key(r, rd->key, rd->key_size);
+		r->key_stored = 1;
+	}
+
+	as_storage_record_adjust_mem_stats(rd, memory_bytes);
+	*is_delete = n_new_bins == 0;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+int
+record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd,
+		bool *is_delete)
+{
+	as_namespace* ns = rr->rsv->ns;
+	as_record* r = rd->r;
+
+	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf);
+
+	if (n_new_bins > 1) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins);
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	as_bin stack_bin = { { 0 } };
+
+	rd->n_bins = 1;
+	rd->bins = &stack_bin;
+
+	// Fill the new bin and particle.
+	cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE);
+
+	int result;
+
+	if (n_new_bins == 1 &&
+			(result = unpickle_bins(rr, rd, &particles_llb)) != 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name);
+		cf_ll_buf_free(&particles_llb);
+		return result;
+	}
+
+	// Apply changes to metadata in as_index needed for and writing.
+	index_metadata old_metadata;
+
+	update_index_metadata(rr, &old_metadata, r);
+
+	// Write the record to storage.
+	if ((result = as_record_write_from_pickle(rd)) < 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} write_master: failed write ", ns->name);
+		unwind_index_metadata(&old_metadata, r);
+		cf_ll_buf_free(&particles_llb);
+		return -result;
+	}
+
+	// Accommodate a new stored key - wasn't needed for writing.
+	if (r->key_stored == 0 && rr->key) {
+		r->key_stored = 1;
+	}
+
+	*is_delete = n_new_bins == 0;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+int
+record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex,
+		bool *is_delete)
+{
+	as_namespace* ns = rr->rsv->ns;
+	as_record* r = rd->r;
+	bool has_sindex = ! (skip_sindex &&
+			next_generation(r->generation, (uint16_t)rr->generation, ns)) &&
+					record_has_sindex(r, ns);
+
+	uint16_t n_old_bins = 0;
+	int result;
+
+	if (has_sindex) {
+		// Set rd->n_bins!
+		if ((result = as_storage_rd_load_n_bins(rd)) < 0) {
+			cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load n-bins ", ns->name);
+			return -result;
+		}
+
+		n_old_bins = rd->n_bins;
+	}
+
+	as_bin old_bins[n_old_bins];
+
+	if (has_sindex) {
+		// Set rd->bins!
+		if ((result = as_storage_rd_load_bins(rd, old_bins)) < 0) {
+			cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load bins ", ns->name);
+			return -result;
+		}
+	}
+
+	// Stack space for resulting record's bins.
+	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf);
+	as_bin new_bins[n_new_bins];
+
+	memset(new_bins, 0, sizeof(new_bins));
+	rd->n_bins = n_new_bins;
+	rd->bins = new_bins;
+
+	// Fill the new bins and particles.
+	cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE);
+
+	if ((result = unpickle_bins(rr, rd, &particles_llb)) != 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name);
+		cf_ll_buf_free(&particles_llb);
+		return result;
+	}
+
+	// Apply changes to metadata in as_index needed for and writing.
+	index_metadata old_metadata;
+
+	update_index_metadata(rr, &old_metadata, r);
+
+	// Write the record to storage.
+	if ((result = as_record_write_from_pickle(rd)) < 0) {
+		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
+		unwind_index_metadata(&old_metadata, r);
+		cf_ll_buf_free(&particles_llb);
+		return -result;
+	}
+
+	// Success - adjust sindex, looking at old and new bins.
+	if (has_sindex) {
+		write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd,
+				old_bins, n_old_bins, new_bins, n_new_bins);
+	}
+
+	// Accommodate a new stored key - wasn't needed for writing.
+	if (r->key_stored == 0 && rr->key) {
+		r->key_stored = 1;
+	}
+
+	*is_delete = n_new_bins == 0;
+
+	return 0;
+}
+
+
+void
+update_index_metadata(as_remote_record *rr, index_metadata *old, as_record *r)
+{
+	old->void_time = r->void_time;
+	old->last_update_time = r->last_update_time;
+	old->generation = r->generation;
+
+	r->generation = (uint16_t)rr->generation;
+	r->void_time = truncate_void_time(rr->rsv->ns, rr->void_time);
+	r->last_update_time = rr->last_update_time;
+}
+
+
+void
+unwind_index_metadata(const index_metadata *old, as_record *r)
+{
+	r->void_time = old->void_time;
+	r->last_update_time = old->last_update_time;
+	r->generation = old->generation;
+}
+
+
+void
+unwind_dim_single_bin(as_bin* old_bin, as_bin* new_bin)
+{
+	if (as_bin_inuse(new_bin)) {
+		as_bin_particle_destroy(new_bin, true);
+	}
+
+	as_single_bin_copy(new_bin, old_bin);
+}
+
+
+int
+unpickle_bins(as_remote_record *rr, as_storage_rd *rd, cf_ll_buf *particles_llb)
+{
+	as_namespace *ns = rd->ns;
+
+	const uint8_t *end = rr->record_buf + rr->record_buf_sz;
+	const uint8_t *buf = rr->record_buf + 2;
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		if (buf >= end) {
+			cf_warning(AS_RECORD, "incomplete pickled record");
+			return AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+
+		uint8_t name_sz = *buf++;
+		const uint8_t *name = buf;
+
+		buf += name_sz;
+		buf++; // skipped byte was version
+
+		if (buf > end) {
+			cf_warning(AS_RECORD, "incomplete pickled record");
+			return AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+
+		int result;
+		as_bin *b = as_bin_create_from_buf(rd, name, name_sz, &result);
+
+		if (! b) {
+			return result;
+		}
+
+		if (ns->storage_data_in_memory) {
+			if ((result = as_bin_particle_alloc_from_pickled(b,
+					&buf, end)) < 0) {
+				return -result;
+			}
+		}
+		else {
+			if ((result = as_bin_particle_stack_from_pickled(b, particles_llb,
+					&buf, end)) < 0) {
+				return -result;
+			}
+		}
+	}
+
+	if (buf != end) {
+		cf_warning(AS_RECORD, "extra bytes on pickled record");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+void
+xdr_write_replica(as_remote_record *rr, bool is_delete, uint32_t set_id)
+{
+	uint16_t generation = (uint16_t)rr->generation;
+	xdr_op_type op_type = XDR_OP_TYPE_WRITE;
+
+	// Note - in this code path, only durable deletes get here.
+	if (is_delete) {
+		generation = 0;
+		op_type = XDR_OP_TYPE_DURABLE_DELETE;
+	}
+
+	// Don't send an XDR delete if it's disallowed.
+	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
+		// TODO - should we also not ship if there was no record here before?
+		return;
+	}
+
+	xdr_write(rr->rsv->ns, rr->keyd, generation, rr->src, op_type, set_id,
+			NULL);
+}
diff --git a/as/src/base/record_ce.c b/as/src/base/record_ce.c
new file mode 100644
index 00000000..f0f1f5f1
--- /dev/null
+++ b/as/src/base/record_ce.c
@@ -0,0 +1,136 @@
+/*
+ * record_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Public API.
+//
+
+uint32_t
+clock_skew_stop_writes_sec()
+{
+	return 0;
+}
+
+
+void
+handle_clock_skew(as_namespace* ns, uint64_t skew_ms)
+{
+}
+
+
+uint16_t
+plain_generation(uint16_t regime_generation, const as_namespace* ns)
+{
+	return regime_generation;
+}
+
+
+void
+as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms,
+		const as_namespace* ns)
+{
+	// Note - last-update-time is not allowed to go backwards!
+	if (r->last_update_time < now_ms) {
+		r->last_update_time = now_ms;
+	}
+}
+
+
+void
+as_record_increment_generation(as_record *r, const as_namespace* ns)
+{
+	// The generation might wrap - 0 is reserved as "uninitialized".
+	if (++r->generation == 0) {
+		r->generation = 1;
+	}
+}
+
+
+bool
+as_record_is_live(const as_record* r)
+{
+	return true;
+}
+
+
+int
+as_record_get_live(as_index_tree* tree, cf_digest* keyd, as_index_ref* r_ref,
+		as_namespace* ns)
+{
+	return as_index_get_vlock(tree, keyd, r_ref);
+}
+
+
+int
+as_record_exists_live(as_index_tree* tree, cf_digest* keyd, as_namespace* ns)
+{
+	return as_record_exists(tree, keyd);
+}
+
+
+void
+as_record_drop_stats(as_record* r, as_namespace* ns)
+{
+	as_namespace_release_set_id(ns, as_index_get_set_id(r));
+
+	cf_atomic64_decr(&ns->n_objects);
+}
+
+
+int
+as_record_write_from_pickle(as_storage_rd* rd)
+{
+	cf_assert(as_bin_inuse_has(rd), AS_RECORD, "unexpected binless pickle");
+
+	return as_storage_record_write(rd);
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+int
+record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut,
+		uint16_t right_gen, uint64_t right_lut)
+{
+	cf_crash(AS_RECORD, "CE code called record_resolve_conflict_cp()");
+
+	return 0;
+}
diff --git a/as/src/base/scan.c b/as/src/base/scan.c
new file mode 100644
index 00000000..0602d7b3
--- /dev/null
+++ b/as/src/base/scan.c
@@ -0,0 +1,1409 @@
+/*
+ * scan.c
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/
+ */
+
+//==============================================================================
+// Includes.
+//
+
+#include "base/scan.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "aerospike/as_list.h"
+#include "aerospike/as_module.h"
+#include "aerospike/as_string.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_ll.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "socket.h"
+
+#include "base/aggr.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/job_manager.h"
+#include "base/monitor.h"
+#include "base/predexp.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "base/udf_memtracker.h"
+#include "fabric/exchange.h"
+#include "fabric/partition.h"
+#include "transaction/udf.h"
+
+
+
+//==============================================================================
+// Typedefs and forward declarations.
+//
+
+//----------------------------------------------------------
+// Scan types.
+//
+
+typedef enum {
+	SCAN_TYPE_BASIC		= 0,
+	SCAN_TYPE_AGGR		= 1,
+	SCAN_TYPE_UDF_BG	= 2,
+
+	SCAN_TYPE_UNKNOWN	= -1
+} scan_type;
+
+static inline const char*
+scan_type_str(scan_type type)
+{
+	switch (type) {
+	case SCAN_TYPE_BASIC:
+		return "basic";
+	case SCAN_TYPE_AGGR:
+		return "aggregation";
+	case SCAN_TYPE_UDF_BG:
+		return "background-udf";
+	default:
+		return "?";
+	}
+}
+
+//----------------------------------------------------------
+// scan_job - derived classes' public methods.
+//
+
+int basic_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id);
+int aggr_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id);
+int udf_bg_scan_job_start(as_transaction* tr, as_namespace* ns,
+		uint16_t set_id);
+
+//----------------------------------------------------------
+// Non-class-specific utilities.
+//
+
+typedef struct scan_options_s {
+	int			priority;
+	bool		fail_on_cluster_change;
+	uint32_t	sample_pct;
+} scan_options;
+
+int get_scan_set_id(as_transaction* tr, as_namespace* ns, uint16_t* p_set_id);
+scan_type get_scan_type(as_transaction* tr);
+bool get_scan_options(as_transaction* tr, scan_options* options);
+bool get_scan_socket_timeout(as_transaction* tr, uint32_t* timeout);
+bool get_scan_predexp(as_transaction* tr, predexp_eval_t** p_predexp);
+size_t send_blocking_response_chunk(as_file_handle* fd_h, uint8_t* buf, size_t size, int32_t timeout);
+static inline bool excluded_set(as_index* r, uint16_t set_id);
+
+
+
+//==============================================================================
+// Constants.
+//
+
+const size_t INIT_BUF_BUILDER_SIZE = 1024 * 1024 * 2;
+const size_t SCAN_CHUNK_LIMIT = 1024 * 1024;
+
+
+
+//==============================================================================
+// Globals.
+//
+
+static as_job_manager g_scan_manager;
+
+
+
+//==============================================================================
+// Public API.
+//
+
+void
+as_scan_init()
+{
+	as_job_manager_init(&g_scan_manager, g_config.scan_max_active,
+			g_config.scan_max_done, g_config.scan_threads);
+}
+
+int
+as_scan(as_transaction* tr, as_namespace* ns)
+{
+	int result;
+	uint16_t set_id = INVALID_SET_ID;
+
+	if ((result = get_scan_set_id(tr, ns, &set_id)) != AS_PROTO_RESULT_OK) {
+		return result;
+	}
+
+	switch (get_scan_type(tr)) {
+	case SCAN_TYPE_BASIC:
+		result = basic_scan_job_start(tr, ns, set_id);
+		break;
+	case SCAN_TYPE_AGGR:
+		result = aggr_scan_job_start(tr, ns, set_id);
+		break;
+	case SCAN_TYPE_UDF_BG:
+		result = udf_bg_scan_job_start(tr, ns, set_id);
+		break;
+	default:
+		cf_warning(AS_SCAN, "can't identify scan type");
+		result = AS_PROTO_RESULT_FAIL_PARAMETER;
+		break;
+	}
+
+	return result;
+}
+
+void
+as_scan_limit_active_jobs(uint32_t max_active)
+{
+	as_job_manager_limit_active_jobs(&g_scan_manager, max_active);
+}
+
+void
+as_scan_limit_finished_jobs(uint32_t max_done)
+{
+	as_job_manager_limit_finished_jobs(&g_scan_manager, max_done);
+}
+
+void
+as_scan_resize_thread_pool(uint32_t n_threads)
+{
+	as_job_manager_resize_thread_pool(&g_scan_manager, n_threads);
+}
+
+int
+as_scan_get_active_job_count()
+{
+	return as_job_manager_get_active_job_count(&g_scan_manager);
+}
+
+int
+as_scan_list(char* name, cf_dyn_buf* db)
+{
+	as_mon_info_cmd(AS_MON_MODULES[SCAN_MOD], NULL, 0, 0, db);
+	return 0;
+}
+
+as_mon_jobstat*
+as_scan_get_jobstat(uint64_t trid)
+{
+	return as_job_manager_get_job_info(&g_scan_manager, trid);
+}
+
+as_mon_jobstat*
+as_scan_get_jobstat_all(int* size)
+{
+	return as_job_manager_get_info(&g_scan_manager, size);
+}
+
+int
+as_scan_abort(uint64_t trid)
+{
+	return as_job_manager_abort_job(&g_scan_manager, trid) ? 0 : -1;
+}
+
+int
+as_scan_abort_all()
+{
+	return as_job_manager_abort_all_jobs(&g_scan_manager);
+}
+
+int
+as_scan_change_job_priority(uint64_t trid, uint32_t priority)
+{
+	return as_job_manager_change_job_priority(&g_scan_manager, trid,
+			(int)priority) ? 0 : -1;
+}
+
+
+//==============================================================================
+// Non-class-specific utilities.
+//
+
+int
+get_scan_set_id(as_transaction* tr, as_namespace* ns, uint16_t* p_set_id)
+{
+	uint16_t set_id = INVALID_SET_ID;
+	as_msg_field* f = as_transaction_has_set(tr) ?
+			as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_SET) : NULL;
+
+	if (f && as_msg_field_get_value_sz(f) != 0) {
+		uint32_t set_name_len = as_msg_field_get_value_sz(f);
+		char set_name[set_name_len + 1];
+
+		memcpy(set_name, f->data, set_name_len);
+		set_name[set_name_len] = '\0';
+		set_id = as_namespace_get_set_id(ns, set_name);
+
+		if (set_id == INVALID_SET_ID) {
+			cf_warning(AS_SCAN, "scan msg from %s has unrecognized set %s",
+					tr->from.proto_fd_h->client, set_name);
+			return AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		}
+	}
+
+	*p_set_id = set_id;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+scan_type
+get_scan_type(as_transaction* tr)
+{
+	if (! as_transaction_is_udf(tr)) {
+		return SCAN_TYPE_BASIC;
+	}
+
+	as_msg_field* udf_op_f = as_msg_field_get(&tr->msgp->msg,
+			AS_MSG_FIELD_TYPE_UDF_OP);
+
+	if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_AGGREGATE) {
+		return SCAN_TYPE_AGGR;
+	}
+
+	if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_BACKGROUND) {
+		return SCAN_TYPE_UDF_BG;
+	}
+
+	return SCAN_TYPE_UNKNOWN;
+}
+
+bool
+get_scan_options(as_transaction* tr, scan_options* options)
+{
+	if (! as_transaction_has_scan_options(tr)) {
+		return true;
+	}
+
+	as_msg_field* f = as_msg_field_get(&tr->msgp->msg,
+			AS_MSG_FIELD_TYPE_SCAN_OPTIONS);
+
+	if (as_msg_field_get_value_sz(f) != 2) {
+		cf_warning(AS_SCAN, "scan msg options field size not 2");
+		return false;
+	}
+
+	options->priority = AS_MSG_FIELD_SCAN_PRIORITY(f->data[0]);
+	options->fail_on_cluster_change =
+			(AS_MSG_FIELD_SCAN_FAIL_ON_CLUSTER_CHANGE & f->data[0]) != 0;
+	options->sample_pct = f->data[1];
+
+	return true;
+}
+
+bool
+get_scan_socket_timeout(as_transaction* tr, uint32_t* timeout)
+{
+	if (! as_transaction_has_socket_timeout(tr)) {
+		return true;
+	}
+
+	as_msg_field* f = as_msg_field_get(&tr->msgp->msg,
+			AS_MSG_FIELD_TYPE_SOCKET_TIMEOUT);
+
+	if (as_msg_field_get_value_sz(f) != 4) {
+		cf_warning(AS_SCAN, "scan socket timeout field size not 4");
+		return false;
+	}
+
+	*timeout = cf_swap_from_be32(*(uint32_t*)f->data);
+
+	return true;
+}
+
+bool
+get_scan_predexp(as_transaction* tr, predexp_eval_t** p_predexp)
+{
+	if (! as_transaction_has_predexp(tr)) {
+		return true;
+	}
+
+	as_msg_field* f = as_msg_field_get(&tr->msgp->msg,
+			AS_MSG_FIELD_TYPE_PREDEXP);
+
+	*p_predexp = predexp_build(f);
+
+	return *p_predexp != NULL;
+}
+
+size_t
+send_blocking_response_chunk(as_file_handle* fd_h, uint8_t* buf, size_t size,
+		int32_t timeout)
+{
+	cf_socket* sock = &fd_h->sock;
+	as_proto proto;
+
+	proto.version = PROTO_VERSION;
+	proto.type = PROTO_TYPE_AS_MSG;
+	proto.sz = size;
+	as_proto_swap(&proto);
+
+	if (cf_socket_send_all(sock, (uint8_t*)&proto, sizeof(as_proto),
+			MSG_NOSIGNAL | MSG_MORE, timeout) < 0) {
+		cf_warning(AS_SCAN, "error sending to %s - fd %d %s", fd_h->client,
+				CSFD(sock), cf_strerror(errno));
+		return 0;
+	}
+
+	if (cf_socket_send_all(sock, buf, size, MSG_NOSIGNAL, timeout) < 0) {
+		cf_warning(AS_SCAN, "error sending to %s - fd %d sz %lu %s",
+				fd_h->client, CSFD(sock), size, cf_strerror(errno));
+		return 0;
+	}
+
+	return sizeof(as_proto) + size;
+}
+
+static inline bool
+excluded_set(as_index* r, uint16_t set_id)
+{
+	return set_id != INVALID_SET_ID && set_id != as_index_get_set_id(r);
+}
+
+
+
+//==============================================================================
+// conn_scan_job derived class implementation - not final class.
+//
+
+//----------------------------------------------------------
+// conn_scan_job typedefs and forward declarations.
+//
+
+typedef struct conn_scan_job_s {
+	// Base object must be first:
+	as_job			_base;
+
+	// Derived class data:
+	pthread_mutex_t	fd_lock;
+	as_file_handle*	fd_h;
+	int32_t			fd_timeout;
+
+	uint64_t		net_io_bytes;
+} conn_scan_job;
+
+void conn_scan_job_own_fd(conn_scan_job* job, as_file_handle* fd_h, uint32_t timeout);
+void conn_scan_job_disown_fd(conn_scan_job* job);
+void conn_scan_job_finish(conn_scan_job* job);
+bool conn_scan_job_send_response(conn_scan_job* job, uint8_t* buf, size_t size);
+void conn_scan_job_release_fd(conn_scan_job* job, bool force_close);
+void conn_scan_job_info(conn_scan_job* job, as_mon_jobstat* stat);
+
+//----------------------------------------------------------
+// conn_scan_job API.
+//
+
+void
+conn_scan_job_own_fd(conn_scan_job* job, as_file_handle* fd_h, uint32_t timeout)
+{
+	pthread_mutex_init(&job->fd_lock, NULL);
+
+	job->fd_h = fd_h;
+	job->fd_h->fh_info |= FH_INFO_DONOT_REAP;
+	job->fd_timeout = timeout == 0 ? -1 : (int32_t)timeout;
+
+	job->net_io_bytes = 0;
+}
+
+void
+conn_scan_job_disown_fd(conn_scan_job* job)
+{
+	// Just undo conn_scan_job_own_fd(), nothing more.
+
+	job->fd_h->fh_info &= ~FH_INFO_DONOT_REAP;
+
+	pthread_mutex_destroy(&job->fd_lock);
+}
+
+void
+conn_scan_job_finish(conn_scan_job* job)
+{
+	as_job* _job = (as_job*)job;
+
+	if (job->fd_h) {
+		// TODO - perhaps reflect in monitor if send fails?
+		size_t size_sent = as_msg_send_fin_timeout(&job->fd_h->sock,
+				_job->abandoned, job->fd_timeout);
+
+		job->net_io_bytes += size_sent;
+		conn_scan_job_release_fd(job, size_sent == 0);
+	}
+
+	pthread_mutex_destroy(&job->fd_lock);
+}
+
+bool
+conn_scan_job_send_response(conn_scan_job* job, uint8_t* buf, size_t size)
+{
+	as_job* _job = (as_job*)job;
+
+	pthread_mutex_lock(&job->fd_lock);
+
+	if (! job->fd_h) {
+		pthread_mutex_unlock(&job->fd_lock);
+		// Job already abandoned.
+		return false;
+	}
+
+	size_t size_sent = send_blocking_response_chunk(job->fd_h, buf, size,
+			job->fd_timeout);
+
+	if (size_sent == 0) {
+		int reason = errno == ETIMEDOUT ?
+				AS_JOB_FAIL_RESPONSE_TIMEOUT : AS_JOB_FAIL_RESPONSE_ERROR;
+
+		conn_scan_job_release_fd(job, true);
+		pthread_mutex_unlock(&job->fd_lock);
+		as_job_manager_abandon_job(_job->mgr, _job, reason);
+		return false;
+	}
+
+	job->net_io_bytes += size_sent;
+
+	pthread_mutex_unlock(&job->fd_lock);
+	return true;
+}
+
+void
+conn_scan_job_release_fd(conn_scan_job* job, bool force_close)
+{
+	job->fd_h->fh_info &= ~FH_INFO_DONOT_REAP;
+	job->fd_h->last_used = cf_getms();
+	as_end_of_transaction(job->fd_h, force_close);
+	job->fd_h = NULL;
+}
+
+void
+conn_scan_job_info(conn_scan_job* job, as_mon_jobstat* stat)
+{
+	stat->net_io_bytes = job->net_io_bytes;
+}
+
+
+
+//==============================================================================
+// basic_scan_job derived class implementation.
+//
+
+//----------------------------------------------------------
+// basic_scan_job typedefs and forward declarations.
+//
+
+typedef struct basic_scan_job_s {
+	// Base object must be first:
+	conn_scan_job	_base;
+
+	// Derived class data:
+	uint64_t		cluster_key;
+	bool			fail_on_cluster_change;
+	bool			no_bin_data;
+	uint32_t		sample_pct;
+	predexp_eval_t*	predexp;
+	cf_vector*		bin_names;
+} basic_scan_job;
+
+void basic_scan_job_slice(as_job* _job, as_partition_reservation* rsv);
+void basic_scan_job_finish(as_job* _job);
+void basic_scan_job_destroy(as_job* _job);
+void basic_scan_job_info(as_job* _job, as_mon_jobstat* stat);
+
+const as_job_vtable basic_scan_job_vtable = {
+		basic_scan_job_slice,
+		basic_scan_job_finish,
+		basic_scan_job_destroy,
+		basic_scan_job_info
+};
+
+typedef struct basic_scan_slice_s {
+	basic_scan_job*		job;
+	cf_buf_builder**	bb_r;
+} basic_scan_slice;
+
+void basic_scan_job_reduce_cb(as_index_ref* r_ref, void* udata);
+cf_vector* bin_names_from_op(as_msg* m, int* result);
+
+//----------------------------------------------------------
+// basic_scan_job public API.
+//
+
+int
+basic_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id)
+{
+	basic_scan_job* job = cf_malloc(sizeof(basic_scan_job));
+	as_job* _job = (as_job*)job;
+
+	scan_options options = { .sample_pct = 100 };
+	uint32_t timeout = CF_SOCKET_TIMEOUT;
+	predexp_eval_t* predexp = NULL;
+
+	if (! get_scan_options(tr, &options) ||
+			! get_scan_socket_timeout(tr, &timeout) ||
+			! get_scan_predexp(tr, &predexp)) {
+		cf_warning(AS_SCAN, "basic scan job failed msg field processing");
+		cf_free(job);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	as_job_init(_job, &basic_scan_job_vtable, &g_scan_manager, RSV_WRITE,
+			as_transaction_trid(tr), ns, set_id, options.priority);
+
+	job->cluster_key = as_exchange_cluster_key();
+	job->fail_on_cluster_change = options.fail_on_cluster_change;
+	job->no_bin_data = (tr->msgp->msg.info1 & AS_MSG_INFO1_GET_NO_BINS) != 0;
+	job->sample_pct = options.sample_pct;
+	job->predexp = predexp;
+
+	int result;
+
+	job->bin_names = bin_names_from_op(&tr->msgp->msg, &result);
+
+	if (! job->bin_names && result != AS_PROTO_RESULT_OK) {
+		as_job_destroy(_job);
+		return result;
+	}
+
+	if (job->fail_on_cluster_change &&
+			(cf_atomic_int_get(ns->migrate_tx_partitions_remaining) != 0 ||
+			 cf_atomic_int_get(ns->migrate_rx_partitions_remaining) != 0)) {
+		// TODO - was AS_PROTO_RESULT_FAIL_UNAVAILABLE - ok?
+		cf_warning(AS_SCAN, "basic scan job not started - migration");
+		as_job_destroy(_job);
+		return AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH;
+	}
+
+	// Take ownership of socket from transaction.
+	conn_scan_job_own_fd((conn_scan_job*)job, tr->from.proto_fd_h, timeout);
+
+	cf_info(AS_SCAN, "starting basic scan job %lu {%s:%s} priority %u, sample-pct %u%s%s",
+			_job->trid, ns->name, as_namespace_get_set_name(ns, set_id),
+			_job->priority, job->sample_pct,
+			job->no_bin_data ? ", metadata-only" : "",
+			job->fail_on_cluster_change ? ", fail-on-cluster-change" : "");
+
+	if ((result = as_job_manager_start_job(_job->mgr, _job)) != 0) {
+		cf_warning(AS_SCAN, "basic scan job %lu failed to start (%d)",
+				_job->trid, result);
+		conn_scan_job_disown_fd((conn_scan_job*)job);
+		as_job_destroy(_job);
+		return result;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+//----------------------------------------------------------
+// basic_scan_job mandatory scan_job interface.
+//
+
+void
+basic_scan_job_slice(as_job* _job, as_partition_reservation* rsv)
+{
+	basic_scan_job* job = (basic_scan_job*)_job;
+	as_index_tree* tree = rsv->tree;
+	cf_buf_builder* bb = cf_buf_builder_create_size(INIT_BUF_BUILDER_SIZE);
+
+	if (! bb) {
+		as_job_manager_abandon_job(_job->mgr, _job,
+				AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	uint64_t slice_start = cf_getms();
+	basic_scan_slice slice = { job, &bb };
+
+	if (job->sample_pct == 100) {
+		as_index_reduce_live(tree, basic_scan_job_reduce_cb, (void*)&slice);
+	}
+	else {
+		uint64_t sample_count =
+				((as_index_tree_size(tree) * job->sample_pct) / 100);
+
+		as_index_reduce_partial_live(tree, sample_count,
+				basic_scan_job_reduce_cb, (void*)&slice);
+	}
+
+	if (bb->used_sz != 0) {
+		conn_scan_job_send_response((conn_scan_job*)job, bb->buf, bb->used_sz);
+	}
+
+	// TODO - guts don't check buf_builder realloc failures rigorously.
+	cf_buf_builder_free(bb);
+
+	cf_detail(AS_SCAN, "%s:%u basic scan job %lu in thread %lu took %lu ms",
+			rsv->ns->name, rsv->p->id, _job->trid, pthread_self(),
+			cf_getms() - slice_start);
+}
+
+void
+basic_scan_job_finish(as_job* _job)
+{
+	conn_scan_job_finish((conn_scan_job*)_job);
+
+	switch (_job->abandoned) {
+	case 0:
+		cf_atomic_int_incr(&_job->ns->n_scan_basic_complete);
+		break;
+	case AS_JOB_FAIL_USER_ABORT:
+		cf_atomic_int_incr(&_job->ns->n_scan_basic_abort);
+		break;
+	case AS_JOB_FAIL_UNKNOWN:
+	case AS_JOB_FAIL_CLUSTER_KEY:
+	case AS_JOB_FAIL_RESPONSE_ERROR:
+	case AS_JOB_FAIL_RESPONSE_TIMEOUT:
+	default:
+		cf_atomic_int_incr(&_job->ns->n_scan_basic_error);
+		break;
+	}
+
+	cf_info(AS_SCAN, "finished basic scan job %lu (%d)", _job->trid,
+			_job->abandoned);
+}
+
+void
+basic_scan_job_destroy(as_job* _job)
+{
+	basic_scan_job* job = (basic_scan_job*)_job;
+
+	if (job->bin_names) {
+		cf_vector_destroy(job->bin_names);
+	}
+
+	if (job->predexp) {
+		predexp_destroy(job->predexp);
+	}
+}
+
+void
+basic_scan_job_info(as_job* _job, as_mon_jobstat* stat)
+{
+	strcpy(stat->job_type, scan_type_str(SCAN_TYPE_BASIC));
+	conn_scan_job_info((conn_scan_job*)_job, stat);
+}
+
+//----------------------------------------------------------
+// basic_scan_job utilities.
+//
+
+void
+basic_scan_job_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	basic_scan_slice* slice = (basic_scan_slice*)udata;
+	basic_scan_job* job = slice->job;
+	as_job* _job = (as_job*)job;
+	as_namespace* ns = _job->ns;
+
+	if (_job->abandoned != 0) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	if (job->fail_on_cluster_change &&
+			job->cluster_key != as_exchange_cluster_key()) {
+		as_record_done(r_ref, ns);
+		as_job_manager_abandon_job(_job->mgr, _job,
+				AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH);
+		return;
+	}
+
+	as_index* r = r_ref->r;
+
+	if (excluded_set(r, _job->set_id) || as_record_is_doomed(r, ns)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	predexp_args_t predargs = { .ns = ns, .md = r, .vl = NULL, .rd = NULL };
+
+	if (job->predexp && ! predexp_matches_metadata(job->predexp, &predargs)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	as_storage_rd rd;
+
+	as_storage_record_open(ns, r, &rd);
+
+	if (job->no_bin_data) {
+		// TODO - suppose the predexp needs bin values???
+
+		as_msg_make_response_bufbuilder(slice->bb_r, &rd, true, true, true,
+				NULL);
+	}
+	else {
+		as_storage_rd_load_n_bins(&rd); // TODO - handle error returned
+
+		as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+		as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned
+
+		predargs.rd = &rd;
+
+		if (job->predexp && ! predexp_matches_record(job->predexp, &predargs)) {
+			as_storage_record_close(&rd);
+			as_record_done(r_ref, ns);
+			return;
+		}
+
+		as_msg_make_response_bufbuilder(slice->bb_r, &rd, false, true, true,
+				job->bin_names);
+	}
+
+	as_storage_record_close(&rd);
+	as_record_done(r_ref, ns);
+
+	cf_atomic64_incr(&_job->n_records_read);
+
+	cf_buf_builder* bb = *slice->bb_r;
+
+	// If we exceed the proto size limit, send accumulated data back to client
+	// and reset the buf-builder to start a new proto.
+	if (bb->used_sz > SCAN_CHUNK_LIMIT) {
+		if (! conn_scan_job_send_response((conn_scan_job*)job, bb->buf,
+				bb->used_sz)) {
+			return;
+		}
+
+		cf_buf_builder_reset(bb);
+	}
+}
+
+cf_vector*
+bin_names_from_op(as_msg* m, int* result)
+{
+	*result = AS_PROTO_RESULT_OK;
+
+	if (m->n_ops == 0) {
+		return NULL;
+	}
+
+	cf_vector* v  = cf_vector_create(AS_ID_BIN_SZ, m->n_ops, 0);
+
+	as_msg_op* op = NULL;
+	int n = 0;
+
+	while ((op = as_msg_op_iterate(m, op, &n)) != NULL) {
+		if (op->name_sz >= AS_ID_BIN_SZ) {
+			cf_warning(AS_SCAN, "basic scan job bin name too long");
+			cf_vector_destroy(v);
+			*result = AS_PROTO_RESULT_FAIL_BIN_NAME;
+			return NULL;
+		}
+
+		char bin_name[AS_ID_BIN_SZ];
+
+		memcpy(bin_name, op->name, op->name_sz);
+		bin_name[op->name_sz] = 0;
+		cf_vector_append_unique(v, (void*)bin_name);
+	}
+
+	return v;
+}
+
+
+
+//==============================================================================
+// aggr_scan_job derived class implementation.
+//
+
+//----------------------------------------------------------
+// aggr_scan_job typedefs and forward declarations.
+//
+
+typedef struct aggr_scan_job_s {
+	// Base object must be first:
+	conn_scan_job	_base;
+
+	// Derived class data:
+	as_aggr_call	aggr_call;
+} aggr_scan_job;
+
+void aggr_scan_job_slice(as_job* _job, as_partition_reservation* rsv);
+void aggr_scan_job_finish(as_job* _job);
+void aggr_scan_job_destroy(as_job* _job);
+void aggr_scan_job_info(as_job* _job, as_mon_jobstat* stat);
+
+const as_job_vtable aggr_scan_job_vtable = {
+		aggr_scan_job_slice,
+		aggr_scan_job_finish,
+		aggr_scan_job_destroy,
+		aggr_scan_job_info
+};
+
+typedef struct aggr_scan_slice_s {
+	aggr_scan_job*				job;
+	cf_ll*						ll;
+	cf_buf_builder**			bb_r;
+	as_partition_reservation*	rsv;
+} aggr_scan_slice;
+
+bool aggr_scan_init(as_aggr_call* call, const as_transaction* tr);
+void aggr_scan_job_reduce_cb(as_index_ref* r_ref, void* udata);
+bool aggr_scan_add_digest(cf_ll* ll, cf_digest* keyd);
+as_partition_reservation* aggr_scan_ptn_reserve(void* udata, as_namespace* ns,
+		uint32_t pid, as_partition_reservation* rsv);
+as_stream_status aggr_scan_ostream_write(void* udata, as_val* val);
+
+const as_aggr_hooks scan_aggr_hooks = {
+	.ostream_write = aggr_scan_ostream_write,
+	.set_error     = NULL,
+	.ptn_reserve   = aggr_scan_ptn_reserve,
+	.ptn_release   = NULL,
+	.pre_check     = NULL
+};
+
+void aggr_scan_add_val_response(aggr_scan_slice* slice, const as_val* val,
+		bool success);
+
+//----------------------------------------------------------
+// aggr_scan_job public API.
+//
+
+int
+aggr_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id)
+{
+	aggr_scan_job* job = cf_malloc(sizeof(aggr_scan_job));
+	as_job* _job = (as_job*)job;
+
+	scan_options options = { .sample_pct = 100 };
+	uint32_t timeout = CF_SOCKET_TIMEOUT;
+
+	if (! get_scan_options(tr, &options) ||
+			! get_scan_socket_timeout(tr, &timeout)) {
+		cf_warning(AS_SCAN, "aggregation scan job failed msg field processing");
+		cf_free(job);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (as_transaction_has_predexp(tr)) {
+		cf_warning(AS_SCAN, "aggregation scans do not support predexp filters");
+		cf_free(job);
+		return AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE;
+	}
+
+	as_job_init(_job, &aggr_scan_job_vtable, &g_scan_manager, RSV_WRITE,
+			as_transaction_trid(tr), ns, set_id, options.priority);
+
+	if (! aggr_scan_init(&job->aggr_call, tr)) {
+		cf_warning(AS_SCAN, "aggregation scan job failed call init");
+		as_job_destroy(_job);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	// Take ownership of socket from transaction.
+	conn_scan_job_own_fd((conn_scan_job*)job, tr->from.proto_fd_h, timeout);
+
+	cf_info(AS_SCAN, "starting aggregation scan job %lu {%s:%s} priority %u",
+			_job->trid, ns->name, as_namespace_get_set_name(ns, set_id),
+			_job->priority);
+
+	int result = as_job_manager_start_job(_job->mgr, _job);
+
+	if (result != 0) {
+		cf_warning(AS_SCAN, "aggregation scan job %lu failed to start (%d)",
+				_job->trid, result);
+		conn_scan_job_disown_fd((conn_scan_job*)job);
+		as_job_destroy(_job);
+		return result;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+//----------------------------------------------------------
+// aggr_scan_job mandatory scan_job interface.
+//
+
+void
+aggr_scan_job_slice(as_job* _job, as_partition_reservation* rsv)
+{
+	aggr_scan_job* job = (aggr_scan_job*)_job;
+	cf_ll ll;
+	cf_buf_builder* bb = cf_buf_builder_create_size(INIT_BUF_BUILDER_SIZE);
+
+	if (! bb) {
+		as_job_manager_abandon_job(_job->mgr, _job,
+				AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	cf_ll_init(&ll, as_index_keys_ll_destroy_fn, false);
+
+	aggr_scan_slice slice = { job, &ll, &bb, rsv };
+
+	as_index_reduce_live(rsv->tree, aggr_scan_job_reduce_cb, (void*)&slice);
+
+	if (cf_ll_size(&ll) != 0) {
+		as_result result;
+		as_result_init(&result);
+
+		int ret = as_aggr_process(_job->ns, &job->aggr_call, &ll, (void*)&slice,
+				&result);
+
+		if (ret != 0) {
+			char* rs = as_module_err_string(ret);
+
+			if (result.value) {
+				as_string* lua_s = as_string_fromval(result.value);
+				char* lua_err = (char*)as_string_tostring(lua_s);
+
+				if (lua_err) {
+					int l_rs_len = strlen(rs);
+
+					rs = cf_realloc(rs, l_rs_len + strlen(lua_err) + 4);
+					sprintf(&rs[l_rs_len], " : %s", lua_err);
+				}
+			}
+
+			const as_val* v = (as_val*)as_string_new(rs, false);
+
+			aggr_scan_add_val_response(&slice, v, false);
+			as_val_destroy(v);
+			cf_free(rs);
+			as_job_manager_abandon_job(_job->mgr, _job,
+					AS_PROTO_RESULT_FAIL_UNKNOWN);
+		}
+
+		as_result_destroy(&result);
+	}
+
+	cf_ll_reduce(&ll, true, as_index_keys_ll_reduce_fn, NULL);
+
+	if (bb->used_sz != 0) {
+		conn_scan_job_send_response((conn_scan_job*)job, bb->buf, bb->used_sz);
+	}
+
+	// TODO - guts don't check buf_builder realloc failures rigorously.
+	cf_buf_builder_free(bb);
+}
+
+void
+aggr_scan_job_finish(as_job* _job)
+{
+	aggr_scan_job* job = (aggr_scan_job*)_job;
+
+	conn_scan_job_finish((conn_scan_job*)job);
+
+	if (job->aggr_call.def.arglist) {
+		as_list_destroy(job->aggr_call.def.arglist);
+		job->aggr_call.def.arglist = NULL;
+	}
+
+	switch (_job->abandoned) {
+	case 0:
+		cf_atomic_int_incr(&_job->ns->n_scan_aggr_complete);
+		break;
+	case AS_JOB_FAIL_USER_ABORT:
+		cf_atomic_int_incr(&_job->ns->n_scan_aggr_abort);
+		break;
+	case AS_JOB_FAIL_UNKNOWN:
+	case AS_JOB_FAIL_CLUSTER_KEY:
+	case AS_JOB_FAIL_RESPONSE_ERROR:
+	case AS_JOB_FAIL_RESPONSE_TIMEOUT:
+	default:
+		cf_atomic_int_incr(&_job->ns->n_scan_aggr_error);
+		break;
+	}
+
+	cf_info(AS_SCAN, "finished aggregation scan job %lu (%d)", _job->trid,
+			_job->abandoned);
+}
+
+void
+aggr_scan_job_destroy(as_job* _job)
+{
+	aggr_scan_job* job = (aggr_scan_job*)_job;
+
+	if (job->aggr_call.def.arglist) {
+		as_list_destroy(job->aggr_call.def.arglist);
+	}
+}
+
+void
+aggr_scan_job_info(as_job* _job, as_mon_jobstat* stat)
+{
+	strcpy(stat->job_type, scan_type_str(SCAN_TYPE_AGGR));
+	conn_scan_job_info((conn_scan_job*)_job, stat);
+}
+
+//----------------------------------------------------------
+// aggr_scan_job utilities.
+//
+
+bool
+aggr_scan_init(as_aggr_call* call, const as_transaction* tr)
+{
+	if (! udf_def_init_from_msg(&call->def, tr)) {
+		return false;
+	}
+
+	call->aggr_hooks = &scan_aggr_hooks;
+
+	return true;
+}
+
+void
+aggr_scan_job_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	aggr_scan_slice* slice = (aggr_scan_slice*)udata;
+	aggr_scan_job* job = slice->job;
+	as_job* _job = (as_job*)job;
+	as_namespace* ns = _job->ns;
+
+	if (_job->abandoned != 0) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	as_index* r = r_ref->r;
+
+	if (excluded_set(r, _job->set_id) || as_record_is_doomed(r, ns)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	if (! aggr_scan_add_digest(slice->ll, &r->keyd)) {
+		as_record_done(r_ref, ns);
+		as_job_manager_abandon_job(_job->mgr, _job,
+				AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	cf_atomic64_incr(&_job->n_records_read);
+	as_record_done(r_ref, ns);
+}
+
+bool
+aggr_scan_add_digest(cf_ll* ll, cf_digest* keyd)
+{
+	as_index_keys_ll_element* tail_e = (as_index_keys_ll_element*)ll->tail;
+	as_index_keys_arr* keys_arr;
+
+	if (tail_e) {
+		keys_arr = tail_e->keys_arr;
+
+		if (keys_arr->num == AS_INDEX_KEYS_PER_ARR) {
+			tail_e = NULL;
+		}
+	}
+
+	if (! tail_e) {
+		if (! (keys_arr = as_index_get_keys_arr())) {
+			return false;
+		}
+
+		tail_e = cf_malloc(sizeof(as_index_keys_ll_element));
+
+		tail_e->keys_arr = keys_arr;
+		cf_ll_append(ll, (cf_ll_element*)tail_e);
+	}
+
+	keys_arr->pindex_digs[keys_arr->num] = *keyd;
+	keys_arr->num++;
+
+	return true;
+}
+
+as_partition_reservation*
+aggr_scan_ptn_reserve(void* udata, as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv)
+{
+	aggr_scan_slice* slice = (aggr_scan_slice*)udata;
+
+	return slice->rsv;
+}
+
+as_stream_status
+aggr_scan_ostream_write(void* udata, as_val* val)
+{
+	aggr_scan_slice* slice = (aggr_scan_slice*)udata;
+
+	if (val) {
+		aggr_scan_add_val_response(slice, val, true);
+		as_val_destroy(val);
+	}
+
+	return AS_STREAM_OK;
+}
+
+void
+aggr_scan_add_val_response(aggr_scan_slice* slice, const as_val* val,
+		bool success)
+{
+	uint32_t size = as_particle_asval_client_value_size(val);
+
+	as_msg_make_val_response_bufbuilder(val, slice->bb_r, size, success);
+
+	cf_buf_builder* bb = *slice->bb_r;
+	conn_scan_job* conn_job = (conn_scan_job*)slice->job;
+
+	// If we exceed the proto size limit, send accumulated data back to client
+	// and reset the buf-builder to start a new proto.
+	if (bb->used_sz > SCAN_CHUNK_LIMIT) {
+		if (! conn_scan_job_send_response(conn_job, bb->buf, bb->used_sz)) {
+			return;
+		}
+
+		cf_buf_builder_reset(bb);
+	}
+}
+
+
+
+//==============================================================================
+// udf_bg_scan_job derived class implementation.
+//
+
+//----------------------------------------------------------
+// udf_bg_scan_job typedefs and forward declarations.
+//
+
+typedef struct udf_bg_scan_job_s {
+	// Base object must be first:
+	as_job			_base;
+
+	// Derived class data:
+	iudf_origin		origin;
+	bool			is_durable_delete; // enterprise only
+	cf_atomic32		n_active_tr;
+
+	cf_atomic64		n_successful_tr;
+	cf_atomic64		n_failed_tr;
+} udf_bg_scan_job;
+
+void udf_bg_scan_job_slice(as_job* _job, as_partition_reservation* rsv);
+void udf_bg_scan_job_finish(as_job* _job);
+void udf_bg_scan_job_destroy(as_job* _job);
+void udf_bg_scan_job_info(as_job* _job, as_mon_jobstat* stat);
+
+const as_job_vtable udf_bg_scan_job_vtable = {
+		udf_bg_scan_job_slice,
+		udf_bg_scan_job_finish,
+		udf_bg_scan_job_destroy,
+		udf_bg_scan_job_info
+};
+
+void udf_bg_scan_job_reduce_cb(as_index_ref* r_ref, void* udata);
+int udf_bg_scan_tr_complete(void* udata, int retcode);
+
+//----------------------------------------------------------
+// udf_bg_scan_job public API.
+//
+
+int
+udf_bg_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id)
+{
+	udf_bg_scan_job* job = cf_malloc(sizeof(udf_bg_scan_job));
+	as_job* _job = (as_job*)job;
+
+	scan_options options = { .sample_pct = 100 };
+	predexp_eval_t* predexp = NULL;
+
+	if (! get_scan_options(tr, &options) || ! get_scan_predexp(tr, &predexp)) {
+		cf_warning(AS_SCAN, "udf-bg scan job failed msg field processing");
+		cf_free(job);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	as_job_init(_job, &udf_bg_scan_job_vtable, &g_scan_manager, RSV_WRITE,
+			as_transaction_trid(tr), ns, set_id, options.priority);
+
+	job->origin.predexp = predexp;
+	job->is_durable_delete = as_transaction_is_durable_delete(tr);
+	job->n_active_tr = 0;
+	job->n_successful_tr = 0;
+	job->n_failed_tr = 0;
+
+	if (! udf_def_init_from_msg(&job->origin.def, tr)) {
+		cf_warning(AS_SCAN, "udf-bg scan job failed def init");
+		as_job_destroy(_job);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	job->origin.cb = udf_bg_scan_tr_complete;
+	job->origin.udata = (void*)job;
+
+	cf_info(AS_SCAN, "starting udf-bg scan job %lu {%s:%s} priority %u",
+			_job->trid, ns->name, as_namespace_get_set_name(ns, set_id),
+			_job->priority);
+
+	int result = as_job_manager_start_job(_job->mgr, _job);
+
+	if (result != 0) {
+		cf_warning(AS_SCAN, "udf-bg scan job %lu failed to start (%d)",
+				_job->trid, result);
+		as_job_destroy(_job);
+		return result;
+	}
+
+	if (as_msg_send_fin(&tr->from.proto_fd_h->sock, AS_PROTO_RESULT_OK)) {
+		tr->from.proto_fd_h->last_used = cf_getms();
+		as_end_of_transaction_ok(tr->from.proto_fd_h);
+	}
+	else {
+		cf_warning(AS_SCAN, "udf-bg scan job error sending fin");
+		as_end_of_transaction_force_close(tr->from.proto_fd_h);
+		// No point returning an error - it can't be reported on this socket.
+	}
+
+	tr->from.proto_fd_h = NULL;
+
+	return AS_PROTO_RESULT_OK;
+}
+
+//----------------------------------------------------------
+// udf_bg_scan_job mandatory scan_job interface.
+//
+
+void
+udf_bg_scan_job_slice(as_job* _job, as_partition_reservation* rsv)
+{
+	as_index_reduce_live(rsv->tree, udf_bg_scan_job_reduce_cb, (void*)_job);
+}
+
+void
+udf_bg_scan_job_finish(as_job* _job)
+{
+	udf_bg_scan_job* job = (udf_bg_scan_job*)_job;
+
+	while (cf_atomic32_get(job->n_active_tr) != 0) {
+		usleep(100);
+	}
+
+	switch (_job->abandoned) {
+	case 0:
+		cf_atomic_int_incr(&_job->ns->n_scan_udf_bg_complete);
+		break;
+	case AS_JOB_FAIL_USER_ABORT:
+		cf_atomic_int_incr(&_job->ns->n_scan_udf_bg_abort);
+		break;
+	case AS_JOB_FAIL_UNKNOWN:
+	case AS_JOB_FAIL_CLUSTER_KEY:
+	default:
+		cf_atomic_int_incr(&_job->ns->n_scan_udf_bg_error);
+		break;
+	}
+
+	cf_info(AS_SCAN, "finished udf-bg scan job %lu (%d)", _job->trid,
+			_job->abandoned);
+}
+
+void
+udf_bg_scan_job_destroy(as_job* _job)
+{
+	udf_bg_scan_job* job = (udf_bg_scan_job*)_job;
+
+	iudf_origin_destroy(&job->origin);
+}
+
+void
+udf_bg_scan_job_info(as_job* _job, as_mon_jobstat* stat)
+{
+	strcpy(stat->job_type, scan_type_str(SCAN_TYPE_UDF_BG));
+	stat->net_io_bytes = sizeof(cl_msg); // size of original synchronous fin
+
+	udf_bg_scan_job* job = (udf_bg_scan_job*)_job;
+	char* extra = stat->jdata + strlen(stat->jdata);
+
+	sprintf(extra, ":udf-filename=%s:udf-function=%s:udf-active=%u:udf-success=%lu:udf-failed=%lu",
+			job->origin.def.filename, job->origin.def.function,
+			cf_atomic32_get(job->n_active_tr),
+			cf_atomic64_get(job->n_successful_tr),
+			cf_atomic64_get(job->n_failed_tr));
+}
+
+//----------------------------------------------------------
+// udf_bg_scan_job utilities.
+//
+
+void
+udf_bg_scan_job_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_job* _job = (as_job*)udata;
+	udf_bg_scan_job* job = (udf_bg_scan_job*)_job;
+	as_namespace* ns = _job->ns;
+
+	if (_job->abandoned != 0) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	as_index* r = r_ref->r;
+
+	if (excluded_set(r, _job->set_id) || as_record_is_doomed(r, ns)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	predexp_args_t predargs = { .ns = ns, .md = r, .vl = NULL, .rd = NULL };
+
+	if (job->origin.predexp &&
+			! predexp_matches_metadata(job->origin.predexp, &predargs)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	// Save this before releasing record.
+	cf_digest d = r->keyd;
+
+	// Release record lock before enqueuing transaction.
+	as_record_done(r_ref, ns);
+
+	// TODO - replace this mechanism with signal-based counter?
+	while (cf_atomic32_get(job->n_active_tr) >
+			g_config.scan_max_udf_transactions) {
+		usleep(50);
+	}
+
+	as_transaction tr;
+
+	as_transaction_init_iudf(&tr, ns, &d, &job->origin, job->is_durable_delete);
+
+	cf_atomic64_incr(&_job->n_records_read);
+	cf_atomic32_incr(&job->n_active_tr);
+
+	as_tsvc_enqueue(&tr);
+}
+
+int
+udf_bg_scan_tr_complete(void* udata, int retcode)
+{
+	udf_bg_scan_job* job = (udf_bg_scan_job*)udata;
+
+	cf_atomic32_decr(&job->n_active_tr);
+	cf_atomic64_incr(retcode == 0 ? &job->n_successful_tr : &job->n_failed_tr);
+
+	return 0;
+}
diff --git a/as/src/base/secondary_index.c b/as/src/base/secondary_index.c
new file mode 100644
index 00000000..d698702c
--- /dev/null
+++ b/as/src/base/secondary_index.c
@@ -0,0 +1,4539 @@
+/*
+ * secondary_index.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+/*
+ * SYNOPSIS
+ * Abstraction to support secondary indexes with multiple implementations.
+ * Currently there are two variants of secondary indexes supported.
+ *
+ * -  Aerospike Index B-tree, this is full fledged index implementation and
+ *    maintains its own metadata and data structure for list of those indexes.
+ *
+ * -  Citrusleaf foundation indexes which are bare bone tree implementation
+ *    with ability to insert delete update indexes. For these the current code
+ *    manage all the data structure to manage different trees. [Will be
+ *    implemented when required]
+ *
+ * This file implements all the translation function which can be called from
+ * citrusleaf to prepare to do the operations on secondary index. Also
+ * implements locking to make Aerospike Index (single threaded) code multi threaded.
+ *
+ */
+
+/* Code flow --
+ *
+ * DDLs
+ *
+ * as_sindex_create --> ai_btree_create
+ *
+ * as_sindex_destroy --> Releases the si and change the state to AS_SINDEX_DESTROY
+ *
+ * BOOT INDEX
+ *
+ * as_sindex_boot_populateall --> If fast restart or data in memory and load at start up --> as_sbld_build_all
+ *
+ * SBIN creation
+ *
+ * as_sindex_sbins_from_rd  --> (For every bin in the record) as_sindex_sbins_from_bin
+ *
+ * as_sindex_sbins_from_bin -->  as_sindex_sbins_from_bin_buf
+ *
+ * as_sindex_sbins_from_bin_buf --> (For every macthing sindex) --> as_sindex_sbin_from_sindex
+ *
+ * as_sindex_sbin_from_sindex --> (If bin value macthes with sindex defn) --> as_sindex_add_asval_to_itype_sindex
+ *
+ * SBIN updates
+ *
+ * as_sindex_update_by_sbin --> For every sbin --> as_sindex__op_by_sbin
+ *
+ * as_sindex__op_by_sbin --> If op == AS_SINDEX_OP_INSERT --> ai_btree_put
+ *                       |
+ *                       --> If op == AS_SINDEX_OP_DELETE --> ai_btree_delete
+ *
+ * DMLs using RECORD
+ *
+ * as_sindex_put_rd --> For each bin in the record --> as_sindex_sbin_from_sindex
+ *
+ * as_sindex_putall_rd --> For each sindex --> as_sindex_put_rd
+ *
+ */
+
+#include "base/secondary_index.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "aerospike/as_arraylist.h"
+#include "aerospike/as_arraylist_iterator.h"
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_hashmap.h"
+#include "aerospike/as_hashmap_iterator.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_pair.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_val.h"
+
+#include "ai_btree.h"
+#include "bt_iterator.h"
+#include "cf_str.h"
+#include "fault.h"
+#include "shash.h"
+
+#include "base/cdt.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/stats.h"
+#include "base/system_metadata.h"
+#include "base/thr_sindex.h"
+#include "base/thr_info.h"
+#include "fabric/partition.h"
+#include "geospatial/geospatial.h"
+#include "transaction/udf.h"
+
+
+#define SINDEX_CRASH(str, ...) \
+	cf_crash(AS_SINDEX, "SINDEX_ASSERT: "str, ##__VA_ARGS__);
+
+#define AS_SINDEX_PROP_KEY_SIZE (AS_SET_NAME_MAX_SIZE + 20) // setname_binid_typeid
+
+
+// ************************************************************************************************
+//                                        BINID HAS SINDEX
+// Maintains a bit array where binid'th bit represents the existence of atleast one index over the
+// bin with bin id as binid.
+// Set, reset should be called under SINDEX_GWLOCK
+// get should be called under SINDEX_GRLOCK
+
+void
+as_sindex_set_binid_has_sindex(as_namespace *ns, int binid)
+{
+	int index     = binid / 32;
+	uint32_t temp = ns->binid_has_sindex[index];
+	temp         |= (1 << (binid % 32));
+	ns->binid_has_sindex[index] = temp;
+}
+
+void
+as_sindex_reset_binid_has_sindex(as_namespace *ns, int binid)
+{
+	int i          = 0;
+	int j          = 0;
+	as_sindex * si = NULL;
+
+	while (i < AS_SINDEX_MAX && j < ns->sindex_cnt) {
+		si = &ns->sindex[i];
+		if (si != NULL) {
+			if (si->state == AS_SINDEX_ACTIVE) {
+				j++;
+				if (si->imd->binid == binid) {
+					return;
+				}
+			}
+		}
+		i++;
+	}
+
+	int index     = binid / 32;
+	uint32_t temp = ns->binid_has_sindex[index];
+	temp         &= ~(1 << (binid % 32));
+	ns->binid_has_sindex[index] = temp;
+}
+
+bool
+as_sindex_binid_has_sindex(as_namespace *ns, int binid)
+{
+	int index      = binid / 32;
+	uint32_t temp  = ns->binid_has_sindex[index];
+	return (temp & (1 << (binid % 32))) ? true : false;
+}
+//                                     END - BINID HAS SINDEX
+// ************************************************************************************************
+// ************************************************************************************************
+//                                             UTILITY
+// Translation from sindex error code to string. In alphabetic order
+const char *as_sindex_err_str(int op_code) {
+	switch (op_code) {
+		case AS_SINDEX_ERR:                     return "ERR GENERIC";
+		case AS_SINDEX_ERR_BIN_NOTFOUND:        return "BIN NOT FOUND";
+		case AS_SINDEX_ERR_FOUND:               return "INDEX FOUND";
+		case AS_SINDEX_ERR_INAME_MAXLEN:        return "INDEX NAME EXCEED MAX LIMIT";
+		case AS_SINDEX_ERR_MAXCOUNT:            return "INDEX COUNT EXCEEDS MAX LIMIT";
+		case AS_SINDEX_ERR_NOTFOUND:            return "NO INDEX";
+		case AS_SINDEX_ERR_NOT_READABLE:        return "INDEX NOT READABLE";
+		case AS_SINDEX_ERR_NO_MEMORY:           return "NO MEMORY";
+		case AS_SINDEX_ERR_PARAM:               return "ERR PARAM";
+		case AS_SINDEX_ERR_SET_MISMATCH:        return "SET MISMATCH";
+		case AS_SINDEX_ERR_TYPE_MISMATCH:       return "KEY TYPE MISMATCH";
+		case AS_SINDEX_ERR_UNKNOWN_KEYTYPE:     return "UNKNOWN KEYTYPE";
+		case AS_SINDEX_OK:                      return "OK";
+		default:                                return "Unknown Code";
+	}
+}
+
+inline bool as_sindex_isactive(as_sindex *si)
+{
+	if (! si) {
+		cf_warning(AS_SINDEX, "si is null in as_sindex_isactive");
+		return false;
+	}
+
+	return si->state == AS_SINDEX_ACTIVE;
+}
+
+// Translation from sindex internal error code to generic client visible Aerospike error code
+uint8_t as_sindex_err_to_clienterr(int err, char *fname, int lineno) {
+	switch (err) {
+		case AS_SINDEX_ERR_FOUND:        return AS_PROTO_RESULT_FAIL_INDEX_FOUND;
+		case AS_SINDEX_ERR_INAME_MAXLEN: return AS_PROTO_RESULT_FAIL_INDEX_NAME_MAXLEN;
+		case AS_SINDEX_ERR_MAXCOUNT:     return AS_PROTO_RESULT_FAIL_INDEX_MAXCOUNT;
+		case AS_SINDEX_ERR_NOTFOUND:     return AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND;
+		case AS_SINDEX_ERR_NOT_READABLE: return AS_PROTO_RESULT_FAIL_INDEX_NOTREADABLE;
+		case AS_SINDEX_ERR_NO_MEMORY:    return AS_PROTO_RESULT_FAIL_INDEX_OOM;
+		case AS_SINDEX_ERR_PARAM:        return AS_PROTO_RESULT_FAIL_PARAMETER;
+		case AS_SINDEX_OK:               return AS_PROTO_RESULT_OK;
+
+		// Defensive internal error
+		case AS_SINDEX_ERR:
+		case AS_SINDEX_ERR_BIN_NOTFOUND:
+		case AS_SINDEX_ERR_SET_MISMATCH:
+		case AS_SINDEX_ERR_TYPE_MISMATCH:
+		case AS_SINDEX_ERR_UNKNOWN_KEYTYPE:
+		default: cf_warning(AS_SINDEX, "%s %d Error at %s,%d",
+							 as_sindex_err_str(err), err, fname, lineno);
+											return AS_PROTO_RESULT_FAIL_INDEX_GENERIC;
+	}
+}
+
+bool
+as_sindex__setname_match(as_sindex_metadata *imd, const char *setname)
+{
+	// NULL SET being a valid set, logic is a bit complex
+	if (setname && ((!imd->set) || strcmp(imd->set, setname))) {
+		goto Fail;
+	}
+	else if (!setname && imd->set) {
+		goto Fail;
+	}
+	return true;
+Fail:
+	cf_debug(AS_SINDEX, "Index Mismatch %s %s", imd->set, setname);
+	return false;
+}
+
+/* Returns
+ * AS_SINDEX_GC_ERROR if cannot defrag
+ * AS_SINDEX_GC_OK if can defrag
+ * AS_SINDEX_GC_SKIP_ITERATION if partition lock timed out
+ */
+as_sindex_gc_status
+as_sindex_can_defrag_record(as_namespace *ns, cf_digest *keyd)
+{
+	as_partition_reservation rsv;
+	uint32_t pid = as_partition_getid(keyd);
+
+	int timeout_ms = 2;
+	if (as_partition_reserve_timeout(ns, pid, &rsv, timeout_ms) != 0 ) {
+		cf_atomic64_incr(&g_stats.sindex_gc_timedout);
+		return AS_SINDEX_GC_SKIP_ITERATION;
+	}
+
+	int rv = AS_SINDEX_GC_ERROR;
+	if (as_record_exists_live(rsv.tree, keyd, rsv.ns) != 0) {
+		rv = AS_SINDEX_GC_OK;
+	}
+	as_partition_release(&rsv);
+	return rv;
+
+}
+
+/*
+ * Function as_sindex_pktype
+ * 		Returns the type of particle indexed
+ *
+ * 	Returns -
+ * 		On failure - AS_SINDEX_ERR_UNKNOWN_KEYTYPE
+ */
+as_particle_type
+as_sindex_pktype(as_sindex_metadata * imd)
+{
+	switch (imd->sktype) {
+		case COL_TYPE_LONG: {
+			return AS_PARTICLE_TYPE_INTEGER;
+		}
+		case COL_TYPE_DIGEST: {
+			return AS_PARTICLE_TYPE_STRING;
+		}
+		case COL_TYPE_GEOJSON: {
+			return AS_PARTICLE_TYPE_GEOJSON;
+		}
+		default: {
+			cf_warning(AS_SINDEX, "UNKNOWN KEY TYPE FOUND. VERY BAD STATE");
+		}
+	}
+	return AS_SINDEX_ERR_UNKNOWN_KEYTYPE;
+}
+
+/*
+ * Function as_sindex_key_str
+ *     Returns a static string representing the key type
+ *
+ */
+char const *
+as_sindex_ktype_str(as_sindex_ktype type)
+{
+	switch (type) {
+	case COL_TYPE_LONG:    return "NUMERIC";
+	case COL_TYPE_DIGEST:  return "STRING";
+	case COL_TYPE_GEOJSON: return "GEOJSON";
+	default:
+		cf_warning(AS_SINDEX, "UNSUPPORTED KEY TYPE %d", type);
+		return "??????";
+	}
+}
+
+as_sindex_ktype
+as_sindex_ktype_from_string(char const * type_str)
+{
+	if (! type_str) {
+		cf_warning(AS_SINDEX, "missing secondary index key type");
+		return COL_TYPE_INVALID;
+	}
+	else if (strncasecmp(type_str, "string", 6) == 0) {
+		return COL_TYPE_DIGEST;
+	}
+	else if (strncasecmp(type_str, "numeric", 7) == 0) {
+		return COL_TYPE_LONG;
+	}
+	else if (strncasecmp(type_str, "geo2dsphere", 11) == 0) {
+		return COL_TYPE_GEOJSON;
+	}
+	else {
+		cf_warning(AS_SINDEX, "UNRECOGNIZED KEY TYPE %s", type_str);
+		return COL_TYPE_INVALID;
+	}
+}
+
+as_sindex_ktype
+as_sindex_sktype_from_pktype(as_particle_type t)
+{
+	switch (t) {
+		case AS_PARTICLE_TYPE_INTEGER :     return COL_TYPE_LONG;
+		case AS_PARTICLE_TYPE_STRING  :     return COL_TYPE_DIGEST;
+		case AS_PARTICLE_TYPE_GEOJSON :     return COL_TYPE_GEOJSON;
+		default                       :     return COL_TYPE_INVALID;
+	}
+	return COL_TYPE_INVALID;
+}
+
+/*
+ * Client API to check if there is secondary index on given namespace
+ */
+int
+as_sindex_ns_has_sindex(as_namespace *ns)
+{
+	return (ns->sindex_cnt > 0);
+}
+
+char *as_sindex_type_defs[] =
+{	"NONE", "LIST", "MAPKEYS", "MAPVALUES"
+};
+
+bool
+as_sindex_can_query(as_sindex *si)
+{
+	// Still building. Do not allow reads
+	return (si->flag & AS_SINDEX_FLAG_RACTIVE) ? true : false;
+}
+
+/*
+ * Create duplicate copy of sindex metadata. New lock is created
+ * used by index create by user at runtime or index creation at the boot time
+ */
+void
+as_sindex__dup_meta(as_sindex_metadata *imd, as_sindex_metadata **qimd)
+{
+	if (!imd) return;
+
+	as_sindex_metadata *qimdp = cf_rc_alloc(sizeof(as_sindex_metadata));
+
+	memset(qimdp, 0, sizeof(as_sindex_metadata));
+
+	qimdp->ns_name = cf_strdup(imd->ns_name);
+
+	// Set name is optional for create
+	if (imd->set) {
+		qimdp->set = cf_strdup(imd->set);
+	} else {
+		qimdp->set = NULL;
+	}
+
+	qimdp->iname       = cf_strdup(imd->iname);
+	qimdp->itype       = imd->itype;
+	qimdp->nprts       = imd->nprts;
+	qimdp->path_str    = cf_strdup(imd->path_str);
+	qimdp->path_length = imd->path_length;
+	memcpy(qimdp->path, imd->path, AS_SINDEX_MAX_DEPTH*sizeof(as_sindex_path));
+	qimdp->bname       = cf_strdup(imd->bname);
+	qimdp->sktype       = imd->sktype;
+	qimdp->binid       = imd->binid;
+
+	*qimd = qimdp;
+}
+
+/*
+ * Function to perform validation check on the return type and increment
+ * decrement all the statistics.
+ */
+void
+as_sindex__process_ret(as_sindex *si, int ret, as_sindex_op op,
+		uint64_t starttime, int pos)
+{
+	switch (op) {
+		case AS_SINDEX_OP_INSERT:
+			if (ret && ret != AS_SINDEX_KEY_FOUND) {
+				cf_debug(AS_SINDEX,
+						"SINDEX_FAIL: Insert into %s failed at %d with %d",
+						si->imd->iname, pos, ret);
+				cf_atomic64_incr(&si->stats.write_errs);
+			} else if (!ret) {
+				cf_atomic64_incr(&si->stats.n_objects);
+			}
+			cf_atomic64_incr(&si->stats.n_writes);
+			SINDEX_HIST_INSERT_DATA_POINT(si, write_hist, starttime);
+			break;
+		case AS_SINDEX_OP_DELETE:
+			if (ret && ret != AS_SINDEX_KEY_NOTFOUND) {
+				cf_debug(AS_SINDEX,
+						"SINDEX_FAIL: Delete from %s failed at %d with %d",
+	                    si->imd->iname, pos, ret);
+				cf_atomic64_incr(&si->stats.delete_errs);
+			} else if (!ret) {
+				cf_atomic64_decr(&si->stats.n_objects);
+			}
+			cf_atomic64_incr(&si->stats.n_deletes);
+			SINDEX_HIST_INSERT_DATA_POINT(si, delete_hist, starttime);
+			break;
+		case AS_SINDEX_OP_READ:
+			if (ret < 0) { // AS_SINDEX_CONTINUE(1) also OK
+				cf_debug(AS_SINDEX,
+						"SINDEX_FAIL: Read from %s failed at %d with %d",
+						si->imd->iname, pos, ret);
+				cf_atomic64_incr(&si->stats.read_errs);
+			}
+			cf_atomic64_incr(&si->stats.n_reads);
+			break;
+		default:
+			cf_crash(AS_SINDEX, "Invalid op");
+	}
+}
+
+// Bin id should be around
+// if not create it
+// TODO is it not needed
+int
+as_sindex__populate_binid(as_namespace *ns, as_sindex_metadata *imd)
+{
+	int len  = strlen(imd->bname);
+	if (len >= AS_ID_BIN_SZ) {
+		cf_warning(AS_SINDEX, "bin name %s of size %d too big. Max size allowed is %d",
+							imd->bname, len, AS_ID_BIN_SZ-1);
+		return AS_SINDEX_ERR;
+	}
+
+	if(!as_bin_name_within_quota(ns, imd->bname)) {
+		cf_warning(AS_SINDEX, "Bin %s not added. Quota is full", imd->bname);
+		return AS_SINDEX_ERR;
+	}
+
+	// An extra strncpy to remove valgrind warning
+	char bname[AS_ID_BIN_SZ];
+	strncpy(bname, imd->bname, AS_ID_BIN_SZ);
+	imd->binid = as_bin_get_or_assign_id(ns, bname);
+	cf_debug(AS_SINDEX, " Assigned %d for %s", imd->binid, imd->bname);
+
+	return AS_SINDEX_OK;
+}
+
+// Free if IMD has allocated the info in it
+int
+as_sindex_imd_free(as_sindex_metadata *imd)
+{
+	if (!imd) {
+		cf_warning(AS_SINDEX, "imd is null in as_sindex_imd_free");
+		return AS_SINDEX_ERR;
+	}
+
+	if (imd->ns_name) {
+		cf_free(imd->ns_name);
+		imd->ns_name = NULL;
+	}
+
+	if (imd->iname) {
+		cf_free(imd->iname);
+		imd->iname = NULL;
+	}
+
+	if (imd->set) {
+		cf_free(imd->set);
+		imd->set = NULL;
+	}
+
+	if (imd->path_str) {
+		cf_free(imd->path_str);
+		imd->path_str = NULL;
+	}
+
+	if (imd->bname) {
+		cf_free(imd->bname);
+		imd->bname = NULL;
+	}
+
+	return AS_SINDEX_OK;
+}
+//                                           END - UTILITY
+// ************************************************************************************************
+// ************************************************************************************************
+//                                           METADATA
+typedef struct sindex_set_binid_hash_ele_s {
+	cf_ll_element ele;
+	int           simatch;
+} sindex_set_binid_hash_ele;
+
+void
+as_sindex__set_binid_hash_destroy(cf_ll_element * ele) {
+	cf_free((sindex_set_binid_hash_ele * ) ele);
+}
+
+/*
+ * Should happen under SINDEX_GWLOCK
+ */
+as_sindex_status
+as_sindex__put_in_set_binid_hash(as_namespace * ns, char * set, int binid, int chosen_id)
+{
+	// Create fixed size key for hash
+	// Get the linked list from the hash
+	// If linked list does not exist then make one and put it in the hash
+	// Append the chosen id in the linked list
+
+	if (chosen_id < 0 || chosen_id > AS_SINDEX_MAX) {
+		cf_debug(AS_SINDEX, "Put in set_binid hash got invalid simatch %d", chosen_id);
+		return AS_SINDEX_ERR;
+	}
+	cf_ll * simatch_ll = NULL;
+	// Create fixed size key for hash
+	char si_prop[AS_SINDEX_PROP_KEY_SIZE];
+	memset(si_prop, 0, AS_SINDEX_PROP_KEY_SIZE);
+
+	if (set == NULL ) {
+		sprintf(si_prop, "_%d", binid);
+	}
+	else {
+		sprintf(si_prop, "%s_%d", set, binid);
+	}
+
+	// Get the linked list from the hash
+	int rv      = cf_shash_get(ns->sindex_set_binid_hash, (void *)si_prop, (void *)&simatch_ll);
+
+	// If linked list does not exist then make one and put it in the hash
+	if (rv && rv != CF_SHASH_ERR_NOT_FOUND) {
+		cf_debug(AS_SINDEX, "shash get failed with error %d", rv);
+		return AS_SINDEX_ERR;
+	};
+	if (rv == CF_SHASH_ERR_NOT_FOUND) {
+		simatch_ll = cf_malloc(sizeof(cf_ll));
+		cf_ll_init(simatch_ll, as_sindex__set_binid_hash_destroy, false);
+		cf_shash_put(ns->sindex_set_binid_hash, (void *)si_prop, (void *)&simatch_ll);
+	}
+	if (!simatch_ll) {
+		return AS_SINDEX_ERR;
+	}
+
+	// Append the chosen id in the linked list
+	sindex_set_binid_hash_ele * ele = cf_malloc(sizeof(sindex_set_binid_hash_ele));
+	ele->simatch                    = chosen_id;
+	cf_ll_append(simatch_ll, (cf_ll_element*)ele);
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Should happen under SINDEX_GWLOCK
+ */
+as_sindex_status
+as_sindex__delete_from_set_binid_hash(as_namespace * ns, as_sindex_metadata * imd)
+{
+	// Make a key
+	// Get the sindex list corresponding to key
+	// If the list does not exist, return does not exist
+	// If the list exist
+	// 		match the path and type of incoming si to the existing sindexes in the list
+	// 		If any element matches
+	// 			Delete from the list
+	// 			If the list size becomes 0
+	// 				Delete the entry from the hash
+	// 		If none of the element matches, return does not exist.
+	//
+
+	// Make a key
+	char si_prop[AS_SINDEX_PROP_KEY_SIZE];
+	memset(si_prop, 0, AS_SINDEX_PROP_KEY_SIZE);
+	if (imd->set == NULL ) {
+		sprintf(si_prop, "_%d", imd->binid);
+	}
+	else {
+		sprintf(si_prop, "%s_%d", imd->set, imd->binid);
+	}
+
+	// Get the sindex list corresponding to key
+	cf_ll * simatch_ll = NULL;
+	int rv             = cf_shash_get(ns->sindex_set_binid_hash, (void *)si_prop, (void *)&simatch_ll);
+
+	// If the list does not exist, return does not exist
+	if (rv && rv != CF_SHASH_ERR_NOT_FOUND) {
+		cf_debug(AS_SINDEX, "shash get failed with error %d", rv);
+		return AS_SINDEX_ERR_NOTFOUND;
+	};
+	if (rv == CF_SHASH_ERR_NOT_FOUND) {
+		return AS_SINDEX_ERR_NOTFOUND;
+	}
+
+	// If the list exist
+	// 		match the path and type of incoming si to the existing sindexes in the list
+	bool    to_delete                    = false;
+	cf_ll_element * ele                  = NULL;
+	sindex_set_binid_hash_ele * prop_ele = NULL;
+	if (simatch_ll) {
+		ele = cf_ll_get_head(simatch_ll);
+		while (ele) {
+			prop_ele       = ( sindex_set_binid_hash_ele * ) ele;
+			as_sindex * si = &(ns->sindex[prop_ele->simatch]);
+			if (strcmp(si->imd->path_str, imd->path_str) == 0 &&
+				si->imd->sktype == imd->sktype && si->imd->itype == imd->itype) {
+				to_delete  = true;
+				break;
+			}
+			ele = ele->next;
+		}
+	}
+	else {
+		return AS_SINDEX_ERR_NOTFOUND;
+	}
+
+	// 		If any element matches
+	// 			Delete from the list
+	if (to_delete && ele) {
+		cf_ll_delete(simatch_ll, ele);
+	}
+
+	// 			If the list size becomes 0
+	// 				Delete the entry from the hash
+	if (cf_ll_size(simatch_ll) == 0) {
+		rv = cf_shash_delete(ns->sindex_set_binid_hash, si_prop);
+		if (rv) {
+			cf_debug(AS_SINDEX, "shash_delete fails with error %d", rv);
+		}
+	}
+
+	// 		If none of the element matches, return does not exist.
+	if (!to_delete) {
+		return AS_SINDEX_ERR_NOTFOUND;
+	}
+	return AS_SINDEX_OK;
+}
+
+
+//                                         END - METADATA
+// ************************************************************************************************
+// ************************************************************************************************
+//                                             LOOKUP
+/*
+ * Should happen under SINDEX_GRLOCK if called directly.
+ */
+as_sindex_status
+as_sindex__simatch_list_by_set_binid(as_namespace * ns, const char *set, int binid, cf_ll ** simatch_ll)
+{
+	// Make the fixed size key (set_binid)
+	// Look for the key in set_binid_hash
+	// If found return the value (list of simatches)
+	// Else return NULL
+
+	// Make the fixed size key (set_binid)
+	char si_prop[AS_SINDEX_PROP_KEY_SIZE];
+	memset(si_prop, 0, AS_SINDEX_PROP_KEY_SIZE);
+	if (!set) {
+		sprintf(si_prop, "_%d", binid);
+	}
+	else {
+		sprintf(si_prop, "%s_%d", set, binid);
+	}
+
+	// Look for the key in set_binid_hash
+	int rv             = cf_shash_get(ns->sindex_set_binid_hash, (void *)si_prop, (void *)simatch_ll);
+
+	// If not found return NULL
+	if (rv || !(*simatch_ll)) {
+		cf_debug(AS_SINDEX, "shash get failed with error %d", rv);
+		return AS_SINDEX_ERR_NOTFOUND;
+	};
+
+	// Else return simatch_ll
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Should happen under SINDEX_GRLOCK
+ */
+int
+as_sindex__simatch_by_set_binid(as_namespace *ns, char * set, int binid, as_sindex_ktype type, as_sindex_type itype, char * path)
+{
+	// get the list corresponding to the list from the hash
+	// if list does not exist return -1
+	// If list exist
+	// 		Iterate through all the elements in the list and match the path and type
+	// 		If matches
+	// 			return the simatch
+	// 	If none of the si matches
+	// 		return -1
+
+	cf_ll * simatch_ll = NULL;
+	as_sindex__simatch_list_by_set_binid(ns, set, binid, &simatch_ll);
+
+	// If list exist
+	// 		Iterate through all the elements in the list and match the path and type
+	int     simatch                      = -1;
+	sindex_set_binid_hash_ele * prop_ele = NULL;
+	cf_ll_element * ele                  = NULL;
+	if (simatch_ll) {
+		ele = cf_ll_get_head(simatch_ll);
+		while (ele) {
+			prop_ele = ( sindex_set_binid_hash_ele * ) ele;
+			as_sindex * si = &(ns->sindex[prop_ele->simatch]);
+			if (strcmp(si->imd->path_str, path) == 0 &&
+				si->imd->sktype == type && si->imd->itype == itype) {
+				simatch  = prop_ele->simatch;
+				break;
+			}
+			ele = ele->next;
+		}
+	}
+	else {
+		return -1;
+	}
+
+	// 		If matches
+	// 			return the simatch
+	// 	If none of the si matches
+	// 		return -1
+	return simatch;
+}
+
+// Populates the si_arr with all the sindexes which matches set and binid
+// Each sindex is reserved as well. Enough space is provided by caller in si_arr
+// Currently only 8 sindexes can be create on one combination of set and binid
+// i.e number_of_sindex_types * number_of_sindex_data_type (4 * 2)
+int
+as_sindex_arr_lookup_by_set_binid_lockfree(as_namespace * ns, const char *set, int binid, as_sindex ** si_arr)
+{
+	cf_ll * simatch_ll=NULL;
+
+	int sindex_count = 0;
+	if (!as_sindex_binid_has_sindex(ns, binid) ) {
+		return sindex_count;
+	}
+
+	as_sindex__simatch_list_by_set_binid(ns, set, binid, &simatch_ll);
+	if (!simatch_ll) {
+		return sindex_count;
+	}
+
+	cf_ll_element             * ele    = cf_ll_get_head(simatch_ll);
+	sindex_set_binid_hash_ele * si_ele = NULL;
+	int                        simatch = -1;
+	as_sindex                 * si     = NULL;
+	while (ele) {
+		si_ele                         = (sindex_set_binid_hash_ele *) ele;
+		simatch                        = si_ele->simatch;
+
+		if (simatch == -1) {
+			cf_warning(AS_SINDEX, "A matching simatch comes out to be -1.");
+			ele = ele->next;
+			continue;
+		}
+
+		si                             = &ns->sindex[simatch];
+		// Reserve only active sindexes.
+		// Do not break this rule
+		if (!as_sindex_isactive(si)) {
+			ele = ele->next;
+			continue;
+		}
+
+		if (simatch != si->simatch) {
+			cf_warning(AS_SINDEX, "Inconsistent simatch reference between simatch stored in"
+									"si and simatch stored in hash");
+			ele = ele->next;
+			continue;
+		}
+
+		AS_SINDEX_RESERVE(si);
+
+		si_arr[sindex_count++] = si;
+		ele = ele->next;
+	}
+	return sindex_count;
+}
+
+// Populates the si_arr with all the sindexes which matches setname
+// Each sindex is reserved as well. Enough space is provided by caller in si_arr
+int
+as_sindex_arr_lookup_by_setname_lockfree(as_namespace * ns, const char *setname, as_sindex ** si_arr)
+{
+	int sindex_count = 0;
+	as_sindex * si = NULL;
+
+	for (int i=0; i<AS_SINDEX_MAX; i++) {
+		if (sindex_count >= ns->sindex_cnt) {
+			break;
+		}
+		si = &ns->sindex[i];
+		// Reserve only active sindexes.
+		// Do not break this rule
+		if (!as_sindex_isactive(si)) {
+			continue;
+		}
+
+		if (!as_sindex__setname_match(si->imd, setname)) {
+			continue;
+		}
+
+		AS_SINDEX_RESERVE(si);
+
+		si_arr[sindex_count++] = si;
+	}
+
+	return sindex_count;
+}
+int
+as_sindex__simatch_by_iname(as_namespace *ns, char *idx_name)
+{
+	if (strlen(idx_name) >= AS_ID_INAME_SZ) {
+		return -1;
+	}
+
+	char iname[AS_ID_INAME_SZ] = { 0 }; // must pad key
+	strcpy(iname, idx_name);
+
+	int simatch = -1;
+	int rv = cf_shash_get(ns->sindex_iname_hash, (void *)iname, (void *)&simatch);
+	cf_detail(AS_SINDEX, "Found iname simatch %s->%d rv=%d", iname, simatch, rv);
+
+	if (rv) {
+		return -1;
+	}
+	return simatch;
+}
+/*
+ * Single cluttered interface for lookup. iname precedes binid
+ * i.e if both are specified search is done with iname
+ */
+#define AS_SINDEX_LOOKUP_FLAG_SETCHECK     0x01
+#define AS_SINDEX_LOOKUP_FLAG_ISACTIVE     0x02
+#define AS_SINDEX_LOOKUP_FLAG_NORESERVE    0x04
+as_sindex *
+as_sindex__lookup_lockfree(as_namespace *ns, char *iname, char *set, int binid,
+								as_sindex_ktype type, as_sindex_type itype, char * path, char flag)
+{
+
+	// If iname is not null then search in iname hash and store the simatch
+	// Else then
+	// 		Check the possible existence of sindex over bin in the bit array
+	//		If no possibility return NULL
+	//		Search in the set_binid hash using setname, binid, itype and binid
+	//		If found store simatch
+	//		If not found return NULL
+	//			Get the sindex corresponding to the simatch.
+	// 			Apply the flags applied by caller.
+	//          Validate the simatch
+
+	int simatch   = -1;
+	as_sindex *si = NULL;
+	// If iname is not null then search in iname hash and store the simatch
+	if (iname) {
+		simatch   = as_sindex__simatch_by_iname(ns, iname);
+	}
+	// Else then
+	// 		Check the possible existence of sindex over bin in the bit array
+	else {
+		if (!as_sindex_binid_has_sindex(ns,  binid) ) {
+	//		If no possibility return NULL
+			goto END;
+		}
+	//		Search in the set_binid hash using setname, binid, itype and binid
+	//		If found store simatch
+		simatch   = as_sindex__simatch_by_set_binid(ns, set, binid, type, itype, path);
+	}
+	//		If not found return NULL
+	// 			Get the sindex corresponding to the simatch.
+	if (simatch != -1) {
+		si      = &ns->sindex[simatch];
+	// 			Apply the flags applied by caller.
+		if ((flag & AS_SINDEX_LOOKUP_FLAG_ISACTIVE)
+			&& !as_sindex_isactive(si)) {
+			si = NULL;
+			goto END;
+		}
+	//          Validate the simatch
+		if (simatch != si->simatch) {
+			cf_warning(AS_SINDEX, "Inconsistent simatch reference between simatch stored in"
+									"si and simatch stored in hash");
+		}
+		if (!(flag & AS_SINDEX_LOOKUP_FLAG_NORESERVE))
+			AS_SINDEX_RESERVE(si);
+	}
+END:
+	return si;
+}
+
+as_sindex *
+as_sindex__lookup(as_namespace *ns, char *iname, char *set, int binid, as_sindex_ktype type,
+						as_sindex_type itype, char * path, char flag)
+{
+	SINDEX_GRLOCK();
+	as_sindex *si = as_sindex__lookup_lockfree(ns, iname, set, binid, type, itype, path, flag);
+	SINDEX_GRUNLOCK();
+	return si;
+}
+
+as_sindex *
+as_sindex_lookup_by_iname(as_namespace *ns, char * iname, char flag)
+{
+	return as_sindex__lookup(ns, iname, NULL, -1, 0, 0, NULL, flag);
+}
+
+as_sindex *
+as_sindex_lookup_by_defns(as_namespace *ns, char *set, int binid, as_sindex_ktype type, as_sindex_type itype, char * path, char flag)
+{
+	return as_sindex__lookup(ns, NULL, set, binid, type, itype, path, flag);
+}
+
+as_sindex *
+as_sindex_lookup_by_iname_lockfree(as_namespace *ns, char * iname, char flag)
+{
+	return as_sindex__lookup_lockfree(ns, iname, NULL, -1, 0, 0, NULL, flag);
+}
+
+as_sindex *
+as_sindex_lookup_by_defns_lockfree(as_namespace *ns, char *set, int binid, as_sindex_ktype type, as_sindex_type itype, char * path, char flag)
+{
+	return as_sindex__lookup_lockfree(ns, NULL, set, binid, type, itype, path, flag);
+}
+
+
+//                                           END LOOKUP
+// ************************************************************************************************
+// ************************************************************************************************
+//                                          STAT/CONFIG/HISTOGRAM
+void
+as_sindex__stats_clear(as_sindex *si) {
+	as_sindex_stat *s = &si->stats;
+
+	s->n_objects            = 0;
+
+	s->n_reads              = 0;
+	s->read_errs            = 0;
+
+	s->n_writes             = 0;
+	s->write_errs           = 0;
+
+	s->n_deletes            = 0;
+	s->delete_errs          = 0;
+
+	s->loadtime             = 0;
+	s->recs_pending         = 0;
+
+	s->n_defrag_records     = 0;
+	s->defrag_time          = 0;
+
+	// Aggregation stat
+	s->n_aggregation        = 0;
+	s->agg_response_size    = 0;
+	s->agg_num_records      = 0;
+	s->agg_errs             = 0;
+	// Lookup stats
+	s->n_lookup             = 0;
+	s->lookup_response_size = 0;
+	s->lookup_num_records   = 0;
+	s->lookup_errs          = 0;
+
+	si->enable_histogram = false;
+	if (s->_write_hist) {
+		histogram_clear(s->_write_hist);
+	}
+	if (s->_si_prep_hist) {
+		histogram_clear(s->_si_prep_hist);
+	}
+	if (s->_delete_hist) {
+		histogram_clear(s->_delete_hist);
+	}
+	if (s->_query_hist) {
+		histogram_clear(s->_query_hist);
+	}
+	if (s->_query_batch_io) {
+		histogram_clear(s->_query_batch_io);
+	}
+	if (s->_query_batch_lookup) {
+		histogram_clear(s->_query_batch_lookup);
+	}
+	if (s->_query_rcnt_hist) {
+		histogram_clear(s->_query_rcnt_hist);
+	}
+	if (s->_query_diff_hist) {
+		histogram_clear(s->_query_diff_hist);
+	}
+}
+
+void
+as_sindex_gconfig_default(as_config *c)
+{
+	c->sindex_builder_threads = 4;
+	c->sindex_gc_max_rate = 50000; // 50,000 per second
+	c->sindex_gc_period = 10; // every 10 seconds
+}
+
+void
+as_sindex__config_default(as_sindex *si)
+{
+	si->config.flag = AS_SINDEX_FLAG_WACTIVE;
+}
+
+void
+as_sindex__setup_histogram(as_sindex *si)
+{
+	char hist_name[AS_ID_INAME_SZ + 64];
+
+	sprintf(hist_name, "%s_write_us", si->imd->iname);
+	si->stats._write_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "%s_si_prep_us", si->imd->iname);
+	si->stats._si_prep_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "%s_delete_us", si->imd->iname);
+	si->stats._delete_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "%s_query", si->imd->iname);
+	si->stats._query_hist = histogram_create(hist_name, HIST_MILLISECONDS);
+
+	sprintf(hist_name, "%s_query_batch_lookup_us", si->imd->iname);
+	si->stats._query_batch_lookup = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "%s_query_batch_io_us", si->imd->iname);
+	si->stats._query_batch_io = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "%s_query_row_count", si->imd->iname);
+	si->stats._query_rcnt_hist = histogram_create(hist_name, HIST_COUNT);
+
+	sprintf(hist_name, "%s_query_diff_count", si->imd->iname);
+	si->stats._query_diff_hist = histogram_create(hist_name, HIST_COUNT);
+}
+
+int
+as_sindex__destroy_histogram(as_sindex *si)
+{
+	if (si->stats._write_hist)            cf_free(si->stats._write_hist);
+	if (si->stats._si_prep_hist)          cf_free(si->stats._si_prep_hist);
+	if (si->stats._delete_hist)           cf_free(si->stats._delete_hist);
+	if (si->stats._query_hist)            cf_free(si->stats._query_hist);
+	if (si->stats._query_batch_lookup)    cf_free(si->stats._query_batch_lookup);
+	if (si->stats._query_batch_io)        cf_free(si->stats._query_batch_io);
+	if (si->stats._query_rcnt_hist)       cf_free(si->stats._query_rcnt_hist);
+	if (si->stats._query_diff_hist)       cf_free(si->stats._query_diff_hist);
+	return 0;
+}
+
+int
+as_sindex_stats_str(as_namespace *ns, char * iname, cf_dyn_buf *db)
+{
+	as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+
+	if (!si) {
+		cf_warning(AS_SINDEX, "SINDEX STAT : sindex %s not found", iname);
+		return AS_SINDEX_ERR_NOTFOUND;
+	}
+
+	// A good thing to cache the stats first.
+	uint64_t ns_objects  = ns->n_objects;
+	uint64_t si_objects  = cf_atomic64_get(si->stats.n_objects);
+	uint64_t pending     = cf_atomic64_get(si->stats.recs_pending);
+
+	uint64_t n_keys      = ai_btree_get_numkeys(si->imd);
+	uint64_t i_size      = ai_btree_get_isize(si->imd);
+	uint64_t n_size      = ai_btree_get_nsize(si->imd);
+
+	info_append_uint64(db, "keys", n_keys);
+	info_append_uint64(db, "entries", si_objects);
+	info_append_uint64(db, "ibtr_memory_used", i_size);
+	info_append_uint64(db, "nbtr_memory_used", n_size);
+	info_append_uint64(db, "si_accounted_memory", i_size + n_size);
+	if (si->flag & AS_SINDEX_FLAG_RACTIVE) {
+		info_append_string(db, "load_pct", "100");
+	} else {
+		if (pending > ns_objects) {
+			info_append_uint64(db, "load_pct", 100);
+		} else {
+			info_append_uint64(db, "load_pct", (ns_objects == 0) ? 100 : 100 - ((100 * pending) / ns_objects));
+		}
+	}
+
+	info_append_uint64(db, "loadtime", cf_atomic64_get(si->stats.loadtime));
+	// writes
+	info_append_uint64(db, "write_success", cf_atomic64_get(si->stats.n_writes) - cf_atomic64_get(si->stats.write_errs));
+	info_append_uint64(db, "write_error", cf_atomic64_get(si->stats.write_errs));
+	// delete
+	info_append_uint64(db, "delete_success", cf_atomic64_get(si->stats.n_deletes) - cf_atomic64_get(si->stats.delete_errs));
+	info_append_uint64(db, "delete_error", cf_atomic64_get(si->stats.delete_errs));
+	// defrag
+	info_append_uint64(db, "stat_gc_recs", cf_atomic64_get(si->stats.n_defrag_records));
+	info_append_uint64(db, "stat_gc_time", cf_atomic64_get(si->stats.defrag_time));
+
+	// Cache values
+	uint64_t agg        = cf_atomic64_get(si->stats.n_aggregation);
+	uint64_t agg_rec    = cf_atomic64_get(si->stats.agg_num_records);
+	uint64_t agg_size   = cf_atomic64_get(si->stats.agg_response_size);
+	uint64_t lkup       = cf_atomic64_get(si->stats.n_lookup);
+	uint64_t lkup_rec   = cf_atomic64_get(si->stats.lookup_num_records);
+	uint64_t lkup_size  = cf_atomic64_get(si->stats.lookup_response_size);
+	uint64_t query      = agg      + lkup;
+	uint64_t query_rec  = agg_rec  + lkup_rec;
+	uint64_t query_size = agg_size + lkup_size;
+
+	// Query
+	info_append_uint64(db, "query_reqs", query);
+	info_append_uint64(db, "query_avg_rec_count", query ? query_rec / query : 0);
+	info_append_uint64(db, "query_avg_record_size", query_rec ? query_size / query_rec : 0);
+	// Aggregation
+	info_append_uint64(db, "query_agg", agg);
+	info_append_uint64(db, "query_agg_avg_rec_count", agg ? agg_rec / agg : 0);
+	info_append_uint64(db, "query_agg_avg_record_size", agg_rec ? agg_size / agg_rec : 0);
+	//Lookup
+	info_append_uint64(db, "query_lookups", lkup);
+	info_append_uint64(db, "query_lookup_avg_rec_count", lkup ? lkup_rec / lkup : 0);
+	info_append_uint64(db, "query_lookup_avg_record_size", lkup_rec ? lkup_size / lkup_rec : 0);
+
+	info_append_bool(db, "histogram", si->enable_histogram);
+
+	cf_dyn_buf_chomp(db);
+
+	AS_SINDEX_RELEASE(si);
+	// Release reference
+	return AS_SINDEX_OK;
+}
+
+int
+as_sindex_histogram_dumpall(as_namespace *ns)
+{
+	if (!ns)
+		return AS_SINDEX_ERR_PARAM;
+	SINDEX_GRLOCK();
+
+	for (int i = 0; i < ns->sindex_cnt; i++) {
+		if (ns->sindex[i].state != AS_SINDEX_ACTIVE) continue;
+		if (!ns->sindex[i].enable_histogram)         continue;
+		as_sindex *si = &ns->sindex[i];
+		if (si->stats._write_hist)
+			histogram_dump(si->stats._write_hist);
+		if (si->stats._si_prep_hist)
+			histogram_dump(si->stats._si_prep_hist);
+		if (si->stats._delete_hist)
+			histogram_dump(si->stats._delete_hist);
+		if (si->stats._query_hist)
+			histogram_dump(si->stats._query_hist);
+		if (si->stats._query_batch_lookup)
+			histogram_dump(si->stats._query_batch_lookup);
+		if (si->stats._query_batch_io)
+			histogram_dump(si->stats._query_batch_io);
+		if (si->stats._query_rcnt_hist)
+			histogram_dump(si->stats._query_rcnt_hist);
+		if (si->stats._query_diff_hist)
+			histogram_dump(si->stats._query_diff_hist);
+	}
+	SINDEX_GRUNLOCK();
+	return AS_SINDEX_OK;
+}
+
+int
+as_sindex_histogram_enable(as_namespace *ns, char * iname, bool enable)
+{
+	as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+	if (!si) {
+		cf_warning(AS_SINDEX, "SINDEX HISTOGRAM : sindex %s not found", iname);
+		return AS_SINDEX_ERR_NOTFOUND;
+	}
+
+	si->enable_histogram = enable;
+	AS_SINDEX_RELEASE(si);
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Client API to list all the indexes in a namespace, returns list of imd with
+ * index information, Caller should free it up
+ */
+int
+as_sindex_list_str(as_namespace *ns, cf_dyn_buf *db)
+{
+	SINDEX_GRLOCK();
+	for (int i = 0; i < AS_SINDEX_MAX; i++) {
+		if (&(ns->sindex[i]) && (ns->sindex[i].imd)) {
+			as_sindex si = ns->sindex[i];
+
+			cf_dyn_buf_append_string(db, "ns=");
+			cf_dyn_buf_append_string(db, ns->name);
+			cf_dyn_buf_append_string(db, ":set=");
+			cf_dyn_buf_append_string(db, (si.imd->set) ? si.imd->set : "NULL");
+			cf_dyn_buf_append_string(db, ":indexname=");
+			cf_dyn_buf_append_string(db, si.imd->iname);
+			cf_dyn_buf_append_string(db, ":bin=");
+			cf_dyn_buf_append_buf(db, (uint8_t *)si.imd->bname, strlen(si.imd->bname));
+			cf_dyn_buf_append_string(db, ":type=");
+			cf_dyn_buf_append_string(db, as_sindex_ktype_str(si.imd->sktype));
+			cf_dyn_buf_append_string(db, ":indextype=");
+			cf_dyn_buf_append_string(db, as_sindex_type_defs[si.imd->itype]);
+
+			cf_dyn_buf_append_string(db, ":path=");
+			cf_dyn_buf_append_string(db, si.imd->path_str);
+
+			// Index State
+			if (si.state == AS_SINDEX_ACTIVE) {
+				if (si.flag & AS_SINDEX_FLAG_RACTIVE) {
+					cf_dyn_buf_append_string(db, ":state=RW;");
+				}
+				else if (si.flag & AS_SINDEX_FLAG_WACTIVE) {
+					cf_dyn_buf_append_string(db, ":state=WO;");
+				}
+				else {
+					// should never come here.
+					cf_dyn_buf_append_string(db, ":state=A;");
+				}
+			}
+			else if (si.state == AS_SINDEX_INACTIVE) {
+				cf_dyn_buf_append_string(db, ":state=I;");
+			}
+			else {
+				cf_dyn_buf_append_string(db, ":state=D;");
+			}
+		}
+	}
+	SINDEX_GRUNLOCK();
+	return AS_SINDEX_OK;
+}
+//                                  END - STAT/CONFIG/HISTOGRAM
+// ************************************************************************************************
+// ************************************************************************************************
+//                                         SI REFERENCE
+// Reserve the sindex so it does not get deleted under the hood
+int
+as_sindex_reserve(as_sindex *si, char *fname, int lineno)
+{
+	if (! as_sindex_isactive(si)) {
+		cf_warning(AS_SINDEX, "Trying to reserve sindex %s in a state other than active. State is %d",
+							si->imd->iname, si->state);
+	}
+
+	if (si->imd) {
+		cf_rc_reserve(si->imd);
+	}
+
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Release, queue up the request for the destroy to clean up Aerospike Index thread,
+ * Not done inline because main write thread could release the last reference.
+ */
+void
+as_sindex_release(as_sindex *si, char *fname, int lineno)
+{
+	if (! si) {
+	   	return;
+	}
+
+	uint64_t val = cf_rc_release(si->imd);
+
+	if (val == 0) {
+		si->flag |= AS_SINDEX_FLAG_DESTROY_CLEANUP;
+		cf_queue_push(g_sindex_destroy_q, &si);
+	}
+}
+
+as_sindex_status
+as_sindex_populator_reserve_all(as_namespace * ns)
+{
+	if (!ns) {
+		cf_warning(AS_SINDEX, "namespace found NULL");
+		return AS_SINDEX_ERR;
+	}
+
+	int count = 0 ;
+	int valid = 0;
+	SINDEX_GRLOCK();
+	while (valid < ns->sindex_cnt && count < AS_SINDEX_MAX) {
+		as_sindex * si = &ns->sindex[count];
+		if (as_sindex_isactive(si)) {
+			AS_SINDEX_RESERVE(si);
+			valid++;
+		}
+		count++;
+	}
+	SINDEX_GRUNLOCK();
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_populator_release_all(as_namespace * ns)
+{
+	if (!ns) {
+		cf_warning(AS_SINDEX, "namespace found NULL");
+		return AS_SINDEX_ERR;
+	}
+
+	int count = 0 ;
+	int valid = 0;
+	SINDEX_GRLOCK();
+	while (valid < ns->sindex_cnt && count < AS_SINDEX_MAX) {
+		as_sindex * si = &ns->sindex[count];
+		if (as_sindex_isactive(si)) {
+			AS_SINDEX_RELEASE(si);
+			valid++;
+		}
+		count++;
+	}
+	SINDEX_GRUNLOCK();
+	return AS_SINDEX_OK;
+}
+
+// Complementary function of as_sindex_arr_lookup_by_set_binid
+void
+as_sindex_release_arr(as_sindex *si_arr[], int si_arr_sz)
+{
+	for (int i=0; i<si_arr_sz; i++) {
+		if (si_arr[i]) {
+			AS_SINDEX_RELEASE(si_arr[i]);
+		}
+		else {
+			cf_warning(AS_SINDEX, "SI is null");
+		}
+	}
+}
+
+//                                    END - SI REFERENCE
+// ************************************************************************************************
+// ************************************************************************************************
+//                                          SINDEX CREATE
+// simatch is index in sindex array
+// nptr is index of pimd in imd
+void
+as_sindex__create_pmeta(as_sindex *si, int simatch, int nptr)
+{
+	if (!si) {
+		cf_warning(AS_SINDEX, "SI is null");
+		return;
+	}
+
+	if (nptr == 0) {
+		cf_warning(AS_SINDEX, "nptr is 0");
+		return;
+	}
+
+	si->imd->pimd = cf_malloc(nptr * sizeof(as_sindex_pmetadata));
+	memset(si->imd->pimd, 0, nptr*sizeof(as_sindex_pmetadata));
+
+	pthread_rwlockattr_t rwattr;
+	if (pthread_rwlockattr_init(&rwattr))
+		cf_crash(AS_AS,
+				"pthread_rwlockattr_init: %s", cf_strerror(errno));
+	if (pthread_rwlockattr_setkind_np(&rwattr,
+				PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP))
+		cf_crash(AS_TSVC,
+				"pthread_rwlockattr_setkind_np: %s",cf_strerror(errno));
+
+	for (int i = 0; i < nptr; i++) {
+		as_sindex_pmetadata *pimd = &si->imd->pimd[i];
+		if (pthread_rwlock_init(&pimd->slock, &rwattr)) {
+			cf_crash(AS_SINDEX,
+					"Could not create secondary index dml mutex ");
+		}
+	}
+}
+
+/*
+ * Description :
+ *  	Checks the parameters passed to as_sindex_create function
+ *
+ * Parameters:
+ * 		namespace, index metadata
+ *
+ * Returns:
+ * 		AS_SINDEX_OK            - for valid parameters.
+ * 		Appropriate error codes - otherwise
+ *
+ * Synchronization:
+ * 		This function does not explicitly acquire any lock.
+ * TODO : Check if exits_by_defn can be used instead of this
+ */
+int
+as_sindex_create_check_params(as_namespace* ns, as_sindex_metadata* imd)
+{
+	SINDEX_GRLOCK();
+
+	int ret     = AS_SINDEX_OK;
+	if (ns->sindex_cnt >= AS_SINDEX_MAX) {
+		ret = AS_SINDEX_ERR_MAXCOUNT;
+		goto END;
+	}
+
+	int simatch = as_sindex__simatch_by_iname(ns, imd->iname);
+	if (simatch != -1) {
+		ret = AS_SINDEX_ERR_FOUND;
+	} else {
+		int16_t binid = as_bin_get_id(ns, imd->bname);
+		if (binid != -1)
+		{
+			int simatch = as_sindex__simatch_by_set_binid(ns, imd->set, binid, imd->sktype, imd->itype, imd->path_str);
+			if (simatch != -1) {
+				ret = AS_SINDEX_ERR_FOUND;
+				goto END;
+			}
+		}
+	}
+
+END:
+	SINDEX_GRUNLOCK();
+    return ret;
+}
+
+static int
+sindex_create_lockless(as_namespace *ns, as_sindex_metadata *imd)
+{
+	int chosen_id = AS_SINDEX_MAX;
+	as_sindex *si = NULL;
+	for (int i = 0; i < AS_SINDEX_MAX; i++) {
+		if (ns->sindex[i].state == AS_SINDEX_INACTIVE) {
+			si = &ns->sindex[i];
+			chosen_id = i;
+			break;
+		}
+	}
+
+	if (! si || (chosen_id == AS_SINDEX_MAX))  {
+		cf_warning(AS_SINDEX, "SINDEX CREATE : Maxed out secondary index limit no more indexes allowed");
+		return AS_SINDEX_ERR;
+	}
+
+	as_set *p_set = NULL;
+
+	if (imd->set) {
+		if (as_namespace_get_create_set_w_len(ns, imd->set, strlen(imd->set), &p_set, NULL) != 0) {
+			cf_warning(AS_SINDEX, "SINDEX CREATE : failed get-create set %s", imd->set);
+			return AS_SINDEX_ERR;
+		}
+	}
+
+	imd->nprts  = ns->sindex_num_partitions;
+	int id      = chosen_id;
+	si          = &ns->sindex[id];
+	as_sindex_metadata *qimd;
+
+	if (as_sindex__populate_binid(ns, imd)) {
+		cf_warning(AS_SINDEX, "SINDEX CREATE : Popluating bin id failed");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	as_sindex_status rv = as_sindex__put_in_set_binid_hash(ns, imd->set, imd->binid, id);
+	if (rv != AS_SINDEX_OK) {
+		cf_warning(AS_SINDEX, "SINDEX CREATE : Put in set_binid hash fails with error %d", rv);
+		return AS_SINDEX_ERR;
+	}
+
+	cf_detail(AS_SINDEX, "Put binid simatch %d->%d", imd->binid, chosen_id);
+
+	char iname[AS_ID_INAME_SZ];
+	memset(iname, 0, AS_ID_INAME_SZ);
+	snprintf(iname, strlen(imd->iname)+1, "%s", imd->iname);
+	cf_shash_put(ns->sindex_iname_hash, (void *)iname, (void *)&chosen_id);
+	cf_detail(AS_SINDEX, "Put iname simatch %s:%zu->%d", iname, strlen(imd->iname), chosen_id);
+
+	// Init SI
+	si->ns          = ns;
+	si->simatch     = chosen_id;
+	si->state       = AS_SINDEX_ACTIVE;
+	si->flag        = AS_SINDEX_FLAG_WACTIVE;
+	si->recreate_imd     = NULL;
+	as_sindex__config_default(si);
+
+	// Init IMD
+	as_sindex__dup_meta(imd, &qimd);
+	si->imd = qimd;
+	qimd->si = si;
+
+	// Init PIMD
+	as_sindex__create_pmeta(si, id, imd->nprts);
+	ai_btree_create(si->imd);
+	as_sindex_set_binid_has_sindex(ns, si->imd->binid);
+
+
+	// Update Counter
+	as_sindex__setup_histogram(si);
+	as_sindex__stats_clear(si);
+	ns->sindex_cnt++;
+	if (p_set) {
+		p_set->n_sindexes++;
+	} else {
+		ns->n_setless_sindexes++;
+	}
+	cf_atomic64_add(&ns->n_bytes_sindex_memory, ai_btree_get_isize(si->imd));
+
+	// Queue this for secondary index builder if create is done after boot.
+	// At the boot time single builder request is queued for entire namespace.
+	if (g_sindex_boot_done) {
+		// Reserve for ref in queue
+		AS_SINDEX_RESERVE(si);
+		cf_queue_push(g_sindex_populate_q, &si);
+	}
+
+	return AS_SINDEX_OK;
+}
+
+int
+as_sindex_create(as_namespace *ns, as_sindex_metadata *imd)
+{
+	// Ideally there should be one lock per namespace, but because the
+	// Aerospike Index metadata is single global structure we need a overriding
+	// lock for that. NB if it becomes per namespace have a file lock
+	SINDEX_GWLOCK();
+	if (as_sindex_lookup_by_iname_lockfree(ns, imd->iname, AS_SINDEX_LOOKUP_FLAG_NORESERVE)) {
+		cf_detail(AS_SINDEX,"Index %s already exists", imd->iname);
+		SINDEX_GWUNLOCK();
+		return AS_SINDEX_ERR_FOUND;
+	}
+
+	int rv = sindex_create_lockless(ns, imd);
+	SINDEX_GWUNLOCK();
+	return rv;
+}
+
+void
+as_sindex_smd_create(as_namespace *ns, as_sindex_metadata *imd)
+{
+	SINDEX_GWLOCK();
+
+	// FIXME - wrong place for check
+	// If one node cannot have > AS_SINDEX_MAX then neither
+	// can majority in cluster.
+	// if (ns->sindex_cnt >= AS_SINDEX_MAX) {
+	//     cf_warning(AS_SINDEX, "Failed to SMD create index '%s' on namespace '%s', maximum allowed number of indexes %d reached !!",
+	//			imd->ns_name, imd->iname, ns->sindex_cnt);
+	//     SINDEX_GWUNLOCK();
+	//	   return;
+	// }
+
+	bool found_exact_defn = false; // ns:iname   ns:binid / set / sktype / itype / path_str
+	bool found_defn = false;       //            ns:binid / set / sktype / itype / path_str
+	bool found_iname = false;      // ns:iname
+
+	int simatch_defn = -1;
+	int16_t binid = as_bin_get_id(ns, imd->bname);
+	if (binid != -1) {
+		simatch_defn = as_sindex__simatch_by_set_binid(ns, imd->set, binid,
+				imd->sktype, imd->itype, imd->path_str);
+		if (simatch_defn != -1) {
+			as_sindex *si = &ns->sindex[simatch_defn];
+			if (! strcmp(si->imd->iname, imd->iname)) {
+				found_exact_defn = true;
+			} else {
+				found_defn = true;
+			}
+		}
+	}
+
+	int simatch_iname = as_sindex__simatch_by_iname(ns, imd->iname);
+	if (simatch_iname != -1) {
+		found_iname = true;
+	}
+
+	if (found_exact_defn) {
+		as_sindex *si = &ns->sindex[simatch_defn];
+		if (si->state == AS_SINDEX_ACTIVE) {
+			SINDEX_GWUNLOCK();
+			return;
+		}
+	}
+
+	if (found_defn) {
+		as_sindex *si = &ns->sindex[simatch_defn];
+		if (si->state == AS_SINDEX_ACTIVE) {
+			si->state = AS_SINDEX_DESTROY;
+			as_sindex_reset_binid_has_sindex(ns, si->imd->binid);
+			AS_SINDEX_RELEASE(si);
+		}
+	}
+
+	if (found_iname) {
+		as_sindex *si = &ns->sindex[simatch_iname];
+		if (si->state == AS_SINDEX_ACTIVE) {
+			si->state = AS_SINDEX_DESTROY;
+			as_sindex_reset_binid_has_sindex(ns, si->imd->binid);
+			AS_SINDEX_RELEASE(si);
+		}
+	}
+
+	// If found set setop; Use si found with same definition to set op.
+	if (found_defn || found_exact_defn || found_iname) {
+		if (simatch_defn != -1) {
+			as_sindex *si = &ns->sindex[simatch_defn];
+			as_sindex__dup_meta(imd, &si->recreate_imd);
+			SINDEX_GWUNLOCK();
+			return;
+		}
+
+		as_sindex *si = &ns->sindex[simatch_iname];
+		as_sindex__dup_meta(imd, &si->recreate_imd);
+		SINDEX_GWUNLOCK();
+		return;
+	}
+
+	// Not found.
+	sindex_create_lockless(ns, imd);
+	SINDEX_GWUNLOCK();
+	return;
+}
+
+/*
+ * Description     : When a index has to be dropped and recreated during cluster state change
+ * 				     this function is called.
+ * Parameters      : imd, which is constructed from the final index defn given by paxos principal.
+ *
+ * Returns         : 0 on all cases. Check log for errors.
+ *
+ * Synchronization : Does not explicitly take any locks
+ */
+int
+as_sindex_recreate(as_sindex_metadata* imd)
+{
+	as_namespace *ns = as_namespace_get_byname(imd->ns_name);
+	int ret          = as_sindex_create(ns, imd);
+	if (ret != 0) {
+		cf_warning(AS_SINDEX,"Index %s creation failed at the accept callback", imd->iname);
+	}
+	return 0;
+}
+//                                       END - SINDEX CREATE
+// ************************************************************************************************
+// ************************************************************************************************
+//                                         SINDEX DELETE
+
+void
+as_sindex_destroy_pmetadata(as_sindex *si)
+{
+	for (int i = 0; i < si->imd->nprts; i++) {
+		as_sindex_pmetadata *pimd = &si->imd->pimd[i];
+		pthread_rwlock_destroy(&pimd->slock);
+	}
+	as_sindex__destroy_histogram(si);
+	cf_free(si->imd->pimd);
+	si->imd->pimd = NULL;
+}
+
+// TODO : Will not harm if it reserves and releases the sindex
+// Keep it simple
+bool
+as_sindex_delete_checker(as_namespace *ns, as_sindex_metadata *imd)
+{
+	if (as_sindex_lookup_by_iname_lockfree(ns, imd->iname,
+			AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE)) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+/*
+ * Client API to destroy secondary index, mark destroy
+ * Deletes via smd or info-command user-delete requests.
+ */
+int
+as_sindex_destroy(as_namespace *ns, as_sindex_metadata *imd)
+{
+	SINDEX_GWLOCK();
+	as_sindex *si = NULL;
+
+	if (imd->iname) {
+		si = as_sindex_lookup_by_iname_lockfree(ns, imd->iname,
+				AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+	}
+	else {
+		int16_t bin_id = as_bin_get_id(ns, imd->bname);
+
+		if (bin_id == -1) {
+			SINDEX_GWUNLOCK();
+			return AS_SINDEX_ERR_NOTFOUND;
+		}
+
+		si = as_sindex_lookup_by_defns_lockfree(ns, imd->set, (int)bin_id,
+				imd->sktype, imd->itype, imd->path_str,
+				AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+	}
+
+	if (si) {
+		si->state = AS_SINDEX_DESTROY;
+		as_sindex_reset_binid_has_sindex(ns, si->imd->binid);
+		AS_SINDEX_RELEASE(si);
+		SINDEX_GWUNLOCK();
+		return AS_SINDEX_OK;
+	}
+
+	SINDEX_GWUNLOCK();
+	return AS_SINDEX_ERR_NOTFOUND;
+}
+
+// On emptying a index
+// 		reset objects and keys
+// 		reset memory used
+// 		add previous number of objects as deletes
+void
+as_sindex_clear_stats_on_empty_index(as_sindex *si)
+{
+	cf_atomic64_add(&si->stats.n_deletes, cf_atomic64_get(si->stats.n_objects));
+	cf_atomic64_set(&si->stats.n_keys, 0);
+	cf_atomic64_set(&si->stats.n_objects, 0);
+}
+
+void
+as_sindex_empty_index(as_sindex_metadata * imd)
+{
+	as_sindex_pmetadata * pimd;
+	cf_atomic64_sub(&imd->si->ns->n_bytes_sindex_memory,
+			ai_btree_get_isize(imd) + ai_btree_get_nsize(imd));
+	for (int i=0; i<imd->nprts; i++) {
+		pimd = &imd->pimd[i];
+		PIMD_WLOCK(&pimd->slock);
+		struct btree * ibtr = pimd->ibtr;
+		ai_btree_reinit_pimd(pimd, imd->sktype);
+		PIMD_WUNLOCK(&pimd->slock);
+		ai_btree_delete_ibtr(ibtr);
+	}
+	cf_atomic64_add(&imd->si->ns->n_bytes_sindex_memory,
+			ai_btree_get_isize(imd));
+	as_sindex_clear_stats_on_empty_index(imd->si);
+}
+
+// TODO - formerly used during set deletion - leaving it for now, but if nothing
+// needs it going forward, we'll remove it.
+void
+as_sindex_delete_set(as_namespace * ns, char * set_name)
+{
+	SINDEX_GRLOCK();
+	as_sindex * si_arr[ns->sindex_cnt];
+	int sindex_count = as_sindex_arr_lookup_by_setname_lockfree(ns, set_name, si_arr);
+
+	for (int i=0; i<sindex_count; i++) {
+		cf_info(AS_SINDEX, "Initiating si set delete for index %s in set %s", si_arr[i]->imd->iname, set_name);
+		as_sindex_empty_index(si_arr[i]->imd);
+		cf_info(AS_SINDEX, "Finished si set delete for index %s in set %s", si_arr[i]->imd->iname, set_name);
+	}
+	SINDEX_GRUNLOCK();
+	as_sindex_release_arr(si_arr, sindex_count);
+}
+//                                        END - SINDEX DELETE
+// ************************************************************************************************
+// ************************************************************************************************
+//                                         SINDEX POPULATE
+/*
+ * Client API to mark index population finished, tick it ready for read
+ */
+int
+as_sindex_populate_done(as_sindex *si)
+{
+	// Setting flag is atomic: meta lockless
+	si->flag |= AS_SINDEX_FLAG_RACTIVE;
+	si->flag &= ~AS_SINDEX_FLAG_POPULATING;
+	return AS_SINDEX_OK;
+}
+/*
+ * Client API to start namespace scan to populate secondary index. The scan
+ * is only performed in the namespace is warm start or if its data is not in
+ * memory and data is loaded from. For cold start with data in memory the indexes
+ * are populate upfront.
+ *
+ * This call is only made at the boot time.
+ */
+int
+as_sindex_boot_populateall()
+{
+	// Initialize the secondary index builder. The thread pool is initialized
+	// with maximum threads to go full throttle, then down-sized to the
+	// configured number after the startup population job is done.
+	as_sbld_init();
+
+	int ns_cnt = 0;
+
+	// Trigger namespace scan to populate all secondary indexes
+	// mark all secondary index for a namespace as populated
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+		if (!ns || (ns->sindex_cnt == 0)) {
+			continue;
+		}
+
+		if (! ns->storage_data_in_memory) {
+			// Data-not-in-memory (cold or warm restart) - have not yet built
+			// sindex, build it now.
+			as_sindex_populator_reserve_all(ns);
+			as_sbld_build_all(ns);
+			cf_info(AS_SINDEX, "Queuing namespace %s for sindex population ", ns->name);
+		} else {
+			// Data-in-memory (cold or cool restart) - already built sindex.
+			as_sindex_boot_populateall_done(ns);
+		}
+		ns_cnt++;
+	}
+	for (int i = 0; i < ns_cnt; i++) {
+		int ret;
+		// blocking call, wait till an item is popped out of Q :
+		cf_queue_pop(g_sindex_populateall_done_q, &ret, CF_QUEUE_FOREVER);
+		// TODO: Check for failure .. is generally fatal if it fails
+	}
+
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+		if (!ns || (ns->sindex_cnt == 0)) {
+			continue;
+		}
+
+		if (! ns->storage_data_in_memory) {
+			// Data-not-in-memory - finished sindex building job.
+			as_sindex_populator_release_all(ns);
+		}
+	}
+
+	// Down-size builder thread pool to configured value.
+	as_sbld_resize_thread_pool(g_config.sindex_builder_threads);
+
+	g_sindex_boot_done = true;
+
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Client API to mark all the indexes in namespace populated and ready for read
+ */
+int
+as_sindex_boot_populateall_done(as_namespace *ns)
+{
+	SINDEX_GWLOCK();
+	int ret = AS_SINDEX_OK;
+
+	for (int i = 0; i < AS_SINDEX_MAX; i++) {
+		as_sindex *si = &ns->sindex[i];
+		if (!as_sindex_isactive(si))  continue;
+		// This sindex is getting populating by it self scan
+		if (si->flag & AS_SINDEX_FLAG_POPULATING) continue;
+		si->flag |= AS_SINDEX_FLAG_RACTIVE;
+	}
+	SINDEX_GWUNLOCK();
+	cf_queue_push(g_sindex_populateall_done_q, &ret);
+	cf_info(AS_SINDEX, "Namespace %s sindex population done", ns->name);
+	return ret;
+}
+
+//                                            END - SINDEX POPULATE
+// ************************************************************************************************
+// ************************************************************************************************
+//                                       SINDEX BIN PATH
+as_sindex_status
+as_sindex_add_mapkey_in_path(as_sindex_metadata * imd, char * path_str, int start, int end)
+{
+	if (end < start) {
+		return AS_SINDEX_ERR;
+	}
+
+	int path_length = imd->path_length;
+	char int_str[20];
+	strncpy(int_str, path_str+start, end-start+1);
+	int_str[end-start+1] = '\0';
+	char * str_part;
+	imd->path[path_length-1].value.key_int = strtol(int_str, &str_part, 10);
+	if (str_part == int_str || (*str_part != '\0')) {
+		imd->path[path_length-1].value.key_str  = cf_strndup(int_str, strlen(int_str)+1);
+		imd->path[path_length-1].mapkey_type = AS_PARTICLE_TYPE_STRING;
+	}
+	else {
+		imd->path[path_length-1].mapkey_type = AS_PARTICLE_TYPE_INTEGER;
+	}
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_add_listelement_in_path(as_sindex_metadata * imd, char * path_str, int start, int end)
+{
+	if (end < start) {
+		return AS_SINDEX_ERR;
+	}
+	int path_length = imd->path_length;
+	char int_str[10];
+	strncpy(int_str, path_str+start, end-start+1);
+	int_str[end-start+1] = '\0';
+	char * str_part;
+	imd->path[path_length-1].value.index = strtol(int_str, &str_part, 10);
+	if (str_part == int_str || (*str_part != '\0')) {
+		return AS_SINDEX_ERR;
+	}
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_parse_subpath(as_sindex_metadata * imd, char * path_str, int start, int end)
+{
+	int path_len = strlen(path_str);
+	bool overflow = end >= path_len ? true : false;
+
+	if (start == 0 ) {
+		if (overflow) {
+			imd->bname = cf_strndup(path_str+start, end-start);
+		}
+		else if (path_str[end] == '.') {
+			imd->bname = cf_strndup(path_str+start, end-start);
+			imd->path_length++;
+			imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_MAP;
+		}
+		else if (path_str[end] == '[') {
+			imd->bname = cf_strndup(path_str+start, end-start);
+			imd->path_length++;
+			imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_LIST;
+		}
+		else {
+			return AS_SINDEX_ERR;
+		}
+	}
+	else if (path_str[start] == '.') {
+		if (overflow) {
+			if (as_sindex_add_mapkey_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) {
+				return AS_SINDEX_ERR;
+			}
+		}
+		else if (path_str[end] == '.') {
+			// take map value
+			if (as_sindex_add_mapkey_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) {
+				return AS_SINDEX_ERR;
+			}
+			// add type for next node in path
+			imd->path_length++;
+			imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_MAP;
+		}
+		else if (path_str[end] == '[') {
+			// value
+			if (as_sindex_add_mapkey_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) {
+				return AS_SINDEX_ERR;
+			}
+			// add type for next node in path
+			imd->path_length++;
+			imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_LIST;
+		}
+		else {
+			return AS_SINDEX_ERR;
+		}
+	}
+	else if (path_str[start] == '[') {
+		if (!overflow && path_str[end] == ']') {
+			//take list value
+			if (as_sindex_add_listelement_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) {
+				return AS_SINDEX_ERR;
+			}
+		}
+		else {
+			return AS_SINDEX_ERR;
+		}
+	}
+	else if (path_str[start] == ']') {
+		if (end - start != 1) {
+			return AS_SINDEX_ERR;
+		}
+		else if (overflow) {
+			return AS_SINDEX_OK;
+		}
+		if (path_str[end] == '.') {
+			imd->path_length++;
+			imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_MAP;
+		}
+		else if (path_str[end] == '[') {
+			imd->path_length++;
+			imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_LIST;
+		}
+		else {
+			return AS_SINDEX_ERR;
+		}
+	}
+	else {
+		return AS_SINDEX_ERR;
+	}
+	return AS_SINDEX_OK;
+}
+/*
+ * This function parses the path_str and populate array of path structure in
+ * imd.
+ * Each element of the path is the way to reach the the next path.
+ * For e.g
+ * bin.k1[1][0]
+ * array of the path structure would be like -
+ * path[0].type = AS_PARTICLE_TYPE_MAP . path[0].value.key_str = k1  path[0].value.ke
+ * path[1].type = AS_PARTICLE_TYPE_LIST . path[1].value.index  = 1
+ * path[2].type = AS_PARTICLE_TYPE_LIST . path[2].value.index  = 0
+*/
+as_sindex_status
+as_sindex_extract_bin_path(as_sindex_metadata * imd, char * path_str)
+{
+	int    path_len    = strlen(path_str);
+	int    start       = 0;
+	int    end         = 0;
+	if (path_len > AS_SINDEX_MAX_PATH_LENGTH) {
+		cf_warning(AS_SINDEX, "Bin path length exceeds the maximum allowed.");
+		return AS_SINDEX_ERR;
+	}
+	// Iterate through the path_str and search for character (., [, ])
+	// which leads to sublevels in maps and lists
+	while (end < path_len) {
+		if (path_str[end] == '.' || path_str[end] == '[' || path_str[end] == ']') {
+			if (as_sindex_parse_subpath(imd, path_str, start, end)!=AS_SINDEX_OK) {
+				return AS_SINDEX_ERR;
+			}
+			start = end;
+			if (imd->path_length >= AS_SINDEX_MAX_DEPTH) {
+				cf_warning(AS_SINDEX, "Bin position depth level exceeds the max depth allowed %d", AS_SINDEX_MAX_DEPTH);
+				return AS_SINDEX_ERR;
+			}
+		}
+		end++;
+	}
+	if (as_sindex_parse_subpath(imd, path_str, start, end)!=AS_SINDEX_OK) {
+		return AS_SINDEX_ERR;
+	}
+/*
+// For debugging
+	cf_info(AS_SINDEX, "After parsing : bin name: %s", imd->bname);
+	for (int i=0; i<imd->path_length; i++) {
+		if(imd->path[i].type == AS_PARTICLE_TYPE_MAP ) {
+			if (imd->path[i].key_type == AS_PARTICLE_TYPE_INTEGER) {
+				cf_info(AS_SINDEX, "map key_int %d", imd->path[i].value.key_int);
+			}
+			else if (imd->path[i].key_type == AS_PARTICLE_TYPE_STRING){
+				cf_info(AS_SINDEX, "map key_str %s", imd->path[i].value.key_str);
+			}
+			else {
+				cf_info(AS_SINDEX, "ERROR EEROR EERROR ERRROR REERROR");
+			}
+		}
+		else{
+			cf_info(AS_SINDEX, "list index %d", imd->path[i].value.index);
+		}
+	}
+*/
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_extract_bin_from_path(char * path_str, char *bin)
+{
+	int    path_len    = strlen(path_str);
+	int    end         = 0;
+	if (path_len > AS_SINDEX_MAX_PATH_LENGTH) {
+		cf_warning(AS_SINDEX, "Bin path length exceeds the maximum allowed.");
+		return AS_SINDEX_ERR;
+	}
+
+	while (end < path_len && path_str[end] != '.' && path_str[end] != '[' && path_str[end] != ']') {
+		end++;
+	}
+
+	if (end > 0 && end < AS_ID_BIN_SZ) {
+		strncpy(bin, path_str, end);
+		bin[end] = '\0';
+	}
+	else {
+		return AS_SINDEX_ERR;
+	}
+
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_destroy_value_path(as_sindex_metadata * imd)
+{
+	for (int i=0; i<imd->path_length; i++) {
+		if (imd->path[i].type == AS_PARTICLE_TYPE_MAP &&
+				imd->path[i].mapkey_type == AS_PARTICLE_TYPE_STRING) {
+			cf_free(imd->path[i].value.key_str);
+		}
+	}
+	return AS_SINDEX_OK;
+}
+
+/*
+ * This function checks the existence of path stored in the sindex metadata
+ * in a bin
+ */
+as_val *
+as_sindex_extract_val_from_path(as_sindex_metadata * imd, as_val * v)
+{
+	if (!v) {
+		return NULL;
+	}
+
+	as_val * val = v;
+
+	as_particle_type imd_sktype = as_sindex_pktype(imd);
+	if (imd->path_length == 0) {
+		goto END;
+	}
+	as_sindex_path *path = imd->path;
+	for (int i=0; i<imd->path_length; i++) {
+		switch (val->type) {
+			case AS_STRING:
+			case AS_INTEGER:
+				return NULL;
+			case AS_LIST: {
+				if (path[i].type != AS_PARTICLE_TYPE_LIST) {
+					return NULL;
+				}
+				int index = path[i].value.index;
+				as_arraylist* list  = (as_arraylist*) as_list_fromval(val);
+				as_arraylist_iterator it;
+				as_arraylist_iterator_init( &it, list);
+				int j = 0;
+				while( as_arraylist_iterator_has_next( &it) && j<=index) {
+					val = (as_val*) as_arraylist_iterator_next( &it);
+					j++;
+				}
+				if (j-1 != index ) {
+					return NULL;
+				}
+				break;
+			}
+			case AS_MAP: {
+				if (path[i].type != AS_PARTICLE_TYPE_MAP) {
+					return NULL;
+				}
+				as_map * map = as_map_fromval(val);
+				as_val * key;
+				if (path[i].mapkey_type == AS_PARTICLE_TYPE_STRING) {
+					key = (as_val *)as_string_new(path[i].value.key_str, false);
+				}
+				else if (path[i].mapkey_type == AS_PARTICLE_TYPE_INTEGER) {
+					key = (as_val *)as_integer_new(path[i].value.key_int);
+				}
+				else {
+					cf_warning(AS_SINDEX, "Possible false data in sindex metadata");
+					return NULL;
+				}
+				val = as_map_get(map, key);
+				if (key) {
+					as_val_destroy(key);
+				}
+				if ( !val ) {
+					return NULL;
+				}
+				break;
+			}
+			default:
+				return NULL;
+		}
+	}
+
+END:
+	if (imd->itype == AS_SINDEX_ITYPE_DEFAULT) {
+		if (val->type == AS_INTEGER && imd_sktype == AS_PARTICLE_TYPE_INTEGER) {
+			return val;
+		}
+		else if (val->type == AS_STRING && imd_sktype == AS_PARTICLE_TYPE_STRING) {
+			return val;
+		}
+	}
+	else if (imd->itype == AS_SINDEX_ITYPE_MAPKEYS ||  imd->itype == AS_SINDEX_ITYPE_MAPVALUES) {
+		if (val->type == AS_MAP) {
+			return val;
+		}
+	}
+	else if (imd->itype == AS_SINDEX_ITYPE_LIST) {
+		if (val->type == AS_LIST) {
+			return val;
+		}
+	}
+	return NULL;
+}
+//                                        END - SINDEX BIN PATH
+// ************************************************************************************************
+// ************************************************************************************************
+//                                                SINDEX QUERY
+/*
+ * Returns -
+ * 		NULL - On failure
+ * 		si   - On success.
+ * Notes -
+ * 		Reserves the si if found in the srange
+ * 		Releases the si if imd is null or bin type is mis matched.
+ *
+ */
+as_sindex *
+as_sindex_from_range(as_namespace *ns, char *set, as_sindex_range *srange)
+{
+	cf_debug(AS_SINDEX, "as_sindex_from_range");
+	if (ns->single_bin) {
+		cf_warning(AS_SINDEX, "Secondary index query not allowed on single bin namespace %s", ns->name);
+		return NULL;
+	}
+	as_sindex *si = as_sindex_lookup_by_defns(ns, set, srange->start.id,
+						as_sindex_sktype_from_pktype(srange->start.type), srange->itype, srange->bin_path,
+						AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+	if (si && si->imd) {
+		// Do the type check
+		as_sindex_metadata *imd = si->imd;
+		if ((imd->binid == srange->start.id) && (srange->start.type != as_sindex_pktype(imd))) {
+			cf_warning(AS_SINDEX, "Query and Index Bin Type Mismatch: "
+					"[binid %d : Index Bin type %d : Query Bin Type %d]",
+					imd->binid, as_sindex_pktype(imd), srange->start.type );
+			AS_SINDEX_RELEASE(si);
+			return NULL;
+		}
+	}
+	return si;
+}
+
+/*
+ * The way to filter out imd information from the as_msg which is primarily
+ * query with all the details. For the normal operations the imd is formed out
+ * of the as_op.
+ */
+/*
+ * Returns -
+ * 		NULL      - On failure.
+ * 		as_sindex - On success.
+ *
+ * Description -
+ * 		Firstly obtains the simatch using ns name and set name.
+ * 		Then returns the corresponding slot from sindex array.
+ *
+ * TODO
+ * 		log messages
+ */
+as_sindex *
+as_sindex_from_msg(as_namespace *ns, as_msg *msgp)
+{
+	cf_debug(AS_SINDEX, "as_sindex_from_msg");
+	as_msg_field *ifp  = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_INDEX_NAME);
+
+	if (!ifp) {
+		cf_debug(AS_SINDEX, "Index name not found in the query request");
+		return NULL;
+	}
+
+	uint32_t iname_len = as_msg_field_get_value_sz(ifp);
+
+	if (iname_len >= AS_ID_INAME_SZ) {
+		cf_warning(AS_SINDEX, "index name too long");
+		return NULL;
+	}
+
+	char iname[AS_ID_INAME_SZ];
+
+	memcpy(iname, ifp->data, iname_len);
+	iname[iname_len] = 0;
+
+	as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+	if (!si) {
+		cf_detail(AS_SINDEX, "Search did not find index ");
+	}
+
+	return si;
+}
+
+
+/*
+ * Internal Function - as_sindex_range_free
+ * 		frees the sindex range
+ *
+ * Returns
+ * 		AS_SINDEX_OK - In every case
+ */
+int
+as_sindex_range_free(as_sindex_range **range)
+{
+	cf_debug(AS_SINDEX, "as_sindex_range_free");
+	as_sindex_range *sk = (*range);
+	if (sk->region) {
+		geo_region_destroy(sk->region);
+	}
+	cf_free(sk);
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Extract out range information from the as_msg and create the irange structure
+ * if required allocates the memory.
+ * NB: It is responsibility of caller to call the cleanup routine to clean the
+ * range structure up and free up its memory
+ *
+ * query range field layout: contains - numranges, binname, start, end
+ *
+ * generic field header
+ * 0   4 size = size of data only
+ * 4   1 field_type = CL_MSG_FIELD_TYPE_INDEX_RANGE
+ *
+ * numranges
+ * 5   1 numranges (max 255 ranges)
+ *
+ * binname
+ * 6   1 binnamelen b
+ * 7   b binname
+ *
+ * particle (start & end)
+ * +b    1 particle_type
+ * +b+1  4 start_particle_size x
+ * +b+5  x start_particle_data
+ * +b+5+x      4 end_particle_size y
+ * +b+5+x+y+4   y end_particle_data
+ *
+ * repeat "numranges" times from "binname"
+ */
+
+/*
+ * Function as_sindex_binlist_from_msg
+ *
+ * Returns -
+ * 		binlist - On success
+ * 		NULL    - On failure
+ *
+ */
+cf_vector *
+as_sindex_binlist_from_msg(as_namespace *ns, as_msg *msgp, int * num_bins)
+{
+	cf_debug(AS_SINDEX, "as_sindex_binlist_from_msg");
+	as_msg_field *bfp = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_QUERY_BINLIST);
+	if (!bfp) {
+		return NULL;
+	}
+	const uint8_t *data = bfp->data;
+	int numbins         = *data++;
+	*num_bins           = numbins;
+
+	cf_vector *binlist  = cf_vector_create(AS_ID_BIN_SZ, numbins, 0);
+
+	for (int i = 0; i < numbins; i++) {
+		int binnamesz = *data++;
+		if (binnamesz <= 0 || binnamesz > AS_ID_BIN_SZ - 1) {
+			cf_warning(AS_SINDEX, "Size of the bin name in bin list of sindex query is out of bounds. Size %d", binnamesz);
+			cf_vector_destroy(binlist);
+			return NULL;
+		}
+		char binname[AS_ID_BIN_SZ];
+		memcpy(&binname, data, binnamesz);
+		binname[binnamesz] = 0;
+		cf_vector_set(binlist, i, (void *)binname);
+		data     += binnamesz;
+	}
+
+	cf_debug(AS_SINDEX, "Queried Bin List %d ", numbins);
+	for (int i = 0; i < cf_vector_size(binlist); i++) {
+		char binname[AS_ID_BIN_SZ];
+		cf_vector_get(binlist, i, (void*)&binname);
+		cf_debug(AS_SINDEX,  " String Queried is |%s| \n", binname);
+	}
+
+	return binlist;
+}
+
+/*
+ * Returns -
+ *		AS_SINDEX_OK        - On success.
+ *		AS_SINDEX_ERR_PARAM - On failure.
+ *		AS_SINDEX_ERR_BIN_NOTFOUND - On failure.
+ *
+ * Description -
+ *		Frames a sane as_sindex_range from msg.
+ *
+ *		We are not supporting multiranges right now. So numrange is always expected to be 1.
+ */
+int
+as_sindex_range_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range *srange)
+{
+	cf_debug(AS_SINDEX, "as_sindex_range_from_msg");
+	srange->num_binval = 0;
+	// Ensure region is initialized in case we need to return an error code early.
+	srange->region = NULL;
+
+	// getting ranges
+	as_msg_field *itype_fp  = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_INDEX_TYPE);
+	as_msg_field *rfp = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_INDEX_RANGE);
+	if (!rfp) {
+		cf_warning(AS_SINDEX, "Required Index Range Not Found");
+		return AS_SINDEX_ERR_PARAM;
+	}
+	const uint8_t *data = rfp->data;
+	int numrange        = *data++;
+
+	if (numrange != 1) {
+		cf_warning(AS_SINDEX,
+					"can't handle multiple ranges right now %d", rfp->data[0]);
+		return AS_SINDEX_ERR_PARAM;
+	}
+	// NOTE - to support geospatial queries the srange object is actually a vector
+	// of MAX_REGION_CELLS elements.  Normal queries only use the first element.
+	// Geospatial queries use multiple elements.
+	//
+	memset(srange, 0, sizeof(as_sindex_range) * MAX_REGION_CELLS);
+	if (itype_fp) {
+		srange->itype = *itype_fp->data;
+	}
+	else {
+		srange->itype = AS_SINDEX_ITYPE_DEFAULT;
+	}
+	for (int i = 0; i < numrange; i++) {
+		as_sindex_bin_data *start = &(srange->start);
+		as_sindex_bin_data *end   = &(srange->end);
+		// Populate Bin id
+		uint8_t bin_path_len         = *data++;
+		if (bin_path_len >= AS_SINDEX_MAX_PATH_LENGTH) {
+			cf_warning(AS_SINDEX, "Index position size %d exceeds the max length %d", bin_path_len, AS_SINDEX_MAX_PATH_LENGTH);
+			return AS_SINDEX_ERR_PARAM;
+		}
+
+		strncpy(srange->bin_path, (char *)data, bin_path_len);
+		srange->bin_path[bin_path_len] = '\0';
+
+		char binname[AS_ID_BIN_SZ];
+		if (as_sindex_extract_bin_from_path(srange->bin_path, binname) == AS_SINDEX_OK) {
+			int16_t id = as_bin_get_id(ns, binname);
+			if (id != -1) {
+				start->id   = id;
+				end->id     = id;
+			} else {
+				return AS_SINDEX_ERR_BIN_NOTFOUND;
+			}
+		}
+		else {
+			return AS_SINDEX_ERR_PARAM;
+		}
+
+		data       += bin_path_len;
+
+		// Populate type
+		int type    = *data++;
+		start->type = type;
+		end->type   = start->type;
+
+		// TODO - Refactor these into generic conversion from
+		// buffer to as_sindex_bin_data functions. Can be used
+		// by write code path as well.
+		if ((type == AS_PARTICLE_TYPE_INTEGER)) {
+			// get start point
+			uint32_t startl  = ntohl(*((uint32_t *)data));
+			data            += sizeof(uint32_t);
+			if (startl != 8) {
+				cf_warning(AS_SINDEX,
+					"Can only handle 8 byte numerics right now %u", startl);
+				goto Cleanup;
+			}
+			start->u.i64  = __cpu_to_be64(*((uint64_t *)data));
+			data         += sizeof(uint64_t);
+
+			// get end point
+			uint32_t endl = ntohl(*((uint32_t *)data));
+			data         += sizeof(uint32_t);
+			if (endl != 8) {
+				cf_warning(AS_SINDEX,
+						"can only handle 8 byte numerics right now %u", endl);
+				goto Cleanup;
+			}
+			end->u.i64  = __cpu_to_be64(*((uint64_t *)data));
+			data       += sizeof(uint64_t);
+			if (start->u.i64 > end->u.i64) {
+				cf_warning(AS_SINDEX,
+                     "Invalid range from %ld to %ld", start->u.i64, end->u.i64);
+				goto Cleanup;
+			} else {
+				srange->isrange = start->u.i64 != end->u.i64;
+			}
+			cf_debug(AS_SINDEX, "Range is equal  %"PRId64", %"PRId64"",
+								start->u.i64, end->u.i64);
+		} else if (type == AS_PARTICLE_TYPE_STRING) {
+			// get start point
+			uint32_t startl    = ntohl(*((uint32_t *)data));
+			data              += sizeof(uint32_t);
+			char* start_binval       = (char *)data;
+			data              += startl;
+			srange->isrange    = false;
+
+			if (startl >= AS_SINDEX_MAX_STRING_KSIZE) {
+				cf_warning(AS_SINDEX, "Query on bin %s fails. Value length %u too long.", binname, startl);
+				goto Cleanup;
+			}
+			uint32_t endl	   = ntohl(*((uint32_t *)data));
+			data              += sizeof(uint32_t);
+			char * end_binval        = (char *)data;
+			if (startl != endl && strncmp(start_binval, end_binval, startl)) {
+				cf_warning(AS_SINDEX,
+                           "Only Equality Query Supported in Strings %s-%s",
+                           start_binval, end_binval);
+				goto Cleanup;
+			}
+			cf_digest_compute(start_binval, startl, &(start->digest));
+			cf_debug(AS_SINDEX, "Range is equal %s ,%s",
+					 start_binval, end_binval);
+		} else if (type == AS_PARTICLE_TYPE_GEOJSON) {
+			// get start point
+			uint32_t startl = ntohl(*((uint32_t *)data));
+			data += sizeof(uint32_t);
+			char* start_binval = (char *)data;
+			data += startl;
+
+			if ((startl == 0) || (startl >= AS_SINDEX_MAX_GEOJSON_KSIZE)) {
+				cf_warning(AS_SINDEX, "Out of bound query key size %u", startl);
+				goto Cleanup;
+			}
+			uint32_t endl = ntohl(*((uint32_t *)data));
+			data += sizeof(uint32_t);
+			char * end_binval = (char *)data;
+			if (startl != endl && strncmp(start_binval, end_binval, startl)) {
+				cf_warning(AS_SINDEX,
+						   "Only Geospatial Query Supported on GeoJSON %s-%s",
+						   start_binval, end_binval);
+				goto Cleanup;
+			}
+
+			srange->cellid = 0;
+			if (!geo_parse(ns, start_binval, startl,
+						   &srange->cellid, &srange->region)) {
+				cf_warning(AS_GEO, "failed to parse query GeoJSON");
+				goto Cleanup;
+			}
+
+			if (srange->cellid && srange->region) {
+				geo_region_destroy(srange->region);
+				srange->region = NULL;
+				cf_warning(AS_GEO, "query geo_parse: both point and region");
+				goto Cleanup;
+			}
+
+			if (!srange->cellid && !srange->region) {
+				cf_warning(AS_GEO, "query geo_parse: neither point nor region");
+				goto Cleanup;
+			}
+
+			if (srange->cellid) {
+				// REGIONS-CONTAINING-POINT QUERY
+
+				uint64_t center[MAX_REGION_LEVELS];
+				int numcenters;
+				if (!geo_point_centers(ns, srange->cellid, MAX_REGION_LEVELS,
+									   center, &numcenters)) {
+					cf_warning(AS_GEO, "Query point invalid");
+					goto Cleanup;
+				}
+
+				// Geospatial queries use multiple srange elements.	 Many
+				// of the fields are copied from the first cell because
+				// they were filled in above.
+				for (int ii = 0; ii < numcenters; ++ii) {
+					srange[ii].num_binval = 1;
+					srange[ii].isrange = true;
+					srange[ii].start.id = srange[0].start.id;
+					srange[ii].start.type = srange[0].start.type;
+					srange[ii].start.u.i64 = center[ii];
+					srange[ii].end.id = srange[0].end.id;
+					srange[ii].end.type = srange[0].end.type;
+					srange[ii].end.u.i64 = center[ii];
+					srange[ii].itype = srange[0].itype;
+				}
+			} else {
+				// POINTS-INSIDE-REGION QUERY
+
+				uint64_t cellmin[MAX_REGION_CELLS];
+				uint64_t cellmax[MAX_REGION_CELLS];
+				int numcells;
+				if (!geo_region_cover(ns, srange->region, MAX_REGION_CELLS,
+									  NULL, cellmin, cellmax, &numcells)) {
+					cf_warning(AS_GEO, "Query region invalid.");
+					goto Cleanup;
+				}
+
+				cf_atomic64_incr(&ns->geo_region_query_count);
+				cf_atomic64_add(&ns->geo_region_query_cells, numcells);
+
+				// Geospatial queries use multiple srange elements.	 Many
+				// of the fields are copied from the first cell because
+				// they were filled in above.
+				for (int ii = 0; ii < numcells; ++ii) {
+					srange[ii].num_binval = 1;
+					srange[ii].isrange = true;
+					srange[ii].start.id = srange[0].start.id;
+					srange[ii].start.type = srange[0].start.type;
+					srange[ii].start.u.i64 = cellmin[ii];
+					srange[ii].end.id = srange[0].end.id;
+					srange[ii].end.type = srange[0].end.type;
+					srange[ii].end.u.i64 = cellmax[ii];
+					srange[ii].itype = srange[0].itype;
+				}
+			}
+		} else {
+			cf_warning(AS_SINDEX, "Only handle String, Numeric and GeoJSON type");
+			goto Cleanup;
+		}
+		srange->num_binval = numrange;
+	}
+	return AS_SINDEX_OK;
+
+Cleanup:
+	return AS_SINDEX_ERR_PARAM;
+}
+
+/*
+ * Function as_sindex_rangep_from_msg
+ *
+ * Arguments
+ * 		ns     - the namespace on which srange has to be build
+ * 		msgp   - the msgp from which sent
+ * 		srange - it builds this srange
+ *
+ * Returns
+ * 		AS_SINDEX_OK - On success
+ * 		else the return value of as_sindex_range_from_msg
+ *
+ * Description
+ * 		Allocating space for srange and then calling as_sindex_range_from_msg.
+ */
+int
+as_sindex_rangep_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range **srange)
+{
+	cf_debug(AS_SINDEX, "as_sindex_rangep_from_msg");
+
+	// NOTE - to support geospatial queries we allocate an array of
+	// MAX_REGION_CELLS length.	 Nongeospatial queries use only the
+	// first element.  Geospatial queries use one element per region
+	// cell, up to MAX_REGION_CELLS.
+	*srange         = cf_malloc(sizeof(as_sindex_range) * MAX_REGION_CELLS);
+
+	int ret = as_sindex_range_from_msg(ns, msgp, *srange);
+	if (AS_SINDEX_OK != ret) {
+		as_sindex_range_free(srange);
+		*srange = NULL;
+		return ret;
+	}
+	return AS_SINDEX_OK;
+}
+
+/*
+ * Returns -
+ * 		AS_SINDEX_ERR_PARAM
+ *		o/w return value from ai_btree_query
+ *
+ * Notes -
+ * 		Client API to do range get from index based on passed in range key, returns
+ * 		digest list
+ *
+ * Synchronization -
+ *
+ */
+int
+as_sindex_query(as_sindex *si, as_sindex_range *srange, as_sindex_qctx *qctx)
+{
+	if (! si || ! srange) {
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	as_sindex_metadata *imd = si->imd;
+	as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx];
+
+	if (! as_sindex_can_query(si)) {
+		return AS_SINDEX_ERR_NOT_READABLE;
+	}
+
+	PIMD_RLOCK(&pimd->slock);
+	int ret = ai_btree_query(imd, srange, qctx);
+	PIMD_RUNLOCK(&pimd->slock);
+
+	as_sindex__process_ret(si, ret, AS_SINDEX_OP_READ,
+			0 /* No histogram for query per call */, __LINE__);
+
+	return ret;
+}
+//                                        END -  SINDEX QUERY
+// ************************************************************************************************
+// ************************************************************************************************
+//                                          SBIN UTILITY
+void
+as_sindex_init_sbin(as_sindex_bin * sbin, as_sindex_op op, as_particle_type type, as_sindex * si)
+{
+	sbin->si              = si;
+	sbin->to_free         = false;
+	sbin->num_values      = 0;
+	sbin->op              = op;
+	sbin->heap_capacity   = 0;
+	sbin->type            = type;
+	sbin->values          = NULL;
+}
+
+int
+as_sindex_sbin_free(as_sindex_bin *sbin)
+{
+	if (sbin->to_free) {
+		if (sbin->values) {
+			cf_free(sbin->values);
+		}
+	}
+    return AS_SINDEX_OK;
+}
+
+int
+as_sindex_sbin_freeall(as_sindex_bin *sbin, int numbins)
+{
+	for (int i = 0; i < numbins; i++)  {
+		as_sindex_sbin_free(&sbin[i]);
+	}
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex__op_by_sbin(as_namespace *ns, const char *set, int numbins, as_sindex_bin *start_sbin, cf_digest * pkey)
+{
+	// If numbins == 0 return AS_SINDEX_OK
+	// Iterate through sbins
+	// 		Reserve the SI.
+	// 		Take the read lock on imd
+	//		Get a value from sbin
+	//			Get the related pimd
+	//			Get the pimd write lock
+	//			If op is DELETE delete the values from sbin from sindex
+	//			If op is INSERT put all the values from bin in sindex.
+	//			Release the pimd lock
+	//		Release the imd lock.
+	//		Release the SI.
+
+	as_sindex_status retval = AS_SINDEX_OK;
+	if (!ns || !start_sbin) {
+		return AS_SINDEX_ERR;
+	}
+
+	// If numbins != 1 return AS_SINDEX_OK
+	if (numbins != 1 ) {
+		return AS_SINDEX_OK;
+	}
+
+	as_sindex * si             = NULL;
+	as_sindex_bin * sbin   = NULL;
+	as_sindex_metadata * imd   = NULL;
+	as_sindex_pmetadata * pimd = NULL;
+	as_sindex_op op;
+	// Iterate through sbins
+	for (int i=0; i<numbins; i++) {
+	// 		Reserve the SI.
+		sbin = &start_sbin[i];
+		si = sbin->si;
+		if (!si) {
+			cf_warning(AS_SINDEX, "as_sindex_op_by_sbin : si is null in sbin");
+			return AS_SINDEX_ERR;
+		}
+		imd =  si->imd;
+		op = sbin->op;
+	// 		Take the read lock on imd
+		for (int j=0; j<sbin->num_values; j++) {
+
+	//		Get a value from sbin
+			void * skey;
+			switch (sbin->type) {
+			case AS_PARTICLE_TYPE_INTEGER:
+			case AS_PARTICLE_TYPE_GEOJSON:
+				if (j==0) {
+					skey = (void *)&(sbin->value.int_val);
+				}
+				else {
+					skey = (void *)((uint64_t *)(sbin->values) + j);
+				}
+				break;
+			case AS_PARTICLE_TYPE_STRING:
+				if (j==0) {
+					skey = (void *)&(sbin->value.str_val);
+				}
+				else {
+					skey = (void *)((cf_digest *)(sbin->values) + j);
+				}
+				break;
+			default:
+				retval = AS_SINDEX_ERR;
+				goto Cleanup;
+			}
+	//			Get the related pimd
+			pimd = &imd->pimd[ai_btree_key_hash(imd, skey)];
+			uint64_t starttime = 0;
+			if (si->enable_histogram) {
+				starttime = cf_getns();
+			}
+
+	//			Get the pimd write lock
+			PIMD_WLOCK(&pimd->slock);
+
+	//			If op is DELETE delete the value from sindex
+			int ret = AS_SINDEX_OK;
+			if (op == AS_SINDEX_OP_DELETE) {
+				ret = ai_btree_delete(imd, pimd, skey, pkey);
+			}
+			else if (op == AS_SINDEX_OP_INSERT) {
+	//			If op is INSERT put the value in sindex.
+				ret = ai_btree_put(imd, pimd, skey, pkey);
+			}
+
+	//			Release the pimd lock
+			PIMD_WUNLOCK(&pimd->slock);
+			as_sindex__process_ret(si, ret, op, starttime, __LINE__);
+		}
+		cf_debug(AS_SINDEX, " Secondary Index Op Finish------------- ");
+
+	//		Release the imd lock.
+	//		Release the SI.
+
+	}
+Cleanup:
+	return retval;
+}
+//                                       END - SBIN UTILITY
+// ************************************************************************************************
+// ************************************************************************************************
+//                                          ADD TO SBIN
+
+
+as_sindex_status
+as_sindex_add_sbin_value_in_heap(as_sindex_bin * sbin, void * val)
+{
+	// Get the size of the data we are going to store
+	// If to_free = false, this means this is the first
+	// time we are storing value for this sbin to heap
+	// Check if there is need to copy the existing data from stack_buf
+	// 		init_storage(num_values)
+	// 		If num_values != 0
+	//			Copy the existing data from stack to heap
+	//			reduce the used stack_buf size
+	// 		to_free = true;
+	// 	Else
+	// 		If (num_values == heap_capacity)
+	// 			extend the allocation and capacity
+	// 	Copy the value to the appropriate position.
+
+	uint32_t   size = 0;
+	bool    to_copy = false;
+	uint8_t    data_sz = 0;
+	void * tmp_value = NULL;
+	sbin_value_pool * stack_buf = sbin->stack_buf;
+
+	// Get the size of the data we are going to store
+	if (sbin->type == AS_PARTICLE_TYPE_INTEGER ||
+		sbin->type == AS_PARTICLE_TYPE_GEOJSON) {
+		data_sz = sizeof(uint64_t);
+	}
+	else if (sbin->type == AS_PARTICLE_TYPE_STRING) {
+		data_sz = sizeof(cf_digest);
+	}
+	else {
+		cf_warning(AS_SINDEX, "Bad type of data to index %d", sbin->type);
+		return AS_SINDEX_ERR;
+	}
+
+	// If to_free = false, this means this is the first
+	// time we are storing value for this sbin to heap
+	// Check if there is need to copy the existing data from stack_buf
+	if (!sbin->to_free) {
+		if (sbin->num_values == 0) {
+			size = 2;
+		}
+		else if (sbin->num_values == 1) {
+			to_copy = true;
+			size = 2;
+			tmp_value = &sbin->value;
+		}
+		else if (sbin->num_values > 1) {
+			to_copy = true;
+			size = 2 * sbin->num_values;
+			tmp_value = sbin->values;
+		}
+		else {
+			cf_warning(AS_SINDEX, "num_values in sbin is less than 0  %"PRIu64"", sbin->num_values);
+			return AS_SINDEX_ERR;
+		}
+
+		sbin->values  = cf_malloc(data_sz * size);
+		sbin->to_free = true;
+		sbin->heap_capacity = size;
+
+	//			Copy the existing data from stack to heap
+	//			reduce the used stack_buf size
+		if (to_copy) {
+			if (!memcpy(sbin->values, tmp_value, data_sz * sbin->num_values)) {
+				cf_warning(AS_SINDEX, "memcpy failed");
+				return AS_SINDEX_ERR;
+			}
+			if (sbin->num_values != 1) {
+				stack_buf->used_sz -= (sbin->num_values * data_sz);
+			}
+		}
+	}
+	else
+	{
+	// 	Else
+	// 		If (num_values == heap_capacity)
+	// 			extend the allocation and capacity
+		if (sbin->heap_capacity ==  sbin->num_values) {
+			sbin->heap_capacity = 2 * sbin->heap_capacity;
+			sbin->values = cf_realloc(sbin->values, sbin->heap_capacity * data_sz);
+		}
+	}
+
+	// 	Copy the value to the appropriate position.
+	if (sbin->type == AS_PARTICLE_TYPE_INTEGER ||
+		sbin->type == AS_PARTICLE_TYPE_GEOJSON) {
+		if (!memcpy((void *)((uint64_t *)sbin->values + sbin->num_values), (void *)val, data_sz)) {
+			cf_warning(AS_SINDEX, "memcpy failed");
+			return AS_SINDEX_ERR;
+		}
+	}
+	else if (sbin->type == AS_PARTICLE_TYPE_STRING) {
+		if (!memcpy((void *)((cf_digest *)sbin->values + sbin->num_values), (void *)val, data_sz)) {
+			cf_warning(AS_SINDEX, "memcpy failed");
+			return AS_SINDEX_ERR;
+		}
+	}
+	else {
+		cf_warning(AS_SINDEX, "Bad type of data to index %d", sbin->type);
+		return AS_SINDEX_ERR;
+	}
+
+	sbin->num_values++;
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_add_value_to_sbin(as_sindex_bin * sbin, uint8_t * val)
+{
+	// If this is the first value coming to the  sbin
+	// 		assign the value to the local variable of struct.
+	// Else
+	// 		If to_free is true or stack_buf is full
+	// 			add value to the heap
+	// 		else
+	// 			If needed copy the values stored in sbin to stack_buf
+	// 			add the value to end of stack buf
+
+	int data_sz = 0;
+	if (sbin->type == AS_PARTICLE_TYPE_STRING) {
+		data_sz = sizeof(cf_digest);
+	}
+	else if (sbin->type == AS_PARTICLE_TYPE_INTEGER ||
+			 sbin->type == AS_PARTICLE_TYPE_GEOJSON) {
+		data_sz = sizeof(uint64_t);
+	}
+	else {
+		cf_warning(AS_SINDEX, "sbin type is invalid %d", sbin->type);
+		return AS_SINDEX_ERR;
+	}
+
+	sbin_value_pool * stack_buf = sbin->stack_buf;
+	if (sbin->num_values == 0 ) {
+		if (sbin->type == AS_PARTICLE_TYPE_STRING) {
+			sbin->value.str_val = *(cf_digest *)val;
+		}
+		else if (sbin->type == AS_PARTICLE_TYPE_INTEGER ||
+				 sbin->type == AS_PARTICLE_TYPE_GEOJSON) {
+			sbin->value.int_val = *(int64_t *)val;
+		}
+		sbin->num_values++;
+	}
+	else if (sbin->num_values == 1) {
+		if ((stack_buf->used_sz + data_sz + data_sz) > AS_SINDEX_VALUESZ_ON_STACK ) {
+			if (as_sindex_add_sbin_value_in_heap(sbin, (void *)val)) {
+				cf_warning(AS_SINDEX, "Adding value in sbin failed.");
+				return AS_SINDEX_ERR;
+			}
+		}
+		else {
+			// sbin->values gets initiated here
+			sbin->values = stack_buf->value + stack_buf->used_sz;
+
+			if (!memcpy(sbin->values, (void *)&sbin->value, data_sz)) {
+				cf_warning(AS_SINDEX, "Memcpy failed");
+				return AS_SINDEX_ERR;
+			}
+			stack_buf->used_sz += data_sz;
+
+			if (!memcpy((void *)((uint8_t *)sbin->values + data_sz * sbin->num_values), (void *)val, data_sz)) {
+				cf_warning(AS_SINDEX, "Memcpy failed");
+				return AS_SINDEX_ERR;
+			}
+			sbin->num_values++;
+			stack_buf->used_sz += data_sz;
+		}
+	}
+	else if (sbin->num_values > 1) {
+		if (sbin->to_free || (stack_buf->used_sz + data_sz ) > AS_SINDEX_VALUESZ_ON_STACK ) {
+			if (as_sindex_add_sbin_value_in_heap(sbin, (void *)val)) {
+				cf_warning(AS_SINDEX, "Adding value in sbin failed.");
+				return AS_SINDEX_ERR;
+			}
+		}
+		else {
+			if (!memcpy((void *)((uint8_t *)sbin->values + data_sz * sbin->num_values), (void *)val, data_sz)) {
+				cf_warning(AS_SINDEX, "Memcpy failed");
+				return AS_SINDEX_ERR;
+			}
+			sbin->num_values++;
+			stack_buf->used_sz += data_sz;
+		}
+	}
+	else {
+		cf_warning(AS_SINDEX, "numvalues is coming as negative. Possible memory corruption in sbin.");
+		return AS_SINDEX_ERR;
+	}
+	return AS_SINDEX_OK;
+}
+
+as_sindex_status
+as_sindex_add_integer_to_sbin(as_sindex_bin * sbin, uint64_t val)
+{
+	return as_sindex_add_value_to_sbin(sbin, (uint8_t * )&val);
+}
+
+as_sindex_status
+as_sindex_add_digest_to_sbin(as_sindex_bin * sbin, cf_digest val_dig)
+{
+	return as_sindex_add_value_to_sbin(sbin, (uint8_t * )&val_dig);
+}
+
+as_sindex_status
+as_sindex_add_string_to_sbin(as_sindex_bin * sbin, char * val)
+{
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	// Calculate digest and cal add_digest_to_sbin
+	cf_digest val_dig;
+	cf_digest_compute(val, strlen(val), &val_dig);
+	return as_sindex_add_digest_to_sbin(sbin, val_dig);
+}
+//                                       END - ADD TO SBIN
+// ************************************************************************************************
+// ************************************************************************************************
+//                                 ADD KEYTYPE FROM BASIC TYPE ASVAL
+as_sindex_status
+as_sindex_add_long_from_asval(as_val *val, as_sindex_bin *sbin)
+{
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	if (sbin->type != AS_PARTICLE_TYPE_INTEGER) {
+		return AS_SINDEX_ERR;
+	}
+
+	as_integer *i = as_integer_fromval(val);
+	if (!i) {
+		return AS_SINDEX_ERR;
+	}
+	uint64_t int_val = (uint64_t)as_integer_get(i);
+	return as_sindex_add_integer_to_sbin(sbin, int_val);
+}
+
+as_sindex_status
+as_sindex_add_digest_from_asval(as_val *val, as_sindex_bin *sbin)
+{
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	if (sbin->type != AS_PARTICLE_TYPE_STRING) {
+		return AS_SINDEX_ERR;
+	}
+
+	as_string *s = as_string_fromval(val);
+	if (!s) {
+		return AS_SINDEX_ERR;
+	}
+	char * str_val = as_string_get(s);
+	return as_sindex_add_string_to_sbin(sbin, str_val);
+}
+
+as_sindex_status
+as_sindex_add_geo2dsphere_from_as_val(as_val *val, as_sindex_bin *sbin)
+{
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	if (sbin->type != AS_PARTICLE_TYPE_GEOJSON) {
+		return AS_SINDEX_ERR;
+	}
+
+	as_geojson *g = as_geojson_fromval(val);
+	if (!g) {
+		return AS_SINDEX_ERR;
+	}
+
+	const char *s = as_geojson_get(g);
+	size_t jsonsz = as_geojson_len(g);
+	uint64_t parsed_cellid = 0;
+	geo_region_t parsed_region = NULL;
+
+	if (! geo_parse(NULL, s, jsonsz, &parsed_cellid, &parsed_region)) {
+		cf_warning(AS_PARTICLE, "geo_parse() failed - unexpected");
+		geo_region_destroy(parsed_region);
+		return AS_SINDEX_ERR;
+	}
+
+	if (parsed_cellid) {
+		if (parsed_region) {
+			geo_region_destroy(parsed_region);
+			cf_warning(AS_PARTICLE, "geo_parse found both point and region");
+			return AS_SINDEX_ERR;
+		}
+
+		// POINT
+		if (as_sindex_add_integer_to_sbin(sbin, parsed_cellid) != AS_SINDEX_OK) {
+			cf_warning(AS_PARTICLE, "as_sindex_add_integer_to_sbin() failed - unexpected");
+			return AS_SINDEX_ERR;
+		}
+	}
+	else if (parsed_region) {
+		// REGION
+		int numcells;
+		uint64_t outcells[MAX_REGION_CELLS];
+
+		if (! geo_region_cover(NULL, parsed_region, MAX_REGION_CELLS, outcells, NULL, NULL, &numcells)) {
+			geo_region_destroy(parsed_region);
+			cf_warning(AS_PARTICLE, "geo_region_cover failed");
+			return AS_SINDEX_ERR;
+		}
+
+		geo_region_destroy(parsed_region);
+
+		int added = 0;
+		for (size_t i = 0; i < numcells; i++) {
+			if (as_sindex_add_integer_to_sbin(sbin, outcells[i]) == AS_SINDEX_OK) {
+				added++;
+			}
+			else {
+				cf_warning(AS_PARTICLE, "as_sindex_add_integer_to_sbin() failed - unexpected");
+			}
+		}
+
+		if (added == 0 && numcells > 0) {
+			return AS_SINDEX_ERR;
+		}
+	}
+	else {
+		cf_warning(AS_PARTICLE, "geo_parse found neither point nor region");
+		return AS_SINDEX_ERR;
+	}
+
+	return AS_SINDEX_OK;
+}
+
+typedef as_sindex_status (*as_sindex_add_keytype_from_asval_fn)
+(as_val *val, as_sindex_bin * sbin);
+static const as_sindex_add_keytype_from_asval_fn
+			 as_sindex_add_keytype_from_asval[COL_TYPE_MAX] = {
+	NULL,
+	as_sindex_add_long_from_asval,
+	as_sindex_add_digest_from_asval,
+	as_sindex_add_geo2dsphere_from_as_val // 3
+};
+
+//                             END - ADD KEYTYPE FROM BASIC TYPE ASVAL
+// ************************************************************************************************
+// ************************************************************************************************
+//                                    ADD ASVAL TO SINDEX TYPE
+as_sindex_status
+as_sindex_add_asval_to_default_sindex(as_val *val, as_sindex_bin * sbin)
+{
+	return as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)](val, sbin);
+}
+
+static bool as_sindex_add_listvalues_foreach(as_val * element, void * udata)
+{
+	as_sindex_bin * sbin = (as_sindex_bin *)udata;
+	as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)](element, sbin);
+	return true;
+}
+
+as_sindex_status
+as_sindex_add_asval_to_list_sindex(as_val *val, as_sindex_bin * sbin)
+{
+	// If val type is not AS_LIST
+	// 		return AS_SINDEX_ERR
+	// Else iterate through all values of list
+	// 		If type == AS_PARTICLE_TYPE_STRING
+	// 			add all string type values to the sbin
+	// 		If type == AS_PARTICLE_TYPE_INTEGER
+	// 			add all integer type values to the sbin
+
+	// If val type is not AS_LIST
+	// 		return AS_SINDEX_ERR
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	if (val->type != AS_LIST) {
+		return AS_SINDEX_ERR;
+	}
+	// Else iterate through all elements of map
+	as_list * list               = as_list_fromval(val);
+	if (as_list_foreach(list, as_sindex_add_listvalues_foreach, sbin)) {
+		return AS_SINDEX_OK;
+	}
+	return AS_SINDEX_ERR;
+}
+
+static bool as_sindex_add_mapkeys_foreach(const as_val * key, const as_val * val, void * udata)
+{
+	as_sindex_bin * sbin = (as_sindex_bin *)udata;
+	as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)]((as_val *)key, sbin);
+	return true;
+}
+
+static bool as_sindex_add_mapvalues_foreach(const as_val * key, const as_val * val, void * udata)
+{
+	as_sindex_bin * sbin = (as_sindex_bin *)udata;
+	as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)]((as_val *)val, sbin);
+	return true;
+}
+
+as_sindex_status
+as_sindex_add_asval_to_mapkeys_sindex(as_val *val, as_sindex_bin * sbin)
+{
+	// If val type is not AS_MAP
+	// 		return AS_SINDEX_ERR
+	// 		Defensive check. Should not happen.
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	if (val->type != AS_MAP) {
+		cf_warning(AS_SINDEX, "Unexpected wrong type %d", val->type);
+		return AS_SINDEX_ERR;
+	}
+
+	// Else iterate through all keys of map
+	as_map * map                   = as_map_fromval(val);
+	if (as_map_foreach(map, as_sindex_add_mapkeys_foreach, sbin)) {
+		return AS_SINDEX_OK;
+	}
+	return AS_SINDEX_ERR;
+}
+
+as_sindex_status
+as_sindex_add_asval_to_mapvalues_sindex(as_val *val, as_sindex_bin * sbin)
+{
+	// If val type is not AS_MAP
+	// 		return AS_SINDEX_ERR
+	// Else iterate through all values of all keys of the map
+	// 		If type == AS_PARTICLE_TYPE_STRING
+	// 			add all string type values to the sbin
+	// 		If type == AS_PARTICLE_TYPE_INTEGER
+	// 			add all integer type values to the sbin
+
+	// If val type is not AS_MAP
+	// 		return AS_SINDEX_ERR
+	if (!val) {
+		return AS_SINDEX_ERR;
+	}
+	if (val->type != AS_MAP) {
+		return AS_SINDEX_ERR;
+	}
+	// Else iterate through all keys, values of map
+	as_map * map                  = as_map_fromval(val);
+	if (as_map_foreach(map, as_sindex_add_mapvalues_foreach, sbin)) {
+		return AS_SINDEX_OK;
+	}
+	return AS_SINDEX_ERR;
+}
+
+typedef as_sindex_status (*as_sindex_add_asval_to_itype_sindex_fn)
+(as_val *val, as_sindex_bin * sbin);
+static const as_sindex_add_asval_to_itype_sindex_fn
+			 as_sindex_add_asval_to_itype_sindex[AS_SINDEX_ITYPE_MAX] = {
+	as_sindex_add_asval_to_default_sindex,
+	as_sindex_add_asval_to_list_sindex,
+	as_sindex_add_asval_to_mapkeys_sindex,
+	as_sindex_add_asval_to_mapvalues_sindex
+};
+//                                   END - ADD ASVAL TO SINDEX TYPE
+// ************************************************************************************************
+// ************************************************************************************************
+// DIFF FROM BIN TO SINDEX
+
+static bool
+as_sindex_bin_add_skey(as_sindex_bin *sbin, const void *skey, as_val_t type)
+{
+	if (type == AS_STRING) {
+		if (as_sindex_add_digest_to_sbin(sbin, *((cf_digest *)skey)) == AS_SINDEX_OK) {
+			return true;
+		}
+	}
+	else if (type == AS_INTEGER) {
+		if (as_sindex_add_integer_to_sbin(sbin, *((uint64_t *)skey)) == AS_SINDEX_OK) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void
+packed_val_init_unpacker(const cdt_payload *val, as_unpacker *pk)
+{
+	pk->buffer = val->ptr;
+	pk->length = val->sz;
+	pk->offset = 0;
+}
+
+static bool
+packed_val_make_skey(const cdt_payload *val, as_val_t type, void *skey)
+{
+	as_unpacker pk;
+	packed_val_init_unpacker(val, &pk);
+
+	as_val_t packed_type = as_unpack_peek_type(&pk);
+
+	if (packed_type != type) {
+		return false;
+	}
+
+	if (type == AS_STRING) {
+		int32_t size = as_unpack_blob_size(&pk);
+
+		if (size < 0) {
+			return false;
+		}
+
+		if (pk.buffer[pk.offset++] != AS_BYTES_STRING) {
+			return false;
+		}
+
+		cf_digest_compute(pk.buffer + pk.offset, pk.length - pk.offset, (cf_digest *)skey);
+	}
+	else if (type == AS_INTEGER) {
+		if (as_unpack_int64(&pk, (int64_t *)skey) < 0) {
+			return false;
+		}
+	}
+	else {
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+packed_val_add_sbin_or_update_shash(cdt_payload *val, as_sindex_bin *sbin, cf_shash *hash, as_val_t type)
+{
+	uint8_t skey[sizeof(cf_digest)];
+
+	if (! packed_val_make_skey(val, type, skey)) {
+		// packed_vals that aren't of type are ignored.
+		return true;
+	}
+
+	bool found = false;
+
+	if (cf_shash_get(hash, skey, &found) != CF_SHASH_OK) {
+		// Item not in hash, add to sbin.
+		return as_sindex_bin_add_skey(sbin, skey, type);
+	}
+	else {
+		// Item is in hash, set it to true.
+		found = true;
+		cf_shash_put(hash, skey, &found);
+
+		return true;
+	}
+
+	return false;
+}
+
+static void
+shash_add_packed_val(cf_shash *h, const cdt_payload *val, as_val_t type, bool value)
+{
+	uint8_t skey[sizeof(cf_digest)];
+
+	if (! packed_val_make_skey(val, type, skey)) {
+		// packed_vals that aren't of type are ignored.
+		return;
+	}
+
+	cf_shash_put(h, skey, &value);
+}
+
+static int
+shash_diff_reduce_fn(const void *skey, void *data, void *udata)
+{
+	bool value = *(bool *)data;
+	as_sindex_bin *sbin = (as_sindex_bin *)udata;
+
+	if (! sbin) {
+		cf_debug(AS_SINDEX, "SBIN sent as NULL");
+		return -1;
+	}
+
+	if (! value) {
+		// Add in the sbin.
+		if (sbin->type == AS_PARTICLE_TYPE_STRING) {
+			as_sindex_add_digest_to_sbin(sbin, *(const cf_digest*)skey);
+		}
+		else if (sbin->type == AS_PARTICLE_TYPE_INTEGER) {
+			as_sindex_add_integer_to_sbin(sbin, *(const uint64_t*)skey);
+		}
+	}
+
+	return 0;
+}
+
+// Find delta list elements and put them into sbins.
+// Currently supports only string/integer index types.
+static int32_t
+as_sindex_sbins_sindex_list_diff_populate(as_sindex_bin *sbins, as_sindex *si, const as_bin *b_old, const as_bin *b_new)
+{
+	// Algorithm
+	//	Add elements of short_list into hash with value = false
+	//	Iterate through all the values in the long_list
+	//		For all elements of long_list in hash, set value = true
+	//		For all elements of long_list not in hash, add to sbin (insert or delete)
+	//	Iterate through all the elements of hash
+	//		For all elements where value == false, add to sbin (insert or delete)
+
+	as_particle_type type = as_sindex_pktype(si->imd);
+	int data_size;
+	as_val_t expected_type;
+
+	if (type == AS_PARTICLE_TYPE_STRING) {
+		data_size = 20;
+		expected_type = AS_STRING;
+	}
+	else if (type == AS_PARTICLE_TYPE_INTEGER) {
+		data_size = 8;
+		expected_type = AS_INTEGER;
+	}
+	else {
+		cf_debug(AS_SINDEX, "Invalid data type %d", type);
+		return -1;
+	}
+
+	cdt_payload old_val;
+	cdt_payload new_val;
+
+	as_bin_particle_list_get_packed_val(b_old, &old_val);
+	as_bin_particle_list_get_packed_val(b_new, &new_val);
+
+	as_unpacker pk_old;
+	as_unpacker pk_new;
+
+	packed_val_init_unpacker(&old_val, &pk_old);
+	packed_val_init_unpacker(&new_val, &pk_new);
+
+	int64_t old_list_count = as_unpack_list_header_element_count(&pk_old);
+	int64_t new_list_count = as_unpack_list_header_element_count(&pk_new);
+
+	if (old_list_count < 0 || new_list_count < 0) {
+		return -1;
+	}
+
+	// Skip msgpack ext if it exist as the first element.
+	if (old_list_count != 0 && as_unpack_peek_is_ext(&pk_old)) {
+		if (as_unpack_size(&pk_old) < 0) {
+			return -1;
+		}
+
+		old_list_count--;
+	}
+
+	if (new_list_count != 0 && as_unpack_peek_is_ext(&pk_new)) {
+		if (as_unpack_size(&pk_new) < 0) {
+			return -1;
+		}
+
+		new_list_count--;
+	}
+
+	bool old_list_is_short = old_list_count < new_list_count;
+
+	uint32_t short_list_count;
+	uint32_t long_list_count;
+	as_unpacker *pk_short;
+	as_unpacker *pk_long;
+
+	if (old_list_is_short) {
+		short_list_count	= (uint32_t)old_list_count;
+		long_list_count		= (uint32_t)new_list_count;
+		pk_short			= &pk_old;
+		pk_long				= &pk_new;
+	}
+	else {
+		short_list_count	= (uint32_t)new_list_count;
+		long_list_count		= (uint32_t)old_list_count;
+		pk_short			= &pk_new;
+		pk_long				= &pk_old;
+	}
+
+	if (short_list_count == 0) {
+		if (long_list_count == 0) {
+			return 0;
+		}
+
+		as_sindex_init_sbin(sbins, old_list_is_short ? AS_SINDEX_OP_INSERT : AS_SINDEX_OP_DELETE, type, si);
+
+		for (uint32_t i = 0; i < long_list_count; i++) {
+			cdt_payload ele;
+
+			ele.ptr = pk_long->buffer + pk_long->offset;
+			ele.sz = as_unpack_size(pk_long);
+
+			// sizeof(cf_digest) is big enough for all key types we support so far.
+			uint8_t skey[sizeof(cf_digest)];
+
+			if (! packed_val_make_skey(&ele, expected_type, skey)) {
+				// packed_vals that aren't of type are ignored.
+				continue;
+			}
+
+			if (! as_sindex_bin_add_skey(sbins, skey, expected_type)) {
+				cf_warning(AS_SINDEX, "as_sindex_sbins_sindex_list_diff_populate() as_sindex_bin_add_skey failed");
+				as_sindex_sbin_free(sbins);
+				return -1;
+			}
+		}
+
+		return sbins->num_values == 0 ? 0 : 1;
+	}
+
+	cf_shash *hash = cf_shash_create(cf_shash_fn_u32, data_size, 1, short_list_count, 0);
+
+	// Add elements of shorter list into hash with value = false.
+	for (uint32_t i = 0; i < short_list_count; i++) {
+		cdt_payload ele = {
+				.ptr = pk_short->buffer + pk_short->offset
+		};
+
+		int size = as_unpack_size(pk_short);
+
+		if (size < 0) {
+			cf_warning(AS_SINDEX, "as_sindex_sbins_sindex_list_diff_populate() list unpack failed");
+			cf_shash_destroy(hash);
+			return -1;
+		}
+
+		ele.sz = size;
+		shash_add_packed_val(hash, &ele, expected_type, false);
+	}
+
+	as_sindex_init_sbin(sbins, old_list_is_short ? AS_SINDEX_OP_INSERT : AS_SINDEX_OP_DELETE, type, si);
+
+	for (uint32_t i = 0; i < long_list_count; i++) {
+		cdt_payload ele;
+
+		ele.ptr = pk_long->buffer + pk_long->offset;
+		ele.sz = as_unpack_size(pk_long);
+
+		if (! packed_val_add_sbin_or_update_shash(&ele, sbins, hash, expected_type)) {
+			cf_warning(AS_SINDEX, "as_sindex_sbins_sindex_list_diff_populate() hash update failed");
+			as_sindex_sbin_free(sbins);
+			cf_shash_destroy(hash);
+			return -1;
+		}
+	}
+
+	// Need to keep track of start for unwinding on error.
+	as_sindex_bin *start_sbin = sbins;
+	int found = 0;
+
+	if (sbins->num_values > 0) {
+		sbins++;
+		found++;
+	}
+
+	as_sindex_init_sbin(sbins, old_list_is_short ? AS_SINDEX_OP_DELETE : AS_SINDEX_OP_INSERT, type, si);
+
+	// Iterate through all the elements of hash.
+	if (cf_shash_reduce(hash, shash_diff_reduce_fn, sbins) != 0) {
+		as_sindex_sbin_freeall(start_sbin, found + 1);
+		cf_shash_destroy(hash);
+		return -1;
+	}
+
+	if (sbins->num_values > 0) {
+		found++;
+	}
+
+	cf_shash_destroy(hash);
+
+	return found;
+}
+
+void
+as_sindex_sbins_debug_print(as_sindex_bin *sbins, uint32_t count)
+{
+	cf_warning( AS_SINDEX, "as_sindex_sbins_list_update_diff() found=%d", count);
+	for (uint32_t i = 0; i < count; i++) {
+		as_sindex_bin *p = sbins + i;
+
+		cf_warning( AS_SINDEX, "  %d: values= %"PRIu64" type=%d op=%d",
+				i, p->num_values, p->type, p->op);
+
+		if (p->type == AS_PARTICLE_TYPE_INTEGER) {
+			int64_t *values = (int64_t *)p->values;
+
+			if (p->num_values == 1) {
+				cf_warning( AS_SINDEX, "    %ld", p->value.int_val);
+			}
+			else {
+				for (uint64_t j = 0; j < p->num_values; j++) {
+					cf_warning( AS_SINDEX, "     %"PRIu64":  %"PRId64"", j, values[j]);
+				}
+			}
+		}
+	}
+}
+
+// Assumes b_old and b_new are AS_PARTICLE_TYPE_LIST bins.
+// Assumes b_old and b_new have the same id.
+static int32_t
+as_sindex_sbins_list_diff_populate(as_sindex_bin *sbins, as_namespace *ns, const char *set_name, const as_bin *b_old, const as_bin *b_new)
+{
+	uint16_t id = b_new->id;
+
+	if (! as_sindex_binid_has_sindex(ns, id)) {
+		return 0;
+	}
+
+	cf_ll *simatch_ll = NULL;
+	as_sindex__simatch_list_by_set_binid(ns, set_name, id, &simatch_ll);
+
+	if (! simatch_ll) {
+		return 0;
+	}
+
+	uint32_t populated = 0;
+
+	for (cf_ll_element *ele = cf_ll_get_head(simatch_ll); ele; ele = ele->next) {
+		sindex_set_binid_hash_ele *si_ele = (sindex_set_binid_hash_ele *)ele;
+		int simatch = si_ele->simatch;
+		as_sindex *si = &ns->sindex[simatch];
+
+		if (! as_sindex_isactive(si)) {
+			ele = ele->next;
+			continue;
+		}
+
+		int32_t delta = as_sindex_sbins_sindex_list_diff_populate(&sbins[populated], si, b_old, b_new);
+
+		if (delta < 0) {
+			return -1;
+		}
+
+		populated += delta;
+	}
+
+	return populated;
+}
+
+uint32_t
+as_sindex_sbins_populate(as_sindex_bin *sbins, as_namespace *ns, const char *set_name, const as_bin *b_old, const as_bin *b_new)
+{
+	if (as_bin_get_particle_type(b_old) == AS_PARTICLE_TYPE_LIST && as_bin_get_particle_type(b_new) == AS_PARTICLE_TYPE_LIST) {
+		int32_t ret = as_sindex_sbins_list_diff_populate(sbins, ns, set_name, b_old, b_new);
+
+		if (ret >= 0) {
+			return (uint32_t)ret;
+		}
+	}
+
+	uint32_t populated = 0;
+
+	// TODO - might want an optimization that detects the (rare) case when a
+	// particle was rewritten with the exact old value.
+	populated += as_sindex_sbins_from_bin(ns, set_name, b_old, &sbins[populated], AS_SINDEX_OP_DELETE);
+	populated += as_sindex_sbins_from_bin(ns, set_name, b_new, &sbins[populated], AS_SINDEX_OP_INSERT);
+
+	return populated;
+}
+// DIFF FROM BIN TO SINDEX
+// ************************************************************************************************
+// ************************************************************************************************
+//                                     SBIN INTERFACE FUNCTIONS
+int
+as_sindex_sbin_from_sindex(as_sindex * si, const as_bin *b, as_sindex_bin * sbin, as_val ** cdt_asval)
+{
+	as_sindex_metadata * imd    = si->imd;
+	as_particle_type imd_sktype  = as_sindex_pktype(imd);
+	as_val * cdt_val            = * cdt_asval;
+	uint32_t  valsz             = 0;
+	int sindex_found            = 0;
+	as_particle_type bin_type   = 0;
+	bool found = false;
+
+	bin_type = as_bin_get_particle_type(b);
+
+	//		Prepare si
+	// 		If path_length == 0
+	if (imd->path_length == 0) {
+		// 			If itype == AS_SINDEX_ITYPE_DEFAULT and bin_type == STRING OR INTEGER
+		// 				Add the value to the sbin.
+		if (imd->itype == AS_SINDEX_ITYPE_DEFAULT && bin_type == imd_sktype) {
+			if (bin_type == AS_PARTICLE_TYPE_INTEGER) {
+				found = true;
+				sbin->value.int_val = as_bin_particle_integer_value(b);
+
+				if (as_sindex_add_integer_to_sbin(sbin, (uint64_t)sbin->value.int_val) == AS_SINDEX_OK) {
+					if (sbin->num_values) {
+						sindex_found++;
+					}
+				}
+			}
+			else if (bin_type == AS_PARTICLE_TYPE_STRING) {
+				found = true;
+				char* bin_val;
+				valsz = as_bin_particle_string_ptr(b, &bin_val);
+
+				if (valsz > AS_SINDEX_MAX_STRING_KSIZE) {
+					cf_warning( AS_SINDEX, "sindex key size out of bounds %d ", valsz);
+					cf_warning(AS_SINDEX, "Sindex on bin %s fails. Value length %u too long.", imd->bname, valsz);
+				}
+				else {
+					cf_digest buf_dig;
+					cf_digest_compute(bin_val, valsz, &buf_dig);
+
+					if (as_sindex_add_digest_to_sbin(sbin, buf_dig) == AS_SINDEX_OK) {
+						if (sbin->num_values) {
+							sindex_found++;
+						}
+					}
+				}
+			}
+			else if (bin_type == AS_PARTICLE_TYPE_GEOJSON) {
+				// GeoJSON is like AS_PARTICLE_TYPE_STRING when
+				// reading the value and AS_PARTICLE_TYPE_INTEGER for
+				// adding the result to the index.
+				found = true;
+				bool added = false;
+				uint64_t * cells;
+				size_t ncells = as_bin_particle_geojson_cellids(b, &cells);
+				for (size_t ndx = 0; ndx < ncells; ++ndx) {
+					if (as_sindex_add_integer_to_sbin(sbin, cells[ndx]) == AS_SINDEX_OK) {
+						added = true;
+					}
+				}
+				if (added && sbin->num_values) {
+					sindex_found++;
+				}
+			}
+		}
+	}
+	// 		Else if path_length > 0 OR type == MAP or LIST
+	// 			Deserialize the bin if have not deserialized it yet.
+	//			Extract as_val from path within the bin.
+	//			Add the values to the sbin.
+	if (!found) {
+		if (bin_type == AS_PARTICLE_TYPE_MAP || bin_type == AS_PARTICLE_TYPE_LIST) {
+			if (! cdt_val) {
+				cdt_val = as_bin_particle_to_asval(b);
+			}
+			as_val * res_val   = as_sindex_extract_val_from_path(imd, cdt_val);
+			if (!res_val) {
+				goto END;
+			}
+			if (as_sindex_add_asval_to_itype_sindex[imd->itype](res_val, sbin) == AS_SINDEX_OK) {
+				if (sbin->num_values) {
+					sindex_found++;
+				}
+			}
+		}
+	}
+END:
+	*cdt_asval = cdt_val;
+	return sindex_found;
+}
+
+// Returns the number of sindex found
+// TODO - deprecate and conflate body with as_sindex_sbins_from_bin() below.
+int
+as_sindex_sbins_from_bin_buf(as_namespace *ns, const char *set, const as_bin *b, as_sindex_bin * start_sbin,
+					as_sindex_op op)
+{
+	// Check the sindex bit array.
+	// If there is not sindex present on this bin return 0
+	// Get the simatch_ll from set_binid_hash
+	// If simatch_ll is NULL return 0
+	// Iterate through simatch_ll
+	// 		If path_length == 0
+	// 			If itype == AS_SINDEX_ITYPE_DEFAULT and bin_type == STRING OR INTEGER
+	// 				Add the value to the sbin.
+	//			If itype == AS_SINDEX_ITYPE_MAP or AS_SINDEX_ITYPE_INVMAP and type = MAP
+	//	 			Deserialize the bin if have not deserialized it yet.
+	//				Extract as_val from path within the bin
+	//				Add them to the sbin.
+	// 			If itype == AS_SINDEX_ITYPE_LIST and type = LIST
+	//	 			Deserialize the bin if have not deserialized it yet.
+	//				Extract as_val from path within the bin.
+	//				Add the values to the sbin.
+	// 		Else if path_length > 0 and type == MAP or LIST
+	// 			Deserialize the bin if have not deserialized it yet.
+	//			Extract as_val from path within the bin.
+	//			Add the values to the sbin.
+	// Return the number of sbins found.
+
+	int sindex_found = 0;
+	if (!b) {
+		cf_warning(AS_SINDEX, "Null Bin Passed, No sbin created");
+		return sindex_found;
+	}
+	if (!ns) {
+		cf_warning(AS_SINDEX, "NULL Namespace Passed");
+		return sindex_found;
+	}
+	if (!as_bin_inuse(b)) {
+		return sindex_found;
+	}
+
+	// Check the sindex bit array.
+	// If there is not sindex present on this bin return 0
+	if (!as_sindex_binid_has_sindex(ns, b->id) ) {
+		return sindex_found;
+	}
+
+	// Get the simatch_ll from set_binid_hash
+	cf_ll * simatch_ll  = NULL;
+	as_sindex__simatch_list_by_set_binid(ns, set, b->id, &simatch_ll);
+
+	// If simatch_ll is NULL return 0
+	if (!simatch_ll) {
+		return sindex_found;
+	}
+
+	// Iterate through simatch_ll
+	cf_ll_element             * ele    = cf_ll_get_head(simatch_ll);
+	sindex_set_binid_hash_ele * si_ele = NULL;
+	int                        simatch = -1;
+	as_sindex                 * si     = NULL;
+	as_val                   * cdt_val = NULL;
+	int                   sbins_in_si  = 0;
+	while (ele) {
+		si_ele                = (sindex_set_binid_hash_ele *) ele;
+		simatch               = si_ele->simatch;
+		si                    = &ns->sindex[simatch];
+		if (!as_sindex_isactive(si)) {
+			ele = ele->next;
+			continue;
+		}
+		as_sindex_init_sbin(&start_sbin[sindex_found], op,  as_sindex_pktype(si->imd), si);
+		uint64_t s_time = cf_getns();
+		sbins_in_si          = as_sindex_sbin_from_sindex(si, b, &start_sbin[sindex_found], &cdt_val);
+		if (sbins_in_si == 1) {
+			sindex_found += sbins_in_si;
+			// sbin free will happen once sbin is updated in sindex tree
+			SINDEX_HIST_INSERT_DATA_POINT(si, si_prep_hist, s_time);
+		}
+		else {
+			as_sindex_sbin_free(&start_sbin[sindex_found]);
+			if (sbins_in_si) {
+				cf_warning(AS_SINDEX, "sbins found in si is neither 1 nor 0. It is %d", sbins_in_si);
+			}
+		}
+		ele                   = ele->next;
+	}
+
+	// FREE as_val
+	if (cdt_val) {
+		as_val_destroy(cdt_val);
+	}
+	// Return the number of sbin found.
+	return sindex_found;
+}
+
+int
+as_sindex_sbins_from_bin(as_namespace *ns, const char *set, const as_bin *b, as_sindex_bin * start_sbin, as_sindex_op op)
+{
+	return as_sindex_sbins_from_bin_buf(ns, set, b, start_sbin, op);
+}
+
+/*
+ * returns number of sbins found.
+ */
+int
+as_sindex_sbins_from_rd(as_storage_rd *rd, uint16_t from_bin, uint16_t to_bin, as_sindex_bin sbins[], as_sindex_op op)
+{
+	uint16_t count  = 0;
+	for (uint16_t i = from_bin; i < to_bin; i++) {
+		as_bin *b   = &rd->bins[i];
+		count      += as_sindex_sbins_from_bin(rd->ns, as_index_get_set_name(rd->r, rd->ns), b, &sbins[count], op);
+	}
+	return count;
+}
+
+// Needs comments
+int
+as_sindex_update_by_sbin(as_namespace *ns, const char *set, as_sindex_bin *start_sbin, int num_sbins, cf_digest * pkey)
+{
+	cf_debug(AS_SINDEX, "as_sindex_update_by_sbin");
+
+	// Need to address sbins which have OP as AS_SINDEX_OP_DELETE before the ones which have
+	// OP as AS_SINDEX_OP_INSERT. This is because same secondary index key can exist in sbins
+	// with different OPs
+	int sindex_ret = AS_SINDEX_OK;
+	for (int i=0; i<num_sbins; i++) {
+		if (start_sbin[i].op == AS_SINDEX_OP_DELETE) {
+			sindex_ret = as_sindex__op_by_sbin(ns, set, 1, &start_sbin[i], pkey);
+		}
+	}
+	for (int i=0; i<num_sbins; i++) {
+		if (start_sbin[i].op == AS_SINDEX_OP_INSERT) {
+			sindex_ret = as_sindex__op_by_sbin(ns, set, 1, &start_sbin[i], pkey);
+		}
+	}
+	return sindex_ret;
+}
+//                                 END - SBIN INTERFACE FUNCTIONS
+// ************************************************************************************************
+// ************************************************************************************************
+//                                      PUT RD IN SINDEX
+// Takes a record and tries to populate it in every sindex present in the namespace.
+void
+as_sindex_putall_rd(as_namespace *ns, as_storage_rd *rd)
+{
+	int count = 0;
+	int valid = 0;
+
+	// Only called at the boot time. No writer is expected to
+	// change ns->sindex in parallel.
+	while (count < AS_SINDEX_MAX && valid < ns->sindex_cnt) {
+		as_sindex *si = &ns->sindex[count];
+		if (! as_sindex_put_rd(si, rd)) {
+			valid++;
+		}
+		count++;
+	}
+}
+
+as_sindex_status
+as_sindex_put_rd(as_sindex *si, as_storage_rd *rd)
+{
+	// Proceed only if sindex is active
+	SINDEX_GRLOCK();
+	if (! as_sindex_isactive(si)) {
+		SINDEX_GRUNLOCK();
+		return AS_SINDEX_ERR;
+	}
+
+	as_sindex_metadata *imd = si->imd;
+	// Validate Set name. Other function do this check while
+	// performing searching for simatch.
+	const char *setname = NULL;
+	if (as_index_has_set(rd->r)) {
+		setname = as_index_get_set_name(rd->r, si->ns);
+	}
+
+	if (!as_sindex__setname_match(imd, setname)) {
+		SINDEX_GRUNLOCK();
+		return AS_SINDEX_OK;
+	}
+
+	// collect sbins
+	SINDEX_BINS_SETUP(sbins, 1);
+
+	int sbins_populated = 0;
+	as_val * cdt_val = NULL;
+
+	as_bin *b = as_bin_get(rd, imd->bname);
+
+	if (!b) {
+		SINDEX_GRUNLOCK();
+		return AS_SINDEX_OK;
+	}
+
+	as_sindex_init_sbin(&sbins[sbins_populated], AS_SINDEX_OP_INSERT,
+												as_sindex_pktype(si->imd), si);
+	sbins_populated = as_sindex_sbin_from_sindex(si, b, &sbins[sbins_populated], &cdt_val);
+
+	// Only 1 sbin should be populated here.
+	// If populated should be freed after sindex update
+	if (sbins_populated != 1) {
+		as_sindex_sbin_free(&sbins[sbins_populated]);
+		if (sbins_populated) {
+			cf_warning(AS_SINDEX, "Number of sbins found for 1 sindex is neither 1 nor 0. It is %d",
+					sbins_populated);
+		}
+	}
+	SINDEX_GRUNLOCK();
+
+	if (cdt_val) {
+		as_val_destroy(cdt_val);
+	}
+
+	if (sbins_populated) {
+		as_sindex_update_by_sbin(rd->ns, setname, sbins, sbins_populated, &rd->r->keyd);
+		as_sindex_sbin_freeall(sbins, sbins_populated);
+	}
+
+	return AS_SINDEX_OK;
+}
+//                                    END - PUT RD IN SINDEX
+// ************************************************************************************************
+
+
+// ************************************************************************************************
+//                                           SMD CALLBACKS
+/*
+ *                +------------------+
+ *  client -->    |  Secondary Index |
+ *                +------------------+
+ *                     /|\
+ *                      | 4 accept
+ *                  +----------+   2
+ *                  |          |<-------   +------------------+ 1 request
+ *                  | SMD      | 3 merge   |  Secondary Index | <------------|
+ *                  |          |<------->  |                  | 5 response   | CLIENT
+ *                  |          | 4 accept  |                  | ------------>|
+ *                  |          |-------->  +------------------+
+ *                  +----------+
+ *                     |   4 accept
+ *                    \|/
+ *                +------------------+
+ *  client -->    |  Secondary Index |
+ *                +------------------+
+ *
+ *
+ *  System Metadta module sits in the middle of multiple secondary index
+ *  module on multiple nodes. The changes which eventually are made to the
+ *  secondary index are always triggerred from SMD. Here is the flow.
+ *
+ *  Step1: Client send (could possibly be secondary index thread) triggers
+ *         create / delete / update related to secondary index metadata.
+ *
+ *  Step2: The request passed through secondary index module (may be few
+ *         node specific info is added on the way) to the SMD.
+ *
+ *  Step3: SMD send out the request to the paxos master.
+ *
+ *  Step4: Paxos master request the relevant metadata info from all the
+ *         nodes in the cluster once it has all the data... [SMD always
+ *         stores copy of the data, it is stored when the first time
+ *         create happens]..it call secondary index merge callback
+ *         function. The function is responsible for resolving the winning
+ *         version ...
+ *
+ *  Step5: Once winning version is decided for all the registered module
+ *         the changes are sent to all the node.
+ *
+ *  Step6: At each node accept_fn is called for each module. Which triggers
+ *         the call to the secondary index create/delete/update functions
+ *         which would be used to in-memory operation and make it available
+ *         for the system.
+ *
+ *  There are two types of operations which look at the secondary index
+ *  operations.
+ *
+ *  a) Normal operation .. they all look a the in-memory structure and
+ *     data which is in sindex and ai_btree layer.
+ *
+ *  b) Other part which do DDL operation like which work through the SMD
+ *     layer. Multiple operation happening from the multiple nodes which
+ *     come through this layer. The synchronization is responsible of
+ *     SMD layer. The part sindex / ai_btree code is responsible is to
+ *     make sure when the call from the SMD comes there is proper sync
+ *     between this and operation in section a
+ *
+ */
+
+// Global flag to signal that all secondary index SMD is restored.
+static bool g_sindex_smd_restored = false;
+
+void
+as_sindex_init_smd()
+{
+	int retval = as_smd_create_module(SINDEX_MODULE,
+				as_smd_majority_consensus_merge, NULL,
+				NULL, NULL,
+				as_sindex_smd_accept_cb, NULL,
+				NULL, NULL);
+
+	cf_assert(retval == 0, AS_SINDEX, "failed to create sindex SMD module (rv %d)", retval);
+
+	// Wait for Secondary Index SMD to be completely restored.
+	while (! g_sindex_smd_restored) {
+		usleep(1000);
+	}
+}
+
+/*
+ * This function is called when the SMD has resolved the correct state of
+ * metadata. This function needs to, based on the value, looks at the current
+ * state of the index and trigger requests to secondary index to do the
+ * needful. At the start of time there is nothing in sindex and this code
+ * comes and setup indexes
+ *
+ * Expectation. SMD is responsible for persisting data and communicating back
+ *              to sindex layer to create in-memory structures
+ *
+ *
+ * Description: To perform sindex operations(ADD,MODIFY,DELETE), through SMD
+ * 				This function called on every node, after paxos master decides
+ * 				the final version of the sindex to be created. This is the final
+ *				version and the only allowed version in the sindex.Operations coming
+ *				to this function are least expected to fail, ideally they should
+ *				never fail.
+ *
+ * Parameters:
+ * 		module:             SINDEX_MODULE
+ * 		as_smd_item_list_t: list of action items, to be performed on sindex.
+ * 		udata:              ??
+ *
+ * Returns:
+ * 		always 0
+ *
+ * Synchronization:
+ * 		underlying secondary index all needs to take corresponding lock and
+ * 		SMD is today single threaded no sync needed there
+ */
+
+as_sindex_ktype
+as_sindex_ktype_from_smd_char(char c)
+{
+	if (c == 'I') {
+		return COL_TYPE_LONG;
+	}
+	else if (c == 'S') {
+		return COL_TYPE_DIGEST;
+	}
+	else if (c == 'G') {
+		return COL_TYPE_GEOJSON;
+	}
+	else {
+		cf_warning(AS_SINDEX, "unknown smd ktype %c", c);
+		return COL_TYPE_INVALID;
+	}
+}
+
+char
+as_sindex_ktype_to_smd_char(as_sindex_ktype ktype)
+{
+	if (ktype == COL_TYPE_LONG) {
+		return 'I';
+	}
+	else if (ktype == COL_TYPE_DIGEST) {
+		return 'S';
+	}
+	else if (ktype == COL_TYPE_GEOJSON) {
+		return 'G';
+	}
+	else {
+		cf_crash(AS_SINDEX, "unknown ktype %d", ktype);
+		return '?';
+	}
+}
+
+as_sindex_type
+as_sindex_type_from_smd_char(char c)
+{
+	if (c == '.') {
+		return AS_SINDEX_ITYPE_DEFAULT; // or - "scalar"
+	}
+	else if (c == 'L') {
+		return AS_SINDEX_ITYPE_LIST;
+	}
+	else if (c == 'K') {
+		return AS_SINDEX_ITYPE_MAPKEYS;
+	}
+	else if (c == 'V') {
+		return AS_SINDEX_ITYPE_MAPVALUES;
+	}
+	else {
+		cf_warning(AS_SINDEX, "unknown smd type %c", c);
+		return AS_SINDEX_ITYPE_MAX; // since there's no named illegal value
+	}
+}
+
+char
+as_sindex_type_to_smd_char(as_sindex_type itype)
+{
+	if (itype == AS_SINDEX_ITYPE_DEFAULT) {
+		return '.';
+	}
+	else if (itype == AS_SINDEX_ITYPE_LIST) {
+		return 'L';
+	}
+	else if (itype == AS_SINDEX_ITYPE_MAPKEYS) {
+		return 'K';
+	}
+	else if (itype == AS_SINDEX_ITYPE_MAPVALUES) {
+		return 'V';
+	}
+	else {
+		cf_crash(AS_SINDEX, "unknown type %d", itype);
+		return '?';
+	}
+}
+
+#define TOK_CHAR_DELIMITER '|'
+
+bool
+smd_key_to_imd(const char *smd_key, as_sindex_metadata *imd)
+{
+	// ns-name|<set-name>|path|itype|sktype
+	// Note - sktype a.k.a. ktype and dtype.
+
+	const char *read = smd_key;
+	const char *tok = strchr(read, TOK_CHAR_DELIMITER);
+
+	if (! tok) {
+		cf_warning(AS_SINDEX, "smd - namespace name missing delimiter");
+		return false;
+	}
+
+	uint32_t ns_name_len = tok - read;
+
+	imd->ns_name = cf_malloc(ns_name_len + 1);
+	memcpy(imd->ns_name, read, ns_name_len);
+	imd->ns_name[ns_name_len] = 0;
+
+	read = tok + 1;
+	tok = strchr(read, TOK_CHAR_DELIMITER);
+
+	if (! tok) {
+		cf_warning(AS_SINDEX, "smd - set name missing delimiter");
+		return false;
+	}
+
+	uint32_t set_name_len = tok - read;
+
+	if (set_name_len != 0) {
+		imd->set = cf_malloc(set_name_len + 1);
+		memcpy(imd->set, read, set_name_len);
+		imd->set[set_name_len] = 0;
+	}
+	// else - imd->set remains NULL.
+
+	read = tok + 1;
+	tok = strchr(read, TOK_CHAR_DELIMITER);
+
+	if (! tok) {
+		cf_warning(AS_SINDEX, "smd - path missing delimiter");
+		return false;
+	}
+
+	uint32_t path_len = tok - read;
+
+	imd->path_str = cf_malloc(path_len + 1);
+	memcpy(imd->path_str, read, path_len);
+	imd->path_str[path_len] = 0;
+
+	if (as_sindex_extract_bin_path(imd, imd->path_str) != AS_SINDEX_OK) {
+		cf_warning(AS_SINDEX, "smd - can't parse path");
+		return false;
+	}
+
+	read = tok + 1;
+	tok = strchr(read, TOK_CHAR_DELIMITER);
+
+	if (! tok) {
+		cf_warning(AS_SINDEX, "smd - itype missing delimiter");
+		return false;
+	}
+
+	if ((imd->itype = as_sindex_type_from_smd_char(*read)) ==
+			AS_SINDEX_ITYPE_MAX) {
+		cf_warning(AS_SINDEX, "smd - bad itype");
+		return false;
+	}
+
+	read = tok + 1;
+
+	if ((imd->sktype = as_sindex_ktype_from_smd_char(*read)) ==
+			COL_TYPE_INVALID) {
+		cf_warning(AS_SINDEX, "smd - bad sktype");
+		return false;
+	}
+
+	return true;
+}
+
+void
+smd_value_to_imd(const char *smd_value, as_sindex_metadata *imd)
+{
+	// For now, it's only index-name
+	imd->iname = cf_strdup(smd_value);
+}
+
+void
+as_sindex_imd_to_smd_key(const as_sindex_metadata *imd, char *smd_key)
+{
+	// ns-name|<set-name>|path|itype|sktype
+	// Note - sktype a.k.a. ktype and dtype.
+
+	sprintf(smd_key, "%s|%s|%s|%c|%c",
+			imd->ns_name,
+			imd->set ? imd->set : "",
+			imd->path_str,
+			as_sindex_type_to_smd_char(imd->itype),
+			as_sindex_ktype_to_smd_char(imd->sktype));
+}
+
+bool
+as_sindex_delete_imd_to_smd_key(as_namespace *ns, as_sindex_metadata *imd, char *smd_key)
+{
+	// ns-name|<set-name>|path|sktype|<itype>
+	// Note - sktype a.k.a. ktype and dtype.
+
+	// The imd passed in doesn't have enough to make SMD key - use a full imd
+	// from the existing sindex, if it's there.
+
+	// TODO - takes lock - is this ok? Flags ok?
+	as_sindex *si = as_sindex_lookup_by_iname(ns, imd->iname,
+			AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+
+	if (! si) {
+		return false;
+	}
+
+	as_sindex_imd_to_smd_key(si->imd, smd_key);
+
+	return true;
+}
+
+int
+as_sindex_smd_accept_cb(char *module, as_smd_item_list_t *items, void *udata, uint32_t accept_opt)
+{
+	if ((accept_opt & AS_SMD_ACCEPT_OPT_CREATE) != 0) {
+		g_sindex_smd_restored = true;
+		return 0;
+	}
+
+	for (int i = 0; i < (int)items->num_items; i++) {
+		as_smd_item_t *item = items->item[i];
+		as_sindex_metadata imd;
+
+		memset(&imd, 0, sizeof(imd)); // TODO - arrange to use { 0 } ???
+
+		if (! smd_key_to_imd(item->key, &imd)) {
+			as_sindex_imd_free(&imd);
+			continue;
+		}
+
+		as_namespace *ns = as_namespace_get_byname(imd.ns_name);
+
+		if (! ns) {
+			cf_detail(AS_SINDEX, "skipping invalid namespace %s", imd.ns_name);
+			as_sindex_imd_free(&imd);
+			continue;
+		}
+
+		if (item->action == AS_SMD_ACTION_SET) {
+			smd_value_to_imd(item->value, &imd); // sets index name
+			as_sindex_smd_create(ns, &imd);
+		}
+		else if (item->action == AS_SMD_ACTION_DELETE) {
+			as_sindex_destroy(ns, &imd);
+		}
+		else {
+			cf_warning(AS_SINDEX, "smd accept cb - unknown action");
+		}
+
+		as_sindex_imd_free(&imd);
+	}
+
+	return 0;
+}
+//                                     END - SMD CALLBACKS
+// ************************************************************************************************
+// ************************************************************************************************
+//                                         SINDEX TICKER
+// Sindex ticker start
+void
+as_sindex_ticker_start(as_namespace * ns, as_sindex * si)
+{
+	cf_info(AS_SINDEX, "Sindex-ticker start: ns=%s si=%s job=%s", ns->name ? ns->name : "<all>",
+			si ? si->imd->iname : "<all>", si ? "SINDEX_POPULATE" : "SINDEX_POPULATEALL");
+
+}
+// Sindex ticker
+void
+as_sindex_ticker(as_namespace * ns, as_sindex * si, uint64_t n_obj_scanned, uint64_t start_time)
+{
+	const uint64_t sindex_ticker_obj_count = 500000;
+
+	if (n_obj_scanned % sindex_ticker_obj_count == 0 && n_obj_scanned != 0) {
+		// Ticker can be dumped from here, we'll be in this place for both
+		// sindex populate and populate-all.
+		// si memory gets set from as_sindex_reserve_data_memory() which in turn gets set from :
+		// ai_btree_put() <- for every single sindex insertion (boot-time/dynamic)
+		// as_sindex_create() : for dynamic si creation, cluster change, smd on boot-up.
+
+		uint64_t si_memory = 0;
+		char   * si_name = NULL;
+
+		if (si) {
+			si_memory += ai_btree_get_isize(si->imd);
+			si_memory += ai_btree_get_nsize(si->imd);
+			si_name = si->imd->iname;
+		}
+		else {
+			si_memory = (uint64_t)cf_atomic64_get(ns->n_bytes_sindex_memory);
+			si_name = "<all>";
+		}
+
+		uint64_t n_objects       = cf_atomic64_get(ns->n_objects);
+		uint64_t pct_obj_scanned = n_objects == 0 ? 100 : ((n_obj_scanned * 100) / n_objects);
+		uint64_t elapsed         = (cf_getms() - start_time);
+		uint64_t est_time        = (elapsed * n_objects)/n_obj_scanned - elapsed;
+
+		cf_info(AS_SINDEX, " Sindex-ticker: ns=%s si=%s obj-scanned=%"PRIu64" si-mem-used=%"PRIu64""
+				" progress= %"PRIu64"%% est-time=%"PRIu64" ms",
+				ns->name, si_name, n_obj_scanned, si_memory, pct_obj_scanned, est_time);
+	}
+}
+
+// Sindex ticker end
+void
+as_sindex_ticker_done(as_namespace * ns, as_sindex * si, uint64_t start_time)
+{
+	uint64_t si_memory   = 0;
+	char   * si_name     = NULL;
+
+	if (si) {
+		si_memory += ai_btree_get_isize(si->imd);
+		si_memory += ai_btree_get_nsize(si->imd);
+		si_name = si->imd->iname;
+	}
+	else {
+		si_memory = (uint64_t)cf_atomic64_get(ns->n_bytes_sindex_memory);
+		si_name = "<all>";
+	}
+
+	cf_info(AS_SINDEX, "Sindex-ticker done: ns=%s si=%s si-mem-used=%"PRIu64" elapsed=%"PRIu64" ms",
+				ns->name, si_name, si_memory, cf_getms() - start_time);
+
+}
+//                                       END - SINDEX TICKER
+// ************************************************************************************************
+// ************************************************************************************************
+//                                         INDEX KEYS ARR
+// Functions are not used in this file.
+static cf_queue *g_q_index_keys_arr = NULL;
+int
+as_index_keys_ll_reduce_fn(cf_ll_element *ele, void *udata)
+{
+	return CF_LL_REDUCE_DELETE;
+}
+
+void
+as_index_keys_ll_destroy_fn(cf_ll_element *ele)
+{
+	as_index_keys_ll_element * node = (as_index_keys_ll_element *) ele;
+	if (node) {
+		if (node->keys_arr) {
+			as_index_keys_release_arr_to_queue(node->keys_arr);
+			node->keys_arr = NULL;
+		}
+		cf_free(node);
+	}
+}
+
+as_index_keys_arr *
+as_index_get_keys_arr(void)
+{
+	as_index_keys_arr *keys_arr;
+	if (cf_queue_pop(g_q_index_keys_arr, &keys_arr, CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) {
+		keys_arr = cf_malloc(sizeof(as_index_keys_arr));
+	}
+	keys_arr->num = 0;
+	return keys_arr;
+}
+
+void
+as_index_keys_release_arr_to_queue(as_index_keys_arr *v)
+{
+	as_index_keys_arr * keys_arr = (as_index_keys_arr *)v;
+	if (cf_queue_sz(g_q_index_keys_arr) < AS_INDEX_KEYS_ARRAY_QUEUE_HIGHWATER) {
+		cf_queue_push(g_q_index_keys_arr, &keys_arr);
+	}
+	else {
+		cf_free(keys_arr);
+	}
+
+}
+//                                      END - INDEX KEYS ARR
+// ************************************************************************************************
+
+/*
+ * Main initialization function. Talks to Aerospike Index to pull up all the indexes
+ * and populates sindex hanging from namespace
+ */
+int
+as_sindex_init(as_namespace *ns)
+{
+	ns->sindex = cf_malloc(sizeof(as_sindex) * AS_SINDEX_MAX);
+
+	ns->sindex_cnt = 0;
+	for (int i = 0; i < AS_SINDEX_MAX; i++) {
+		as_sindex *si                    = &ns->sindex[i];
+		memset(si, 0, sizeof(as_sindex));
+		si->state                        = AS_SINDEX_INACTIVE;
+		si->stats._delete_hist           = NULL;
+		si->stats._query_hist            = NULL;
+		si->stats._query_batch_lookup    = NULL;
+		si->stats._query_batch_io        = NULL;
+		si->stats._query_rcnt_hist       = NULL;
+		si->stats._query_diff_hist       = NULL;
+	}
+
+	// binid to simatch lookup
+	ns->sindex_set_binid_hash = cf_shash_create(cf_shash_fn_zstr,
+			AS_SINDEX_PROP_KEY_SIZE, sizeof(cf_ll *), AS_SINDEX_MAX, 0);
+
+	// iname to simatch lookup
+	ns->sindex_iname_hash = cf_shash_create(cf_shash_fn_zstr, AS_ID_INAME_SZ,
+			sizeof(uint32_t), AS_SINDEX_MAX, 0);
+
+	// Init binid_has_sindex to zero
+	memset(ns->binid_has_sindex, 0, sizeof(uint32_t)*AS_BINID_HAS_SINDEX_SIZE);
+	if (!g_q_index_keys_arr) {
+		g_q_index_keys_arr = cf_queue_create(sizeof(void *), true);
+	}
+	return AS_SINDEX_OK;
+}
+
+void
+as_sindex_dump(char *nsname, char *iname, char *fname, bool verbose)
+{
+	as_namespace *ns = as_namespace_get_byname(nsname);
+	as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE);
+	ai_btree_dump(si->imd, fname, verbose);
+	AS_SINDEX_RELEASE(si);
+}
diff --git a/as/src/base/security_ce.c b/as/src/base/security_ce.c
new file mode 100644
index 00000000..49137dbb
--- /dev/null
+++ b/as/src/base/security_ce.c
@@ -0,0 +1,163 @@
+/*
+ * security_stubs.c
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/security.h"
+#include "base/security_config.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "fault.h"
+#include "socket.h"
+
+#include "base/proto.h"
+#include "base/transaction.h"
+
+
+//==========================================================
+// Public API.
+//
+
+// Security is an enterprise feature - here, do nothing.
+void
+as_security_init()
+{
+}
+
+// Security is an enterprise feature - here, allow all operations.
+uint8_t
+as_security_check(const as_file_handle* fd_h, as_sec_perm perm)
+{
+	return AS_PROTO_RESULT_OK;
+}
+
+// Security is an enterprise feature - here, allow all operations.
+bool
+as_security_check_data_op(as_transaction* tr, as_namespace* ns,
+		as_sec_perm perm)
+{
+	return true;
+}
+
+// Security is an enterprise feature - here, there's no filter.
+void*
+as_security_filter_create()
+{
+	return NULL;
+}
+
+// Security is an enterprise feature - here, there's no filter.
+void
+as_security_filter_destroy(void* pv_filter)
+{
+}
+
+// Security is an enterprise feature - here, do nothing.
+void
+as_security_log(const as_file_handle* fd_h, uint8_t result, as_sec_perm perm,
+		const char* action, const char* detail)
+{
+}
+
+// Security is an enterprise feature - here, do nothing.
+void
+as_security_refresh(as_file_handle* fd_h)
+{
+}
+
+// Security is an enterprise feature. If we receive a security message from a
+// client here, quickly return AS_SEC_ERR_NOT_SUPPORTED. The client may choose
+// to continue using this (unsecured) socket.
+void
+as_security_transact(as_transaction* tr)
+{
+	// We don't need the request, since we're ignoring it.
+	cf_free(tr->msgp);
+	tr->msgp = NULL;
+
+	// Set up a simple response with a single as_sec_msg that has no fields.
+	size_t resp_size = sizeof(as_proto) + sizeof(as_sec_msg);
+	uint8_t resp[resp_size];
+
+	// Fill out the as_proto fields.
+	as_proto* p_resp_proto = (as_proto*)resp;
+
+	p_resp_proto->version = PROTO_VERSION;
+	p_resp_proto->type = PROTO_TYPE_SECURITY;
+	p_resp_proto->sz = sizeof(as_sec_msg);
+
+	// Switch to network byte order.
+	as_proto_swap(p_resp_proto);
+
+	uint8_t* p_proto_body = resp + sizeof(as_proto);
+
+	memset((void*)p_proto_body, 0, sizeof(as_sec_msg));
+
+	// Fill out the relevant as_sec_msg fields.
+	as_sec_msg* p_sec_msg = (as_sec_msg*)p_proto_body;
+
+	p_sec_msg->scheme = AS_SEC_MSG_SCHEME;
+	p_sec_msg->result = AS_SEC_ERR_NOT_SUPPORTED;
+
+	// Send the complete response.
+	cf_socket *sock = &tr->from.proto_fd_h->sock;
+
+	if (cf_socket_send_all(sock, resp, resp_size, MSG_NOSIGNAL,
+			CF_SOCKET_TIMEOUT) < 0) {
+		cf_warning(AS_SECURITY, "fd %d send failed, errno %d",
+				CSFD(sock), errno);
+		as_end_of_transaction_force_close(tr->from.proto_fd_h);
+		tr->from.proto_fd_h = NULL;
+		return;
+	}
+
+	as_end_of_transaction_ok(tr->from.proto_fd_h);
+	tr->from.proto_fd_h = NULL;
+}
+
+
+//==========================================================
+// Public API - security configuration.
+//
+
+// Security is an enterprise feature - here, do nothing.
+void
+as_security_config_check()
+{
+}
+
+// Security is an enterprise feature - here, do nothing.
+void
+as_security_config_log_scope(uint32_t sink, const char* ns_name,
+		const char* set_name)
+{
+}
diff --git a/as/src/base/signal.c b/as/src/base/signal.c
new file mode 100644
index 00000000..b5eca1da
--- /dev/null
+++ b/as/src/base/signal.c
@@ -0,0 +1,249 @@
+/*
+ * signal.c
+ *
+ * Copyright (C) 2010-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "fault.h"
+
+#include "base/xdr_serverside.h"
+
+
+//==========================================================
+// Constants.
+//
+
+// String constants in version.c, generated by make.
+extern const char aerospike_build_type[];
+extern const char aerospike_build_id[];
+extern const char aerospike_build_os[];
+
+
+//==========================================================
+// Globals.
+//
+
+// The mutex that the main function deadlocks on after starting the service.
+extern pthread_mutex_t g_main_deadlock;
+extern bool g_startup_complete;
+
+
+//==========================================================
+// Local helpers.
+//
+
+static inline void
+register_signal_handler(int sig_num, sighandler_t handler)
+{
+	sighandler_t old_handler = signal(sig_num, handler);
+
+	if (old_handler == SIG_ERR) {
+		cf_crash(AS_AS, "could not register signal handler for %d", sig_num);
+	}
+	// Occasionally we've seen the value 1 (SIG_IGN) returned, assume it's ok.
+	else if (old_handler && old_handler != SIG_IGN) {
+		cf_warning(AS_AS, "found unexpected old signal handler %p for %d",
+				old_handler, sig_num);
+		// This should never happen, but for now, proceed anyway...
+	}
+}
+
+static inline void
+reraise_signal(int sig_num, sighandler_t handler)
+{
+	if (signal(sig_num, SIG_DFL) != handler) {
+		cf_warning(AS_AS, "could not register default signal handler for %d",
+				sig_num);
+		_exit(-1);
+	}
+
+	raise(sig_num);
+}
+
+
+//==========================================================
+// Signal handlers.
+//
+
+// We get here on some crashes.
+void
+as_sig_handle_abort(int sig_num)
+{
+	cf_warning(AS_AS, "SIGABRT received, aborting %s build %s os %s",
+			aerospike_build_type, aerospike_build_id, aerospike_build_os);
+
+	xdr_sig_handler(sig_num);
+
+	PRINT_STACKTRACE();
+	reraise_signal(sig_num, as_sig_handle_abort);
+}
+
+void
+as_sig_handle_bus(int sig_num)
+{
+	cf_warning(AS_AS, "SIGBUS received, aborting %s build %s",
+			aerospike_build_type, aerospike_build_id);
+
+	xdr_sig_handler(sig_num);
+
+	PRINT_STACKTRACE();
+	reraise_signal(sig_num, as_sig_handle_bus);
+}
+
+// Floating point exception.
+void
+as_sig_handle_fpe(int sig_num)
+{
+	cf_warning(AS_AS, "SIGFPE received, aborting %s build %s os %s",
+			aerospike_build_type, aerospike_build_id, aerospike_build_os);
+
+	xdr_sig_handler(sig_num);
+
+	PRINT_STACKTRACE();
+	reraise_signal(sig_num, as_sig_handle_fpe);
+}
+
+// This signal is our cue to roll the log.
+void
+as_sig_handle_hup(int sig_num)
+{
+	cf_info(AS_AS, "SIGHUP received, rolling log");
+
+	cf_fault_sink_logroll();
+}
+
+// We get here on some crashes.
+void
+as_sig_handle_ill(int sig_num)
+{
+	cf_warning(AS_AS, "SIGILL received, aborting %s build %s os %s",
+			aerospike_build_type, aerospike_build_id, aerospike_build_os);
+
+	PRINT_STACKTRACE();
+	reraise_signal(sig_num, as_sig_handle_ill);
+}
+
+// We get here on cf_crash_nostack(), cf_assert_nostack().
+void
+as_sig_handle_int(int sig_num)
+{
+	cf_warning(AS_AS, "SIGINT received, shutting down");
+
+	if (! g_startup_complete) {
+		cf_warning(AS_AS, "startup was not complete, exiting immediately");
+		_exit(1);
+	}
+
+	xdr_sig_handler(sig_num);
+
+	pthread_mutex_unlock(&g_main_deadlock);
+}
+
+// We get here if we intentionally trigger the signal.
+void
+as_sig_handle_quit(int sig_num)
+{
+	cf_warning(AS_AS, "SIGQUIT received, aborting %s build %s os %s",
+			aerospike_build_type, aerospike_build_id, aerospike_build_os);
+
+	PRINT_STACKTRACE();
+	reraise_signal(sig_num, as_sig_handle_quit);
+}
+
+// We get here on some crashes.
+void
+as_sig_handle_segv(int sig_num)
+{
+	cf_warning(AS_AS, "SIGSEGV received, aborting %s build %s os %s",
+			aerospike_build_type, aerospike_build_id, aerospike_build_os);
+
+	xdr_sig_handler(sig_num);
+
+	PRINT_STACKTRACE();
+	reraise_signal(sig_num, as_sig_handle_segv);
+}
+
+// We get here on normal shutdown.
+void
+as_sig_handle_term(int sig_num)
+{
+	cf_info(AS_AS, "SIGTERM received, starting normal shutdown");
+
+	if (! g_startup_complete) {
+		cf_warning(AS_AS, "startup was not complete, exiting immediately");
+		_exit(0);
+	}
+
+	xdr_sig_handler(sig_num);
+
+	pthread_mutex_unlock(&g_main_deadlock);
+}
+
+// We get here on cf_crash() and cf_assert().
+void
+as_sig_handle_usr1(int sig_num)
+{
+	cf_warning(AS_AS, "SIGUSR1 received, aborting %s build %s os %s",
+			aerospike_build_type, aerospike_build_id, aerospike_build_os);
+
+	xdr_sig_handler(sig_num);
+
+	PRINT_CALL_STACK(CF_INFO);
+	reraise_signal(SIGABRT, as_sig_handle_abort);
+}
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_signal_setup()
+{
+	register_signal_handler(SIGABRT, as_sig_handle_abort);
+	register_signal_handler(SIGBUS, as_sig_handle_bus);
+	register_signal_handler(SIGFPE, as_sig_handle_fpe);
+	register_signal_handler(SIGHUP, as_sig_handle_hup);
+	register_signal_handler(SIGILL, as_sig_handle_ill);
+	register_signal_handler(SIGINT, as_sig_handle_int);
+	register_signal_handler(SIGQUIT, as_sig_handle_quit);
+	register_signal_handler(SIGSEGV, as_sig_handle_segv);
+	register_signal_handler(SIGTERM, as_sig_handle_term);
+	register_signal_handler(SIGUSR1, as_sig_handle_usr1);
+
+	// Block SIGPIPE signal when there is some error while writing to pipe. The
+	// write() call will return with a normal error which we can handle.
+	struct sigaction sigact;
+
+	memset(&sigact, 0, sizeof(sigact));
+	sigact.sa_handler = SIG_IGN;
+	sigemptyset(&sigact.sa_mask);
+	sigaddset(&sigact.sa_mask, SIGPIPE);
+
+	if (sigaction(SIGPIPE, &sigact, NULL) != 0) {
+		cf_warning(AS_AS, "could not block the SIGPIPE signal");
+	}
+}
diff --git a/as/src/base/system_metadata.c b/as/src/base/system_metadata.c
new file mode 100644
index 00000000..2633731b
--- /dev/null
+++ b/as/src/base/system_metadata.c
@@ -0,0 +1,3471 @@
+/*
+ * system_metadata.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ *  SYNOPSIS
+ *    The System Metadata module provides a mechanism for synchronizing
+ *    module metadata cluster-wide.  While each module is responsible
+ *    for the interpretation of its own metadata, the System Metadata
+ *    module provides persistence and automatic distribution of changes
+ *    to that opaque metadata.
+ */
+
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+
+#include "aerospike/as_hashmap.h"
+#include "aerospike/as_integer.h"
+#include "aerospike/as_stringmap.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_rchash.h"
+
+#include "msg.h"
+#include "shash.h"
+
+#include "base/cfg.h"
+#include "base/secondary_index.h"
+#include "base/system_metadata.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "jansson.h"
+
+
+/*
+**                                 System Metadata Theory of Operation
+**                                 ===================================
+**
+**   Overview:
+**   ---------
+**
+**   The System Metadata (SMD) module provides the means for an Aerospike cluster to manage and
+**   automatically and consistently distribute data describing the state of any number of modules
+**   within each of the cluster nodes.  This data is called "system metadata."  System metadata
+**   is managed on a module-by-module basis, where each registered module has a set of zero or
+**   more SMD items.  An SMD item has properties describing the item (module name, key, value
+**   generation, and modification timestamp.)  The contents (value) of an SMD item is opaque to
+**   the SMD module itself.  At creation time, modules may register policy callback functions
+**   to perform the actions of merging and accepting metadata updates, or else select the system
+**   default policy for these operations.
+**
+**   Initialization:
+**   ---------------
+**
+**   Prior to use, the System Metadata module must first be initialized by calling "as_smd_init()"
+**   to create the SMD internal data structures and launch a captive thread to process all
+**   incoming system metadata events.  During this phase, all system metadata operations will be
+**   handled locally on each node.
+**
+**   Once all server components have been initialized, SMD may be started via "as_smd_start()".
+**   At this point, SMD will begin handling cluster state change events and begin
+**   communicating with SMD in other cluster nodes via SMD fabric messages to synchronize
+**   system metadata cluster-wide.  Fabric transactions are used guarantee message delivery
+**   succeeds or fails atomically, with re-try handled automatically at fabric level.
+**
+**   The System Metadata module may be terminated using "as_smd_shutdown()", which de-registers
+**   the SMD fabric message type and causes the captive thread to exit.  At this point, it is
+**   permissible to re-initialize (and then re-start) the System Metadata module again.
+**
+**   Life Cycle of System Metadata:
+**   ------------------------------
+**
+**   For a server component to use System Metadata, the component must first create its SMD
+**   module.  The SMD API names modules via a name string which must be unique within the
+**   server.  Calling "as_smd_create_module()" will create a container object in SMD to
+**   hold the module's metadata and register any supplied policy callback functions provided by
+**   the component.  To release the component's SMD module, call "as_smd_destroy_module()".
+**
+**   After a module has been created, new metadata items may be added, or existing items may
+**   be modified, using "as_smd_set_metadata()".  Existing metadata items may be removed using
+**   "as_smd_delete_metadata()".  Metadata may be searched using "as_smd_get_metadata()", which
+**   can return one or more items for one or more modules, depending upon the item list passed
+**   in, and sends the search results to a user-supplied callback function.
+**
+**   Each module's metadata is automatically persisted via serialization (in JSON format) to a file
+**   upon each accepted metadata item change and also when the module is destroyed.  When a module
+**   is created (usually at server start-up time), if an existing SMD file is found for the module,
+**   its contents will be loaded in as the initial values of the module's metadata.
+**
+**   System Metadata Policy Callback Functions:
+**   ------------------------------------------
+**
+**   There are three SMD policy callback functions a module may register.  If NULL is passed
+**   for a callback function pointer in "as_smd_module_create()", the system default policy
+**   will be selected for that operation.  All policy callbacks are executed in the context
+**   of the SMD thread.
+**
+**   The SMD policy callbacks operate as follows:
+**
+**     1). The Merge Callback ("as_smd_merge_cb()"):  When a cluster state change occurs,
+**          each module's Merge callback will be executed on the SMD principal to create a new,
+**          unified view of each module's metadata.  The system default merge policy is to simply
+**          form a union of all nodes' metadata items for the given module, taking the latest
+**          version of metadata items with duplicate keys, chosen first by highest generation
+**          and second by highest timestamp.
+**
+**     2). The Accept Callback ("as_smd_accept_cb()"):  When a modules SMD item(s) are changed,
+**          or when a module is created and fully restored from persistence, the Accept callback
+**          will be invoked on every node to commit the change, with the originator of the accept
+**          event passed as the accept option parameter value.
+**
+**          This callback will be invoked in three distinct cases:
+**
+**          First, when a module is created and its persisted metadata (if any) has been fully
+**          restored, this callback will be invoked with the OPT_CREATE accept option and a
+**          NULL item list.  This event is the proper point for synchronizing with any other
+**          thread(s) who depend upon the given module being fully initialized.
+**
+**          Second, after the the SMD principal has determined the merged metadata for a
+**          module, it will distribute the new metadata to all cluster nodes (including itself)
+**          for processing via the Accept callback with the OPT_MERGE accept option and an item
+**          list of length 0 or greater.  The system default accept policy is simply to replace
+**          any preexisting metadata items for the module with the received metadata items.
+**          Modules will generally, however, define their own Accept callback to take actions
+**          based upon the changed metadata, such as creating secondary indexes or defining new
+**          User Defined Functions (UDFs.)
+**
+**          Third, when a metadata item is set or deleted via the SMD API (or at module creation
+**          time, via restoration from persisted state), the Accept callback will be invoked with
+**          the OPT_API accept option and an item list of length 1.  Note that at system start-up
+**          time, prior to cluster formation, the metadata change will be handled locally.  Once
+**          a cluster has been joined, however, each metadata change event will be proxied to
+**          the SMD principal, who will forward it to every cluster node (including itself)
+**          for acceptance.
+**
+**     3). The Can Accept Callback ("as_smd_can_accept_cb()"):  When the SMD principal
+**          receives a metadata change request (set or delete), it will first attempt to
+**          validate the request via any registered Can Accept callback.  If the callback
+**          exists, it must return non-zero for the item to be processed.  Otherwise the item
+**          will be rejected.
+**
+**   Threading Structure:
+**   --------------------
+**
+**   The System Metadata module relies on a single, captive thread to handle all incoming SMD
+**   fabric messages, public SMD API operations, and to invoke module's registered policy
+**   callbacks.  Single-thread access means no locking of SMD data structures is necessary.
+**
+**   The SMD thread waits on a queue for messages from either the local node (created and sent
+**   via the System Metadata API functions) or from other cluster nodes (via System Metadata
+**   fabric messages.)
+**
+**   Initially the System Metadata module is inactive until the "as_smd_init()" function launches
+**   the System Metadata thread.  At this point, only node-local SMD commands and events will be
+**   processed.  When "as_smd_start()" is called, a START message will be sent telling the SMD
+**   thread to also begin receiving SMD events for cluster state change notifications
+**   and from other cluster nodes via SMD fabric messages.  SMD will now perform the full
+**   policy callback processing as describe above.  The System Metadata module will be running
+**   until the "as_smd_shutdown()" function sends a SHUTDOWN message, upon receipt of which the
+**   System Metadata thread will exit cleanly.
+**
+**   Internal Messaging Structure:
+**   -----------------------------
+**
+**   Each public SMD API function invocation corresponds to an event message being sent to the
+**   System Metadata thread via its message queue for processing.  Internal command messages
+**   (those not generated by API calls) are also sent via the message queue to handle cluster
+**   state change events, incoming SMD fabric messages, and other internal utility functions.
+**
+**   Each event is defined by an event type, options bits, and a metadata item (which may be
+**   NULL or partially populated, depending upon the command type.)
+**
+**   The SMD command message types are:
+**
+**    1). INIT / START / SHUTDOWN:  These messages correspond to the APIs controlling the
+**          running of the SMD subsystem itself and its captive thread.
+**
+**    2). CREATE_MODULE / DESTROY_MODULE:  These messages create and destroy module objects
+**          containing metadata items.
+**
+**    3). SET_METADATA / DELETE_METADATA / GET_METADATA:  The SMD API sends these messages to
+**          set, delete, and get metadata items.
+**
+**    4). INTERNAL:  This message type is used for non-API "internal" events such as the event
+**          triggered by a cluster state change notification, incoming SMD fabric
+**          messages from other nodes, or to dump info. about the state of system metadata to
+**          the system log.
+**
+**   Debugging Utilities:
+**   --------------------
+**
+**   The state of the System Metadata module can be logged using the "dump-smd:" Info command:
+**
+**     dump-smd:[verbose={"true"|"false"}]   (Default: "false".)
+**
+**   The optional option "verbose" parameter may be set to "true" to log additional detailed
+**   information about the system metadata, such as information about all modules' metadata items.
+**
+**   System Metadata may be directly manipulated using the "smd:" Info command:
+**
+**     smd:cmd=<SMDCommand>[;module=<String>;node=<HexNodeID>;key=<String>;value=<String>]
+**
+**   where <SMDCommand> is one of:  {create|destroy|set|delete|get|init|start|shutdown}, and:
+**    - The "init", "start", and "shutdown" commands take no parameters;
+**    - The "create" and "destroy" commands require a "module" parameter;
+**    - The "set" command requires "key" and "value", the "delete" command only requires "key";
+**    - The "get" command can take "module", "key" and "node" parameters, which if specified as
+**       empty, e.g., "module=;key=", will perform a wildcard metadata item retrieval.
+**
+**   Open Issues:
+**   ------------
+**
+**   The SMD API currently provides no mechanism for notifying the caller whether (or when)
+**   the request has succeeded (or failed.)  The challenge is that in general the asynchronous
+**   event may be triggered on a remote node, e.g., the SMD principal.  Support for an optional
+**   callback for this purpose (per-module or per-API call) may be added in the future.
+**
+*/
+
+
+/* Define constants. */
+
+
+/* Maximum length for System Metadata persistence files. */
+#define MAX_PATH_LEN  (1024)
+
+/* Time in milliseconds to wait for an incoming message. */
+#define AS_SMD_WAIT_INTERVAL_MS  (1000)
+
+/* Time in milliseconds for System Metadata proxy transactions to the SMD principal. */
+#define AS_SMD_TRANSACT_TIMEOUT_MS  (1000)
+
+#define SMD_MAX_STACK_MODULES 128
+#define SMD_MAX_STACK_NUM_ITEMS (1 << 14)
+
+/* Declare Private Types */
+
+
+/*
+ *  Type for System Metadata command option flags.
+ */
+typedef enum as_smd_cmd_opt_e {
+	AS_SMD_CMD_OPT_NONE           = 0x00,
+	AS_SMD_CMD_OPT_DUMP_SMD       = 0x01,
+	AS_SMD_CMD_OPT_VERBOSE        = 0x02
+} as_smd_cmd_opt_t;
+
+/*
+ *  Types of API commands sent to the System Metadata module.
+ */
+typedef enum as_smd_cmd_type_e {
+	AS_SMD_CMD_INIT,              // System Metadata API initialization
+	AS_SMD_CMD_START,             // System Metadata start receiving cluster state changes
+	AS_SMD_CMD_CREATE_MODULE,     // Metadata container creation
+	AS_SMD_CMD_DESTROY_MODULE,    // Metadata container destruction
+	AS_SMD_CMD_SET_METADATA,      // Add new, or modify existing, metadata item
+	AS_SMD_CMD_DELETE_METADATA,   // Existing metadata item deletion
+	AS_SMD_CMD_GET_METADATA,      // Get single metadata item
+	AS_SMD_CMD_CLUSTER_CHANGED,   // Cluster state change
+	AS_SMD_CMD_INTERNAL,          // System Metadata system internal command
+	AS_SMD_CMD_SHUTDOWN           // System Metadata shut down
+} as_smd_cmd_type_t;
+
+/*
+ *  Name of the given System Metadata API command type.
+ */
+#define AS_SMD_CMD_TYPE_NAME(cmd)  (AS_SMD_CMD_INIT == cmd ? "INIT" : \
+									(AS_SMD_CMD_START == cmd ? "START" : \
+									 (AS_SMD_CMD_CREATE_MODULE == cmd ? "CREATE" : \
+									  (AS_SMD_CMD_DESTROY_MODULE == cmd ? "DESTROY" : \
+									   (AS_SMD_CMD_SET_METADATA == cmd ? "SET" : \
+										(AS_SMD_CMD_DELETE_METADATA == cmd ? "DELETE" : \
+										 (AS_SMD_CMD_GET_METADATA == cmd ? "GET" : \
+										  (AS_SMD_CMD_CLUSTER_CHANGED == cmd ? "CLUSTER" : \
+										   (AS_SMD_CMD_INTERNAL == cmd ? "INTERNAL" : \
+											(AS_SMD_CMD_SHUTDOWN == cmd ? "SHUTDOWN" : "<UNKNOWN>"))))))))))
+
+/*
+ *  Type for System Metadata event messages sent via the API.
+ */
+typedef struct as_smd_cmd_s {
+	as_smd_cmd_type_t type;              // System Metadata command type
+	uint32_t options;                    // Bit vector of event options of type "as_smd_cmd_opt_t"
+	as_smd_item_t *item;                 // Metadata item associated with this event (only relevant fields are set)
+	void *a, *b, *c, *d, *e, *f, *g, *h; // Generic storage for command parameters.
+} as_smd_cmd_t;
+
+/*
+ *  Types of operation messages handled by the System Metadata module, received as msg events.
+ */
+typedef enum as_smd_msg_op_e {
+	AS_SMD_MSG_OP_SET_ITEM,                 // Add a new, or modify an existing, metadata item
+	AS_SMD_MSG_OP_DELETE_ITEM,              // Delete an existing metadata item (must already exist)  [[Deprecated]]
+	AS_SMD_MSG_OP_MY_CURRENT_METADATA,      // Current metadata sent from a node to the principal
+	AS_SMD_MSG_OP_ACCEPT_THIS_METADATA,     // New blessed metadata sent from the principal to a node
+	AS_SMD_MSG_OP_SET_FROM_PR               // Accept item (OPT_API) from principal.
+} as_smd_msg_op_t;
+
+/*
+ *  Name of the given System Metadata message operation.
+ */
+#define AS_SMD_MSG_OP_NAME(op)  (AS_SMD_MSG_OP_SET_ITEM == op ? "SET_ITEM" : \
+								 (AS_SMD_MSG_OP_DELETE_ITEM == op ? "DELETE_ITEM" : \
+								  (AS_SMD_MSG_OP_MY_CURRENT_METADATA == op ? "MY_CURRENT_METADATA" : \
+								   (AS_SMD_MSG_OP_ACCEPT_THIS_METADATA == op ? "ACCEPT_THIS_METADATA" : \
+									(AS_SMD_MSG_OP_SET_FROM_PR == op ? "SET_FROM_PR" : "<UNKNOWN>")))))
+
+/*
+ *  Name of the given System Metadata action.
+ */
+#define AS_SMD_ACTION_NAME(action)  (AS_SMD_ACTION_SET == action ? "SET" : \
+									 (AS_SMD_ACTION_DELETE == action ? "DELETE" : "<UNKNOWN>"))
+
+
+/* Define API Command / Message Type / Callback Action Correspondence Macros. */
+
+
+/*
+ *  Message operation corresponding to the given API command type.
+ *   (Default to SET_ITEM for the unknown case.)
+ */
+#define CMD_TYPE2MSG_OP(cmd)  (AS_SMD_CMD_SET_METADATA == cmd ? AS_SMD_MSG_OP_SET_ITEM : \
+							   (AS_SMD_CMD_DELETE_METADATA == cmd ? AS_SMD_MSG_OP_DELETE_ITEM : AS_SMD_MSG_OP_SET_ITEM))
+
+/*
+ *  API action corresponding to the given message operation.
+ *   (Default to SET for the unknown case.)
+ */
+#define MSG_OP2ACTION(op)  (AS_SMD_MSG_OP_SET_ITEM == op ? AS_SMD_ACTION_SET : \
+							(AS_SMD_MSG_OP_DELETE_ITEM == op ? AS_SMD_ACTION_DELETE : AS_SMD_ACTION_SET))
+
+/*
+ *  Type for System Metadata messages transmitted via the fabric.
+ */
+typedef struct as_smd_msg_s {
+	as_smd_msg_op_t op;         // System Metadata operation
+	uint64_t cluster_key;       // Sending node's cluster key
+	cf_node node_id;            // Sending node's ID
+	char *module_name;          // Name of the module.
+	uint32_t num_items;         // Number of metadata items
+	as_smd_item_list_t *items;  // List of metadata items associated with this message (only relevant fields are set)
+	uint32_t options;           // Message options (originator)
+} as_smd_msg_t;
+
+/*
+ *  Types of events sent to and processed by the System Metadata thread.
+ */
+typedef enum as_smd_event_type_e {
+	AS_SMD_CMD,                 // SMD API command
+	AS_SMD_MSG,                 // SMD fabric message
+} as_smd_event_type_t;
+
+/*
+ *  Type for an event object handled by the System Metadata system.
+ *     An event can either be an API command or a message transmitted via the fabric.
+ */
+typedef struct as_smd_event_s {
+	as_smd_event_type_t type;   // Selector determining event type (command or message)
+	union {
+		as_smd_cmd_t cmd;       // SMD command event sent via the SMD API
+		as_smd_msg_t msg;       // SMD message event sent via fabric
+	} u;
+} as_smd_event_t;
+
+/*
+ *  Type for the key for items in the external metadata hash table: node_id, key_len, key (flexible array member, sized by key_len.)
+ */
+typedef struct as_smd_external_item_key_s {
+	cf_node node_id;            // ID of the source cluster node.
+	size_t key_len;             // Length of the key string.
+	char key[];                 // Flexible array member for the null-terminated key string.
+} as_smd_external_item_key_t;
+
+typedef enum {
+	AS_SMD_MSG_TRID,
+	AS_SMD_MSG_ID,
+	AS_SMD_MSG_CLUSTER_KEY,
+	AS_SMD_MSG_OP,
+	AS_SMD_MSG_NUM_ITEMS, // deprecated
+	AS_SMD_MSG_ACTION, // deprecated
+	AS_SMD_MSG_MODULE, // deprecated
+	AS_SMD_MSG_KEY, // deprecated
+	AS_SMD_MSG_VALUE, // deprecated
+	AS_SMD_MSG_GENERATION, // deprecated
+	AS_SMD_MSG_TIMESTAMP,
+	AS_SMD_MSG_MODULE_NAME,
+	AS_SMD_MSG_OPTIONS, // deprecated
+
+	AS_SMD_MSG_MODULE_LIST,
+	AS_SMD_MSG_MODULE_COUNTS,
+	AS_SMD_MSG_KEY_LIST,
+	AS_SMD_MSG_VALUE_LIST,
+	AS_SMD_MSG_GEN_LIST,
+
+	AS_SMD_MSG_SINGLE_KEY,
+	AS_SMD_MSG_SINGLE_VALUE,
+	AS_SMD_MSG_SINGLE_GENERATION,
+	AS_SMD_MSG_SINGLE_TIMESTAMP,
+
+	NUM_SMD_FIELDS
+} smd_msg_fields;
+
+#define AS_SMD_MSG_V2_IDENTIFIER  0x123B
+
+/*
+ *  Define the template for System Metadata messages.
+ *
+ *  System Metadata message structure:
+ *     0). Transaction ID - UINT64 (Required for Fabric Transact.)
+ *     1). System Metadata Protocol Version Identifier - (uint32_t <==> UINT32)  [Only V2 for now.]
+ *     2). Cluster Key - (uint64_t <==> UINT64)
+ *     3). Operation - (uint32_t <==> UINT32)
+ *     4). Number of items - (uint32_t <==> UINT32)
+ *     5). Action[] - Array of (uint32_t <==> UINT32)
+ *     6). Module[] - Array of (char * <==> STR)
+ *     7). Key[] - Array of (char * <==> STR)
+ *     8). Value[] - Array of (char * <==> STR)
+ *     9). Generation[] - Array of (uint32_t <==> UINT32)
+ *     10). Timestamp[] - Array of (uint64_t <==> UINT64)
+ *     11). Module Name - (char * <==> STR)
+ *     12). Options - (uint32_t <==> UINT32)
+ */
+static const msg_template as_smd_msg_template[] = {
+	{ AS_SMD_MSG_TRID, M_FT_UINT64 },              // Transaction ID for Fabric Transact
+	{ AS_SMD_MSG_ID, M_FT_UINT32 },                // Version of the System Metadata protocol
+	{ AS_SMD_MSG_CLUSTER_KEY, M_FT_UINT64 },       // Cluster key corresponding to msg contents
+	{ AS_SMD_MSG_OP, M_FT_UINT32 },                // Metadata operation
+	{ AS_SMD_MSG_NUM_ITEMS, M_FT_UINT32 },         // Number of metadata items
+	{ AS_SMD_MSG_ACTION, M_FT_ARRAY_UINT32 },      // Metadata action array
+	{ AS_SMD_MSG_MODULE, M_FT_ARRAY_STR },         // Metadata module array
+	{ AS_SMD_MSG_KEY, M_FT_ARRAY_STR },            // Metadata key array
+	{ AS_SMD_MSG_VALUE, M_FT_ARRAY_STR },          // Metadata value array
+	{ AS_SMD_MSG_GENERATION, M_FT_ARRAY_UINT32 },  // Metadata generation array
+	{ AS_SMD_MSG_TIMESTAMP, M_FT_ARRAY_UINT64 },   // Metadata timestamp array
+	{ AS_SMD_MSG_MODULE_NAME, M_FT_STR },          // Name of module the message is from or else NULL if from all.
+	{ AS_SMD_MSG_OPTIONS, M_FT_UINT32 },           // Option flags specifying the originator of the message (i.e., MERGE/API)
+
+	{ AS_SMD_MSG_MODULE_LIST, M_FT_MSGPACK },
+	{ AS_SMD_MSG_MODULE_COUNTS, M_FT_MSGPACK },
+	{ AS_SMD_MSG_KEY_LIST, M_FT_MSGPACK },
+	{ AS_SMD_MSG_VALUE_LIST, M_FT_MSGPACK },
+	{ AS_SMD_MSG_GEN_LIST, M_FT_MSGPACK },
+
+	{ AS_SMD_MSG_SINGLE_KEY, M_FT_STR },
+	{ AS_SMD_MSG_SINGLE_VALUE, M_FT_STR },
+	{ AS_SMD_MSG_SINGLE_GENERATION, M_FT_UINT32 },
+	{ AS_SMD_MSG_SINGLE_TIMESTAMP, M_FT_UINT64 },
+};
+
+COMPILER_ASSERT(sizeof(as_smd_msg_template) / sizeof(msg_template) == NUM_SMD_FIELDS);
+
+#define AS_SMD_MSG_SCRATCH_SIZE 64 // accommodate module name
+
+/*
+ *  State of operation of the System Metadata module.
+ */
+typedef enum as_smd_state_e {
+	AS_SMD_STATE_IDLE,                     // Not initialized yet
+	AS_SMD_STATE_INITIALIZED,              // Ready to receive API calls
+	AS_SMD_STATE_RUNNING,                  // Normal operation:  Receiving cluster state changes
+	AS_SMD_STATE_EXITING                   // Shutting down
+} as_smd_state_t;
+
+/*
+ *  Name of the given System Metadata state.
+ */
+#define AS_SMD_STATE_NAME(state)  (AS_SMD_STATE_IDLE == state ? "IDLE" : \
+								   (AS_SMD_STATE_INITIALIZED == state ? "INITIALIZED" : \
+									(AS_SMD_STATE_RUNNING == state ? "RUNNING" : \
+									 (AS_SMD_STATE_EXITING == state ? "EXITING" : "UNKNOWN"))))
+
+#define SMD_PENDING_MERGE_TIMEOUT_SEC 30
+
+typedef struct smd_pending_merge_s {
+	as_smd_msg_t m;
+	uint64_t expire;
+} smd_pending_merge;
+
+/*
+ *  Internal representation of the state of the System Metadata module.
+ */
+struct as_smd_s {
+
+	// System Metadata thread ID.
+	pthread_t thr_id;
+
+	// System Metadata thread attributes.
+	pthread_attr_t thr_attr;
+
+	// Is the System Metadata module up and running?
+	as_smd_state_t state;
+
+	// Hash table mapping module name (char *) ==> module object (as_smd_module_t *).
+	cf_rchash *modules;
+
+	// Message queue for receiving System Metadata messages.
+	cf_queue *msgq;
+
+	// Scoreboard of what cluster nodes the SMD principal has received metadata from:  cf_node ==> cf_shash *.
+	cf_shash *scoreboard;
+
+	cf_queue pending_merge_queue; // elements are (smd_pending_merge)
+};
+
+/*
+ *  Type representing a module and holding all metadata for the module.
+ */
+typedef struct as_smd_module_s {
+
+	// Name of this module.
+	char *module;
+
+	// This module's merge metadata callback function (or NULL if none.)
+	as_smd_merge_cb merge_cb;
+
+	// User data for the merge metadata callback (or NULL if none.)
+	void *merge_udata;
+
+	// This module's item conflict resolution callback function (or NULL if none.)
+	as_smd_conflict_cb conflict_cb;
+
+	// User data for the item conflict resolution callback (or NULL if none.)
+	void *conflict_udata;
+
+	// This module's accept metadata callback function (or NULL if none.)
+	as_smd_accept_cb accept_cb;
+
+	// User data for the accept metadata callback (or NULL if none.)
+	void *accept_udata;
+
+	// This module's user_op validation callback (or NULL if none.)
+	as_smd_can_accept_cb can_accept_cb;
+
+	// User data for the user_op validation callback (or NULL if none.)
+	void *can_accept_udata;
+
+	// Parsed JSON representation of the module's metadata.
+	json_t *json;
+
+	// Hash table of metadata registered by this node mapping key (char *) ==> metadata item (as_smd_item_t *).
+	cf_rchash *my_metadata;
+
+	// Hash table of metadata received from all external nodes mapping key (as_smd_external_item_key_t *) ==> metadata item (as_smd_item_t *).
+	cf_rchash *external_metadata;
+
+	// Does the module need to be persisted?
+	bool dirty;
+} as_smd_module_t;
+
+
+/* Define macros. */
+
+
+/*
+ *  Free and set to NULL a pointer if non-NULL.
+ */
+#define CF_FREE_AND_NULLIFY(ptr) \
+	if (ptr) {                   \
+		cf_free(ptr);            \
+		ptr = NULL;              \
+	}
+
+/*
+ *  Free members of a metadata item if non-NULL.
+ */
+#define RELEASE_ITEM_MEMBERS(ptr)          \
+	CF_FREE_AND_NULLIFY(ptr->module_name); \
+	CF_FREE_AND_NULLIFY(ptr->key);         \
+	CF_FREE_AND_NULLIFY(ptr->value);
+
+
+/* Function forward references. */
+
+
+static int as_smd_module_persist(as_smd_module_t *module_obj);
+void *as_smd_thr(void *arg);
+
+
+/* Globals. */
+
+as_smd_t *g_smd;
+
+static uint64_t g_cluster_key;
+static uint32_t g_cluster_size;
+static cf_node g_succession[AS_CLUSTER_SZ];
+
+static void as_smd_destroy_event(as_smd_event_t *evt);
+
+/* Get SMD's principal node */
+
+
+static inline cf_node as_smd_principal()
+{
+	return g_succession[0];
+}
+
+
+/* Internal message passing functions. */
+
+
+/*
+ *  Allocate a System Metadata cmd event object to handle API commands.
+ *  (Note:  Using 0 for "node_id" is shorthand for the current node.)
+ *
+ *  Release using "as_smd_destroy_event()".
+ */
+static as_smd_event_t *as_smd_create_cmd_event(as_smd_cmd_type_t type, ...)
+{
+	as_smd_event_t *evt = NULL;
+	as_smd_item_t *item = NULL;
+
+	// In Commands:  Internal
+	uint32_t options = 0;
+
+	// (Always zero.)
+	cf_node node_id = 0;
+
+	// In Commands:  Create / Destroy / Set / Delete / Get
+	char *module = NULL;
+
+	// In Commands:  Set / Delete / Get
+	char *key = NULL;
+
+	// In Commands:  Set
+	char *value = NULL;
+	uint32_t generation = 0;
+	uint64_t timestamp = 0UL;
+
+	// In Commands:  Create
+	as_smd_merge_cb merge_cb = NULL;
+	void *merge_udata = NULL;
+	as_smd_conflict_cb conflict_cb = NULL;
+	void *conflict_udata = NULL;
+	as_smd_accept_cb accept_cb = NULL;
+	void *accept_udata = NULL;
+	as_smd_can_accept_cb can_accept_cb = NULL;
+	void *can_accept_udata = NULL;
+
+	// In Commands:  Get
+	as_smd_get_cb get_cb = NULL;
+	void *get_udata = NULL;
+
+	// In Command:  Cluster-changed
+	uint64_t cluster_key = 0;
+	uint32_t cluster_size = 0;
+	cf_node *succession = NULL;
+
+	// Handle variable arguments.
+	va_list args;
+	va_start(args, type);
+	switch (type) {
+		case AS_SMD_CMD_INIT:
+		case AS_SMD_CMD_START:
+		case AS_SMD_CMD_SHUTDOWN:
+			// (No additional arguments.)
+			break;
+
+		case AS_SMD_CMD_CREATE_MODULE:
+			module = va_arg(args, char *);
+			merge_cb = va_arg(args, as_smd_merge_cb);
+			merge_udata = va_arg(args, void *);
+			conflict_cb = va_arg(args, as_smd_conflict_cb);
+			conflict_udata = va_arg(args, void *);
+			accept_cb = va_arg(args, as_smd_accept_cb);
+			accept_udata = va_arg(args, void *);
+			can_accept_cb = va_arg(args, as_smd_can_accept_cb);
+			can_accept_udata = va_arg(args, void *);
+			break;
+
+		case AS_SMD_CMD_DESTROY_MODULE:
+			module = va_arg(args, char *);
+			break;
+
+		case AS_SMD_CMD_SET_METADATA:
+			module = va_arg(args, char *);
+			key = va_arg(args, char *);
+			value = va_arg(args, char *);
+			generation = va_arg(args, uint32_t);
+			timestamp = va_arg(args, uint64_t);
+			break;
+
+		case AS_SMD_CMD_DELETE_METADATA:
+			module = va_arg(args, char *);
+			key = va_arg(args, char *);
+			break;
+
+		case AS_SMD_CMD_GET_METADATA:
+			module = va_arg(args, char *);
+			key = va_arg(args, char *);
+			get_cb = va_arg(args, as_smd_get_cb);
+			get_udata = va_arg(args, void *);
+			break;
+
+		case AS_SMD_CMD_CLUSTER_CHANGED:
+			cf_debug(AS_SMD, "At event creation for cluster state change");
+			cluster_key = va_arg(args, uint64_t);
+			cluster_size = va_arg(args, uint32_t);
+			succession = va_arg(args, cf_node *);
+			break;
+
+		case AS_SMD_CMD_INTERNAL:
+			options = va_arg(args, uint32_t);
+			break;
+	}
+	va_end(args);
+
+	// Allocate an event object and initialize it as a command.
+	evt = (as_smd_event_t *) cf_calloc(1, sizeof(as_smd_event_t));
+	evt->type = AS_SMD_CMD;
+	as_smd_cmd_t *cmd = &(evt->u.cmd);
+	cmd->type = type;
+	cmd->options = options;
+
+	// Only events with the module specified will create a cmd containing a metadata item.
+	if (module) {
+		// Create the metadata item.
+		// [NB: Reference-counted for insertion in metadata "rchash" table.]
+		item = (as_smd_item_t *) cf_rc_alloc(sizeof(as_smd_item_t));
+		memset(item, 0, sizeof(as_smd_item_t));
+
+		cmd->item = item;
+
+		// Set the originating node ID.
+		// (Note:  Using 0 for "node_id" is shorthand for the current node.)
+		item->node_id = (!node_id ? g_config.self_node : node_id);
+
+		item->action = MSG_OP2ACTION(CMD_TYPE2MSG_OP(type));
+
+		// Populate the item with duplicated metadata
+		// (Note:  The caller is responsible for releasing any dynamically-allocated values passed in.)
+
+		if (module) {
+			item->module_name = cf_strdup(module);
+		}
+
+		if (key) {
+			item->key = cf_strdup(key);
+		}
+
+		if (value) {
+			size_t value_len = strlen(value) + 1;
+			item->value = (char *) cf_malloc(value_len);
+			strncpy(item->value, value, value_len);
+		}
+
+		item->generation = generation;
+
+		item->timestamp = timestamp;
+	}
+
+	// Store the policy callback information generically.
+	if (AS_SMD_CMD_CREATE_MODULE == type) {
+		cmd->a = merge_cb;
+		cmd->b = merge_udata;
+		cmd->c = conflict_cb;
+		cmd->d = conflict_udata;
+		cmd->e = accept_cb;
+		cmd->f = accept_udata;
+		cmd->g = can_accept_cb;
+		cmd->h = can_accept_udata;
+	} else if (AS_SMD_CMD_GET_METADATA == type) {
+		cmd->a = get_cb;
+		cmd->b = get_udata;
+	} else if (AS_SMD_CMD_CLUSTER_CHANGED == type) {
+		cmd->a = (void *)cluster_key;
+		cmd->b = (void *)(uint64_t)cluster_size;
+		cmd->c = succession;
+	}
+
+	return evt;
+}
+
+static bool
+smd_msg_read_items(as_smd_msg_t *sm, const msg *m, const cf_vector *mod_vec,
+		const uint32_t *counts, cf_vector *key_vec, cf_vector *value_vec,
+		uint32_t *gen_list)
+{
+	if (! msg_msgpack_list_get_buf_array_presized(m, AS_SMD_MSG_KEY_LIST,
+			key_vec)) {
+		cf_warning(AS_SMD, "KEY_LIST invalid");
+		return false;
+	}
+
+	msg_msgpack_list_get_buf_array_presized(m, AS_SMD_MSG_VALUE_LIST,
+			value_vec);
+
+	uint32_t check = sm->num_items;
+
+	if (! msg_msgpack_list_get_uint32_array(m, AS_SMD_MSG_GEN_LIST, gen_list,
+			&check) || check != sm->num_items) {
+		cf_warning(AS_SMD, "GEN_LIST invalid with count %u num_items %u", check, sm->num_items);
+		return false;
+	}
+
+	if (msg_get_uint64_array_count(m, AS_SMD_MSG_TIMESTAMP, &check) != 0 ||
+			check != sm->num_items) {
+		cf_warning(AS_SMD, "TIMESTAMP invalid with count %u num_items %u", check, sm->num_items);
+		return false;
+	}
+
+	sm->items = as_smd_item_list_create(sm->num_items);
+
+	uint32_t msg_idx = 0;
+
+	for (uint32_t i = 0; i < cf_vector_size(mod_vec); i++) {
+		const msg_buf_ele *p_mod = cf_vector_getp((cf_vector *)mod_vec, i);
+
+		for (uint32_t j = 0; j < counts[i]; j++) {
+			as_smd_item_t *item = sm->items->item[msg_idx];
+
+			item->node_id = sm->node_id;
+			item->module_name = cf_strndup((const char *)p_mod->ptr, p_mod->sz);
+
+			const msg_buf_ele *p_key = cf_vector_getp(key_vec, msg_idx);
+			const msg_buf_ele *p_value = (msg_idx < cf_vector_size(value_vec)) ?
+					cf_vector_getp(value_vec, msg_idx) : NULL;
+
+			if (! p_key->ptr) {
+				cf_warning(AS_SMD, "invalid packed key at %u/%u", msg_idx, sm->num_items);
+				return false;
+			}
+
+			item->key = cf_strndup((const char *)p_key->ptr, p_key->sz);
+			item->value = (p_value && p_value->ptr) ?
+					cf_strndup((const char *)p_value->ptr, p_value->sz) : NULL;
+
+			item->generation = gen_list[msg_idx];
+			msg_get_uint64_array(m, AS_SMD_MSG_TIMESTAMP, msg_idx,
+					&item->timestamp);
+
+			item->action = item->value ?
+					AS_SMD_ACTION_SET : AS_SMD_ACTION_DELETE;
+
+			msg_idx++;
+		}
+	}
+
+	return true;
+}
+
+// New message protocol.
+static bool
+smd_new_create_msg_event(as_smd_msg_t *sm, cf_node node_id, msg *m)
+{
+	uint32_t counts[SMD_MAX_STACK_MODULES];
+	cf_vector_define(mod_vec, sizeof(msg_buf_ele), SMD_MAX_STACK_MODULES, 0);
+
+	if (sm->op == AS_SMD_MSG_OP_ACCEPT_THIS_METADATA) {
+		sm->options = AS_SMD_ACCEPT_OPT_MERGE;
+	}
+	else if (sm->op == AS_SMD_MSG_OP_SET_FROM_PR) {
+		sm->op = AS_SMD_MSG_OP_ACCEPT_THIS_METADATA;
+		sm->options = AS_SMD_ACCEPT_OPT_API;
+	}
+
+	if (sm->module_name) {
+		// Check single item optimized packing.
+		char *key;
+
+		if (msg_get_str(m, AS_SMD_MSG_SINGLE_KEY, &key, NULL,
+				MSG_GET_DIRECT) == 0) {
+			sm->num_items = 1;
+
+			sm->items = as_smd_item_list_create(1);
+
+			as_smd_item_t *item = sm->items->item[0];
+
+			item->node_id = node_id;
+			item->module_name = cf_strdup(sm->module_name);
+			item->key = cf_strdup(key);
+			msg_get_str(m, AS_SMD_MSG_SINGLE_VALUE, &item->value, NULL,
+					MSG_GET_COPY_MALLOC);
+			msg_get_uint32(m, AS_SMD_MSG_SINGLE_GENERATION, &item->generation);
+			msg_get_uint64(m, AS_SMD_MSG_SINGLE_TIMESTAMP, &item->timestamp);
+			item->action = item->value ?
+					AS_SMD_ACTION_SET : AS_SMD_ACTION_DELETE;
+
+			return true;
+		}
+
+		if (! msg_msgpack_container_get_count(m, AS_SMD_MSG_KEY_LIST,
+				&sm->num_items) || sm->num_items == 0) {
+			sm->items = as_smd_item_list_create(0);
+			return true;
+		}
+
+		msg_buf_ele ele = {
+				.sz = (uint32_t)strlen(sm->module_name),
+				.ptr = (uint8_t *)sm->module_name
+		};
+
+		cf_vector_append(&mod_vec, &ele);
+		counts[0] = sm->num_items;
+	}
+	else {
+		if (! msg_msgpack_container_get_count(m, AS_SMD_MSG_KEY_LIST,
+				&sm->num_items) || sm->num_items == 0) {
+			sm->items = as_smd_item_list_create(0);
+			return true;
+		}
+
+		if (! msg_msgpack_list_get_buf_array_presized(m, AS_SMD_MSG_MODULE_LIST,
+				&mod_vec)) {
+			cf_warning(AS_SMD, "MODULE_LIST invalid");
+			return false;
+		}
+
+		if (cf_vector_size(&mod_vec) == 0) {
+			cf_warning(AS_SMD, "MODULE_LIST zero module names with num_items %u", sm->num_items);
+			return false;
+		}
+
+		uint32_t check = SMD_MAX_STACK_MODULES;
+
+		if (! msg_msgpack_list_get_uint32_array(m, AS_SMD_MSG_MODULE_COUNTS,
+				counts, &check) ||
+				check != cf_vector_size(&mod_vec)) {
+			cf_warning(AS_SMD, "MODULE_COUNTS invalid with counts %u vector_size(mod_vec) %u", check, cf_vector_size(&mod_vec));
+			return false;
+		}
+
+		uint32_t total_check = 0;
+
+		for (uint32_t i = 0; i < cf_vector_size(&mod_vec); i++) {
+			total_check += counts[i];
+		}
+
+		if (total_check != sm->num_items) {
+			cf_warning(AS_SMD, "MODULE_COUNTS total %u does not match num_items %u", total_check, sm->num_items);
+			return false;
+		}
+	}
+
+	if (sm->num_items < SMD_MAX_STACK_NUM_ITEMS) {
+		uint32_t gen_list[sm->num_items];
+		cf_vector_define(key_vec, sizeof(msg_buf_ele), sm->num_items, 0);
+		cf_vector_define(value_vec, sizeof(msg_buf_ele), sm->num_items, 0);
+
+		return smd_msg_read_items(sm, m, &mod_vec, counts, &key_vec, &value_vec,
+				gen_list);
+	}
+
+	cf_vector key_vec;
+	cf_vector value_vec;
+	uint32_t *gen_list = cf_malloc(sizeof(uint32_t) * sm->num_items);
+
+	cf_vector_init(&key_vec, sizeof(msg_buf_ele), sm->num_items, 0);
+	cf_vector_init(&value_vec, sizeof(msg_buf_ele), sm->num_items, 0);
+
+	bool ret = smd_msg_read_items(sm, m, &mod_vec, counts, &key_vec, &value_vec,
+			gen_list);
+
+	cf_vector_destroy(&key_vec);
+	cf_vector_destroy(&value_vec);
+	cf_free(gen_list);
+
+	return ret;
+}
+
+/*
+ *  Allocate a System Metadata msg event object to handle an incoming SMD fabric msg.
+ *
+ *  Release using "as_smd_destroy_event()".
+ */
+static as_smd_event_t *
+as_smd_old_create_msg_event(as_smd_msg_op_t op, cf_node node_id, msg *msg)
+{
+	as_smd_event_t *evt = NULL;
+	int e = 0;
+
+	// Allocate an event object and initialize it as a msg.
+	evt = (as_smd_event_t *) cf_calloc(1, sizeof(as_smd_event_t));
+	evt->type = AS_SMD_MSG;
+	as_smd_msg_t *smd_msg = &(evt->u.msg);
+
+	smd_msg->op = op;
+	smd_msg->node_id = node_id;
+
+	if ((e = msg_get_uint64(msg, AS_SMD_MSG_CLUSTER_KEY, &(smd_msg->cluster_key)))) {
+		cf_warning(AS_SMD, "failed to get cluster key from System Metadata fabric msg (err %d)", e);
+		cf_free(evt);
+		return 0;
+	}
+
+	if ((e = msg_get_str(msg, AS_SMD_MSG_MODULE_NAME, &(smd_msg->module_name), 0, MSG_GET_COPY_MALLOC))) {
+		cf_debug(AS_SMD, "failed to get module name from System Metadata fabric msg (err %d)", e);
+	}
+
+	if (msg_get_uint32(msg, AS_SMD_MSG_NUM_ITEMS, &smd_msg->num_items) != 0) {
+		if (! smd_new_create_msg_event(smd_msg, node_id, msg)) {
+			as_smd_destroy_event(evt);
+			return NULL;
+		}
+
+		return evt;
+	}
+
+	as_smd_destroy_event(evt);
+	return NULL;
+}
+
+
+/* Memory release functions for object types passed to the callback functions. */
+
+
+/*
+ *  Release a reference-counted metadata item.
+ *  (Note:  This is *not* a public API.)
+ */
+static void as_smd_item_destroy(as_smd_item_t *item)
+{
+	if (item) {
+		if (!cf_rc_release(item)) {
+			RELEASE_ITEM_MEMBERS(item);
+			cf_rc_free(item);
+		}
+	}
+}
+
+/*
+ *  Allocate an empty list of to contain metadata items.
+ *  (Note:  This is *not* a public API.)
+ */
+static as_smd_item_list_t *as_smd_item_list_alloc(size_t num_items)
+{
+	as_smd_item_list_t *item_list = (as_smd_item_list_t *)
+			cf_malloc(sizeof(as_smd_item_list_t) + num_items * sizeof(as_smd_item_t *));
+
+	item_list->num_items = num_items;
+	memset(item_list->item, 0, num_items * sizeof(as_smd_item_t *));
+
+	return item_list;
+}
+
+/*
+ *  Create an empty list of reference-counted metadata items.
+ *  (Note:  This is a public API for creating merge callback function arguments.)
+ */
+as_smd_item_list_t *as_smd_item_list_create(size_t num_items)
+{
+	as_smd_item_list_t *item_list = as_smd_item_list_alloc(num_items);
+
+	// Use num_items to count the number of successfully allocated items.
+	item_list->num_items = 0;
+	for (int i = 0; i < num_items; i++) {
+		item_list->item[i] = (as_smd_item_t *) cf_rc_alloc(sizeof(as_smd_item_t));
+		memset(item_list->item[i], 0, sizeof(as_smd_item_t));
+		item_list->num_items++;
+	}
+
+	return item_list;
+}
+
+/*
+ *  Release a list of reference-counted metadata items.
+ *  (Note:  This is a public API for releasing merge callback function arguments.)
+ */
+void as_smd_item_list_destroy(as_smd_item_list_t *items)
+{
+	if (items) {
+		for (int i = 0; i < items->num_items; i++) {
+			as_smd_item_destroy(items->item[i]);
+			items->item[i] = NULL;
+		}
+		cf_free(items);
+	}
+}
+
+/*
+ *  Release a System Metadata event object (either a cmd or a msg.)
+ */
+static void as_smd_destroy_event(as_smd_event_t *evt)
+{
+	if (evt) {
+		if (AS_SMD_CMD == evt->type) {
+			as_smd_cmd_t *cmd = &(evt->u.cmd);
+
+			// Give back the item reference if necessary.
+			as_smd_item_destroy(cmd->item);
+			cmd->item = NULL;
+		} else if (AS_SMD_MSG == evt->type) {
+			as_smd_msg_t *msg = &(evt->u.msg);
+
+			// Release the module name.
+			if (msg->module_name) {
+				cf_free(msg->module_name);
+				msg->module_name = NULL;
+			}
+
+			// Release the msg item list.
+			as_smd_item_list_destroy(msg->items);
+			msg->num_items = 0;
+			msg->items = NULL;
+		} else {
+			cf_warning(AS_SMD, "not destroying unknown type of System Metadata event (%d)", evt->type);
+			return;
+		}
+
+		// Release the event itself.
+		cf_free(evt);
+	} else {
+		cf_warning(AS_SMD, "not freeing NULL System Metadata event");
+	}
+}
+
+/*
+ *  Send an event to the System Metadata thread via the message queue.
+ */
+static int as_smd_send_event(as_smd_t *smd, as_smd_event_t *evt)
+{
+	if (!smd) {
+		cf_warning(AS_SMD, "System Metadata is not initialized ~~ Not sending event!");
+		as_smd_destroy_event(evt);
+		return -1;
+	}
+
+	cf_queue_push(smd->msgq, &evt);
+
+	return 0;
+}
+
+
+/* System Metadata Module Init / Start / Shutdown API */
+
+
+/*
+ *  Free a module object from the modules rchash table.
+ */
+static void modules_rchash_destructor_fn(void *object)
+{
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+
+	cf_debug(AS_SMD, "mrdf(%p) [module \"%s\"] called!", object, module_obj->module);
+
+	// Ensure that the module's callbacks cannot be called again.
+	module_obj->merge_cb = module_obj->merge_udata = NULL;
+	module_obj->conflict_cb = module_obj->conflict_udata = NULL;
+	module_obj->accept_cb = module_obj->accept_udata = NULL;
+	module_obj->can_accept_cb = module_obj->can_accept_udata = NULL;
+
+	// Release the module's JSON if necessary.
+	json_decref(module_obj->json);
+	module_obj->json = NULL;
+
+	// Free the module's name.
+	CF_FREE_AND_NULLIFY(module_obj->module);
+
+	// Free both of the module's metadata hash tables.
+	cf_rchash_destroy(module_obj->my_metadata);
+	cf_rchash_destroy(module_obj->external_metadata);
+}
+
+/*
+ *  Free a metadata item from the metadata rchash table.
+ */
+static void metadata_rchash_destructor_fn(void *object)
+{
+	as_smd_item_t *item = (as_smd_item_t *) object;
+
+	cf_debug(AS_SMD, "mdrdf(%p) [key \"%s\"] called!", object, item->key);
+
+	// Free up the members of the item.
+	RELEASE_ITEM_MEMBERS(item);
+}
+
+/*
+ *  Handle a cluster state change event notification from as_exchange.
+ */
+static void as_smd_cluster_state_changed_fn(const as_exchange_cluster_changed_event *event, void *udata)
+{
+	as_smd_t *smd = (as_smd_t *) udata;
+
+	cf_debug(AS_SMD, "Received cluster state changed event!");
+
+	size_t succession_size = event->cluster_size * sizeof(cf_node);
+	cf_node *succession = cf_malloc(succession_size);
+
+	memcpy(succession, event->succession, succession_size);
+
+	// Send a Cluster Changed command to the System Metadata thread.
+	as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_CLUSTER_CHANGED, event->cluster_key, event->cluster_size, succession));
+}
+
+/*
+ *  Create and initialize a System Metadata module. (Local method for now.)
+ */
+static as_smd_t *as_smd_create(void)
+{
+	as_smd_t *smd = (as_smd_t *) cf_calloc(1, sizeof(as_smd_t));
+
+	// Go to the not yet initialized state.
+	smd->state = AS_SMD_STATE_IDLE;
+
+	// Create the System Metadata modules hash table.
+	cf_rchash_create(&(smd->modules), cf_rchash_fn_fnv32, modules_rchash_destructor_fn, 0, 127, CF_RCHASH_BIG_LOCK);
+
+	// Create the scoreboard hash table.
+	smd->scoreboard = cf_shash_create(cf_shash_fn_ptr, sizeof(cf_node), sizeof(cf_shash *), 127, CF_SHASH_BIG_LOCK);
+
+	// Create the System Metadata message queue.
+	smd->msgq = cf_queue_create(sizeof(as_smd_event_t *), true);
+
+	cf_queue_init(&smd->pending_merge_queue, sizeof(smd_pending_merge), 128, false);
+
+	// Create the System Metadata thread.
+
+	if (pthread_attr_init(&(smd->thr_attr))) {
+		cf_crash(AS_SMD, "failed to initialize the System Metadata thread attributes");
+	}
+
+	if (pthread_create(&(smd->thr_id), &(smd->thr_attr), as_smd_thr, smd)) {
+		cf_crash(AS_SMD, "failed to create the System Metadata thread");
+	}
+
+	// Send an INIT message to the System Metadata thread.
+	if (as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_INIT))) {
+		cf_crash(AS_SMD, "failed to send INIT message to System Metadata thread");
+	}
+
+	return smd;
+}
+
+/*
+ *  Initialize the single global System Metadata module.
+ */
+as_smd_t *as_smd_init(void)
+{
+	// This is here only because we happen to use the absence of the old
+	// sindex SMD files as proof of a proper live jump from v3 to v5. We'll
+	// need to keep this around for a long time - perhaps move it to a
+	// better place when SMD is overhauled.
+
+	char smd_path[MAX_PATH_LEN];
+	char smd_save_path[MAX_PATH_LEN];
+
+	snprintf(smd_path, MAX_PATH_LEN, "%s/smd/%s.smd", g_config.work_directory, OLD_SINDEX_MODULE);
+	snprintf(smd_save_path, MAX_PATH_LEN, "%s.save", smd_path);
+
+	struct stat buf;
+	bool both_gone =
+			stat(smd_path, &buf) != 0 && errno == ENOENT &&
+			stat(smd_save_path, &buf) != 0 && errno == ENOENT;
+
+	if (! both_gone) {
+		cf_crash_nostack(AS_SMD,
+				"Aerospike server was not properly switched to paxos-protocol v5 - "
+				"see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/cluster_to_3_13");
+	}
+
+	if (! g_smd) {
+		g_smd = as_smd_create();
+	} else {
+		cf_warning(AS_SMD, "System Metadata is already initialized");
+	}
+
+	return g_smd;
+}
+
+/*
+ *  Convert an incoming fabric message into the corresponding msg event and post it to the System Metadata message queue.
+ */
+static int as_smd_msgq_push(cf_node node_id, msg *msg, void *udata)
+{
+	as_smd_t *smd = (as_smd_t *) udata;
+
+	cf_debug(AS_SMD, "asmp():  Receiving a System Metadata message from node %016lX", node_id);
+
+	// Make sure System Metadata is running before processing msg.
+	if (smd && smd->state != AS_SMD_STATE_RUNNING) {
+		cf_warning(AS_SMD, "System Metadata not initialized ~~ Ignoring incoming fabric msg!");
+		return -1;
+	}
+
+	// Verify the System Metadata fabric protocol version.
+	uint32_t version;
+	int e = msg_get_uint32(msg, AS_SMD_MSG_ID, &version);
+	if (0 > e) {
+		cf_warning(AS_SMD, "failed to get protocol version from System Metadata fabric msg");
+		return -1;
+	} else if (AS_SMD_MSG_V2_IDENTIFIER != version) {
+		cf_warning(AS_SMD, "received System Metadata fabric msg for unknown protocol version (read: %d ; expected: %d) ~~ Ignoring message!",
+				   version, AS_SMD_MSG_V2_IDENTIFIER);
+		return -1;
+	}
+
+	// Extract the operation from the incoming fabric msg.
+	uint32_t op = 0;
+	msg_get_uint32(msg, AS_SMD_MSG_OP, &op);
+
+	cf_debug(AS_SMD, "Operation received %s", AS_SMD_MSG_OP_NAME(op));
+
+	// Create a System Metadata msg event object and populate it from the fabric msg.
+	as_smd_event_t *evt = as_smd_old_create_msg_event(op, node_id, msg);
+
+	cf_assert(evt, AS_SMD, "failed to create a System Metadata msg event");
+
+	// Send the msg event to the System Metadata thread.
+	return as_smd_send_event(smd, evt);
+}
+
+/*
+ *  Receiver function for System Metadata fabric transactions.
+ */
+static int as_smd_transact_recv_fn(cf_node node_id, msg *msg, void *transact_data, void *udata)
+{
+	as_smd_t *smd = (as_smd_t *) udata;
+	int retval = 0;
+
+	cf_debug(AS_SMD, "astrf():  node %016lX (%s) received SMD transaction from node %016lX (%s)",
+			 g_config.self_node, (as_smd_principal() == g_config.self_node ? "SMD principal" : "regular node"),
+			 node_id, (as_smd_principal() == node_id ? "SMD principal" : "regular node"));
+
+	// Send the received msg to the System Metadata thread.
+	if ((retval = as_smd_msgq_push(node_id, msg, smd))) {
+		cf_warning(AS_SMD, "failed to push received transact msg (retval %d)", retval);
+	}
+
+	// Complete the transaction by replying to the received msg.
+	msg_reset(msg);
+	as_fabric_transact_reply(msg, transact_data);
+
+	return retval;
+}
+
+/*
+ *  Start the System Metadata module to begin receiving cluster state change events.
+ */
+int as_smd_start(as_smd_t *smd)
+{
+	// Register System Metadata fabric transact message type.
+	if (as_fabric_transact_register(M_TYPE_SMD, as_smd_msg_template,
+			sizeof(as_smd_msg_template), AS_SMD_MSG_SCRATCH_SIZE,
+			as_smd_transact_recv_fn, smd)) {
+		cf_crash(AS_SMD, "Failed to register System Metadata fabric transact msg type!");
+	}
+
+	// Register to receive cluster state changed events.
+	as_exchange_register_listener(as_smd_cluster_state_changed_fn, (void *)smd);
+
+	// Send a START message to the System Metadata thread.
+	int retval = 0;
+	if ((retval = as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_START)))) {
+		cf_crash(AS_SMD, "failed to send START message to System Metadata thread");
+	}
+
+	return retval;
+}
+
+/*
+ *  Terminate the System Metadata module.
+ */
+int as_smd_shutdown(as_smd_t *smd)
+{
+	// Send a SHUTDOWN message to the System Metadata thread.
+	return as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_SHUTDOWN));
+}
+
+
+/*
+ *  Public System Metadata Manipulation API Functions:
+ *   These functions are executed in the context of a module using System Metadata.
+ */
+
+
+/*
+ *  Create a container for the named module's metadata and register the policy callback functions.
+ *  (Pass a NULL callback function pointer to select the default policy.)
+ */
+int as_smd_create_module(char *module,
+						 as_smd_merge_cb merge_cb, void *merge_udata,
+						 as_smd_conflict_cb conflict_cb, void *conflict_udata,
+						 as_smd_accept_cb accept_cb, void *accept_udata,
+						 as_smd_can_accept_cb can_accept_cb, void *can_accept_udata)
+{
+	// Send a CREATE command to the System Metadata thread.
+	return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_CREATE_MODULE, module,
+							 merge_cb, merge_udata, conflict_cb, conflict_udata,
+							 accept_cb, accept_udata, can_accept_cb, can_accept_udata));
+}
+
+/*
+ *  Destroy the container for the named module's metadata, releasing all of its metadata.
+ */
+int as_smd_destroy_module(char *module)
+{
+	// Send a DESTROY command to the System Metadata thread.
+	return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_DESTROY_MODULE, module));
+}
+
+/*
+ *  Add a new, or modify an existing, metadata item in an existing module.
+ */
+int as_smd_set_metadata(char *module, char *key, char *value)
+{
+	// Send an SET command to the System Metadata thread.
+	return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_SET_METADATA, module, key, value, 0, 0UL));
+}
+
+/*
+ *  Add a new, or modify an existing, metadata item (with generation and timestamp) in an existing module.
+ *  (Note:  This is an internal-only function, not available via the public SMD API.)
+ */
+int as_smd_set_metadata_gen_ts(char *module, char *key, char *value, uint32_t generation, uint64_t timestamp)
+{
+	// Send an SET command to the System Metadata thread.
+	return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_SET_METADATA, module, key, value, generation, timestamp));
+}
+
+/*
+ *  Delete an existing metadata item from an existing module.
+ */
+int as_smd_delete_metadata(char *module, char *key)
+{
+	// Send a DELETE command to the System Metadata thread.
+	return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_DELETE_METADATA, module, key));
+}
+
+/*
+ *  Retrieve metadata item(s.) (Pass NULL for module and/or key for "all".)
+ */
+int as_smd_get_metadata(char *module, char *key, as_smd_get_cb cb, void *udata)
+{
+	// Send a GET command to the System Metadata thread.
+	return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_GET_METADATA, module, key, cb, udata));
+}
+
+
+/*
+ *  Info Command Functions:
+ *   These functions are executed in the context of the Info system.
+ */
+
+
+/*
+ *  Reduce function to print a single metadata item.
+ */
+static int as_smd_metadata_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *smd_key = (char *) key; // (Not used.)
+	as_smd_item_t *item = (as_smd_item_t *) object;
+
+	cf_info(AS_SMD, "%016lX\t\"%s\"\t\"%s\"\t\"%s\"\t%u\t\t%lu", item->node_id, item->module_name, item->key, item->value, item->generation, item->timestamp);
+
+	return 0;
+}
+
+/*
+ *  Reduce function to print info. about a single System Metadata module.
+ */
+static int as_smd_dump_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+	const char *module = (const char *) key;
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+	int *module_num = (int *) udata;
+	int num_items = 0;
+
+	cf_info(AS_SMD, "Module %d: \"%s\" [\"%s\"]: ", *module_num++, module, module_obj->module);
+	cf_info(AS_SMD, "merge cb: %p", module_obj->merge_cb);
+	cf_info(AS_SMD, "merge udata: %p", module_obj->merge_udata);
+	cf_info(AS_SMD, "conflict cb: %p", module_obj->conflict_cb);
+	cf_info(AS_SMD, "conflict udata: %p", module_obj->conflict_udata);
+	cf_info(AS_SMD, "accept cb: %p", module_obj->accept_cb);
+	cf_info(AS_SMD, "accept udata: %p", module_obj->accept_udata);
+	cf_info(AS_SMD, "can accept cb: %p", module_obj->can_accept_cb);
+	cf_info(AS_SMD, "can accept udata: %p", module_obj->can_accept_udata);
+
+	cf_info(AS_SMD, "My Metadata:");
+	cf_info(AS_SMD, "number of metadata items: %d", num_items = cf_rchash_get_size(module_obj->my_metadata));
+	if (num_items) {
+		cf_info(AS_SMD, "Node ID\t\tModule\tKey\tValue\t\tGeneration\tTimestamp");
+		cf_rchash_reduce(module_obj->my_metadata, as_smd_metadata_reduce_fn, NULL);
+	}
+
+	cf_info(AS_SMD, "External Metadata:");
+	cf_info(AS_SMD, "number of metadata items: %d", num_items = cf_rchash_get_size(module_obj->external_metadata));
+	if (num_items) {
+		cf_info(AS_SMD, "Node ID\t\tModule\tKey\tValue\t\tGeneration\tTimestamp");
+		cf_rchash_reduce(module_obj->external_metadata, as_smd_metadata_reduce_fn, NULL);
+	}
+
+	return 0;
+}
+
+/*
+ *  Print info. about the System Metadata state to the log.
+ *  (Verbose event option prints detailed info. about the metadata values.)
+ */
+void as_smd_dump_metadata(as_smd_t *smd, as_smd_cmd_t *cmd)
+{
+	// Print info. about the System Metadata system.
+	cf_info(AS_SMD, "System Metadata Status:");
+	cf_info(AS_SMD, "-----------------------");
+	cf_info(AS_SMD, "thr_id: 0x%lx", smd->thr_id);
+	cf_info(AS_SMD, "thr_attr: %p", &smd->thr_attr);
+	cf_info(AS_SMD, "state: %s", AS_SMD_STATE_NAME(smd->state));
+	cf_info(AS_SMD, "number of modules: %d", cf_rchash_get_size(smd->modules));
+	cf_info(AS_SMD, "number of pending messages in queue: %d", cf_queue_sz(smd->msgq));
+
+	// If verbose, dump info. about the metadata itself.
+	if (cmd->options & AS_SMD_CMD_OPT_VERBOSE) {
+		int module_num = 0;
+		cf_rchash_reduce(smd->modules, as_smd_dump_reduce_fn, &module_num);
+	}
+}
+
+/*
+ *  Print info. about the System Metadata state to the log.
+ *  (Verbose true prints detailed info. about the metadata values.)
+ */
+void as_smd_dump(bool verbose)
+{
+	// Send an INTERNAL + DUMP_SMD + verbosity command to the System Metadata thread.
+	as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_INTERNAL,
+					  (AS_SMD_CMD_OPT_DUMP_SMD | (verbose ? AS_SMD_CMD_OPT_VERBOSE : 0))));
+}
+
+/*
+ *  Callback used to receive System Metadata items requested via the Info SMD "get" command.
+ */
+static int as_smd_info_get_fn(char *module, as_smd_item_list_t *items, void *udata)
+{
+	for (int i = 0; i < items->num_items; i++) {
+		as_smd_item_t *item = items->item[i];
+		cf_info(AS_SMD, "SMD Info get metadata item[%d]:  module \"%s\" ; key \"%s\" ; value \"%s\" ; generation %u ; timestamp %lu",
+				i, item->module_name, item->key, item->value, item->generation, item->timestamp);
+	}
+
+	return 0;
+}
+
+/*
+ *  Manipulate the System Metadata and log the result.
+ */
+void as_smd_info_cmd(char *cmd, cf_node node_id, char *module, char *key, char *value)
+{
+	int retval = 0;
+
+	// Invoke the appropriate System Metadata API function.
+
+	if (!strcmp(cmd, "create")) {
+		if ((retval = as_smd_create_module(module, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL))) {
+			cf_warning(AS_SMD, "System Metadata create module \"%s\" failed (retval %d)", module, retval);
+		}
+	} else if (!strcmp(cmd, "destroy")) {
+		if ((retval = as_smd_destroy_module(module))) {
+			cf_warning(AS_SMD, "System Metadata destroy module \"%s\" failed (retval %d)", module, retval);
+		}
+	} else if (!strcmp(cmd, "set")) {
+		if (((retval = as_smd_set_metadata(module, key, value)))) {
+			cf_warning(AS_SMD, "System Metadata set item: module: \"%s\" key: \"%s\" value: \"%s\" failed (retval %d)", module, key, value, retval);
+		}
+	} else if (!strcmp(cmd, "delete")) {
+		if (((retval = as_smd_delete_metadata(module, key)))) {
+			cf_warning(AS_SMD, "System Metadata delete item: module: \"%s\" key: \"%s\" failed (retval %d)", module, key, retval);
+		}
+	} else if (!strcmp(cmd, "get")) {
+		if ((retval = as_smd_get_metadata(module, key, as_smd_info_get_fn, NULL))) {
+			cf_warning(AS_SMD, "System Metadata get node: %016lX module: \"%s\" key: \"%s\" failed (retval %d)", node_id, module, key, retval);
+		}
+	} else if (!strcmp(cmd, "init")) {
+		as_smd_init();
+	} else if (!strcmp(cmd, "start")) {
+		if (g_smd) {
+			if ((retval = as_smd_start(g_smd))) {
+				cf_warning(AS_SMD, "System Metadata start up failed (retval %d)", retval);
+			}
+		} else {
+			cf_warning(AS_SMD, "System Metadata is not initialized");
+		}
+	} else if (!strcmp(cmd, "shutdown")) {
+		if (g_smd) {
+			as_smd_shutdown(g_smd);
+		} else {
+			cf_warning(AS_SMD, "System Metadata is not initialized");
+		}
+	} else {
+		cf_warning(AS_SMD, "unknown System Metadata command: \"%s\"", cmd);
+	}
+}
+
+
+/*
+ *  System Metadata Internals:
+ *   These functions are executed in the context of the System Metadata thread,
+ *   except for the fabric callbacks.
+ */
+
+
+/* Metadata persistence functions. */
+
+
+/*
+ *  Read in metadata for the given module from the standard location.
+ *  Return:  0 if successful, -1 otherwise.
+ */
+static int as_smd_read(char *module, json_t **module_smd)
+{
+	int retval = 0;
+	json_t *root = NULL;
+
+	char smd_path[MAX_PATH_LEN];
+	size_t load_flags = JSON_REJECT_DUPLICATES;
+	json_error_t json_error;
+
+	snprintf(smd_path, MAX_PATH_LEN, "%s/smd/%s.smd", g_config.work_directory, module);
+
+	// Check if the persisted metadata file exists before attempting to read it.
+	struct stat buf;
+	if (!stat(smd_path, &buf)) {
+		if (!(root = json_load_file(smd_path, load_flags, &json_error))) {
+			cf_warning(AS_SMD, "failed to load System Metadata for module \"%s\" from file \"%s\" with JSON error: %s ; source: %s ; line: %d ; column: %d ; position: %d",
+					   module, smd_path, json_error.text, json_error.source, json_error.line, json_error.column, json_error.position);
+			retval = -1;
+		}
+	} else {
+		cf_debug(AS_SMD, "failed to read persisted System Metadata file \"%s\" for module \"%s\": %s (%d)", smd_path, module, cf_strerror(errno), errno);
+	}
+
+	if (module_smd) {
+		*module_smd = root;
+	}
+
+	return retval;
+}
+
+/*
+ *  Write out metadata for the given module to the the standard location.
+ *  Return:  0 if successful, -1 otherwise.
+ *
+ *  Note:  Any pre-existing file will be saved prior to write for
+ *          manual recovery in case of system failure.
+ */
+static int as_smd_write(char *module, json_t *module_smd)
+{
+	int retval = 0;
+
+	char smd_path[MAX_PATH_LEN];
+	char smd_save_path[MAX_PATH_LEN];
+	size_t dump_flags = JSON_INDENT(3) | JSON_ENSURE_ASCII | JSON_PRESERVE_ORDER;
+
+	snprintf(smd_path, MAX_PATH_LEN, "%s/smd/%s.smd", g_config.work_directory, module);
+	snprintf(smd_save_path, MAX_PATH_LEN, "%s.save", smd_path);
+
+	if (json_dump_file(module_smd, smd_save_path, dump_flags) < 0) {
+		cf_warning(AS_SMD, "failed to dump System Metadata for module \"%s\" to file \"%s\": %s (%d)", module, smd_path, cf_strerror(errno), errno);
+		return -1;
+	}
+
+	if (rename(smd_save_path, smd_path) != 0) {
+		cf_warning(AS_SMD, "error on renaming existing metadata file \"%s\": %s (%d)", smd_save_path, cf_strerror(errno), errno);
+		return -1;
+	}
+
+	return retval;
+}
+
+/*
+ *  Load persisted System Metadata for the given module:
+ *    Read the module's JSON file (if it exists) and add each metadata found therein.
+ *    Return:   The number of metadata items restored (which may be 0) if reading
+ *               the metadata file was successful, -1 otherwise.
+ */
+static int as_smd_module_restore(as_smd_module_t *module_obj)
+{
+	int retval = 0;
+
+	// Load the module's metadata (if persisted.)
+	if ((retval = as_smd_read(module_obj->module, &(module_obj->json)))) {
+		cf_warning(AS_SMD, "failed to read persisted System Metadata for module \"%s\"", module_obj->module);
+		return -1;
+	}
+
+	size_t num_items = json_array_size(module_obj->json);
+	for (int i = 0; i < num_items; i++) {
+		json_t *json_item = json_array_get(module_obj->json, i);
+
+		if (!json_is_object(json_item)) {
+			// Warn and skip the bad item.
+			cf_warning(AS_SMD, "non-JSON object %d of type %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, json_typeof(json_item), module_obj->module);
+			continue;
+		}
+
+		size_t num_fields = json_object_size(json_item);
+		if (5 != num_fields) {
+			// Warn if the item doesn't have the right number of fields.
+			cf_warning(AS_SMD, "wrong number of fields %zu (expected 5) for object %d in persisted System Metadata for module \"%s\"", num_fields, i, module_obj->module);
+		}
+
+		char *module = (char *) json_string_value(json_object_get(json_item, "module"));
+		if (!module) {
+			cf_warning(AS_SMD, "missing \"module\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, module_obj->module);
+			continue;
+		} else if (strcmp(module_obj->module, module)) {
+			cf_warning(AS_SMD, "incorrect module \"%s\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", module, i, module_obj->module);
+			continue;
+		}
+
+		char *key = (char *) json_string_value(json_object_get(json_item, "key"));
+		if (!key) {
+			cf_warning(AS_SMD, "missing \"key\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, module_obj->module);
+			continue;
+		}
+
+		char *value = (char *) json_string_value(json_object_get(json_item, "value"));
+		if (!value) {
+			cf_warning(AS_SMD, "missing \"value\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, module_obj->module);
+			continue;
+		}
+
+		// [Note:  Should really use uint32_t, but Jansson integers are longs.]
+		uint64_t generation = 1;
+		json_t *generation_obj = json_object_get(json_item, "generation");
+		if (!generation_obj) {
+			cf_warning(AS_SMD, "missing \"generation\" for object %d in persisted System Metadata for module \"%s\" ~~ Using 1!", i, module_obj->module);
+		} else {
+			if (0 == (generation = json_integer_value(generation_obj))) {
+				cf_warning(AS_SMD, "bad \"generation\" for object %d in persisted System Metadata for module \"%s\" ~~ Using 1!", i, module_obj->module);
+				generation = 1;
+			}
+		}
+
+		uint64_t timestamp = cf_getms();
+		json_t *timestamp_obj = json_object_get(json_item, "timestamp");
+		if (!timestamp_obj) {
+			cf_warning(AS_SMD, "missing \"timestamp\" for object %d in persisted System Metadata for module \"%s\" ~~ Using now!", i, module_obj->module);
+		} else {
+			if (0 == (timestamp = json_integer_value(timestamp_obj))) {
+				cf_warning(AS_SMD, "bad \"timestamp\" for object %d in persisted System Metadata for module \"%s\" ~~ Using now!", i, module_obj->module);
+				timestamp = cf_getms();
+			}
+		}
+
+		// Send the item metadata add command.
+		as_smd_set_metadata_gen_ts(module, key, value, generation, timestamp);
+
+		// Another metadata item was successfully restored.
+		retval++;
+	}
+
+	// Release the module's JSON if necessary.
+	json_decref(module_obj->json);
+	module_obj->json = NULL;
+
+	return retval;
+}
+
+/*
+ *  Serialize a single metadata item into a JSON object and add it to the array passed in via "udata".
+ */
+static int as_smd_serialize_into_json_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *smd_key = (char *) key; // (Not used.)
+	as_smd_item_t *item = (as_smd_item_t *) object;
+	json_t *array = (json_t *) udata;
+	json_t *metadata_obj = NULL;
+
+	// Create an empty JSON object to hold the
+	if (!(metadata_obj = json_object())) {
+		cf_warning(AS_SMD, "failed to create JSON object to serialize metadata item: module \"%s\" ; key \"%s\"", item->module_name, item->key);
+		return 0;
+	}
+
+	// Add each of the item's properties to the JSON object.
+	int e = 0;
+	e += json_object_set_new(metadata_obj, "module", json_string(item->module_name));
+	e += json_object_set_new(metadata_obj, "key", json_string(item->key));
+	e += json_object_set_new(metadata_obj, "value", json_string(item->value));
+	e += json_object_set_new(metadata_obj, "generation", json_integer(item->generation));
+	e += json_object_set_new(metadata_obj, "timestamp", json_integer(item->timestamp));
+
+	if (e) {
+		cf_warning(AS_SMD, "failed to serialize fields of metadata item: module \"%s\" ; key \"%s\"", item->module_name, item->key);
+	} else {
+		if (json_array_append_new(array, metadata_obj)) {
+			cf_warning(AS_SMD, "failed to add to array metadata item: module \"%s\" ; key \"%s\"", item->module_name, item->key);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *  Store persistently System Metadata for the given module:
+ *    Convert each of the module's metadata items into a JSON object and write an array of the results to the module's JSON file.
+ */
+static int as_smd_module_persist(as_smd_module_t *module_obj)
+{
+	int retval = 0;
+
+	// Avoid unnecessary writes.
+	if (!module_obj->dirty) {
+		return retval;
+	}
+
+	if (module_obj->json) {
+		cf_warning(AS_SMD, "module \"%s\" JSON is unexpectedly non-NULL (rc %zu) ~~ Nulling!", module_obj->module, module_obj->json->refcount);
+		json_decref(module_obj->json);
+		module_obj->json = NULL;
+	}
+
+	// Create an empty JSON array.
+	if (!(module_obj->json = json_array())) {
+		cf_warning(AS_SMD, "failed to create JSON array for persisting module \"%s\"", module_obj->module);
+		return -1;
+	}
+
+	// Walk the module's metadata hash table and create a JSON array of objects, one for each item.
+	cf_rchash_reduce(module_obj->my_metadata, as_smd_serialize_into_json_reduce_fn, module_obj->json);
+
+	// Store the module's metadata persistently if necessary.
+	if (module_obj->json && (retval = as_smd_write(module_obj->module, module_obj->json))) {
+		cf_warning(AS_SMD, "failed to write persisted System Metadata file for module \"%s\"", module_obj->module);
+		retval = -1;
+	} else {
+		// The module's SMD has been persisted.
+		module_obj->dirty = false;
+	}
+
+	// Release the module's JSON if necessary.
+	json_decref(module_obj->json);
+	module_obj->json = NULL;
+
+	return retval;
+}
+
+/*
+ *  Create a metadata container for the given module.
+ */
+static int as_smd_module_create(as_smd_t *smd, as_smd_cmd_t *cmd)
+{
+	as_smd_item_t *item = cmd->item;
+	as_smd_module_t *module_obj;
+	int retval = 0;
+
+	cf_debug(AS_SMD, "System Metadata thread - creating module \"%s\"", item->module_name);
+
+	// Verify the module does not yet exist.
+	if (CF_RCHASH_OK == (retval = cf_rchash_get(smd->modules, item->module_name, strlen(item->module_name) + 1, (void **) &module_obj))) {
+		// (Note:  This is not a problem ~~ May have come over the wire.)
+		cf_detail(AS_SMD, "System Metadata module \"%s\" already exists", item->module_name);
+
+		// Give back the reference.
+		cf_rc_release(module_obj);
+
+		return retval;
+	}
+
+	// Create the module object.
+	// [NB:  Reference-counted for insertion in modules "rchash" table.]
+	module_obj = (as_smd_module_t *) cf_rc_alloc(sizeof(as_smd_module_t));
+	memset(module_obj, 0, sizeof(as_smd_module_t));
+
+	// Set the module's name.
+	module_obj->module = cf_strdup(item->module_name);
+
+	// Create the module's local metadata hash table.
+	cf_rchash_create(&(module_obj->my_metadata), cf_rchash_fn_fnv32, metadata_rchash_destructor_fn, 0, 127, CF_RCHASH_BIG_LOCK);
+
+	// Create the module's external metadata hash table.
+	cf_rchash_create(&(module_obj->external_metadata), cf_rchash_fn_fnv32, metadata_rchash_destructor_fn, 0, 127, CF_RCHASH_BIG_LOCK);
+
+	// Add the module to the modules hash table.
+	if (CF_RCHASH_OK != (retval = cf_rchash_put_unique(smd->modules, item->module_name, strlen(item->module_name) + 1, module_obj))) {
+		cf_crash(AS_SMD, "failed to add System Metadata module \"%s\" to modules table (retval %d)", item->module_name, retval);
+	}
+
+	// Set the callback functions and their respective user data.
+	module_obj->merge_cb = cmd->a;
+	module_obj->merge_udata = cmd->b;
+	module_obj->conflict_cb = cmd->c;
+	module_obj->conflict_udata = cmd->d;
+	module_obj->accept_cb = cmd->e;
+	module_obj->accept_udata = cmd->f;
+	module_obj->can_accept_cb = cmd->g;
+	module_obj->can_accept_udata = cmd->h;
+
+	int num_items = as_smd_module_restore(module_obj);
+	if (0 > num_items) {
+		cf_warning(AS_SMD, "failed to restore persisted System Metadata for module \"%s\"", item->module_name);
+	}
+
+	// Set an empty metadata item, signifying the completion of module creation,
+	//  including the restoration of zero or more persisted metadata items.
+	//  (Will trigger an Accept callback with the OPT_CREATE accept option.)
+	if ((retval = as_smd_set_metadata(module_obj->module, NULL, NULL))) {
+		cf_warning(AS_SMD, "failed to send SMD module \"%s\" creation complete event", module_obj->module);
+	}
+
+	return retval;
+}
+
+/*
+ *  Find or create a System Metadata module object.
+ *  The name if the module can be at two places
+ *  1. With each item
+ *  2. At the item_list level
+ *
+ *  First preference is to get the information from the specific item
+ *  If the item is NULL, get the information from the item_list.
+ */
+static as_smd_module_t *
+as_smd_module_get(as_smd_t *smd, as_smd_item_t *item, as_smd_msg_t *msg)
+{
+	as_smd_module_t *module_obj = NULL;
+	int retval = 0;
+
+	char *module_name = NULL;
+
+	// First check for a given message with the module name set.
+	if (msg && msg->module_name) {
+		cf_debug(AS_SMD, "asmg():  Name of module from message: \"%s\"", module_name);
+		module_name = msg->module_name;
+	}
+	else if (item && item->module_name) {
+		// Next, see if an item is passed and it has module name set.  This takes precedence.
+		module_name = item->module_name;
+		cf_debug(AS_SMD, "asmg():  Name of module from the item: \"%s\"", module_name);
+	}
+	else {
+		// If the message, item, and item_list are NULL, we cannot do anything.
+		cf_debug(AS_SMD, "asmg():  No module name found!");
+		return NULL;
+	}
+
+	if (CF_RCHASH_OK != (retval = cf_rchash_get(smd->modules, module_name, strlen(module_name) + 1, (void **) &module_obj))) {
+		as_smd_cmd_t cmd;
+		as_smd_item_t fakeitem;
+		// Could not find the module object corresponding to the module name. Create one.
+		// Note:  No policy callback will be set if the module is created on-the-fly.
+		//
+		// Ideally, we should not land into this situation at all.
+		// All the legal module objects should get created upfront
+		// TODO : Should we not throw a warning/crash here and not create a new module ???
+		memset(&cmd, 0, sizeof(as_smd_cmd_t));
+		fakeitem.module_name = module_name;	// Only the module name is used. All the callback pointers will be NULL.
+		cmd.type = AS_SMD_CMD_CREATE_MODULE;
+		cmd.item = &fakeitem;
+		if ((retval = as_smd_module_create(smd, &cmd))) {
+			cf_warning(AS_SMD, "failed to create System Metadata module \"%s\" (rv %d)", module_name, retval);
+		} else {
+			cf_debug(AS_SMD, "created System Metadata module \"%s\" on-the-fly", module_name);
+
+			if (CF_RCHASH_OK != (retval = cf_rchash_get(smd->modules, module_name, strlen(module_name) + 1, (void **) &module_obj))) {
+				cf_crash(AS_SMD, "failed to get System Metadata module \"%s\" after creation (rv %d)", module_name, retval);
+			}
+		}
+	}
+
+	return module_obj;
+}
+
+/*
+ *  Destroy a metadata container for the given module after releasing all contained metadata.
+ */
+static int as_smd_module_destroy(as_smd_t *smd, as_smd_cmd_t *cmd)
+{
+	as_smd_item_t *item = cmd->item;
+	int retval = 0;
+
+	cf_debug(AS_SMD, "System Metadata thread - destroying module \"%s\"", item->module_name);
+
+	// Remove the module's object from the hash table.
+	if (CF_RCHASH_OK != (retval = cf_rchash_delete(smd->modules, item->module_name, strlen(item->module_name) + 1))) {
+		cf_warning(AS_SMD, "failed to delete System Metadata module \"%s\" (retval %d)", item->module_name, retval);
+		return retval;
+	}
+
+	return retval;
+}
+
+static void
+smd_msg_fill_items(msg *m, as_smd_item_t **items, uint32_t num_items,
+		cf_vector *key_vec, cf_vector *value_vec, uint32_t *gen_list)
+{
+	uint32_t value_count = 0;
+
+	msg_set_uint64_array_size(m, AS_SMD_MSG_TIMESTAMP, num_items);
+
+	for (uint32_t i = 0; i < num_items; i++) {
+		msg_buf_ele key_ele = {
+				.sz = (uint32_t)strlen(items[i]->key),
+				.ptr = (uint8_t *)items[i]->key
+		};
+
+		cf_vector_append(key_vec, &key_ele);
+
+		msg_buf_ele value_ele = {
+				.ptr = (uint8_t *)items[i]->value
+		};
+
+		if (items[i]->value) {
+			value_ele.sz = (uint32_t)strlen(items[i]->value);
+			value_count++;
+		}
+
+		cf_vector_append(value_vec, &value_ele);
+
+		gen_list[i] = items[i]->generation;
+		msg_set_uint64_array(m, AS_SMD_MSG_TIMESTAMP, i, items[i]->timestamp);
+	}
+
+	msg_msgpack_list_set_buf(m, AS_SMD_MSG_KEY_LIST, key_vec);
+
+	if (value_count != 0) {
+		msg_msgpack_list_set_buf(m, AS_SMD_MSG_VALUE_LIST, value_vec);
+	}
+
+	msg_msgpack_list_set_uint32(m, AS_SMD_MSG_GEN_LIST, gen_list, num_items);
+}
+
+// New message protocol.
+static msg *
+smd_create_msg(as_smd_msg_op_t op, as_smd_item_t **items, uint32_t num_items,
+		const char *module_name, uint32_t accept_opt)
+{
+	msg *m = as_fabric_msg_get(M_TYPE_SMD);
+
+	msg_set_uint32(m, AS_SMD_MSG_ID, AS_SMD_MSG_V2_IDENTIFIER);
+	msg_set_uint64(m, AS_SMD_MSG_CLUSTER_KEY, g_cluster_key);
+
+	if (op == AS_SMD_MSG_OP_ACCEPT_THIS_METADATA &&
+			(accept_opt & AS_SMD_ACCEPT_OPT_API) != 0) {
+		op = AS_SMD_MSG_OP_SET_FROM_PR;
+	}
+	else if (op == AS_SMD_MSG_OP_DELETE_ITEM) {
+		op = AS_SMD_MSG_OP_SET_ITEM;
+	}
+
+	msg_set_uint32(m, AS_SMD_MSG_OP, op);
+
+	if (module_name) {
+		msg_set_str(m, AS_SMD_MSG_MODULE_NAME, module_name, MSG_SET_COPY);
+
+		// Single item optimized packing.
+		if (num_items == 1) {
+			msg_set_str(m, AS_SMD_MSG_SINGLE_KEY, items[0]->key,
+					MSG_SET_COPY);
+
+			if (items[0]->value) {
+				msg_set_str(m, AS_SMD_MSG_SINGLE_VALUE, items[0]->value,
+						MSG_SET_COPY);
+			}
+
+			if (items[0]->generation != 0) {
+				msg_set_uint32(m, AS_SMD_MSG_SINGLE_GENERATION,
+						items[0]->generation);
+			}
+
+			if (items[0]->timestamp != 0) {
+				msg_set_uint64(m, AS_SMD_MSG_SINGLE_TIMESTAMP,
+						items[0]->timestamp);
+			}
+
+			return m;
+		}
+	}
+
+	if (num_items == 0) {
+		return m;
+	}
+
+	if (! module_name) {
+		uint32_t mod_max = cf_rchash_get_size(g_smd->modules);
+		uint32_t mod_counts[mod_max];
+		uint32_t count = 0;
+		const char *prev = NULL;
+		cf_vector_define(mod_vec, sizeof(msg_buf_ele), mod_max, 0);
+
+		// Assume same item module names are clustered together.
+		for (uint32_t i = 0; i < num_items; i++) {
+			if (count != 0 && strcmp(prev, items[i]->module_name) == 0) {
+				mod_counts[count - 1]++;
+				continue;
+			}
+
+			msg_buf_ele ele = {
+					.sz = (uint32_t)strlen(items[i]->module_name),
+					.ptr = (uint8_t *)items[i]->module_name
+			};
+
+			cf_vector_append(&mod_vec, &ele);
+			prev = items[i]->module_name;
+
+			cf_assert(count < mod_max, AS_SMD, "unexpected item module name ordering");
+
+			mod_counts[count++] = 1;
+		}
+
+		msg_msgpack_list_set_buf(m, AS_SMD_MSG_MODULE_LIST, &mod_vec);
+		msg_msgpack_list_set_uint32(m, AS_SMD_MSG_MODULE_COUNTS, mod_counts,
+				count);
+	}
+
+	if (num_items < SMD_MAX_STACK_NUM_ITEMS) {
+		uint32_t gen_list[num_items];
+		cf_vector_define(key_vec, sizeof(msg_buf_ele), num_items, 0);
+		cf_vector_define(value_vec, sizeof(msg_buf_ele), num_items, 0);
+
+		smd_msg_fill_items(m, items, num_items, &key_vec, &value_vec, gen_list);
+	}
+	else {
+		cf_vector key_vec;
+		cf_vector value_vec;
+		uint32_t *gen_list = cf_malloc(sizeof(uint32_t) * num_items);
+
+		if (cf_vector_init(&key_vec, sizeof(msg_buf_ele), num_items, 0) != 0) {
+			cf_crash(AS_SMD, "cf_vector_init");
+		}
+
+		if (cf_vector_init(&value_vec, sizeof(msg_buf_ele), num_items, 0) !=
+				0) {
+			cf_crash(AS_SMD, "cf_vector_init");
+		}
+
+		smd_msg_fill_items(m, items, num_items, &key_vec, &value_vec, gen_list);
+
+		cf_vector_destroy(&key_vec);
+		cf_vector_destroy(&value_vec);
+		cf_free(gen_list);
+	}
+
+	return m;
+}
+
+/*
+ *  Get or create a new System Metadata fabric msg to perform the given operation on the given metadata items.
+ */
+static msg *
+as_smd_msg_get(as_smd_msg_op_t op, as_smd_item_t **item, size_t num_items, const char *module_name, uint32_t accept_opt)
+{
+	// TODO - collapse - don't need two functions any more.
+	return smd_create_msg(op, item, (uint32_t)num_items, module_name, accept_opt);
+}
+
+/*
+ *  Callback for fabric transact responses, both when forwarding metadata change commands to the SMD principal
+ *   and when receiving message events from the SMD principal.
+ *
+ *   Note:  This function is currently shared between all System Metadata transactions, which works for now
+ *           since the different transaction types don't require separate completion processing.
+ */
+static int transact_complete_fn(msg *response, void *udata, int fabric_err)
+{
+//	as_smd_t *smd = (as_smd_t *) udata; // (Not used.)
+
+	if (!response) {
+		cf_warning(AS_SMD, "Null response message passed in transaction complete!");
+		return -1;
+	}
+
+	as_fabric_msg_put(response);
+
+	if (AS_FABRIC_SUCCESS != fabric_err) {
+		cf_warning(AS_SMD, "System Metadata transaction failed with fabric error %d", fabric_err);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+smd_fabric_send(cf_node node_id, msg *m)
+{
+	if (node_id == g_config.self_node) {
+		as_smd_msgq_push(node_id, m, g_smd);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	as_fabric_transact_start(node_id, m, AS_SMD_TRANSACT_TIMEOUT_MS,
+			transact_complete_fn, NULL);
+}
+
+/*
+ *  Send the metadata item change message to the SMD principal.
+ */
+static int as_smd_proxy_to_principal(as_smd_t *smd, as_smd_msg_op_t op, as_smd_item_t *item)
+{
+	if (as_smd_principal() == (cf_node)0) {
+		cf_warning(AS_SMD, "failed to get the SMD principal node ~~ Not proxying SMD msg");
+		return -1;
+	}
+
+	msg *msg = NULL;
+
+	cf_debug(AS_SMD, "forwarding %s metadata request to SMD principal node %016lX", AS_SMD_MSG_OP_NAME(op), as_smd_principal());
+
+	// Get an existing (or create a new) System Metadata fabric msg for the appropriate operation and metadata item.
+	size_t num_items = 1;
+	if (!(msg = as_smd_msg_get(op, &item, num_items, item->module_name, AS_SMD_ACCEPT_OPT_API))) {
+		cf_warning(AS_SMD, "failed to get a System Metadata fabric msg for operation %s transact start for module \"%s\"", AS_SMD_MSG_OP_NAME(op), item->module_name);
+		return -1;
+	}
+
+	smd_fabric_send(as_smd_principal(), msg);
+
+	return 0;
+}
+
+/*
+ *  Locally change a metadata item.
+ */
+static int as_smd_metadata_change_local(as_smd_t *smd, as_smd_msg_op_t op, as_smd_item_t *item)
+{
+	int retval = 0;
+
+	as_smd_module_t *module_obj = NULL;
+
+	cf_debug(AS_SMD, "System Metadata thread - locally %s'ing metadata: node %016lX ; action %s ; module \"%s\" ; key \"%s\"",
+			 AS_SMD_MSG_OP_NAME(op), item->node_id, AS_SMD_ACTION_NAME(item->action), item->module_name, item->key);
+
+	// Find the module's object.
+	if (CF_RCHASH_OK != (retval = cf_rchash_get(smd->modules, item->module_name, strlen(item->module_name) + 1, (void **) &module_obj))) {
+		cf_warning(AS_SMD, "failed to find System Metadata module \"%s\" (retval %d)", item->module_name, retval);
+		return retval;
+	}
+
+	if (AS_SMD_ACTION_DELETE == item->action) {
+		// Delete the metadata from the module's local metadata hash table.
+		if (CF_RCHASH_OK != (retval = cf_rchash_delete(module_obj->my_metadata, item->key, strlen(item->key) + 1))) {
+			cf_warning(AS_SMD, "failed to delete key \"%s\" from System Metadata module \"%s\" (retval %d)", item->key, item->module_name, retval);
+		}
+	} else if (item->key) {
+		// Handle the Set case:
+
+		// Select metadata local hash table for incoming metadata.
+		cf_rchash *metadata_hash = module_obj->my_metadata;
+
+		// The length of the key string includes the NULL terminator.
+		uint32_t key_len = strlen(item->key) + 1;
+
+		// If the item is local, simply use the key string within the item.
+		void *key = item->key;
+
+		// Default to generation 1.
+		if (!item->generation) {
+			item->generation = 1;
+		}
+
+		// Default timestamp to now.
+		if (!item->timestamp) {
+			item->timestamp = cf_clepoch_milliseconds();
+		}
+
+		// Add new, replace or keep existing, metadata in the module's metadata hash table.
+
+		as_smd_item_t *existing_item;
+		bool existing_wins = false;
+
+		if (CF_RCHASH_OK == cf_rchash_get(metadata_hash, key, key_len, (void **)&existing_item)) {
+			existing_wins = (existing_item->generation > item->generation) ||
+					((existing_item->generation == item->generation) &&
+					 (existing_item->timestamp > item->timestamp));
+			as_smd_item_destroy(existing_item);
+		}
+
+		if (! existing_wins) {
+			// Add reference to item for storage in the hash table.
+			// (Note:  One reference to the item will be released by the thread when it releases the containing command.)
+			cf_rc_reserve(item);
+			cf_rchash_put(metadata_hash, key, key_len, item);
+		}
+	} else {
+		cf_debug(AS_SMD, "(not setting empty metadata item for module \"%s\")", module_obj->module);
+	}
+
+	// Give back the module reference.
+	cf_rc_release(module_obj);
+
+	return retval;
+}
+
+/*
+ *  Handle a metadata change request by proxying to SMD principal or short-circuiting locally during node start-up.
+ */
+static int as_smd_metadata_change(as_smd_t *smd, as_smd_msg_op_t op, as_smd_item_t *item)
+{
+	int retval = 0;
+
+	if ((AS_SMD_STATE_RUNNING == smd->state) && item->key) {
+		// Forward to SMD principal.
+		// [Ideally, would re-try or at least notify (via an as-yet nonexistent mechanism) upon failure.]
+		return as_smd_proxy_to_principal(smd, op, item);
+	} else {
+		// Short-circuit to handle change locally when this node is starting up
+		// or when an initially-empty module is being created, as indicated by NULL item key (and value.)
+
+		cf_debug(AS_SMD, "handling metadata change type %s locally: module \"%s\" ; key \"%s\"", AS_SMD_MSG_OP_NAME(op), item->module_name, item->key);
+
+		if ((retval = as_smd_metadata_change_local(smd, op, item))) {
+			cf_warning(AS_SMD, "failed to %s a metadata item locally: module \"%s\" ; key \"%s\" ; value \"%s\"", AS_SMD_MSG_OP_NAME(op), item->module_name, item->key, item->value);
+		}
+
+		uint32_t accept_opt = AS_SMD_ACCEPT_OPT_API;
+		as_smd_item_list_t *item_list = NULL;
+
+		if (!item->key) {
+			// Empty key (and value) indicates creation of an initially-empty module.
+			accept_opt = AS_SMD_ACCEPT_OPT_CREATE;
+		} else {
+			// While restoring pass this info to the module as well.  This is needed
+			// at the boot to make sure metadata init is done before the data init is done.
+			item_list = as_smd_item_list_alloc(1);
+			item_list->item[0] = item;
+		}
+
+		as_smd_module_t *module_obj = as_smd_module_get(smd, item, NULL);
+
+		// At the end of module creation, SMD will be persisted.
+		if (AS_SMD_ACCEPT_OPT_CREATE == accept_opt) {
+			module_obj->dirty = true;
+		}
+
+		if (module_obj->accept_cb) {
+			// Invoke the module's registered accept policy callback function.
+			(module_obj->accept_cb)(module_obj->module, item_list, module_obj->accept_udata, accept_opt);
+		}
+
+		// Persist the accepted metadata for this module.
+		if (as_smd_module_persist(module_obj)) {
+			cf_warning(AS_SMD, "failed to persist accepted metadata for module \"%s\"", module_obj->module);
+		}
+
+		cf_rc_release(module_obj);
+
+		if (item_list) {
+			cf_free(item_list);
+		}
+	}
+
+	return retval;
+}
+
+/*
+ *  Type representing the state of a metadata get request.
+ */
+typedef struct as_smd_metadata_get_state_s {
+	size_t num_items;                   // Number of matching items.
+	as_smd_item_t *item;                // Item to compare with each item.
+	as_smd_item_list_t *item_list;      // List of matching items.
+	cf_rchash_reduce_fn reduce_fn;         // Reduce function to apply to matching items.
+} as_smd_metadata_get_state_t;
+
+/*
+ *  Reduce function to count one metadata item.
+ */
+static int as_smd_count_matching_item_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *smd_key = (char *) key; // (Not used.)
+	as_smd_item_t *item = (as_smd_item_t *) object;
+	as_smd_metadata_get_state_t *get_state = (as_smd_metadata_get_state_t *) udata;
+
+	// Count each matching item.
+	if (!strcmp(get_state->item->key, "") || !strcmp(get_state->item->key, item->key)) {
+		get_state->num_items += 1;
+	}
+
+	return 0;
+}
+
+/*
+ *  Reduce function to return a single metadata option, if it matches the pattern.
+ */
+static int as_smd_metadata_get_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *smd_key = (char *) key; // (Not used.)
+	as_smd_item_t *item = (as_smd_item_t *) object;
+	as_smd_metadata_get_state_t *get_state = (as_smd_metadata_get_state_t *) udata;
+	as_smd_item_list_t *item_list = get_state->item_list;
+
+	// Add each matching item to the list.
+	if (!strcmp(get_state->item->key, "") || !strcmp(get_state->item->key, item->key)) {
+		cf_rc_reserve(item);
+		item_list->item[item_list->num_items] = item;
+		item_list->num_items += 1;
+	}
+
+	return 0;
+}
+
+/*
+ *  Reduce function to perform a given reduce function on each matching module.
+ */
+static int as_smd_matching_module_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+	const char *module = (const char *) key;
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+	as_smd_metadata_get_state_t *get_state = (as_smd_metadata_get_state_t *) udata;
+
+	// Perform the given reduce function on matching module's metadata.
+	if (!strcmp(get_state->item->module_name, "") || !strcmp(get_state->item->module_name, module)) {
+		cf_rchash_reduce(module_obj->my_metadata, get_state->reduce_fn, get_state);
+	}
+
+	return 0;
+}
+
+/*
+ *  Search for metadata according to the given search criteria.
+ *  The incoming item's module and/or key can be NULL to perform a wildcard match.
+ */
+static int as_smd_metadata_get(as_smd_t *smd, as_smd_cmd_t *cmd)
+{
+	as_smd_item_t *item = cmd->item;
+	int retval = 0;
+
+	cf_debug(AS_SMD, "System Metadata thread - get metadata: module \"%s\" ; node %016lX ; key \"%s\"", item->module_name, item->node_id, item->key);
+
+	// Extract the user's callback function and user data.
+	as_smd_get_cb get_cb = cmd->a;
+	void *get_udata = cmd->b;
+
+	if (!get_cb) {
+		cf_warning(AS_SMD, "no System Metadata get callback supplied ~~ Ignoring get metadata request!");
+		return -1;
+	}
+
+	as_smd_metadata_get_state_t get_state;
+	get_state.num_items = 0;
+	get_state.item = item;
+	get_state.item_list = NULL;
+	get_state.reduce_fn = as_smd_count_matching_item_reduce_fn;
+
+	// Count the number of matching items.
+	cf_rchash_reduce(smd->modules, as_smd_matching_module_reduce_fn, &get_state);
+
+	// Allocate a list of sufficient size for the get result.
+	as_smd_item_list_t *item_list = as_smd_item_list_alloc(get_state.num_items);
+	get_state.item_list = item_list;
+
+	// (Note:  Use num_items to count the position for each metadata item.)
+	item_list->num_items = 0;
+
+	// Add matching items to the list.
+	get_state.reduce_fn = as_smd_metadata_get_reduce_fn;
+	cf_rchash_reduce(smd->modules, as_smd_matching_module_reduce_fn, &get_state);
+
+	// Invoke the user's callback function.
+	(get_cb)(item->module_name, item_list, get_udata);
+
+	// Release the item list.
+	as_smd_item_list_destroy(item_list);
+
+	return retval;
+}
+
+/*
+ *  Cleanly release all System Metadata resources.
+ */
+static void as_smd_terminate(as_smd_t *smd)
+{
+	cf_debug(AS_SMD, "SMD Terminate called");
+
+	// After this is NULLed out, no more messages will be sent to the System Metadata queue.
+	g_smd = NULL;
+
+	// De-register the System Metadata fabric transact message type.
+	// [Note:  Don't need to remove the handler, simply drop the msg in the handler function.]
+//	as_fabric_transact_register(M_TYPE_SMD, NULL, 0, NULL, NULL);
+
+	// Go to the not started up yet state.
+	smd->state = AS_SMD_STATE_IDLE;
+
+	// Destroy the message queue.
+	cf_queue_destroy(smd->msgq);
+
+	// Release the scoreboard hash table.
+	cf_shash_destroy(smd->scoreboard);
+
+	// Release the modules hash table.
+	cf_rchash_destroy(smd->modules);
+
+	// Release the System Metadata object.
+	cf_free(smd);
+}
+
+/*
+ *  Reduce function to count one metadata item.
+ */
+static int as_smd_count_item_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *smd_key = (char *) key; // (Not used.)
+//	as_smd_item_t *item = (as_smd_item_t *) object; // (Not used.)
+	size_t *num_items = (size_t *) udata;
+
+	*num_items += 1;
+
+	return 0;
+}
+
+/*
+ *  Reduce function to count metadata items in one module.
+ */
+static int as_smd_module_count_items_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *module = (char *) key; // (Not used.)
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+	size_t *num_items = (size_t *) udata;
+
+	// Increase the running total by the count the number of metadata items in this module.
+	cf_rchash_reduce(module_obj->my_metadata, as_smd_count_item_reduce_fn, num_items);
+
+	return 0;
+}
+
+/*
+ *  Reduce function to serialize one metadata item.
+ */
+static int as_smd_item_serialize_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *smd_key = (char *) key; // (Not used.)
+	as_smd_item_t *item = (as_smd_item_t *) object;
+	as_smd_item_list_t *item_list = (as_smd_item_list_t *) udata;
+
+	// Add a this metadata item to the list.
+	cf_rc_reserve(item);
+	item_list->item[item_list->num_items] = item;
+	item_list->num_items += 1;
+
+	return 0;
+}
+
+/*
+ *  Reduce function to serialize all of a module's metadata items.
+ */
+static int as_smd_module_serialize_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *module = (char *) key; // (Not used.)
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+	as_smd_item_list_t *item_list = (as_smd_item_list_t *) udata;
+
+	// Serialize all of this module's metadata items.
+	cf_rchash_reduce(module_obj->my_metadata, as_smd_item_serialize_reduce_fn, item_list);
+
+	return 0;
+}
+
+static int as_smd_receive_metadata(as_smd_t *smd, as_smd_msg_t *smd_msg);
+
+static void
+smd_expire_pending_merges()
+{
+	if (cf_queue_sz(&g_smd->pending_merge_queue) == 0) {
+		return;
+	}
+
+	smd_pending_merge item;
+	uint64_t now = cf_getms();
+
+	while (cf_queue_pop(&g_smd->pending_merge_queue, &item, CF_QUEUE_NOWAIT) ==
+			CF_QUEUE_OK) {
+		if (item.expire > now) {
+			cf_queue_push_head(&g_smd->pending_merge_queue, &item);
+			break;
+		}
+
+		cf_free(item.m.module_name);
+		as_smd_item_list_destroy(item.m.items);
+	}
+}
+
+static void
+smd_process_pending_merges()
+{
+	uint64_t now = cf_getms();
+	smd_pending_merge item;
+	int count = cf_queue_sz(&g_smd->pending_merge_queue);
+
+	for (int i = 0; i < count; i++) {
+		cf_queue_pop(&g_smd->pending_merge_queue, &item, CF_QUEUE_NOWAIT);
+
+		if (item.m.cluster_key == g_cluster_key) {
+			as_smd_receive_metadata(g_smd, &item.m);
+		}
+		else if (item.expire > now) {
+			cf_queue_push(&g_smd->pending_merge_queue, &item);
+			continue;
+		}
+
+		cf_free(item.m.module_name);
+		as_smd_item_list_destroy(item.m.items);
+	}
+}
+
+/*
+ *  Handle a cluster state changed message.
+ *  This function collects all metadata items in this node, from all the module,
+ *  currently (UDF, SINDEX) and sends it to the SMD principal for merging the metadata.
+ */
+static void as_smd_cluster_changed(as_smd_t *smd, as_smd_cmd_t *cmd)
+{
+	cf_debug(AS_SMD, "System Metadata thread received cluster state changed cmd event!");
+
+	g_cluster_key = (uint64_t)cmd->a;
+	g_cluster_size = (uint32_t)(uint64_t)cmd->b;
+	memcpy(g_succession, cmd->c, g_cluster_size * sizeof(cf_node));
+
+	cf_free(cmd->c);
+
+	// Determine the number of metadata items to be sent.
+	size_t num_items = 0;
+	cf_rchash_reduce(smd->modules, as_smd_module_count_items_reduce_fn, &num_items);
+
+	cf_debug(AS_SMD, "sending %zu serialized metadata items to the SMD principal", num_items);
+
+	// Copy all reference-counted metadata item pointers from the hash table into an item list.
+	// (Note:  Even if this node has no metadata items, we must still send a message to the principal.)
+	as_smd_item_list_t *item_list = as_smd_item_list_alloc(num_items);
+	// (Note:  Use num_items to count the position for each serialized metadata item.)
+	item_list->num_items = 0;
+	cf_rchash_reduce(smd->modules, as_smd_module_serialize_reduce_fn, item_list);
+
+	cf_debug(AS_SMD, "aspc():  num_items = %zu (%zu)", item_list->num_items, num_items);
+
+	// Build a System Metadata fabric msg containing serialized metadata from the item list.
+	msg *msg = NULL;
+	as_smd_msg_op_t my_smd_op = AS_SMD_MSG_OP_MY_CURRENT_METADATA;
+	if (!(msg = as_smd_msg_get(my_smd_op, item_list->item, item_list->num_items, NULL, 0))) {
+		cf_crash(AS_SMD, "failed to get a System Metadata fabric msg for operation %s transact start", AS_SMD_MSG_OP_NAME(my_smd_op));
+	}
+
+	// The metadata has been copied into the fabric msg and can now be released.
+	as_smd_item_list_destroy(item_list);
+
+	smd_fabric_send(as_smd_principal(), msg);
+
+	smd_process_pending_merges();
+}
+
+/*
+ *  Destroy a node's scoreboard hash table mapping module to metadata item count.
+ */
+static int as_smd_scoreboard_reduce_delete_fn(const void *key, void *data, void *udata)
+{
+	cf_node node_id = (cf_node) key;
+	cf_shash *module_item_count_hash = *((cf_shash **) data);
+
+	cf_debug(AS_SMD, "destroying module item count hash for node %016lX", node_id);
+
+	cf_shash_destroy(module_item_count_hash);
+
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+/*
+ *  Remove the metadata item from the hash table.
+ */
+static int as_smd_reduce_delete_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+	return CF_RCHASH_REDUCE_DELETE;
+}
+
+/*
+ *  Delete all of this module's external metadata items.
+ */
+static int as_smd_delete_external_metadata_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *module = (char *) key; // (Not used.)
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+	as_smd_t *smd = (as_smd_t *) udata;
+
+	cf_rchash_reduce(module_obj->external_metadata, as_smd_reduce_delete_fn, smd);
+	cf_debug(AS_SMD, "All the entries in the scoreboard have been deleted");
+
+	return 0;
+}
+
+/*
+ *  Clear out the temporary state used to merge metadata upon cluster state change.
+ */
+static void as_smd_clear_scoreboard(as_smd_t *smd)
+{
+	cf_shash_reduce(smd->scoreboard, as_smd_scoreboard_reduce_delete_fn, smd);
+	cf_rchash_reduce(smd->modules, as_smd_delete_external_metadata_reduce_fn, smd);
+}
+
+/*
+ *  Apply a metadata change locally using the registered merge policy, defaulting to union.
+ */
+static int as_smd_apply_metadata_change(as_smd_t *smd, as_smd_module_t *module_obj, as_smd_msg_t *smd_msg)
+{
+	int retval = 0;
+
+	as_smd_item_t *item = smd_msg->items->item[0]; // (Only log the fist item.)
+	cf_debug(AS_SMD, "System Metadata thread - applying metadata %s change: item 0:  module \"%s\" ; key \"%s\" ; value \"%s\" ; action %d",
+			 AS_SMD_MSG_OP_NAME(smd_msg->op), module_obj->module, item->key, item->value, item->action);
+
+	// [Note:  Only 1 item should ever be changed via this path.]
+	if (1 != smd_msg->num_items) {
+		cf_crash(AS_SMD, "unexpected number of metadata items being changed: %d != 1", smd_msg->num_items);
+	}
+
+#if 0
+	if (module_obj->merge_cb) {
+		// Invoke the module's registered merge policy callback function.
+		(module_obj->merge_cb)(module_obj->module, smd_msg->item, NULL, module_obj->merge_udata);
+	} else {
+#endif
+		cf_debug(AS_SMD, "asamc():  num_items %d", smd_msg->num_items);
+
+		// By default, simply perform a union operation on an item-by-item basis.
+		for (int i = 0; i < smd_msg->num_items; i++) {
+			item = smd_msg->items->item[i];
+			if (module_obj->can_accept_cb) {
+				int ret = (module_obj->can_accept_cb)(module_obj->module, item, module_obj->can_accept_udata);
+				if (ret != 0) {
+					cf_debug(AS_SMD, "SMD principal rejected the user operation with error code %s", as_sindex_err_str(ret));
+					continue;
+				} else {
+					cf_debug(AS_SMD, "SMD principal validity check succeeded.");
+				}
+			}
+
+			// Default timestamp to now.
+			if (!item->timestamp) {
+				item->timestamp = cf_clepoch_milliseconds();
+			}
+
+			cf_debug(AS_SMD, "asamc():  processing item %d: module \"%s\" key \"%s\" action %s gen %u ts %lu", i, item->module_name, item->key, AS_SMD_ACTION_NAME(item->action), item->generation, item->timestamp);
+
+			// Perform the appropriate union operation.
+
+			as_smd_item_t *existing_item = NULL;
+			if (CF_RCHASH_OK == cf_rchash_get(module_obj->my_metadata, item->key, strlen(item->key) + 1, (void **) &existing_item)) {
+				cf_debug(AS_SMD, "asamc():  Old item exists.");
+			} else {
+				cf_debug(AS_SMD, "asamc():  Old item does not exist.");
+
+				if (AS_SMD_ACTION_DELETE == item->action) {
+					cf_debug(AS_SMD, "deleting a non-extant item: module \"%s\" ; key \"%s\"", item->module_name, item->key);
+				}
+			}
+
+			if (!existing_item) {
+				// For delete, if item already doesn't exist, there's nothing to do.
+				if (AS_SMD_ACTION_DELETE == item->action) {
+					continue;
+				} else {
+					// Otherwise, default to generation 1.
+					if (!item->generation) {
+						item->generation = 1;
+					}
+				}
+			}
+
+			// Choose the most up-to-date item data.
+			if (existing_item && (AS_SMD_ACTION_DELETE != item->action)) {
+				// Default to the next generation.
+				if (!item->generation) {
+					item->generation = existing_item->generation + 1;
+				}
+
+				// Choose the newest first by the highest generation and second by the highest timestamp.
+				if ((existing_item->generation > item->generation) ||
+						((existing_item->generation == item->generation) && (existing_item->timestamp > item->timestamp))) {
+
+					cf_debug(AS_SMD, "old item is newer");
+
+					// If the existing item is newer, skip the incoming item.
+					cf_rc_release(existing_item);
+					continue;
+				} else {
+					// Otherwise, advance the generation.
+					item->generation = existing_item->generation + 1;
+
+					cf_debug(AS_SMD, "New items is newer:  Going to gen %u ts %lu", item->generation, item->timestamp);
+				}
+				cf_rc_release(existing_item);
+				existing_item = NULL;
+			}
+
+			// For each member of the succession list,
+			//   Generate a new SMD fabric msg sharing the properties of the incoming msg event.
+			//   Start a transaction to send the msg out to the node.
+			//   The transaction recv function performs the accept metadata function locally.
+
+			for (uint32_t i = 0; i < g_cluster_size; i++) {
+				msg *msg = NULL;
+				cf_node node_id = g_succession[i];
+				as_smd_msg_op_t accept_op = AS_SMD_MSG_OP_ACCEPT_THIS_METADATA;
+				if (!(msg = as_smd_msg_get(accept_op, smd_msg->items->item, smd_msg->num_items, module_obj->module, AS_SMD_ACCEPT_OPT_API))) {
+					cf_warning(AS_SMD, "failed to get a System Metadata fabric msg for operation %s transact start ~~ Skipping node %016lX!",
+							   AS_SMD_MSG_OP_NAME(accept_op), node_id);
+					continue;
+				}
+
+				smd_fabric_send(node_id, msg);
+			}
+		}
+#if 0
+	}
+#endif
+
+	return retval;
+}
+
+/*
+ *  Increment hash table value by the given delta, starting from zero if not found, and return the new total.
+ */
+static int as_smd_shash_incr(cf_shash *ht, as_smd_module_t *module_obj, size_t delta)
+{
+	size_t count = 0;
+
+	if (CF_SHASH_OK != cf_shash_get(ht, &module_obj, &count)) {
+		// If not found, start at zero.
+		count = 0;
+	}
+
+	count += delta;
+
+	cf_shash_put(ht, &module_obj, &count);
+
+	cf_debug(AS_SMD, "incrementing metadata item count for module \"%s\" to %zu", module_obj->module, count);
+
+	return count;
+}
+
+/*
+ *  Add the metadata items from this msg to the appropriate modules' external hash tables.
+ */
+static cf_shash *as_smd_store_metadata_by_module(as_smd_t *smd, as_smd_msg_t *smd_msg)
+{
+	as_smd_item_list_t *items = smd_msg->items;
+	cf_shash *module_item_count_hash = cf_shash_create(cf_shash_fn_ptr, sizeof(as_smd_module_t *), sizeof(size_t), 19, CF_SHASH_BIG_LOCK);
+
+	for (int i = 0; i < items->num_items; i++) {
+		as_smd_item_t *item = items->item[i];
+
+		// Find the appropriate module's external hash table for this item.
+		as_smd_module_t *module_obj = NULL;
+		if (! (module_obj = as_smd_module_get(smd, item, NULL))) {
+			cf_warning(AS_SMD, "failed to get System Metadata module \"%s\" ~~ Skipping item!", item->module_name);
+			continue;
+		}
+
+		// The length of the key string includes the NULL terminator.
+		uint32_t key_len = strlen(item->key) + 1;
+		uint32_t stack_key_len = sizeof(as_smd_external_item_key_t) + key_len;
+
+		as_smd_external_item_key_t *stack_key = alloca(stack_key_len);
+		if (!stack_key) {
+			cf_crash(AS_SMD, "Failed to allocate stack key of size %d bytes!", stack_key_len);
+		}
+		stack_key->node_id = item->node_id;
+		stack_key->key_len = key_len;
+		memcpy(&(stack_key->key), item->key, key_len);
+
+		// Warn if the item is already present.
+		as_smd_item_t *old_item = NULL;
+		cf_rchash *metadata_hash = module_obj->external_metadata;
+		if (CF_RCHASH_OK == cf_rchash_get(metadata_hash, stack_key, stack_key_len, (void **) &old_item)) {
+			cf_warning(AS_SMD, "found existing metadata item: node: %016lX module: \"%s\" key: \"%s\" value: \"%s\" ~~ Replacing with value: \"%s\"!",
+					   item->node_id, item->module_name, item->key, old_item->value, item->value);
+			// Give back the item reference.
+			cf_rc_release(old_item);
+		}
+
+		// Add reference to item for storage in the hash table.
+		// (Note:  One reference to the item will be released by the thread when it releases the containing msg.)
+		cf_rc_reserve(item);
+
+		// Insert the new metadata into the module's external metadata hash table, replacing any previous contents.
+		cf_rchash_put(metadata_hash, stack_key, stack_key_len, item);
+
+		cf_debug(AS_SMD, "Stored metadata by module for item %d: module \"%s\" ; key \"%s\"", i, module_obj->module, stack_key->key);
+		// Increment the number of items for this module in this node's hash table.
+		as_smd_shash_incr(module_item_count_hash, module_obj, 1);
+
+		// Give back the module reference.
+		cf_rc_release(module_obj);
+	}
+
+	return module_item_count_hash;
+}
+
+typedef struct smd_ext_item_search_s {
+	cf_node node_id;
+	as_smd_item_list_t *item_list;
+	uint32_t count;
+} smd_ext_item_search;
+
+static int
+smd_ext_items_fn(const void *key, uint32_t keylen, void *obj, void *udata)
+{
+	const as_smd_external_item_key_t *extkey =
+			(const as_smd_external_item_key_t *)key;
+	as_smd_item_t *item = (as_smd_item_t *)obj;
+	smd_ext_item_search *search = (smd_ext_item_search *)udata;
+
+	if (extkey->node_id == search->node_id) {
+		cf_rc_reserve(item);
+		search->item_list->item[search->item_list->num_items] = item;
+		search->item_list->num_items++;
+		cf_debug(AS_SMD, "For the node \"%016lX\", num_items is %zu", extkey->node_id, search->item_list->num_items);
+	}
+
+	return 0;
+}
+
+static int
+smd_ext_items_count_fn(const void *key, uint32_t keysz, void *obj, void *udata)
+{
+	const as_smd_external_item_key_t *extkey =
+			(const as_smd_external_item_key_t *)key;
+	smd_ext_item_search *search = (smd_ext_item_search *)udata;
+
+	if (extkey->node_id == search->node_id) {
+		search->count++;
+	}
+
+	return 0;
+}
+
+/*
+ *  Reduce function to create a list of metadata items from an rchash table.
+ */
+static int as_smd_list_items_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+//	char *item_key = (char *) key; // (Not used.)
+	as_smd_item_t *item = (as_smd_item_t *) object;
+	as_smd_item_list_t *item_list = (as_smd_item_list_t *) udata;
+
+	cf_debug(AS_SMD, "adding to item list item: node: %016lX ; module: \"%s\" ; key: \"%s\"", item->node_id, item->module_name, item->key);
+	cf_debug(AS_SMD, "item list: %p", item_list);
+	cf_debug(AS_SMD, "item list length: %zu", item_list->num_items);
+
+	cf_rc_reserve(item);
+
+	item_list->item[item_list->num_items] = item;
+	item_list->num_items += 1;
+
+	return 0;
+}
+
+/*
+ *  Invoke the merge policy callback function for this module.
+ */
+static int as_smd_invoke_merge_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+	const char *module = (const char *) key;
+	as_smd_module_t *module_obj = (as_smd_module_t *) object;
+
+	cf_debug(AS_SMD, "invoking merge policy for module \"%s\"", module);
+
+	as_smd_item_list_t *item_list_out = NULL;
+	as_smd_item_list_t *item_lists_in[g_cluster_size];
+	int list_num = (int)g_cluster_size;
+
+	for (uint32_t i = 0; i < g_cluster_size; i++) {
+		smd_ext_item_search search = {
+				.node_id = g_succession[i]
+		};
+
+		cf_rchash_reduce(module_obj->external_metadata, smd_ext_items_count_fn,
+				&search);
+		item_lists_in[i] = as_smd_item_list_alloc(search.count);
+
+		if (search.count != 0) {
+			search.item_list = item_lists_in[i];
+			item_lists_in[i]->num_items = 0;
+			cf_rchash_reduce(module_obj->external_metadata, smd_ext_items_fn,
+					&search);
+		}
+	}
+
+	// Merge the metadata item lists for this module.
+	if (module_obj->merge_cb) {
+		// Invoke the module's registered merge policy callback function.
+		(module_obj->merge_cb)(module, &item_list_out, item_lists_in, list_num, module_obj->merge_udata);
+	} else {
+		cf_debug(AS_SMD, "no merge cb registered ~~ performing default merge policy: union");
+
+		// No merge policy registered ~~ Default to union.
+		cf_rchash *merge_hash = NULL;
+		cf_rchash_create(&merge_hash, cf_rchash_fn_fnv32, metadata_rchash_destructor_fn, 0, 127, 0);
+
+		// Run through all metadata items in all node's lists.
+		for (int i = 0; i < list_num; i++) {
+			if (item_lists_in[i]) {
+				for (int j = 0; j < item_lists_in[i]->num_items; j++) {
+					as_smd_item_t *new_item = item_lists_in[i]->item[j];
+					uint32_t key_len = strlen(new_item->key) + 1;
+
+					// Look for an existing items with this key.
+					as_smd_item_t *existing_item = NULL;
+					if (CF_RCHASH_OK != cf_rchash_get(merge_hash, new_item->key, key_len, (void **) &existing_item)) {
+						// If not found, insert this item.
+						cf_rc_reserve(new_item);
+						cf_rchash_put(merge_hash, new_item->key, key_len, new_item);
+					} else {
+						// Otherwise, choose a winner.
+						bool existing_wins;
+
+						if (module_obj->conflict_cb) {
+							// Use registered callback to determine winner.
+							existing_wins = (module_obj->conflict_cb)((char *)module, existing_item, new_item, module_obj->conflict_udata);
+						} else {
+							// Otherwise, choose a winner first by the highest generation and second by the highest timestamp.
+							existing_wins = (existing_item->generation > new_item->generation) ||
+									((existing_item->generation == new_item->generation) &&
+									 (existing_item->timestamp > new_item->timestamp));
+						}
+
+						// Leave existing item in hash, or replace existing item
+						// with new item (put releases existing item).
+						if (! existing_wins) {
+							cf_rc_reserve(new_item);
+							cf_rchash_put(merge_hash, new_item->key, key_len, new_item);
+						}
+
+						as_smd_item_destroy(existing_item); // for cf_rchash_get
+					}
+				}
+			}
+		}
+
+		// Create a merged items list.
+		size_t num_items = cf_rchash_get_size(merge_hash);
+		item_list_out = as_smd_item_list_alloc(num_items);
+
+		// Populate the merged items list from the hash table.
+		// (Note:  Use num_items to count the position for each metadata item.)
+		item_list_out->num_items = 0;
+		cf_rchash_reduce(merge_hash, as_smd_list_items_reduce_fn, item_list_out);
+		cf_rchash_destroy(merge_hash);
+	}
+
+	// Sent out a merged metadata msg via fabric transaction to every cluster node.
+	msg *msg = NULL;
+	as_smd_msg_op_t merge_op = AS_SMD_MSG_OP_ACCEPT_THIS_METADATA;
+	for (uint32_t i = 0; i < g_cluster_size; i++) {
+		cf_node node_id = g_succession[i];
+		if (!(msg = as_smd_msg_get(merge_op, item_list_out->item, item_list_out->num_items, module, AS_SMD_ACCEPT_OPT_MERGE))) {
+			cf_crash(AS_SMD, "failed to get a System Metadata fabric msg for operation %s", AS_SMD_MSG_OP_NAME(merge_op));
+		}
+
+		smd_fabric_send(node_id, msg);
+	}
+
+	// Release the item lists.
+	for (int i = 0; i < list_num; i++) {
+		as_smd_item_list_destroy(item_lists_in[i]);
+	}
+
+	// Release the merged items list.
+	as_smd_item_list_destroy(item_list_out);
+
+	return 0;
+}
+
+static void
+smd_add_pending_merge(as_smd_msg_t *sm)
+{
+	smd_pending_merge add = {
+			.m = *sm,
+			.expire = cf_getms() + SMD_PENDING_MERGE_TIMEOUT_SEC * 1000
+	};
+
+	// Steal memory from original.
+	sm->items = NULL;
+	sm->module_name = NULL;
+
+	cf_queue_push(&g_smd->pending_merge_queue, &add);
+}
+
+/*
+ *  Receive a node's metadata on the SMD principal to be combined via the registered merge policy.
+ */
+static int as_smd_receive_metadata(as_smd_t *smd, as_smd_msg_t *smd_msg)
+{
+	int retval = 0;
+
+	// Only the SMD principal receives other node's metadata.
+	if (g_config.self_node != as_smd_principal()) {
+		if (smd_msg->cluster_key != g_cluster_key) {
+			smd_add_pending_merge(smd_msg);
+		}
+
+		cf_debug(AS_SMD, "non-principal node %016lX received metadata from node %016lX", g_config.self_node, smd_msg->node_id);
+		return -1;
+	}
+
+	cf_debug(AS_SMD, "System Metadata thread - received %d metadata items from node %016lX", smd_msg->num_items, smd_msg->node_id);
+
+	if (g_cluster_key != smd_msg->cluster_key) {
+		smd_add_pending_merge(smd_msg);
+		cf_debug(AS_SMD, "received SMD with non-current cluster key (%016lx != %016lx) from node %016lX -> Pending",
+				 smd_msg->cluster_key, g_cluster_key, smd_msg->node_id);
+		return -1;
+	}
+
+	// Store the all of the metadata items received from this node in the appropriate module's external metadata hash table.
+	// And return the item counts by module in a hash table.
+	cf_shash *module_item_count_hash = NULL;
+	if (!(module_item_count_hash = as_smd_store_metadata_by_module(smd, smd_msg))) {
+		cf_crash(AS_SMD, "failed to store metadata by module from node %016lX", smd_msg->node_id);
+	}
+
+	// If something is already there, its obsolete, so release it.
+	cf_shash *prev_module_item_count_hash = NULL;
+	if (CF_SHASH_OK == cf_shash_get(smd->scoreboard, &(smd_msg->node_id), &prev_module_item_count_hash)) {
+		cf_debug(AS_SMD, "found an obsolete module item count hash for node %016lX ~~ Deleting!", smd_msg->node_id);
+		if (CF_SHASH_OK != cf_shash_delete(smd->scoreboard, &(smd_msg->node_id))) {
+			cf_warning(AS_SMD, "failed to delete obsolete module item count hash for node %016lX", smd_msg->node_id);
+		}
+		cf_shash_destroy(prev_module_item_count_hash);
+	}
+
+	// Note that this node has provided its metadata for this cluster state change.
+	if (CF_SHASH_OK != cf_shash_put_unique(smd->scoreboard, &(smd_msg->node_id), &module_item_count_hash)) {
+		cf_warning(AS_SMD, "failed to put unique node %016lX into System Metadata scoreboard hash table", smd_msg->node_id);
+	}
+
+	// Merge the metadata when all nodes have reported in.
+	if (cf_shash_get_size(smd->scoreboard) == g_cluster_size) {
+		cf_debug(AS_SMD, "received metadata from all %u cluster nodes ~~ invoking merge policies", g_cluster_size);
+
+		cf_debug(AS_SMD, "Invoking merge reduce in SMD principal");
+		// Invoke the merge policy for each module and send the results to all nodes.
+		cf_rchash_reduce(smd->modules, as_smd_invoke_merge_reduce_fn, smd);
+
+		// Clear out the state used to notify cluster nodes of the new metadata.
+		as_smd_clear_scoreboard(smd);
+	} else if (cf_shash_get_size(smd->scoreboard) > g_cluster_size) {
+		// Cluster is unstable.
+		// While one node is coming up, one of other nodes has gone down.
+		// e.g Consider 3 node cluster. Add new node. Cluster size is 4.
+		// SMD principal has received information from 3 nodes and waiting for fourth node.
+		// So score board size is 3.
+		// But now two node has gone down. Cluster size is reduced to 2.
+		as_smd_clear_scoreboard(smd);
+	} else {
+		cf_debug(AS_SMD, "Cluster size = %u and smd->scoreboard size = %d ", g_cluster_size, cf_shash_get_size(smd->scoreboard));
+	}
+
+	return retval;
+}
+
+static int metadata_local_deleteall_fn(const void *key, uint32_t key_len, void *object, void *udata)
+{
+	return CF_RCHASH_REDUCE_DELETE;
+}
+
+/*
+ *  Accept a metadata change from the SMD principal using the registered accept policy.
+ */
+static int as_smd_accept_metadata(as_smd_t *smd, as_smd_module_t *module_obj, as_smd_msg_t *smd_msg)
+{
+	int retval = 0;
+
+	// There will be:
+	//    0 items when, after the merge, no valid metadata items were found according to the merge algorithm.
+	//    1 item when the user issues a set/delete metadata API call to a specific module (e.g., SINDEX, UDF.)
+	// >= 1 items when, after the merge, a non-empty list of items is valid according to the merge algorithm.
+	if (smd_msg->items->num_items) {
+		as_smd_item_t *item = smd_msg->items->item[0]; // (Only log the fist item.)
+		cf_debug(AS_SMD, "System Metadata thread - accepting metadata %s change: %zu items: item 0: module \"%s\" ; key \"%s\" ; value \"%s\"",
+				 AS_SMD_MSG_OP_NAME(smd_msg->op), smd_msg->items->num_items, module_obj->module, item->key, item->value);
+	} else {
+		// Allow empty item list for merge and module create.
+		if (smd_msg->options & (AS_SMD_ACCEPT_OPT_MERGE | AS_SMD_ACCEPT_OPT_CREATE)) {
+			cf_debug(AS_SMD, "System Metadata thread - accepting metadata %s change: Zero items coming from merge", AS_SMD_MSG_OP_NAME(smd_msg->op));
+		} else {
+			cf_debug(AS_SMD, "System Metadata thread - accepting metadata %s change: Zero items ~~ Returning!", AS_SMD_MSG_OP_NAME(smd_msg->op));
+			return retval;
+		}
+	}
+
+	cf_debug(AS_SMD, "accepting replacement metadata from incoming System Metadata msg");
+
+#if 1 // DEBUG
+	// It should never be null. Being defensive to bail out just in case.
+	if (!module_obj) {
+		cf_crash(AS_SMD, "SMD module NULL in accept metadata!");
+	}
+#endif
+
+	// In case of merge (after cluster state change) drop the existing local metadata definitions
+	// This is done to clean up some metadata, which could have been dropped during the merge
+	if (smd_msg->options & AS_SMD_ACCEPT_OPT_MERGE) {
+		cf_rchash_reduce(module_obj->my_metadata, metadata_local_deleteall_fn, NULL);
+	}
+
+	for (int i = 0; i < smd_msg->items->num_items; i++) {
+		as_smd_item_t *item = smd_msg->items->item[i];
+		if ((retval = as_smd_metadata_change_local(smd, smd_msg->op, item))) {
+			cf_warning(AS_SMD, "failed to perform the default accept replace local metadata operation %s (rv %d) for item %d: module \"%s\" ; key \"%s\" ; value \"%s\"",
+					   AS_SMD_MSG_OP_NAME(smd_msg->op), retval, i, item->module_name, item->key, item->value);
+		}
+	}
+
+	// Accept the metadata item list for this module.
+	if (module_obj->accept_cb) {
+		// Invoke the module's registered accept policy callback function.
+		cf_debug(AS_SMD, "Calling accept callback with OPT_MERGE for module %s with nitems %zu", smd_msg->module_name, smd_msg->items->num_items);
+		(module_obj->accept_cb)(module_obj->module, smd_msg->items, module_obj->accept_udata, smd_msg->options);
+	}
+
+	// SMD should now be persisted.
+	module_obj->dirty = true;
+
+	// Persist the accepted metadata for this module.
+	if (as_smd_module_persist(module_obj)) {
+		cf_warning(AS_SMD, "failed to persist accepted metadata for module \"%s\"", module_obj->module);
+	}
+
+	return retval;
+}
+
+static uint32_t key2idx_get_index(as_hashmap *map, const char *key)
+{
+	const as_integer *i = as_stringmap_get_integer((as_map *)map, key);
+
+	if (i) {
+		return (uint32_t)as_integer_get(i);
+	}
+
+	uint32_t new_index = as_hashmap_size(map);
+
+	as_stringmap_set_int64((as_map *)map, key, (int64_t)new_index);
+
+	return new_index;
+}
+
+int as_smd_majority_consensus_merge(const char *module, as_smd_item_list_t **merged_list,
+									as_smd_item_list_t **lists_to_merge, size_t num_list, void *udata)
+{
+	typedef struct {
+		as_smd_item_t *item; // does not hold ref to item
+		uint32_t count;
+	} merge_item;
+
+	cf_vector merge_list;
+	as_hashmap key2idx;
+
+	as_hashmap_init(&key2idx, 1024);
+	cf_vector_init(&merge_list, sizeof(merge_item), 1024, 0);
+
+	for(size_t i = 0; i < num_list; i++) {
+		size_t num_items = lists_to_merge[i]->num_items;
+
+		for (size_t j = 0; j < num_items; j++) {
+			as_smd_item_t *item = lists_to_merge[i]->item[j];
+			uint32_t idx = key2idx_get_index(&key2idx, item->key);
+
+			if (idx >= cf_vector_size(&merge_list)) {
+				merge_item mitem = {
+						.item = item,
+						.count = 1
+				};
+
+				cf_vector_append(&merge_list, &mitem);
+				continue;
+			}
+
+			merge_item *p_mitem = (merge_item *)cf_vector_getp(&merge_list, idx);
+			bool existing_wins = (p_mitem->item->generation > item->generation) ||
+					((p_mitem->item->generation == item->generation) &&
+							(p_mitem->item->timestamp > item->timestamp));
+
+			if (! existing_wins) {
+				p_mitem->item = item;
+			}
+
+			p_mitem->count++;
+		}
+	}
+
+	as_hashmap_destroy(&key2idx);
+	*merged_list = as_smd_item_list_alloc(cf_vector_size(&merge_list));
+
+	uint32_t majority_count = ((uint32_t)num_list + 1) / 2;
+
+	for (uint32_t i = 0; i < cf_vector_size(&merge_list); i++) {
+		merge_item *p_mitem = (merge_item *)cf_vector_getp(&merge_list, i);
+
+		if (p_mitem->count >= majority_count) {
+			cf_rc_reserve(p_mitem->item);
+			(*merged_list)->item[i] = p_mitem->item;
+		}
+		else {
+			as_smd_item_t *item = (as_smd_item_t *)cf_rc_alloc(sizeof(as_smd_item_t));
+
+			memset(item, 0, sizeof(as_smd_item_t));
+			item->action = AS_SMD_ACTION_DELETE;
+			item->key = cf_strdup(p_mitem->item->key);
+			item->generation = p_mitem->item->generation + 1;
+			item->timestamp = cf_clepoch_milliseconds();
+			(*merged_list)->item[i] = item;
+		}
+	}
+
+	cf_vector_destroy(&merge_list);
+
+	return 0;
+}
+
+/*
+ *  Process an SMD event, which may be either an SMD API command or an incoming SMD fabric msg.
+ */
+static void as_smd_process_event (as_smd_t *smd, as_smd_event_t *evt)
+{
+	if (AS_SMD_CMD == evt->type) {
+
+		/***** Handle SMD API Command Event *****/
+
+		as_smd_cmd_t *cmd = &(evt->u.cmd);
+
+		cf_debug(AS_SMD, "SMD thread received command: \"%s\" ; options: 0x%08x", AS_SMD_CMD_TYPE_NAME(cmd->type), cmd->options);
+
+		if (cmd->item) {
+			cf_debug(AS_SMD, "SMD event item: node %016lX ; module \"%s\" ; key \"%s\" ; value %p ; generation %u ; timestamp %zu",
+					 cmd->item->node_id, cmd->item->module_name, cmd->item->key, cmd->item->value, cmd->item->generation, cmd->item->timestamp);
+		}
+
+		switch (cmd->type) {
+			case AS_SMD_CMD_INIT:
+				smd->state = AS_SMD_STATE_INITIALIZED;
+				break;
+
+			case AS_SMD_CMD_START:
+				smd->state = AS_SMD_STATE_RUNNING;
+				break;
+
+			case AS_SMD_CMD_CREATE_MODULE:
+				as_smd_module_create(smd, cmd);
+				break;
+
+			case AS_SMD_CMD_DESTROY_MODULE:
+				as_smd_module_destroy(smd, cmd);
+				break;
+
+			case AS_SMD_CMD_SET_METADATA:
+			case AS_SMD_CMD_DELETE_METADATA:
+				as_smd_metadata_change(smd, CMD_TYPE2MSG_OP(cmd->type), cmd->item);
+				break;
+
+			case AS_SMD_CMD_GET_METADATA:
+				as_smd_metadata_get(smd, cmd);
+				break;
+
+			case AS_SMD_CMD_CLUSTER_CHANGED:
+				as_smd_cluster_changed(smd, cmd);
+				break;
+
+			case AS_SMD_CMD_INTERNAL:
+				if (cmd->options & AS_SMD_CMD_OPT_DUMP_SMD) {
+					as_smd_dump_metadata(smd, cmd);
+				} else {
+					cf_warning(AS_SMD, "Unknown System Metadata internal event options received: 0x%08x ~~ Ignoring event!", cmd->options);
+				}
+				break;
+
+			case AS_SMD_CMD_SHUTDOWN:
+				smd->state = AS_SMD_STATE_EXITING;
+				break;
+
+			default:
+				cf_crash(AS_SMD, "received unknown System Metadata event type %d", cmd->type);
+				break;
+		}
+	} else if (AS_SMD_MSG == evt->type) {
+
+		/***** Handle SMD Fabric Transaction Message Event *****/
+
+		as_smd_msg_t *msg = &(evt->u.msg);
+		as_smd_item_t *item = NULL;
+
+		if (msg->num_items) {
+			item = msg->items->item[0]; // (Only log the fist item.)
+			cf_debug(AS_SMD, "SMD thread received fabric msg event with op %s item: item 0: node %016lX module \"%s\" ; key \"%s\" ; value \"%s\"",
+					 AS_SMD_MSG_OP_NAME(msg->op), item->node_id, item->module_name, item->key, item->value);
+		} else {
+			cf_debug(AS_SMD, "SMD thread received fabric msg event with op %s [Zero metadata items]", AS_SMD_MSG_OP_NAME(msg->op));
+			if ((AS_SMD_MSG_OP_SET_ITEM == msg->op) || (AS_SMD_MSG_OP_DELETE_ITEM == msg->op)) {
+				cf_crash(AS_SMD, "SMD thread received invalid empty metadata items list from node %016lX for message %s",
+						 msg->node_id, AS_SMD_MSG_OP_NAME(msg->op));
+			}
+		}
+
+		// Find (or create) the module's object.
+		as_smd_module_t *module_obj = as_smd_module_get(smd, (msg->num_items > 0 ? msg->items->item[0] : NULL), msg);
+
+		switch (msg->op) {
+			case AS_SMD_MSG_OP_SET_ITEM:
+			case AS_SMD_MSG_OP_DELETE_ITEM:
+				as_smd_apply_metadata_change(smd, module_obj, msg);
+				break;
+
+			case AS_SMD_MSG_OP_MY_CURRENT_METADATA:
+				as_smd_receive_metadata(smd, msg);
+				break;
+
+			case AS_SMD_MSG_OP_ACCEPT_THIS_METADATA:
+			case AS_SMD_MSG_OP_SET_FROM_PR:
+				as_smd_accept_metadata(smd, module_obj, msg);
+				break;
+		}
+
+		if (module_obj) {
+			// Give back the reference.
+			cf_rc_release(module_obj);
+		}
+	} else {
+		// This should never happen.
+		cf_warning(AS_SMD, "received unknown type of System Metadata event (%d)", evt->type);
+	}
+}
+
+/*
+ *  Thread to handle all System Metadata events, incoming via the API or the fabric.
+ */
+void *as_smd_thr(void *arg)
+{
+	as_smd_t *smd = (as_smd_t *) arg;
+	int retval = 0;
+
+	cf_debug(AS_SMD, "System Metadata thread created");
+
+	// Receive incoming messages via the message queue.
+	// Process each message.
+	// Destroy the message after processing.
+
+	for ( ; smd->state != AS_SMD_STATE_EXITING ; ) {
+
+		as_smd_event_t *evt = NULL;
+
+		if ((retval = cf_queue_pop(smd->msgq, &evt, AS_SMD_WAIT_INTERVAL_MS))) {
+			if (CF_QUEUE_ERR == retval) {
+				cf_warning(AS_SMD, "failed to pop an event (retval %d)", retval);
+			}
+		}
+
+		if (CF_QUEUE_EMPTY == retval) {
+			// [Could handle any periodic / background events here when there's nothing else to do.]
+			cf_detail(AS_SMD, "System Metadata thread - received timeout event");
+			smd_expire_pending_merges();
+		} else {
+			as_smd_process_event(smd, evt);
+
+			// Release the event message.
+			as_smd_destroy_event(evt);
+		}
+	}
+
+	// Release System Metadata resources.
+	as_smd_terminate(smd);
+
+	// Exit the System Metadata thread.
+	return NULL;
+}
diff --git a/as/src/base/thr_batch.c b/as/src/base/thr_batch.c
new file mode 100644
index 00000000..9696f433
--- /dev/null
+++ b/as/src/base/thr_batch.c
@@ -0,0 +1,467 @@
+/*
+ * thr_batch.c
+ *
+ * Copyright (C) 2012-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/thr_batch.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "aerospike/as_thread_pool.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "dynbuf.h"
+#include "hist.h"
+#include "node.h"
+#include "socket.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/stats.h"
+#include "base/transaction.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+typedef struct {
+	cf_node node;
+	cf_digest keyd;
+	bool done;
+} batch_digest;
+
+typedef struct {
+	int n_digests;
+	batch_digest digest[];
+} batch_digests;
+
+typedef struct {
+	uint64_t trid;
+	uint64_t end_time;
+	as_namespace* ns;
+	as_file_handle* fd_h;
+	batch_digests* digests;
+	cf_vector* binlist;
+	bool get_data;
+	bool complete;
+} batch_transaction;
+
+static as_thread_pool batch_direct_thread_pool;
+
+static void
+as_msg_make_error_response_bufbuilder(cf_digest *keyd, int result_code,
+		cf_buf_builder **bb_r, const char *ns_name)
+{
+	size_t ns_len = strlen(ns_name);
+	size_t msg_sz = sizeof(as_msg) +
+			sizeof(as_msg_field) + sizeof(cf_digest) +
+			sizeof(as_msg_field) + ns_len;
+
+	uint8_t *buf;
+	cf_buf_builder_reserve(bb_r, (int)msg_sz, &buf);
+
+	as_msg *msgp = (as_msg *)buf;
+
+	msgp->header_sz = (uint8_t)sizeof(as_msg);
+	msgp->info1 = 0;
+	msgp->info2 = 0;
+	msgp->info3 = 0;
+	msgp->unused = 0;
+	msgp->result_code = (uint8_t)result_code;
+	msgp->generation = 0;
+	msgp->record_ttl = 0;
+	msgp->transaction_ttl = 0;
+	msgp->n_fields = 2;
+	msgp->n_ops = 0;
+	as_msg_swap_header(msgp);
+
+	buf += sizeof(as_msg);
+
+	as_msg_field *mf = (as_msg_field *)buf;
+
+	mf->field_sz = sizeof(cf_digest) + 1;
+	mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
+	memcpy(mf->data, keyd, sizeof(cf_digest));
+	as_msg_swap_field(mf);
+	buf += sizeof(as_msg_field) + sizeof(cf_digest);
+
+	mf = (as_msg_field *)buf;
+	mf->field_sz = (uint32_t)ns_len + 1;
+	mf->type = AS_MSG_FIELD_TYPE_NAMESPACE;
+	memcpy(mf->data, ns_name, ns_len);
+	as_msg_swap_field(mf);
+}
+
+// Build response to batch request.
+static void
+batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r)
+{
+	as_namespace* ns = btr->ns;
+	batch_digests *bmds = btr->digests;
+	bool get_data = btr->get_data;
+	uint32_t yield_count = 0;
+
+	for (int i = 0; i < bmds->n_digests; i++)
+	{
+		batch_digest *bmd = &bmds->digest[i];
+
+		if (bmd->done == false) {
+			// try to get the key
+			as_partition_reservation rsv;
+			cf_node other_node = 0;
+
+			if (! *bb_r) {
+				*bb_r = cf_buf_builder_create_size(1024 * 4);
+			}
+
+			int rv = as_partition_reserve_read(ns, as_partition_getid(&bmd->keyd), &rsv, false, &other_node);
+
+			if (rv == 0) {
+				as_index_ref r_ref;
+				r_ref.skip_lock = false;
+				int rec_rv = as_record_get_live(rsv.tree, &bmd->keyd, &r_ref, ns);
+
+				if (rec_rv == 0) {
+					as_index *r = r_ref.r;
+
+					// Check to see this isn't a record waiting to die.
+					if (as_record_is_doomed(r, ns)) {
+						as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOT_FOUND, bb_r, ns->name);
+					}
+					else {
+						// Make sure it's brought in from storage if necessary.
+						as_storage_rd rd;
+						as_storage_record_open(ns, r, &rd);
+
+						if (get_data) {
+							as_storage_rd_load_n_bins(&rd); // TODO - handle error returned
+						}
+
+						// Note: this array must stay in scope until the
+						// response for this record has been built, since in the
+						// get data w/ record on device case, it's copied by
+						// reference directly into the record descriptor.
+						as_bin stack_bins[!get_data || ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+						if (get_data) {
+							// Figure out which bins you want - for now, all.
+							as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned
+							rd.n_bins = as_bin_inuse_count(&rd);
+						}
+
+						as_msg_make_response_bufbuilder(bb_r, &rd, !get_data, false, false, btr->binlist);
+
+						as_storage_record_close(&rd);
+					}
+					as_record_done(&r_ref, ns);
+				}
+				else {
+					// TODO - what about empty records?
+					cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %lx", rec_rv, *(uint64_t *)&bmd->keyd);
+					as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOT_FOUND, bb_r, ns->name);
+				}
+
+				bmd->done = true;
+
+				as_partition_release(&rsv);
+			}
+			else {
+				cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv);
+
+				as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOT_FOUND, bb_r, ns->name);
+
+				if (other_node != 0) {
+					bmd->node = other_node;
+					cf_debug(AS_BATCH, "other_node is:  %lx", other_node);
+				} else {
+					cf_debug(AS_BATCH, "other_node is NULL.");
+				}
+			}
+
+			yield_count++;
+			if (yield_count % g_config.batch_priority == 0) {
+				usleep(1);
+			}
+		}
+	}
+}
+
+// Send response to client socket.
+static int
+batch_send(cf_socket *sock, uint8_t* buf, size_t len, int flags)
+{
+	if (cf_socket_send_all(sock, buf, len, flags,
+			CF_SOCKET_TIMEOUT) < 0) {
+		// Common when a client aborts.
+		cf_debug(AS_BATCH, "batch send response error, errno %d fd %d",
+				errno, CSFD(sock));
+		return -1;
+	}
+
+	return 0;
+}
+
+// Send protocol header to the requesting client.
+static int
+batch_send_header(cf_socket *sock, size_t len)
+{
+	as_proto proto;
+	proto.version = PROTO_VERSION;
+	proto.type = PROTO_TYPE_AS_MSG;
+	proto.sz = len;
+	as_proto_swap(&proto);
+
+	return batch_send(sock, (uint8_t*) &proto, 8, MSG_NOSIGNAL | MSG_MORE);
+}
+
+// Send protocol trailer to the requesting client.
+static int
+batch_send_final(cf_socket *sock, uint32_t result_code)
+{
+	cl_msg m;
+	m.proto.version = PROTO_VERSION;
+	m.proto.type = PROTO_TYPE_AS_MSG;
+	m.proto.sz = sizeof(as_msg);
+	as_proto_swap(&m.proto);
+	m.msg.header_sz = sizeof(as_msg);
+	m.msg.info1 = 0;
+	m.msg.info2 = 0;
+	m.msg.info3 = AS_MSG_INFO3_LAST;
+	m.msg.unused = 0;
+	m.msg.result_code = result_code;
+	m.msg.generation = 0;
+	m.msg.record_ttl = 0;
+	m.msg.transaction_ttl = 0;
+	m.msg.n_fields = 0;
+	m.msg.n_ops = 0;
+	as_msg_swap_header(&m.msg);
+
+	return batch_send(sock, (uint8_t*) &m, sizeof(m), MSG_NOSIGNAL);
+}
+
+
+// Release memory for batch transaction.
+static void
+batch_transaction_done(batch_transaction* btr, bool force_close)
+{
+	if (btr->fd_h) {
+		as_end_of_transaction(btr->fd_h, force_close);
+		btr->fd_h = 0;
+	}
+
+	if (btr->digests) {
+		cf_free(btr->digests);
+		btr->digests = 0;
+	}
+
+	if (btr->binlist) {
+		cf_vector_destroy(btr->binlist);
+		btr->binlist = 0;
+	}
+}
+
+// Process a batch request.
+static void
+batch_process_request(batch_transaction* btr)
+{
+	// Keep the reaper at bay.
+	btr->fd_h->last_used = cf_getms();
+
+	cf_buf_builder* bb = 0;
+	batch_build_response(btr, &bb);
+
+	cf_socket *sock = &btr->fd_h->sock;
+	int brv;
+
+	if (bb) {
+		brv = batch_send_header(sock, bb->used_sz);
+
+		if (brv == 0) {
+			brv = batch_send(sock, bb->buf, bb->used_sz, MSG_NOSIGNAL | MSG_MORE);
+
+			if (brv == 0) {
+				brv = batch_send_final(sock, 0);
+			}
+		}
+		cf_buf_builder_free(bb);
+	}
+	else {
+		cf_info(AS_BATCH, " batch request: returned no local responses");
+		brv = batch_send_final(sock, 0);
+	}
+
+	batch_transaction_done(btr, brv != 0);
+}
+
+// Process one queue's batch requests.
+static void
+batch_worker(void* udata)
+{
+	batch_transaction* btr = (batch_transaction*)udata;
+
+	// Check for timeouts.
+	if (btr->end_time != 0 && cf_getns() > btr->end_time) {
+		cf_atomic64_incr(&g_stats.batch_timeout);
+
+		if (btr->fd_h) {
+			as_msg_send_reply(btr->fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT,
+					0, 0, 0, 0, 0, btr->ns, btr->trid);
+			btr->fd_h = 0;
+		}
+		batch_transaction_done(btr, false);
+		return;
+	}
+
+	// Process batch request.
+	batch_process_request(btr);
+}
+
+// Create bin name list from message.
+static cf_vector*
+as_binlist_from_op(as_msg* msg)
+{
+	if (msg->n_ops == 0) {
+		return 0;
+	}
+
+	cf_vector* binlist = cf_vector_create(AS_ID_BIN_SZ, 5, 0);
+	as_msg_op* op = 0;
+	int n = 0;
+	int len;
+	char name[AS_ID_BIN_SZ];
+
+	while ((op = as_msg_op_iterate(msg, op, &n))) {
+		len = (op->name_sz <= AS_ID_BIN_SZ - 1)? op->name_sz : AS_ID_BIN_SZ - 1;
+		memcpy(name, op->name, len);
+		name[len] = 0;
+		cf_vector_append(binlist, name);
+	}
+	return binlist;
+}
+
+// Initialize batch queues and worker threads.
+int
+as_batch_direct_init()
+{
+	uint32_t threads = g_config.n_batch_threads;
+	cf_info(AS_BATCH, "starting %u batch-threads", threads);
+	int status = as_thread_pool_init_fixed(&batch_direct_thread_pool, threads, batch_worker, sizeof(batch_transaction), offsetof(batch_transaction,complete));
+
+	if (status) {
+		cf_warning(AS_BATCH, "Failed to initialize batch-threads to %u: %d", threads, status);
+	}
+	return status;
+}
+
+// Put batch request on a separate batch queue.
+int
+as_batch_direct_queue_task(as_transaction* tr, as_namespace *ns)
+{
+	cf_atomic64_incr(&g_stats.batch_initiate);
+
+	if (g_config.n_batch_threads <= 0) {
+		cf_warning(AS_BATCH, "batch-threads has been disabled.");
+		return AS_PROTO_RESULT_FAIL_BATCH_DISABLED;
+	}
+
+	as_msg* msg = &tr->msgp->msg;
+
+	as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY);
+	if (! dfp) {
+		cf_warning(AS_BATCH, "Batch digests are required.");
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	uint32_t n_digests = dfp->field_sz / sizeof(cf_digest);
+
+	if (n_digests > g_config.batch_max_requests) {
+		cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests);
+		return AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS;
+	}
+
+	batch_transaction btr;
+	btr.trid = as_transaction_trid(tr);
+	btr.end_time = tr->end_time;
+	btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NO_BINS);
+	btr.complete = false;
+	btr.ns = ns;
+
+	// Create the master digest table.
+	btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests));
+
+	batch_digests* bmd = btr.digests;
+	bmd->n_digests = n_digests;
+	uint8_t* digest_field_data = dfp->data;
+
+	for (int i = 0; i < n_digests; i++) {
+		bmd->digest[i].done = false;
+		bmd->digest[i].node = 0;
+		memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest));
+		digest_field_data += sizeof(cf_digest);
+	}
+
+	btr.binlist = as_binlist_from_op(msg);
+	btr.fd_h = tr->from.proto_fd_h;
+	tr->from.proto_fd_h = NULL;
+	btr.fd_h->last_used = cf_getms();
+
+	int status = as_thread_pool_queue_task_fixed(&batch_direct_thread_pool, &btr);
+
+	if (status) {
+		cf_warning(AS_BATCH, "Batch enqueue failed");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+	return 0;
+}
+
+int
+as_batch_direct_queue_size()
+{
+	return batch_direct_thread_pool.dispatch_queue? cf_queue_sz(batch_direct_thread_pool.dispatch_queue) : 0;
+}
+
+int
+as_batch_direct_threads_resize(uint32_t threads)
+{
+	if (threads > MAX_BATCH_THREADS) {
+		cf_warning(AS_BATCH, "batch-threads %u exceeds max %u", threads, MAX_BATCH_THREADS);
+		return -1;
+	}
+
+	cf_info(AS_BATCH, "Resize batch-threads from %u to %u", g_config.n_batch_threads, threads);
+	int status = as_thread_pool_resize(&batch_direct_thread_pool, threads);
+	g_config.n_batch_threads = batch_direct_thread_pool.thread_size;
+
+	if (status) {
+		cf_warning(AS_BATCH, "Failed to resize batch-threads. status=%d, batch-threads=%d",
+				status, g_config.n_batch_threads);
+	}
+	return status;
+}
diff --git a/as/src/base/thr_demarshal.c b/as/src/base/thr_demarshal.c
new file mode 100644
index 00000000..bf6f9b89
--- /dev/null
+++ b/as/src/base/thr_demarshal.c
@@ -0,0 +1,914 @@
+/*
+ * thr_demarshal.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/thr_demarshal.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/param.h>	// for MIN()
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "fault.h"
+#include "hardware.h"
+#include "hist.h"
+#include "socket.h"
+#include "tls.h"
+
+#include "base/as_stap.h"
+#include "base/batch.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/packet_compression.h"
+#include "base/proto.h"
+#include "base/security.h"
+#include "base/stats.h"
+#include "base/thr_info.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "base/xdr_serverside.h"
+
+#define POLL_SZ 1024
+
+#define XDR_WRITE_BUFFER_SIZE (5 * 1024 * 1024)
+#define XDR_READ_BUFFER_SIZE (15 * 1024 * 1024)
+
+extern void *thr_demarshal(void *arg);
+
+typedef struct {
+	cf_poll			polls[MAX_DEMARSHAL_THREADS];
+	unsigned int	num_threads;
+	pthread_t	dm_th[MAX_DEMARSHAL_THREADS];
+} demarshal_args;
+
+static demarshal_args *g_demarshal_args = 0;
+
+as_info_access g_access = {
+	.service = { .addrs = { .n_addrs = 0 }, .port = 0 },
+	.alt_service = { .addrs = { .n_addrs = 0 }, .port = 0 },
+	.tls_service = { .addrs = { .n_addrs = 0 }, .port = 0 },
+	.alt_tls_service = { .addrs = { .n_addrs = 0 }, .port = 0 }
+};
+
+cf_serv_cfg g_service_bind = { .n_cfgs = 0 };
+cf_tls_info *g_service_tls;
+
+static cf_sockets g_sockets;
+
+//
+// File handle reaper.
+//
+
+pthread_mutex_t	g_file_handle_a_LOCK = PTHREAD_MUTEX_INITIALIZER;
+as_file_handle	**g_file_handle_a = 0;
+uint32_t		g_file_handle_a_sz;
+pthread_t		g_demarshal_reaper_th;
+
+void *thr_demarshal_reaper_fn(void *arg);
+static cf_queue *g_freeslot = 0;
+
+void
+thr_demarshal_rearm(as_file_handle *fd_h)
+{
+	// This causes ENOENT, when we reached NextEvent_FD_Cleanup (e.g, because
+	// the client disconnected) while the transaction was still ongoing.
+
+	static int32_t err_ok[] = { ENOENT };
+	CF_IGNORE_ERROR(cf_poll_modify_socket_forgiving(fd_h->poll, &fd_h->sock,
+			EPOLLIN | EPOLLONESHOT | EPOLLRDHUP, fd_h,
+			sizeof(err_ok) / sizeof(int32_t), err_ok));
+}
+
+void
+demarshal_file_handle_init()
+{
+	struct rlimit rl;
+
+	pthread_mutex_lock(&g_file_handle_a_LOCK);
+
+	if (g_file_handle_a == 0) {
+		if (-1 == getrlimit(RLIMIT_NOFILE, &rl)) {
+			cf_crash(AS_DEMARSHAL, "getrlimit: %s", cf_strerror(errno));
+		}
+
+		// Initialize the message pointer array and the unread byte counters.
+		g_file_handle_a = cf_calloc(rl.rlim_cur, sizeof(as_proto *));
+		g_file_handle_a_sz = rl.rlim_cur;
+
+		for (int i = 0; i < g_file_handle_a_sz; i++) {
+			cf_queue_push(g_freeslot, &i);
+		}
+
+		pthread_create(&g_demarshal_reaper_th, 0, thr_demarshal_reaper_fn, 0);
+
+		// If config value is 0, set a maximum proto size based on the RLIMIT.
+		if (g_config.n_proto_fd_max == 0) {
+			g_config.n_proto_fd_max = rl.rlim_cur / 2;
+			cf_info(AS_DEMARSHAL, "setting default client file descriptors to %d", g_config.n_proto_fd_max);
+		}
+	}
+
+	pthread_mutex_unlock(&g_file_handle_a_LOCK);
+}
+
+// Keep track of the connections, since they're precious. Kill anything that
+// hasn't been used in a while. The file handle array keeps a reference count,
+// and allows a reaper to run through and find the ones to reap. The table is
+// only written by the demarshal threads, and only read by the reaper thread.
+void *
+thr_demarshal_reaper_fn(void *arg)
+{
+	uint64_t last = cf_getms();
+
+	while (true) {
+		uint64_t now = cf_getms();
+		uint32_t inuse_cnt = 0;
+		uint64_t kill_ms = g_config.proto_fd_idle_ms;
+		bool refresh = false;
+
+		if (now - last > (uint64_t)g_config.sec_cfg.privilege_refresh_period * 1000) {
+			refresh = true;
+			last = now;
+		}
+
+		pthread_mutex_lock(&g_file_handle_a_LOCK);
+
+		for (int i = 0; i < g_file_handle_a_sz; i++) {
+			if (g_file_handle_a[i]) {
+				as_file_handle *fd_h = g_file_handle_a[i];
+
+				if (refresh) {
+					as_security_refresh(fd_h);
+				}
+
+				// Reap, if asked to.
+				if (fd_h->reap_me) {
+					cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", CSFD(&fd_h->sock));
+					g_file_handle_a[i] = 0;
+					cf_queue_push(g_freeslot, &i);
+					as_release_file_handle(fd_h);
+					fd_h = 0;
+				}
+				// Reap if past kill time.
+				else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) {
+					if (fd_h->fh_info & FH_INFO_DONOT_REAP) {
+						cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", CSFD(&fd_h->sock));
+						inuse_cnt++;
+						continue;
+					}
+
+					cf_socket_shutdown(&fd_h->sock); // will trigger epoll errors
+					cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", CSFD(&fd_h->sock));
+					g_file_handle_a[i] = 0;
+					cf_queue_push(g_freeslot, &i);
+					as_release_file_handle(fd_h);
+					fd_h = 0;
+					g_stats.reaper_count++;
+				}
+				else {
+					inuse_cnt++;
+				}
+			}
+		}
+
+		pthread_mutex_unlock(&g_file_handle_a_LOCK);
+
+		if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) {
+			cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse",
+					g_file_handle_a_sz, inuse_cnt);
+		}
+
+		// Validate the system statistics.
+		if (g_stats.proto_connections_opened - g_stats.proto_connections_closed != inuse_cnt) {
+			cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count:  %lu in stats vs %u calculated",
+					g_stats.proto_connections_opened - g_stats.proto_connections_closed,
+					inuse_cnt);
+		}
+
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+int
+thr_demarshal_read_file(const char *path, char *buffer, size_t size)
+{
+	int res = -1;
+	int fd = open(path, O_RDONLY);
+
+	if (fd < 0) {
+		cf_warning(AS_DEMARSHAL, "Failed to open %s for reading.", path);
+		goto cleanup0;
+	}
+
+	size_t len = 0;
+
+	while (len < size - 1) {
+		ssize_t n = read(fd, buffer + len, size - len - 1);
+
+		if (n < 0) {
+			cf_warning(AS_DEMARSHAL, "Failed to read from %s", path);
+			goto cleanup1;
+		}
+
+		if (n == 0) {
+			buffer[len] = 0;
+			res = 0;
+			goto cleanup1;
+		}
+
+		len += n;
+	}
+
+	cf_warning(AS_DEMARSHAL, "%s is too large.", path);
+
+cleanup1:
+	close(fd);
+
+cleanup0:
+	return res;
+}
+
+int
+thr_demarshal_read_integer(const char *path, int *value)
+{
+	char buffer[21];
+
+	if (thr_demarshal_read_file(path, buffer, sizeof(buffer)) < 0) {
+		return -1;
+	}
+
+	char *end;
+	uint64_t x = strtoul(buffer, &end, 10);
+
+	if (*end != '\n' || x > INT_MAX) {
+		cf_warning(AS_DEMARSHAL, "Invalid integer value in %s.", path);
+		return -1;
+	}
+
+	*value = (int)x;
+	return 0;
+}
+
+typedef enum {
+	BUFFER_TYPE_SEND,
+	BUFFER_TYPE_RECEIVE
+} buffer_type;
+
+int
+thr_demarshal_set_buffer(cf_socket *sock, buffer_type type, int size)
+{
+	static int rcv_max = -1;
+	static int snd_max = -1;
+
+	const char *proc;
+	int *max;
+
+	switch (type) {
+	case BUFFER_TYPE_RECEIVE:
+		proc = "/proc/sys/net/core/rmem_max";
+		max = &rcv_max;
+		break;
+
+	case BUFFER_TYPE_SEND:
+		proc = "/proc/sys/net/core/wmem_max";
+		max = &snd_max;
+		break;
+
+	default:
+		cf_crash(AS_DEMARSHAL, "Invalid buffer type: %d", (int32_t)type);
+		return -1; // cf_crash() should have a "noreturn" attribute, but is a macro
+	}
+
+	int tmp = ck_pr_load_int(max);
+
+	if (tmp < 0) {
+		if (thr_demarshal_read_integer(proc, &tmp) < 0) {
+			cf_warning(AS_DEMARSHAL, "Failed to read %s; should be at least %d. Please verify.", proc, size);
+			tmp = size;
+		}
+	}
+
+	if (tmp < size) {
+		cf_warning(AS_DEMARSHAL, "Buffer limit is %d, should be at least %d. Please set %s accordingly.",
+				tmp, size, proc);
+		return -1;
+	}
+
+	ck_pr_cas_int(max, -1, tmp);
+
+	switch (type) {
+	case BUFFER_TYPE_RECEIVE:
+		cf_socket_set_receive_buffer(sock, size);
+		break;
+
+	case BUFFER_TYPE_SEND:
+		cf_socket_set_send_buffer(sock, size);
+		break;
+	}
+
+	return 0;
+}
+
+int
+thr_demarshal_config_xdr(cf_socket *sock)
+{
+	if (thr_demarshal_set_buffer(sock, BUFFER_TYPE_RECEIVE, XDR_READ_BUFFER_SIZE) < 0) {
+		return -1;
+	}
+
+	if (thr_demarshal_set_buffer(sock, BUFFER_TYPE_SEND, XDR_WRITE_BUFFER_SIZE) < 0) {
+		return -1;
+	}
+
+	cf_socket_set_window(sock, XDR_READ_BUFFER_SIZE);
+	cf_socket_enable_nagle(sock);
+	return 0;
+}
+
+bool
+peek_data_in_memory(const as_msg *m)
+{
+	as_msg_field *f = as_msg_field_get(m, AS_MSG_FIELD_TYPE_NAMESPACE);
+
+	if (! f) {
+		// Should never happen, but don't bark here.
+		return false;
+	}
+
+	as_namespace *ns = as_namespace_get_bymsgfield(f);
+
+	// If ns is null, don't be the first to bark.
+	return ns && ns->storage_data_in_memory;
+}
+
+// Set of threads which talk to client over the connection for doing the needful
+// processing. Note that once fd is assigned to a thread all the work on that fd
+// is done by that thread. Fair fd usage is expected of the client. First thread
+// is special - also does accept [listens for new connections]. It is the only
+// thread which does it.
+void *
+thr_demarshal(void *unused)
+{
+	cf_poll poll;
+	int nevents, i;
+	cf_clock last_fd_print = 0;
+
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	// Figure out my thread index.
+	pthread_t self = pthread_self();
+	int thr_id;
+	for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) {
+		if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self))
+			break;
+	}
+
+	if (thr_id == MAX_DEMARSHAL_THREADS) {
+		cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!");
+		return(0);
+	}
+
+	if (g_config.auto_pin != CF_TOPO_AUTO_PIN_NONE) {
+		cf_detail(AS_DEMARSHAL, "pinning thread to CPU %d", thr_id);
+		cf_topo_pin_to_cpu((cf_topo_cpu_index)thr_id);
+	}
+
+	cf_poll_create(&poll);
+
+	// First thread accepts new connection at interface socket.
+	if (thr_id == 0) {
+		demarshal_file_handle_init();
+
+		cf_poll_add_sockets(poll, &g_sockets, EPOLLIN | EPOLLERR | EPOLLHUP);
+		cf_socket_show_server(AS_DEMARSHAL, "client", &g_sockets);
+	}
+
+	g_demarshal_args->polls[thr_id] = poll;
+	cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id);
+
+	int id_cntr = 0;
+
+	// Demarshal transactions from the socket.
+	for ( ; ; ) {
+		cf_poll_event events[POLL_SZ];
+
+		cf_detail(AS_DEMARSHAL, "calling epoll");
+
+		nevents = cf_poll_wait(poll, events, POLL_SZ, -1);
+		cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents);
+
+		uint64_t now_ns = cf_getns();
+		uint64_t now_ms = now_ns / 1000000;
+
+		// Iterate over all events.
+		for (i = 0; i < nevents; i++) {
+			cf_socket *ssock = events[i].data;
+
+			if (cf_sockets_has_socket(&g_sockets, ssock)) {
+				// Accept new connections on the service socket.
+				cf_socket csock;
+				cf_sock_addr sa;
+
+				if (cf_socket_accept(ssock, &csock, &sa) < 0) {
+					// This means we're out of file descriptors - could be a SYN
+					// flood attack or misbehaving client. Eventually we'd like
+					// to make the reaper fairer, but for now we'll just have to
+					// ignore the accept error and move on.
+					if ((errno == EMFILE) || (errno == ENFILE)) {
+						if (last_fd_print != (cf_getms() / 1000L)) {
+							cf_warning(AS_DEMARSHAL, "Hit OS file descriptor limit (EMFILE on accept). Consider raising limit for uid %d", g_config.uid);
+							last_fd_print = cf_getms() / 1000L;
+						}
+						continue;
+					}
+					cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno);
+				}
+
+				char sa_str[sizeof(((as_file_handle *)NULL)->client)];
+				cf_sock_addr_to_string_safe(&sa, sa_str, sizeof(sa_str));
+				cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", sa_str, CSFD(&csock));
+
+				// Validate the limit of protocol connections we allow.
+				uint32_t conns_open = g_stats.proto_connections_opened - g_stats.proto_connections_closed;
+				cf_sock_cfg *cfg = ssock->cfg;
+				if (cfg->owner != CF_SOCK_OWNER_XDR && conns_open > g_config.n_proto_fd_max) {
+					if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs
+						cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open);
+						last_fd_print = cf_getms();
+					}
+					cf_socket_shutdown(&csock);
+					cf_socket_close(&csock);
+					cf_socket_term(&csock);
+					continue;
+				}
+
+				// Initialize the TLS part of the socket.
+				if (cfg->owner == CF_SOCK_OWNER_SERVICE_TLS) {
+					tls_socket_prepare_server(g_service_tls, &csock);
+				}
+
+				// Create as_file_handle and queue it up in epoll_fd for further
+				// communication on one of the demarshal threads.
+				as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle));
+
+				strcpy(fd_h->client, sa_str);
+				cf_socket_copy(&csock, &fd_h->sock);
+
+				fd_h->last_used = cf_getms();
+				fd_h->reap_me = false;
+				fd_h->proto = 0;
+				fd_h->proto_unread = (uint64_t)sizeof(as_proto);
+				fd_h->fh_info = 0;
+				fd_h->security_filter = as_security_filter_create();
+
+				// Insert into the global table so the reaper can manage it. Do
+				// this before queueing it up for demarshal threads - once
+				// EPOLL_CTL_ADD is done it's difficult to back out (if insert
+				// into global table fails) because fd state could be anything.
+				cf_rc_reserve(fd_h);
+
+				pthread_mutex_lock(&g_file_handle_a_LOCK);
+
+				int j;
+				bool inserted = true;
+
+				if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) {
+					inserted = false;
+				}
+				else {
+					g_file_handle_a[j] = fd_h;
+				}
+
+				pthread_mutex_unlock(&g_file_handle_a_LOCK);
+
+				if (!inserted) {
+					cf_info(AS_DEMARSHAL, "unable to add socket to file handle table");
+					cf_socket_shutdown(&csock);
+					cf_socket_close(&csock);
+					cf_socket_term(&csock);
+					cf_rc_free(fd_h); // will free even with ref-count of 2
+				}
+				else {
+					int32_t id;
+
+					if (g_config.auto_pin == CF_TOPO_AUTO_PIN_NONE) {
+						cf_detail(AS_DEMARSHAL, "no CPU pinning - dispatching incoming connection round-robin");
+						id = (id_cntr++) % g_demarshal_args->num_threads;
+					}
+					else {
+						id = cf_topo_socket_cpu(&fd_h->sock);
+						cf_detail(AS_DEMARSHAL, "incoming connection on CPU %d", id);
+					}
+
+					fd_h->poll = g_demarshal_args->polls[id];
+
+					// Place the client socket in the event queue.
+					cf_poll_add_socket(fd_h->poll, &fd_h->sock, EPOLLIN | EPOLLONESHOT | EPOLLRDHUP, fd_h);
+					cf_atomic64_incr(&g_stats.proto_connections_opened);
+				}
+			}
+			else {
+				bool has_extra_ref   = false;
+				as_file_handle *fd_h = events[i].data;
+				if (fd_h == 0) {
+					cf_info(AS_DEMARSHAL, "event with null handle, continuing");
+					goto NextEvent;
+				}
+
+				cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", CSFD(&fd_h->sock), events[i].events);
+
+				// Process data on an existing connection: this might be more
+				// activity on an already existing transaction, so we have some
+				// state to manage.
+				cf_socket *sock = &fd_h->sock;
+
+				if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
+					cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", CSFD(sock), events[i].events);
+					// no longer in use: out of epoll etc
+					goto NextEvent_FD_Cleanup;
+				}
+
+				if (tls_socket_needs_handshake(&fd_h->sock)) {
+					int32_t tls_ev = tls_socket_accept(&fd_h->sock);
+
+					if (tls_ev == EPOLLERR) {
+						goto NextEvent_FD_Cleanup;
+					}
+
+					if (tls_ev == 0) {
+						tls_socket_must_not_have_data(&fd_h->sock, "service handshake");
+						tls_ev = EPOLLIN;
+					}
+
+					cf_poll_modify_socket(fd_h->poll, &fd_h->sock,
+							tls_ev | EPOLLONESHOT | EPOLLRDHUP, fd_h);
+					goto NextEvent;
+				}
+
+				// If pointer is NULL, then we need to create a transaction and
+				// store it in the buffer.
+				if (fd_h->proto == NULL) {
+					int32_t recv_sz = cf_socket_recv(sock, (uint8_t *)&fd_h->proto_hdr + sizeof(as_proto) - fd_h->proto_unread,	fd_h->proto_unread, 0);
+
+					if (recv_sz <= 0) {
+						if (recv_sz != 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+							// This can happen because TLS protocol
+							// overhead can trip the epoll but no
+							// application-level bytes are actually
+							// available yet.
+							thr_demarshal_rearm(fd_h);
+							goto NextEvent;
+						}
+						cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d errno %d", recv_sz, errno);
+						goto NextEvent_FD_Cleanup;
+					}
+
+					fd_h->proto_unread -= recv_sz;
+
+					if (fd_h->proto_unread != 0) {
+						tls_socket_must_not_have_data(&fd_h->sock, "partial client read (size)");
+						thr_demarshal_rearm(fd_h);
+						goto NextEvent;
+					}
+
+					// Check for a TLS ClientHello arriving at a non-TLS socket. Heuristic:
+					//   - tls[0] == ContentType.handshake (22)
+					//   - tls[1] == ProtocolVersion.major (3)
+					//   - tls[5] == HandshakeType.client_hello (1)
+
+					uint8_t *tls = (uint8_t *)&fd_h->proto_hdr;
+
+					if (tls[0] == 22 && tls[1] == 3 && tls[5] == 1) {
+						cf_warning(AS_DEMARSHAL, "ignoring incoming TLS connection from %s", fd_h->client);
+						goto NextEvent_FD_Cleanup;
+
+					}
+
+					if (fd_h->proto_hdr.version != PROTO_VERSION &&
+							// For backward compatibility, allow version 0 with
+							// security messages.
+							! (fd_h->proto_hdr.version == 0 && fd_h->proto_hdr.type == PROTO_TYPE_SECURITY)) {
+						cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u",
+								fd_h->client, fd_h->proto_hdr.version);
+						goto NextEvent_FD_Cleanup;
+					}
+
+					// Swap the necessary elements of the as_proto.
+					as_proto_swap(&fd_h->proto_hdr);
+
+					if (fd_h->proto_hdr.sz > PROTO_SIZE_MAX) {
+						cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %lu",
+								fd_h->client, PROTO_SIZE_MAX, (uint64_t)fd_h->proto_hdr.sz);
+						goto NextEvent_FD_Cleanup;
+					}
+
+					// Allocate the complete message buffer.
+					fd_h->proto = cf_malloc(sizeof(as_proto) + fd_h->proto_hdr.sz);
+
+					memcpy(fd_h->proto, &fd_h->proto_hdr, sizeof(as_proto));
+
+					fd_h->proto_unread = fd_h->proto->sz;
+				}
+
+				if (fd_h->proto_unread != 0) {
+					// Read the data.
+					int32_t recv_sz = cf_socket_recv(sock, fd_h->proto->data + (fd_h->proto->sz - fd_h->proto_unread), fd_h->proto_unread, 0);
+
+					if (recv_sz <= 0) {
+						if (recv_sz != 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+							thr_demarshal_rearm(fd_h);
+							goto NextEvent;
+						}
+						cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", recv_sz, errno, cf_strerror(errno));
+						goto NextEvent_FD_Cleanup;
+					}
+
+					// Decrement bytes-unread counter.
+					cf_detail(AS_DEMARSHAL, "read fd %d (%d %lu)", CSFD(sock), recv_sz, fd_h->proto_unread);
+					fd_h->proto_unread -= recv_sz;
+
+					if (fd_h->proto_unread != 0) {
+						tls_socket_must_not_have_data(&fd_h->sock, "partial client read (body)");
+						thr_demarshal_rearm(fd_h);
+						goto NextEvent;
+					}
+				}
+
+				tls_socket_must_not_have_data(&fd_h->sock, "full client read");
+				cf_debug(AS_DEMARSHAL, "running on CPU %hu", cf_topo_current_cpu());
+
+				// fd_h->proto_unread == 0 - finished reading complete proto.
+				// In current pipelining model, can't rearm fd_h until end of
+				// transaction.
+				as_proto *proto_p = fd_h->proto;
+
+				fd_h->proto = NULL;
+				fd_h->proto_unread = (uint64_t)sizeof(as_proto);
+				fd_h->last_used = now_ms;
+
+				cf_rc_reserve(fd_h);
+				has_extra_ref = true;
+
+				// Info protocol requests.
+				if (proto_p->type == PROTO_TYPE_INFO) {
+					as_info_transaction it = { fd_h, proto_p, now_ns };
+
+					as_info(&it);
+					goto NextEvent;
+				}
+
+				// INIT_TR
+				as_transaction tr;
+				as_transaction_init_head(&tr, NULL, (cl_msg *)proto_p);
+
+				tr.origin = FROM_CLIENT;
+				tr.from.proto_fd_h = fd_h;
+				tr.start_time = now_ns;
+
+				if (! as_proto_is_valid_type(proto_p)) {
+					cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type);
+					// We got a proto message type we don't recognize, so it
+					// may not do any good to send back an as_msg error, but
+					// it's the best we can do. At least we can keep the fd.
+					as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
+					goto NextEvent;
+				}
+
+				// Check if it's compressed.
+				if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) {
+					// Decompress it - allocate buffer to hold decompressed
+					// packet.
+					uint8_t *decompressed_buf = NULL;
+					size_t decompressed_buf_size = 0;
+					int rv = 0;
+					if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) {
+						cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv);
+						cf_warning_binary(AS_DEMARSHAL, (void *)proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p");
+						as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
+						goto NextEvent;
+					}
+
+					// Free the compressed packet since we'll be using the
+					// decompressed packet from now on.
+					cf_free(proto_p);
+
+					// Get original packet.
+					tr.msgp = (cl_msg *)decompressed_buf;
+					as_proto_swap(&(tr.msgp->proto));
+
+					if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) {
+						cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]",
+								tr.msgp->proto.version, tr.msgp->proto.type, (uint64_t)tr.msgp->proto.sz, decompressed_buf_size);
+						as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
+						goto NextEvent;
+					}
+				}
+
+				// If it's an XDR connection and we haven't yet modified the connection settings, ...
+				if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG &&
+						as_transaction_is_xdr(&tr) &&
+						(fd_h->fh_info & FH_INFO_XDR) == 0) {
+					// ... modify them.
+					if (thr_demarshal_config_xdr(&fd_h->sock) != 0) {
+						cf_warning(AS_DEMARSHAL, "Failed to configure XDR connection");
+						goto NextEvent_FD_Cleanup;
+					}
+
+					fd_h->fh_info |= FH_INFO_XDR;
+				}
+
+				// Security protocol transactions.
+				if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) {
+					as_security_transact(&tr);
+					goto NextEvent;
+				}
+
+				// For now only AS_MSG's contribute to this benchmark.
+				if (g_config.svc_benchmarks_enabled) {
+					tr.benchmark_time = histogram_insert_data_point(g_stats.svc_demarshal_hist, now_ns);
+				}
+
+				// Fast path for batch requests.
+				if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) {
+					as_batch_queue_task(&tr);
+					goto NextEvent;
+				}
+
+				// Swap as_msg fields and bin-ops to host order, and flag
+				// which fields are present, to reduce re-parsing.
+				if (! as_transaction_prepare(&tr, true)) {
+					cf_warning(AS_DEMARSHAL, "bad client msg");
+					as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_PARAMETER);
+					goto NextEvent;
+				}
+
+				ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp, as_transaction_trid(&tr));
+
+				// Directly process or queue the transaction.
+				if (g_config.n_namespaces_inlined != 0 &&
+						(g_config.n_namespaces_not_inlined == 0 ||
+								// Only peek if at least one of each config.
+								peek_data_in_memory(&tr.msgp->msg))) {
+					// Data-in-memory namespace - process in this thread.
+					as_tsvc_process_transaction(&tr);
+				}
+				else {
+					// Data-not-in-memory namespace - process via queues.
+					as_tsvc_enqueue(&tr);
+				}
+
+				// Jump the proto message free & FD cleanup. If we get here, the
+				// above operations went smoothly. The message free & FD cleanup
+				// job is handled elsewhere as directed by
+				// thr_tsvc_process_or_enqueue().
+				goto NextEvent;
+
+NextEvent_FD_Cleanup:
+				// If we allocated memory for the incoming message, free it.
+				if (fd_h->proto) {
+					cf_free(fd_h->proto);
+					fd_h->proto = 0;
+				}
+				// If fd has extra reference for transaction, release it.
+				if (has_extra_ref) {
+					cf_rc_release(fd_h);
+				}
+				// Remove the fd from the events list.
+				cf_poll_delete_socket(poll, sock);
+				pthread_mutex_lock(&g_file_handle_a_LOCK);
+				fd_h->reap_me = true;
+				as_release_file_handle(fd_h);
+				fd_h = 0;
+				pthread_mutex_unlock(&g_file_handle_a_LOCK);
+NextEvent:
+				;
+			}
+
+			// We should never be canceled externally, but just in case...
+			pthread_testcancel();
+		}
+	}
+
+	return NULL;
+}
+
+static void
+add_local(cf_serv_cfg *serv_cfg, cf_sock_owner owner)
+{
+	// Localhost will only be added to the addresses, if we're not yet listening
+	// on wildcard ("any") or localhost.
+
+	cf_ip_port port = 0;
+
+	for (uint32_t i = 0; i < serv_cfg->n_cfgs; ++i) {
+		if (serv_cfg->cfgs[i].owner != owner) {
+			continue;
+		}
+
+		port = serv_cfg->cfgs[i].port;
+
+		if (cf_ip_addr_is_any(&serv_cfg->cfgs[i].addr) ||
+				cf_ip_addr_is_local(&serv_cfg->cfgs[i].addr)) {
+			return;
+		}
+	}
+
+	if (port == 0) {
+		return;
+	}
+
+	cf_sock_cfg sock_cfg;
+	cf_sock_cfg_init(&sock_cfg, owner);
+	sock_cfg.port = port;
+	cf_ip_addr_set_local(&sock_cfg.addr);
+
+	if (cf_serv_cfg_add_sock_cfg(serv_cfg, &sock_cfg) < 0) {
+		cf_crash(AS_DEMARSHAL, "Couldn't add localhost listening address");
+	}
+}
+
+// Initialize the demarshal service, start demarshal threads.
+int
+as_demarshal_start()
+{
+	demarshal_args *dm = cf_malloc(sizeof(demarshal_args));
+	memset(dm, 0, sizeof(demarshal_args));
+	g_demarshal_args = dm;
+
+	g_freeslot = cf_queue_create(sizeof(int), true);
+
+	add_local(&g_service_bind, CF_SOCK_OWNER_SERVICE);
+	add_local(&g_service_bind, CF_SOCK_OWNER_SERVICE_TLS);
+
+	as_xdr_info_port(&g_service_bind);
+
+	if (cf_socket_init_server(&g_service_bind, &g_sockets) < 0) {
+		cf_crash(AS_DEMARSHAL, "Couldn't initialize service socket");
+	}
+
+	// Create all the epoll_fds and wait for all the threads to come up.
+
+	cf_info(AS_DEMARSHAL, "starting %u demarshal threads",
+			g_config.n_service_threads);
+
+	dm->num_threads = g_config.n_service_threads;
+
+	for (int32_t i = 1; i < dm->num_threads; ++i) {
+		if (pthread_create(&dm->dm_th[i], NULL, thr_demarshal, NULL) != 0) {
+			cf_crash(AS_DEMARSHAL, "Can't create demarshal threads");
+		}
+	}
+
+	for (int32_t i = 1; i < dm->num_threads; i++) {
+		while (CEFD(dm->polls[i]) == 0) {
+			usleep(1000);
+		}
+	}
+
+	// Create first thread which is the listener. We do this one last, as it
+	// requires the other threads' epoll instances.
+	if (pthread_create(&dm->dm_th[0], NULL, thr_demarshal, NULL) != 0) {
+		cf_crash(AS_DEMARSHAL, "Can't create demarshal threads");
+	}
+
+	// For orderly startup log, wait for endpoint setup.
+	while (CEFD(dm->polls[0]) == 0) {
+		usleep(1000);
+	}
+
+	return 0;
+}
diff --git a/as/src/base/thr_info.c b/as/src/base/thr_info.c
new file mode 100644
index 00000000..7f445bac
--- /dev/null
+++ b/as/src/base/thr_info.c
@@ -0,0 +1,7024 @@
+/*
+ * thr_info.c
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/thr_info.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <malloc.h>
+#include <mcheck.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "cf_str.h"
+#include "dynbuf.h"
+#include "fault.h"
+#include "meminfo.h"
+#include "shash.h"
+#include "socket.h"
+
+#include "ai_obj.h"
+#include "ai_btree.h"
+
+#include "base/batch.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/features.h"
+#include "base/index.h"
+#include "base/monitor.h"
+#include "base/scan.h"
+#include "base/thr_batch.h"
+#include "base/thr_demarshal.h"
+#include "base/thr_info_port.h"
+#include "base/thr_sindex.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "base/secondary_index.h"
+#include "base/security.h"
+#include "base/stats.h"
+#include "base/system_metadata.h"
+#include "base/truncate.h"
+#include "base/udf_cask.h"
+#include "base/xdr_config.h"
+#include "base/xdr_serverside.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "fabric/hlc.h"
+#include "fabric/migrate.h"
+#include "fabric/partition.h"
+#include "fabric/partition_balance.h"
+#include "fabric/roster.h"
+#include "fabric/skew_monitor.h"
+#include "transaction/proxy.h"
+#include "transaction/rw_request_hash.h"
+
+#define STR_NS              "ns"
+#define STR_SET             "set"
+#define STR_INDEXNAME       "indexname"
+#define STR_NUMBIN          "numbins"
+#define STR_INDEXDATA       "indexdata"
+#define STR_TYPE_NUMERIC    "numeric"
+#define STR_TYPE_STRING     "string"
+#define STR_ITYPE           "indextype"
+#define STR_ITYPE_DEFAULT   "DEFAULT"
+#define STR_ITYPE_LIST      "LIST"
+#define STR_ITYPE_MAPKEYS   "MAPKEYS"
+#define STR_ITYPE_MAPVALUES "MAPVALUES"
+#define STR_BINTYPE         "bintype"
+
+extern int as_nsup_queue_get_size();
+
+int info_get_objects(char *name, cf_dyn_buf *db);
+int info_get_tree_sets(char *name, char *subtree, cf_dyn_buf *db);
+int info_get_tree_bins(char *name, char *subtree, cf_dyn_buf *db);
+int info_get_tree_sindexes(char *name, char *subtree, cf_dyn_buf *db);
+int info_get_tree_statistics(char *name, char *subtree, cf_dyn_buf *db);
+void as_storage_show_wblock_stats(as_namespace *ns);
+void as_storage_summarize_wblock_stats(as_namespace *ns);
+int as_storage_analyze_wblock(as_namespace* ns, int device_index, uint32_t wblock_id);
+
+
+as_stats g_stats = { 0 }; // separate .c file not worth it
+
+uint64_t g_start_ms; // start time of the server
+
+static cf_queue *g_info_work_q = 0;
+
+//
+// Info has its own fabric service
+// which allows it to communicate things like the IP addresses of
+// all the other nodes
+//
+
+#define INFO_FIELD_OP	0
+#define INFO_FIELD_GENERATION 1
+#define INFO_FIELD_SERVICE_ADDRESS 2
+#define INFO_FIELD_ALT_ADDRESS 3
+#define INFO_FIELD_SERVICES_CLEAR_STD 4
+#define INFO_FIELD_SERVICES_TLS_STD 5
+#define INFO_FIELD_SERVICES_CLEAR_ALT 6
+#define INFO_FIELD_SERVICES_TLS_ALT 7
+#define INFO_FIELD_TLS_NAME 8
+
+#define INFO_OP_UPDATE 0
+#define INFO_OP_ACK 1
+#define INFO_OP_UPDATE_REQ 2
+
+msg_template info_mt[] = {
+	{ INFO_FIELD_OP,	M_FT_UINT32 },
+	{ INFO_FIELD_GENERATION, M_FT_UINT32 },
+	{ INFO_FIELD_SERVICE_ADDRESS, M_FT_STR },
+	{ INFO_FIELD_ALT_ADDRESS, M_FT_STR },
+	{ INFO_FIELD_SERVICES_CLEAR_STD, M_FT_STR },
+	{ INFO_FIELD_SERVICES_TLS_STD, M_FT_STR },
+	{ INFO_FIELD_SERVICES_CLEAR_ALT, M_FT_STR },
+	{ INFO_FIELD_SERVICES_TLS_ALT, M_FT_STR },
+	{ INFO_FIELD_TLS_NAME, M_FT_STR }
+};
+
+#define INFO_MSG_SCRATCH_SIZE 512
+
+
+//
+// The dynamic list has a name, and a function to call
+//
+
+typedef struct info_static_s {
+	struct info_static_s	*next;
+	bool   def; // default, but default is a reserved word
+	char *name;
+	char *value;
+	size_t	value_sz;
+} info_static;
+
+
+typedef struct info_dynamic_s {
+	struct info_dynamic_s *next;
+	bool 	def;  // default, but that's a reserved word
+	char *name;
+	as_info_get_value_fn	value_fn;
+} info_dynamic;
+
+typedef struct info_command_s {
+	struct info_command_s *next;
+	char *name;
+	as_info_command_fn 		command_fn;
+	as_sec_perm				required_perm; // required security permission
+} info_command;
+
+typedef struct info_tree_s {
+	struct info_tree_s *next;
+	char *name;
+	as_info_get_tree_fn	tree_fn;
+} info_tree;
+
+
+#define EOL		'\n' // incoming commands are separated by EOL
+#define SEP		'\t'
+#define TREE_SEP		'/'
+
+#define INFO_COMMAND_SINDEX_FAILCODE(num, message)	\
+	if (db) { \
+		cf_dyn_buf_append_string(db, "FAIL:");			\
+		cf_dyn_buf_append_int(db, num); 				\
+		cf_dyn_buf_append_string(db, ": ");				\
+		cf_dyn_buf_append_string(db, message);          \
+	}
+
+
+void
+info_get_aggregated_namespace_stats(cf_dyn_buf *db)
+{
+	uint64_t total_objects = 0;
+	uint64_t total_tombstones = 0;
+
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+
+		total_objects += ns->n_objects;
+		total_tombstones += ns->n_tombstones;
+	}
+
+	info_append_uint64(db, "objects", total_objects);
+	info_append_uint64(db, "tombstones", total_tombstones);
+}
+
+// #define INFO_SEGV_TEST 1
+#ifdef INFO_SEGV_TEST
+char *segv_test = "segv test";
+int
+info_segv_test(char *name, cf_dyn_buf *db)
+{
+	*segv_test = 'E';
+	cf_dyn_buf_append_string(db, "segv");
+	return(0);
+}
+#endif
+
+int
+info_get_stats(char *name, cf_dyn_buf *db)
+{
+	info_append_uint32(db, "cluster_size", as_exchange_cluster_size());
+	info_append_uint64_x(db, "cluster_key", as_exchange_cluster_key()); // not in ticker
+	info_append_bool(db, "cluster_integrity", as_clustering_has_integrity()); // not in ticker
+	info_append_bool(db, "cluster_is_member", ! as_clustering_is_orphan()); // not in ticker
+	as_hb_info_duplicates_get(db); // not in ticker
+	info_append_uint32(db, "cluster_clock_skew_stop_writes_sec", clock_skew_stop_writes_sec()); // not in ticker
+	info_append_uint64(db, "cluster_clock_skew", as_skew_monitor_skew());
+	as_skew_monitor_info(db);
+
+	info_append_uint64(db, "uptime", (cf_getms() - g_start_ms) / 1000); // not in ticker
+
+	int freepct;
+	bool swapping;
+
+	cf_meminfo(NULL, NULL, &freepct, &swapping);
+	info_append_int(db, "system_free_mem_pct", freepct);
+	info_append_bool(db, "system_swapping", swapping);
+
+	size_t allocated_kbytes;
+	size_t active_kbytes;
+	size_t mapped_kbytes;
+	double efficiency_pct;
+	uint32_t site_count;
+
+	cf_alloc_heap_stats(&allocated_kbytes, &active_kbytes, &mapped_kbytes, &efficiency_pct,
+			&site_count);
+	info_append_uint64(db, "heap_allocated_kbytes", allocated_kbytes);
+	info_append_uint64(db, "heap_active_kbytes", active_kbytes);
+	info_append_uint64(db, "heap_mapped_kbytes", mapped_kbytes);
+	info_append_int(db, "heap_efficiency_pct", (int)(efficiency_pct + 0.5));
+	info_append_uint32(db, "heap_site_count", site_count);
+
+	info_get_aggregated_namespace_stats(db);
+
+	info_append_int(db, "tsvc_queue", as_tsvc_queue_get_size());
+	info_append_int(db, "info_queue", as_info_queue_get_size());
+	info_append_int(db, "delete_queue", as_nsup_queue_get_size());
+	info_append_uint32(db, "rw_in_progress", rw_request_hash_count());
+	info_append_uint32(db, "proxy_in_progress", as_proxy_hash_count());
+	info_append_int(db, "tree_gc_queue", as_index_tree_gc_queue_size());
+
+	info_append_uint64(db, "client_connections", g_stats.proto_connections_opened - g_stats.proto_connections_closed);
+	info_append_uint64(db, "heartbeat_connections", g_stats.heartbeat_connections_opened - g_stats.heartbeat_connections_closed);
+	info_append_uint64(db, "fabric_connections", g_stats.fabric_connections_opened - g_stats.fabric_connections_closed);
+
+	info_append_uint64(db, "heartbeat_received_self", g_stats.heartbeat_received_self);
+	info_append_uint64(db, "heartbeat_received_foreign", g_stats.heartbeat_received_foreign);
+
+
+	info_append_uint64(db, "reaped_fds", g_stats.reaper_count); // not in ticker
+
+	info_append_uint64(db, "info_complete", g_stats.info_complete); // not in ticker
+
+	info_append_uint64(db, "demarshal_error", g_stats.n_demarshal_error);
+	info_append_uint64(db, "early_tsvc_client_error", g_stats.n_tsvc_client_error);
+	info_append_uint64(db, "early_tsvc_batch_sub_error", g_stats.n_tsvc_batch_sub_error);
+	info_append_uint64(db, "early_tsvc_udf_sub_error", g_stats.n_tsvc_udf_sub_error);
+
+	info_append_uint64(db, "batch_index_initiate", g_stats.batch_index_initiate); // not in ticker
+
+	cf_dyn_buf_append_string(db, "batch_index_queue=");
+	as_batch_queues_info(db); // not in ticker
+	cf_dyn_buf_append_char(db, ';');
+
+	info_append_uint64(db, "batch_index_complete", g_stats.batch_index_complete);
+	info_append_uint64(db, "batch_index_error", g_stats.batch_index_errors);
+	info_append_uint64(db, "batch_index_timeout", g_stats.batch_index_timeout);
+
+	// Everything below is not in ticker...
+
+	info_append_int(db, "batch_index_unused_buffers", as_batch_unused_buffers());
+	info_append_uint64(db, "batch_index_huge_buffers", g_stats.batch_index_huge_buffers);
+	info_append_uint64(db, "batch_index_created_buffers", g_stats.batch_index_created_buffers);
+	info_append_uint64(db, "batch_index_destroyed_buffers", g_stats.batch_index_destroyed_buffers);
+
+	info_append_uint64(db, "batch_initiate", g_stats.batch_initiate);
+	info_append_int(db, "batch_queue", as_batch_direct_queue_size());
+	info_append_uint64(db, "batch_error", g_stats.batch_errors);
+	info_append_uint64(db, "batch_timeout", g_stats.batch_timeout);
+
+	info_append_int(db, "scans_active", as_scan_get_active_job_count());
+
+	info_append_uint32(db, "query_short_running", g_query_short_running);
+	info_append_uint32(db, "query_long_running", g_query_long_running);
+
+	info_append_uint64(db, "sindex_ucgarbage_found", g_stats.query_false_positives);
+	info_append_uint64(db, "sindex_gc_locktimedout", g_stats.sindex_gc_timedout);
+	info_append_uint64(db, "sindex_gc_list_creation_time", g_stats.sindex_gc_list_creation_time);
+	info_append_uint64(db, "sindex_gc_list_deletion_time", g_stats.sindex_gc_list_deletion_time);
+	info_append_uint64(db, "sindex_gc_objects_validated", g_stats.sindex_gc_objects_validated);
+	info_append_uint64(db, "sindex_gc_garbage_found", g_stats.sindex_gc_garbage_found);
+	info_append_uint64(db, "sindex_gc_garbage_cleaned", g_stats.sindex_gc_garbage_cleaned);
+
+	char paxos_principal[16 + 1];
+	sprintf(paxos_principal, "%lX", as_exchange_principal());
+	info_append_string(db, "paxos_principal", paxos_principal);
+
+	info_append_bool(db, "migrate_allowed", as_partition_balance_are_migrations_allowed());
+	info_append_uint64(db, "migrate_partitions_remaining", as_partition_balance_remaining_migrations());
+
+	info_append_uint64(db, "fabric_bulk_send_rate", g_stats.fabric_bulk_s_rate);
+	info_append_uint64(db, "fabric_bulk_recv_rate", g_stats.fabric_bulk_r_rate);
+	info_append_uint64(db, "fabric_ctrl_send_rate", g_stats.fabric_ctrl_s_rate);
+	info_append_uint64(db, "fabric_ctrl_recv_rate", g_stats.fabric_ctrl_r_rate);
+	info_append_uint64(db, "fabric_meta_send_rate", g_stats.fabric_meta_s_rate);
+	info_append_uint64(db, "fabric_meta_recv_rate", g_stats.fabric_meta_r_rate);
+	info_append_uint64(db, "fabric_rw_send_rate", g_stats.fabric_rw_s_rate);
+	info_append_uint64(db, "fabric_rw_recv_rate", g_stats.fabric_rw_r_rate);
+
+	as_xdr_get_stats(db);
+
+	cf_dyn_buf_chomp(db);
+
+	return 0;
+}
+
+cf_atomic32	 g_node_info_generation = 0;
+
+
+int
+info_get_cluster_generation(char *name, cf_dyn_buf *db)
+{
+	cf_dyn_buf_append_int(db, g_node_info_generation);
+
+	return(0);
+}
+
+void
+info_get_printable_cluster_name(char *cluster_name)
+{
+	as_config_cluster_name_get(cluster_name);
+	if (cluster_name[0] == '\0'){
+		strcpy(cluster_name, "null");
+	}
+}
+
+int
+info_get_cluster_name(char *name, cf_dyn_buf *db)
+{
+	char cluster_name[AS_CLUSTER_NAME_SZ];
+	info_get_printable_cluster_name(cluster_name);
+	cf_dyn_buf_append_string(db, cluster_name);
+
+	return 0;
+}
+
+int
+info_get_features(char *name, cf_dyn_buf *db)
+{
+	cf_dyn_buf_append_string(db, as_features_info());
+
+	return 0;
+}
+
+static cf_ip_port
+bind_to_port(cf_serv_cfg *cfg, cf_sock_owner owner)
+{
+	for (uint32_t i = 0; i < cfg->n_cfgs; ++i) {
+		if (cfg->cfgs[i].owner == owner) {
+			return cfg->cfgs[i].port;
+		}
+	}
+
+	return 0;
+}
+
+char *
+as_info_bind_to_string(const cf_serv_cfg *cfg, cf_sock_owner owner)
+{
+	cf_dyn_buf_define_size(db, 2500);
+	uint32_t count = 0;
+
+	for (uint32_t i = 0; i < cfg->n_cfgs; ++i) {
+		if (cfg->cfgs[i].owner != owner) {
+			continue;
+		}
+
+		if (count > 0) {
+			cf_dyn_buf_append_char(&db, ',');
+		}
+
+		cf_dyn_buf_append_string(&db, cf_ip_addr_print(&cfg->cfgs[i].addr));
+		++count;
+	}
+
+	char *string = cf_dyn_buf_strdup(&db);
+	cf_dyn_buf_free(&db);
+	return string != NULL ? string : cf_strdup("null");
+}
+
+static char *
+access_to_string(cf_addr_list *addrs)
+{
+	cf_dyn_buf_define_size(db, 2500);
+
+	for (uint32_t i = 0; i < addrs->n_addrs; ++i) {
+		if (i > 0) {
+			cf_dyn_buf_append_char(&db, ',');
+		}
+
+		cf_dyn_buf_append_string(&db, addrs->addrs[i]);
+	}
+
+	char *string = cf_dyn_buf_strdup(&db);
+	cf_dyn_buf_free(&db);
+	return string != NULL ? string : cf_strdup("null");
+}
+
+int
+info_get_endpoints(char *name, cf_dyn_buf *db)
+{
+	cf_ip_port port = bind_to_port(&g_service_bind, CF_SOCK_OWNER_SERVICE);
+	info_append_int(db, "service.port", port);
+
+	char *string = as_info_bind_to_string(&g_service_bind, CF_SOCK_OWNER_SERVICE);
+	info_append_string(db, "service.addresses", string);
+	cf_free(string);
+
+	info_append_int(db, "service.access-port", g_access.service.port);
+
+	string = access_to_string(&g_access.service.addrs);
+	info_append_string(db, "service.access-addresses", string);
+	cf_free(string);
+
+	info_append_int(db, "service.alternate-access-port", g_access.alt_service.port);
+
+	string = access_to_string(&g_access.alt_service.addrs);
+	info_append_string(db, "service.alternate-access-addresses", string);
+	cf_free(string);
+
+	port = bind_to_port(&g_service_bind, CF_SOCK_OWNER_SERVICE_TLS);
+	info_append_int(db, "service.tls-port", port);
+
+	string = as_info_bind_to_string(&g_service_bind, CF_SOCK_OWNER_SERVICE_TLS);
+	info_append_string(db, "service.tls-addresses", string);
+	cf_free(string);
+
+	info_append_int(db, "service.tls-access-port", g_access.tls_service.port);
+
+	string = access_to_string(&g_access.tls_service.addrs);
+	info_append_string(db, "service.tls-access-addresses", string);
+	cf_free(string);
+
+	info_append_int(db, "service.tls-alternate-access-port", g_access.alt_tls_service.port);
+
+	string = access_to_string(&g_access.alt_tls_service.addrs);
+	info_append_string(db, "service.tls-alternate-access-addresses", string);
+	cf_free(string);
+
+	as_hb_info_endpoints_get(db);
+
+	port = bind_to_port(&g_fabric_bind, CF_SOCK_OWNER_FABRIC);
+	info_append_int(db, "fabric.port", port);
+
+	string = as_info_bind_to_string(&g_fabric_bind, CF_SOCK_OWNER_FABRIC);
+	info_append_string(db, "fabric.addresses", string);
+	cf_free(string);
+
+	port = bind_to_port(&g_fabric_bind, CF_SOCK_OWNER_FABRIC_TLS);
+	info_append_int(db, "fabric.tls-port", port);
+
+	string = as_info_bind_to_string(&g_fabric_bind, CF_SOCK_OWNER_FABRIC_TLS);
+	info_append_string(db, "fabric.tls-addresses", string);
+	cf_free(string);
+
+	as_fabric_info_peer_endpoints_get(db);
+
+	info_append_int(db, "info.port", g_info_port);
+
+	string = as_info_bind_to_string(&g_info_bind, CF_SOCK_OWNER_INFO);
+	info_append_string(db, "info.addresses", string);
+	cf_free(string);
+
+	cf_dyn_buf_chomp(db);
+	return(0);
+}
+
+int
+info_get_partition_generation(char *name, cf_dyn_buf *db)
+{
+	cf_dyn_buf_append_int(db, (int)g_partition_generation);
+
+	return(0);
+}
+
+int
+info_get_partition_info(char *name, cf_dyn_buf *db)
+{
+	as_partition_getinfo_str(db);
+
+	return(0);
+}
+
+// Deprecate in "six months".
+int
+info_get_replicas_prole(char *name, cf_dyn_buf *db)
+{
+	as_partition_get_replicas_prole_str(db);
+
+	return(0);
+}
+
+int
+info_get_replicas_master(char *name, cf_dyn_buf *db)
+{
+	as_partition_get_replicas_master_str(db);
+
+	return(0);
+}
+
+int
+info_get_replicas_all(char *name, cf_dyn_buf *db)
+{
+	as_partition_get_replicas_all_str(db, false);
+
+	return(0);
+}
+
+int
+info_get_replicas(char *name, cf_dyn_buf *db)
+{
+	as_partition_get_replicas_all_str(db, true);
+
+	return(0);
+}
+
+//
+// COMMANDS
+//
+
+int
+info_command_get_sl(char *name, char *params, cf_dyn_buf *db)
+{
+	// Command Format:  "get-sl:"
+
+	as_exchange_info_get_succession(db);
+
+	return 0;
+}
+
+int
+info_command_tip(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "tip command received: params %s", params);
+
+	char host_str[50];
+	int  host_str_len = sizeof(host_str);
+
+	char port_str[50];
+	int  port_str_len = sizeof(port_str);
+	int rv = -1;
+
+	char tls_str[50];
+	int  tls_str_len = sizeof(tls_str);
+
+	/*
+	 *  Command Format:  "tip:host=<IPAddr>;port=<PortNum>[;tls=<Bool>]"
+	 *
+	 *  where <IPAddr> is an IP address and <PortNum> is a valid TCP port number.
+	 */
+
+	if (0 != as_info_parameter_get(params, "host", host_str, &host_str_len)) {
+		cf_warning(AS_INFO, "tip command: no host, must add a host parameter");
+		goto Exit;
+	}
+
+	if (0 != as_info_parameter_get(params, "port", port_str, &port_str_len)) {
+		cf_warning(AS_INFO, "tip command: no port, must have port");
+		goto Exit;
+	}
+
+	if (0 != as_info_parameter_get(params, "tls", tls_str, &tls_str_len)) {
+		strcpy(tls_str, "false");
+	}
+
+	int port = 0;
+	if (0 != cf_str_atoi(port_str, &port)) {
+		cf_warning(AS_INFO, "tip command: port must be an integer in: %s", port_str);
+		goto Exit;
+	}
+
+	bool tls;
+	if (strcmp(tls_str, "true") == 0) {
+		tls = true;
+	}
+	else if (strcmp(tls_str, "false") == 0) {
+		tls = false;
+	}
+	else {
+		cf_warning(AS_INFO, "The \"%s:\" command argument \"tls\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, tls_str);
+		goto Exit;
+	}
+
+	rv = as_hb_mesh_tip(host_str, port, tls);
+
+Exit:
+	if (0 == rv) {
+		cf_dyn_buf_append_string(db, "ok");
+	} else {
+		cf_dyn_buf_append_string(db, "error");
+	}
+
+	return(0);
+}
+
+/*
+ *  Command Format:  "tip-clear:{host-port-list=<hpl>}"
+ *
+ *  where <hpl> is either "all" or else a comma-separated list of items of the form: <HostIPAddr>:<PortNum>
+ */
+int32_t
+info_command_tip_clear(char* name, char* params, cf_dyn_buf* db)
+{
+	cf_info(AS_INFO, "tip clear command received: params %s", params);
+
+	// Command Format:  "tip-clear:{host-port-list=<hpl>}" [the
+	// "host-port-list" argument is optional]
+	// where <hpl> is either "all" or else a comma-separated list of items
+	// of the form: <HostIPv4Addr>:<PortNum> or [<HostIPv6Addr>]:<PortNum>
+
+	char host_port_list[3000];
+	int host_port_list_len = sizeof(host_port_list);
+	host_port_list[0] = '\0';
+	bool success = true;
+	uint32_t cleared = 0, not_found = 0;
+
+	if (as_info_parameter_get(params, "host-port-list", host_port_list,
+				  &host_port_list_len) == 0) {
+		if (0 != strcmp(host_port_list, "all")) {
+			char* save_ptr = NULL;
+			int port = -1;
+			char* host_port =
+			  strtok_r(host_port_list, ",", &save_ptr);
+
+			while (host_port != NULL) {
+				char* host_port_delim = ":";
+				if (*host_port == '[') {
+					// Parse IPv6 address differently.
+					host_port++;
+					host_port_delim = "]";
+				}
+
+				char* host_port_save_ptr = NULL;
+				char* host =
+				  strtok_r(host_port, host_port_delim, &host_port_save_ptr);
+
+				if (host == NULL) {
+					cf_warning(AS_INFO, "tip clear command: invalid host:port string: %s", host_port);
+					success = false;
+					break;
+				}
+
+				char* port_str =
+				  strtok_r(NULL, host_port_delim, &host_port_save_ptr);
+
+				if (port_str != NULL && *port_str == ':') {
+					// IPv6 case
+					port_str++;
+				}
+				if (port_str == NULL ||
+					0 != cf_str_atoi(port_str, &port)) {
+					cf_warning(AS_INFO, "tip clear command: port must be an integer in: %s", port_str);
+					success = false;
+					break;
+				}
+
+				if (as_hb_mesh_tip_clear(host, port) == -1) {
+					success = false;
+					not_found++;
+					cf_warning(AS_INFO, "seed node %s:%d does not exist", host, port);
+				} else {
+					cleared++;
+				}
+
+				host_port = strtok_r(NULL, ",", &save_ptr);
+			}
+		} else {
+			if (as_hb_mesh_tip_clear_all(&cleared)) {
+				success = false;
+			}
+		}
+	} else {
+		success = false;
+	}
+
+	if (success) {
+		cf_info(AS_INFO, "tip clear command executed: cleared %"PRIu32", params %s", cleared, params);
+		cf_dyn_buf_append_string(db, "ok");
+	} else {
+		cf_info(AS_INFO, "tip clear command failed: cleared %"PRIu32", params %s", cleared, params);
+		char error_msg[1024];
+		sprintf(error_msg, "error: %"PRIu32" cleared, %"PRIu32" not found", cleared, not_found);
+		cf_dyn_buf_append_string(db, error_msg);
+	}
+
+	return (0);
+}
+
+int
+info_command_show_devices(char *name, char *params, cf_dyn_buf *db)
+{
+	char ns_str[512];
+	int  ns_len = sizeof(ns_str);
+
+	if (0 != as_info_parameter_get(params, "namespace", ns_str, &ns_len)) {
+		cf_info(AS_INFO, "show-devices requires namespace parameter");
+		cf_dyn_buf_append_string(db, "error");
+		return(0);
+	}
+
+	as_namespace *ns = as_namespace_get_byname(ns_str);
+	if (!ns) {
+		cf_info(AS_INFO, "show-devices: namespace %s not found", ns_str);
+		cf_dyn_buf_append_string(db, "error");
+		return(0);
+	}
+	as_storage_show_wblock_stats(ns);
+
+	cf_dyn_buf_append_string(db, "ok");
+
+	return(0);
+}
+
+int
+info_command_dump_cluster(char *name, char *params, cf_dyn_buf *db)
+{
+	bool verbose = false;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-cluster:{verbose=<opt>}" [the "verbose" argument is optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+	as_clustering_dump(verbose);
+	as_exchange_dump(verbose);
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+int
+info_command_dump_fabric(char *name, char *params, cf_dyn_buf *db)
+{
+	bool verbose = false;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-fabric:{verbose=<opt>}" [the "verbose" argument is optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+	as_fabric_dump(verbose);
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+int
+info_command_dump_hb(char *name, char *params, cf_dyn_buf *db)
+{
+	bool verbose = false;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-hb:{verbose=<opt>}" [the "verbose" argument is optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+	as_hb_dump(verbose);
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+int
+info_command_dump_hlc(char *name, char *params, cf_dyn_buf *db)
+{
+	bool verbose = false;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-hlc:{verbose=<opt>}" [the "verbose" argument is optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+	as_hlc_dump(verbose);
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+
+int
+info_command_dump_migrates(char *name, char *params, cf_dyn_buf *db)
+{
+	bool verbose = false;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-migrates:{verbose=<opt>}" [the "verbose" argument is optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+	as_migrate_dump(verbose);
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+int
+info_command_dump_msgs(char *name, char *params, cf_dyn_buf *db)
+{
+	bool once = true;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-msgs:{mode=<mode>}" [the "mode" argument is optional]
+	 *
+	 *   where <mode> is one of:  {"on" | "off" | "once"} and defaults to "once".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "mode", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "on", 3)) {
+			g_config.fabric_dump_msgs = true;
+		} else if (!strncmp(param_str, "off", 4)) {
+			g_config.fabric_dump_msgs = false;
+			once = false;
+		} else if (!strncmp(param_str, "once", 5)) {
+			once = true;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"mode\" value must be one of {\"on\", \"off\", \"once\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+
+	if (once) {
+		as_fabric_msg_queue_dump();
+	}
+
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+static int
+is_numeric_string(char *str)
+{
+	if (!*str)
+		return 0;
+
+	while (isdigit(*str))
+		str++;
+
+	return (!*str);
+}
+
+int
+info_command_dump_wb(char *name, char *params, cf_dyn_buf *db)
+{
+	as_namespace *ns;
+	int device_index, wblock_id;
+	char param_str[100];
+	int param_str_len;
+
+	/*
+	 *  Command Format:  "dump-wb:ns=<Namespace>;dev=<DeviceID>;id=<WBlockId>"
+	 *
+	 *   where <Namespace> is the name of the namespace,
+	 *         <DeviceID> is the drive number (a non-negative integer), and
+	 *         <WBlockID> is a non-negative integer corresponding to an active wblock.
+	 */
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "ns", param_str, &param_str_len)) {
+		if (!(ns = as_namespace_get_byname(param_str))) {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"ns\" value must be the name of an existing namespace, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"ns=<Namespace>\"", name);
+		cf_dyn_buf_append_string(db, "error");
+		return 0;
+	}
+
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "dev", param_str, &param_str_len)) {
+		if (!is_numeric_string(param_str) || (0 > (device_index = atoi(param_str)))) {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"dev\" value must be a non-negative integer, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"dev=<DeviceID>\"", name);
+		cf_dyn_buf_append_string(db, "error");
+		return 0;
+	}
+
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "id", param_str, &param_str_len)) {
+		if (!is_numeric_string(param_str) || (0 > (wblock_id = atoi(param_str)))) {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"id\" value must be a non-negative integer, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"id=<WBlockID>\"", name);
+		cf_dyn_buf_append_string(db, "error");
+		return 0;
+	}
+
+	if (!as_storage_analyze_wblock(ns, device_index, (uint32_t) wblock_id))
+		cf_dyn_buf_append_string(db, "ok");
+	else
+		cf_dyn_buf_append_string(db, "error");
+
+	return(0);
+}
+
+int
+info_command_dump_wb_summary(char *name, char *params, cf_dyn_buf *db)
+{
+	as_namespace *ns;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-wb-summary:ns=<Namespace>"
+	 *
+	 *  where <Namespace> is the name of an existing namespace.
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "ns", param_str, &param_str_len)) {
+		if (!(ns = as_namespace_get_byname(param_str))) {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"ns\" value must be the name of an existing namespace, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return(0);
+		}
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"ns=<Namespace>\"", name);
+		cf_dyn_buf_append_string(db, "error");
+		return 0;
+	}
+
+	as_storage_summarize_wblock_stats(ns);
+
+	cf_dyn_buf_append_string(db, "ok");
+
+	return(0);
+}
+
+int
+info_command_dump_rw_request_hash(char *name, char *params, cf_dyn_buf *db)
+{
+	rw_request_hash_dump();
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+}
+
+typedef struct rack_node_s {
+	uint32_t rack_id;
+	cf_node node;
+} rack_node;
+
+// A comparison_fn_t used with qsort() - yields ascending rack-id order.
+static inline int
+compare_rack_nodes(const void* pa, const void* pb)
+{
+	uint32_t a = ((const rack_node*)pa)->rack_id;
+	uint32_t b = ((const rack_node*)pb)->rack_id;
+
+	return a > b ? 1 : (a == b ? 0 : -1);
+}
+
+void
+namespace_rack_info(as_namespace *ns, cf_dyn_buf *db, uint32_t *rack_ids,
+		uint32_t n_nodes, const char *tag)
+{
+	if (n_nodes == 0) {
+		return;
+	}
+
+	rack_node rack_nodes[n_nodes];
+
+	for (uint32_t i = 0; i < n_nodes; i++) {
+		rack_nodes[i].rack_id = rack_ids[i];
+		rack_nodes[i].node = ns->succession[i];
+	}
+
+	qsort(rack_nodes, n_nodes, sizeof(rack_node), compare_rack_nodes);
+
+	uint32_t cur_id = rack_nodes[0].rack_id;
+
+	cf_dyn_buf_append_string(db, tag);
+	cf_dyn_buf_append_uint32(db, cur_id);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_uint64_x(db, rack_nodes[0].node);
+
+	for (uint32_t i = 1; i < n_nodes; i++) {
+		if (rack_nodes[i].rack_id == cur_id) {
+			cf_dyn_buf_append_char(db, ',');
+			cf_dyn_buf_append_uint64_x(db, rack_nodes[i].node);
+			continue;
+		}
+
+		cur_id = rack_nodes[i].rack_id;
+
+		cf_dyn_buf_append_char(db, ':');
+		cf_dyn_buf_append_string(db, tag);
+		cf_dyn_buf_append_uint32(db, cur_id);
+		cf_dyn_buf_append_char(db, '=');
+		cf_dyn_buf_append_uint64_x(db, rack_nodes[i].node);
+	}
+}
+
+int
+info_command_racks(char *name, char *params, cf_dyn_buf *db)
+{
+	// Command format: "racks:{namespace=<namespace-name>}"
+
+	char param_str[AS_ID_NAMESPACE_SZ] = { 0 };
+	int param_str_len = (int)sizeof(param_str);
+	int rv = as_info_parameter_get(params, "namespace", param_str, &param_str_len);
+
+	if (rv == -2) {
+		cf_warning(AS_INFO, "namespace parameter value too long");
+		cf_dyn_buf_append_string(db, "ERROR::bad-namespace");
+		return 0;
+	}
+
+	if (rv == 0) {
+		as_namespace *ns = as_namespace_get_byname(param_str);
+
+		if (! ns) {
+			cf_warning(AS_INFO, "unknown namespace %s", param_str);
+			cf_dyn_buf_append_string(db, "ERROR::unknown-namespace");
+			return 0;
+		}
+
+		as_exchange_info_lock();
+
+		namespace_rack_info(ns, db, ns->rack_ids, ns->cluster_size, "rack_");
+
+		if (ns->roster_count != 0) {
+			cf_dyn_buf_append_char(db, ':');
+			namespace_rack_info(ns, db, ns->roster_rack_ids, ns->roster_count, "roster_rack_");
+		}
+
+		as_exchange_info_unlock();
+
+		return 0;
+	}
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		cf_dyn_buf_append_string(db, "ns=");
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+
+		as_exchange_info_lock();
+
+		namespace_rack_info(ns, db, ns->rack_ids, ns->cluster_size, "rack_");
+
+		if (ns->roster_count != 0) {
+			cf_dyn_buf_append_char(db, ':');
+			namespace_rack_info(ns, db, ns->roster_rack_ids, ns->roster_count, "roster_rack_");
+		}
+
+		as_exchange_info_unlock();
+
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	cf_dyn_buf_chomp(db);
+
+	return 0;
+}
+
+int
+info_command_recluster(char *name, char *params, cf_dyn_buf *db)
+{
+	// Command format: "recluster:"
+
+	int rv = as_clustering_cluster_reform();
+
+	// TODO - resolve error condition further?
+	cf_dyn_buf_append_string(db,
+			rv == 0 ? "ok" : (rv == 1 ? "ignored-by-non-principal" : "ERROR"));
+
+	return 0;
+}
+
+int
+info_command_jem_stats(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "jem_stats command received: params %s", params);
+
+	/*
+	 *	Command Format:	 "jem-stats:{file=<string>;options=<string>;sites=<string>}" [the "file", "options", and "sites" arguments are optional]
+	 *
+	 *  Logs the JEMalloc statistics to the console or an optionally-specified file pathname.
+	 *  Options may be a string containing any of the characters "gmablh", as defined by jemalloc(3) man page.
+	 *  The "sites" parameter optionally specifies a file to dump memory accounting information to.
+	 *  [Note:  Any options are only used if an output file is specified.]
+	 */
+
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+	char *file = NULL, *options = NULL, *sites = NULL;
+
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "file", param_str, &param_str_len)) {
+		file = cf_strdup(param_str);
+	}
+
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "options", param_str, &param_str_len)) {
+		options = cf_strdup(param_str);
+	}
+
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "sites", param_str, &param_str_len)) {
+		sites = cf_strdup(param_str);
+	}
+
+	cf_alloc_log_stats(file, options);
+
+	if (file) {
+		cf_free(file);
+	}
+
+	if (options) {
+		cf_free(options);
+	}
+
+	if (sites) {
+		cf_alloc_log_site_infos(sites);
+		cf_free(sites);
+	}
+
+	cf_dyn_buf_append_string(db, "ok");
+	return 0;
+}
+
+/*
+ *  Print out System Metadata info.
+ */
+int
+info_command_dump_smd(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "dump-smd command received: params %s", params);
+
+	bool verbose = false;
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+
+	/*
+	 *  Command Format:  "dump-smd:{verbose=<opt>}" [the "verbose" argument is optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			return 0;
+		}
+	}
+
+	as_smd_dump(verbose);
+	cf_dyn_buf_append_string(db, "ok");
+
+	return 0;
+}
+
+/*
+ *  Print out Secondary Index info.
+ */
+int
+info_command_dump_si(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "dump-si command received: params %s", params);
+
+	char param_str[100];
+	int param_str_len = sizeof(param_str);
+	char *nsname = NULL, *indexname = NULL, *filename = NULL;
+	bool verbose = false;
+
+	/*
+	 *  Command Format:  "dump-si:ns=<string>;indexname=<string>;filename=<string>;{verbose=<opt>}" [the "file" and "verbose" arguments are optional]
+	 *
+	 *  where <opt> is one of:  {"true" | "false"} and defaults to "false".
+	 */
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "ns", param_str, &param_str_len)) {
+		nsname = cf_strdup(param_str);
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires an \"ns\" parameter", name);
+		cf_dyn_buf_append_string(db, "error");
+		goto cleanup;
+	}
+
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "indexname", param_str, &param_str_len)) {
+		indexname = cf_strdup(param_str);
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires a \"indexname\" parameter", name);
+		cf_dyn_buf_append_string(db, "error");
+		goto cleanup;
+	}
+
+	param_str[0] = '\0';
+	param_str_len = sizeof(param_str);
+	if (!as_info_parameter_get(params, "file", param_str, &param_str_len)) {
+		filename = cf_strdup(param_str);
+	} else {
+		cf_warning(AS_INFO, "The \"%s:\" command requires a \"filename\" parameter", name);
+		cf_dyn_buf_append_string(db, "error");
+		goto cleanup;
+	}
+
+
+	param_str[0] = '\0';
+	if (!as_info_parameter_get(params, "verbose", param_str, &param_str_len)) {
+		if (!strncmp(param_str, "true", 5)) {
+			verbose = true;
+		} else if (!strncmp(param_str, "false", 6)) {
+			verbose = false;
+		} else {
+			cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str);
+			cf_dyn_buf_append_string(db, "error");
+			goto cleanup;
+		}
+	}
+
+	as_sindex_dump(nsname, indexname, filename, verbose);
+	cf_dyn_buf_append_string(db, "ok");
+
+
+ cleanup:
+	if (nsname) {
+		cf_free(nsname);
+	}
+
+	if (indexname) {
+		cf_free(indexname);
+	}
+
+	if (filename) {
+		cf_free(filename);
+	}
+
+	return 0;
+}
+
+/*
+ *  Print out clock skew information.
+ */
+int
+info_command_dump_skew(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "dump-skew command received: params %s", params);
+
+	/*
+	 *  Command Format:  "dump-skew:"
+	 */
+	as_skew_monitor_dump();
+	cf_dyn_buf_append_string(db, "ok");
+	return 0;
+}
+
+int
+info_command_mon_cmd(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "add-module command received: params %s", params);
+
+	/*
+	 *  Command Format:  "jobs:[module=<string>;cmd=<command>;<parameters>]"
+	 *                   asinfo -v 'jobs'              -> list all jobs
+	 *                   asinfo -v 'jobs:module=query' -> list all jobs for query module
+	 *                   asinfo -v 'jobs:module=query;cmd=kill-job;trid=<trid>'
+	 *                   asinfo -v 'jobs:module=query;cmd=set-priority;trid=<trid>;value=<val>'
+	 *
+	 *  where <module> is one of following:
+	 *      - query
+	 *      - scan
+	 */
+
+	char cmd[13];
+	char module[21];
+	char job_id[24];
+	char val_str[11];
+	int cmd_len       = sizeof(cmd);
+	int module_len    = sizeof(module);
+	int job_id_len    = sizeof(job_id);
+	int val_len       = sizeof(val_str);
+	uint64_t trid     = 0;
+	uint32_t value    = 0;
+
+	cmd[0]     = '\0';
+	module[0]  = '\0';
+	job_id[0]  = '\0';
+	val_str[0] = '\0';
+
+	// Read the parameters: module cmd trid value
+	int rv = as_info_parameter_get(params, "module", module, &module_len);
+	if (rv == -1) {
+		as_mon_info_cmd(NULL, NULL, 0, 0, db);
+		return 0;
+	}
+	else if (rv == -2) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":\"module\" parameter too long (> ");
+		cf_dyn_buf_append_int(db, module_len-1);
+		cf_dyn_buf_append_string(db, " chars)");
+		return 0;
+	}
+
+	rv = as_info_parameter_get(params, "cmd", cmd, &cmd_len);
+	if (rv == -1) {
+		as_mon_info_cmd(module, NULL, 0, 0, db);
+		return 0;
+	}
+	else if (rv == -2) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":\"cmd\" parameter too long (> ");
+		cf_dyn_buf_append_int(db, cmd_len-1);
+		cf_dyn_buf_append_string(db, " chars)");
+		return 0;
+	}
+
+	rv = as_info_parameter_get(params, "trid", job_id, &job_id_len);
+	if (rv == 0) {
+		trid  = strtoull(job_id, NULL, 10);
+	}
+	else if (rv == -1) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":no \"trid\" parameter specified");
+		return 0;
+	}
+	else if (rv == -2) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":\"trid\" parameter too long (> ");
+		cf_dyn_buf_append_int(db, job_id_len-1);
+		cf_dyn_buf_append_string(db, " chars)");
+		return 0;
+	}
+
+	rv = as_info_parameter_get(params, "value", val_str, &val_len);
+	if (rv == 0) {
+		value = strtoul(val_str, NULL, 10);
+	}
+	else if (rv == -2) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER);
+		cf_dyn_buf_append_string(db, ":\"value\" parameter too long (> ");
+		cf_dyn_buf_append_int(db, val_len-1);
+		cf_dyn_buf_append_string(db, " chars)");
+		return 0;
+	}
+
+	cf_info(AS_INFO, "%s %s %lu %u", module, cmd, trid, value);
+	as_mon_info_cmd(module, cmd, trid, value, db);
+	return 0;
+}
+
+
+static const char *
+debug_allocations_string(void)
+{
+	switch (g_config.debug_allocations) {
+	case CF_ALLOC_DEBUG_NONE:
+		return "none";
+
+	case CF_ALLOC_DEBUG_TRANSIENT:
+		return "transient";
+
+	case CF_ALLOC_DEBUG_PERSISTENT:
+		return "persistent";
+
+	case CF_ALLOC_DEBUG_ALL:
+		return "all";
+
+	default:
+		cf_crash(CF_ALLOC, "invalid CF_ALLOC_DEBUG_* value");
+		return NULL;
+	}
+}
+
+static const char *
+auto_pin_string(void)
+{
+	switch (g_config.auto_pin) {
+	case CF_TOPO_AUTO_PIN_NONE:
+		return "none";
+
+	case CF_TOPO_AUTO_PIN_CPU:
+		return "cpu";
+
+	case CF_TOPO_AUTO_PIN_NUMA:
+		return "numa";
+
+	default:
+		cf_crash(CF_ALLOC, "invalid CF_TOPO_AUTO_* value");
+		return NULL;
+	}
+}
+
+void
+info_service_config_get(cf_dyn_buf *db)
+{
+	// Note - no user, group.
+	info_append_uint32(db, "paxos-single-replica-limit", g_config.paxos_single_replica_limit);
+	info_append_string_safe(db, "pidfile", g_config.pidfile);
+	info_append_int(db, "proto-fd-max", g_config.n_proto_fd_max);
+
+	info_append_bool(db, "advertise-ipv6", cf_socket_advertises_ipv6());
+	info_append_string(db, "auto-pin", auto_pin_string());
+	info_append_int(db, "batch-threads", g_config.n_batch_threads);
+	info_append_uint32(db, "batch-max-buffers-per-queue", g_config.batch_max_buffers_per_queue);
+	info_append_uint32(db, "batch-max-requests", g_config.batch_max_requests);
+	info_append_uint32(db, "batch-max-unused-buffers", g_config.batch_max_unused_buffers);
+	info_append_uint32(db, "batch-priority", g_config.batch_priority);
+	info_append_uint32(db, "batch-index-threads", g_config.n_batch_index_threads);
+
+	char cluster_name[AS_CLUSTER_NAME_SZ];
+	info_get_printable_cluster_name(cluster_name);
+	info_append_string(db, "cluster-name", cluster_name);
+
+	info_append_bool(db, "enable-benchmarks-fabric", g_config.fabric_benchmarks_enabled);
+	info_append_bool(db, "enable-benchmarks-svc", g_config.svc_benchmarks_enabled);
+	info_append_bool(db, "enable-hist-info", g_config.info_hist_enabled);
+	info_append_string(db, "feature-key-file", g_config.feature_key_file);
+	info_append_uint32(db, "hist-track-back", g_config.hist_track_back);
+	info_append_uint32(db, "hist-track-slice", g_config.hist_track_slice);
+	info_append_string_safe(db, "hist-track-thresholds", g_config.hist_track_thresholds);
+	info_append_int(db, "info-threads", g_config.n_info_threads);
+	info_append_bool(db, "log-local-time", cf_fault_is_using_local_time());
+	info_append_uint32(db, "migrate-max-num-incoming", g_config.migrate_max_num_incoming);
+	info_append_uint32(db, "migrate-threads", g_config.n_migrate_threads);
+	info_append_uint32(db, "min-cluster-size", g_config.clustering_config.cluster_size_min);
+	info_append_uint64_x(db, "node-id", g_config.self_node); // may be configured or auto-generated
+	info_append_string_safe(db, "node-id-interface", g_config.node_id_interface);
+	info_append_uint32(db, "nsup-delete-sleep", g_config.nsup_delete_sleep);
+	info_append_uint32(db, "nsup-period", g_config.nsup_period);
+	info_append_bool(db, "nsup-startup-evict", g_config.nsup_startup_evict);
+	info_append_int(db, "proto-fd-idle-ms", g_config.proto_fd_idle_ms);
+	info_append_int(db, "proto-slow-netio-sleep-ms", g_config.proto_slow_netio_sleep_ms); // dynamic only
+	info_append_uint32(db, "query-batch-size", g_config.query_bsize);
+	info_append_uint32(db, "query-buf-size", g_config.query_buf_size); // dynamic only
+	info_append_uint32(db, "query-bufpool-size", g_config.query_bufpool_size);
+	info_append_bool(db, "query-in-transaction-thread", g_config.query_in_transaction_thr);
+	info_append_uint32(db, "query-long-q-max-size", g_config.query_long_q_max_size);
+	info_append_bool(db, "query-microbenchmark", g_config.query_enable_histogram); // dynamic only
+	info_append_bool(db, "query-pre-reserve-partitions", g_config.partitions_pre_reserved);
+	info_append_uint32(db, "query-priority", g_config.query_priority);
+	info_append_uint64(db, "query-priority-sleep-us", g_config.query_sleep_us);
+	info_append_uint64(db, "query-rec-count-bound", g_config.query_rec_count_bound);
+	info_append_bool(db, "query-req-in-query-thread", g_config.query_req_in_query_thread);
+	info_append_uint32(db, "query-req-max-inflight", g_config.query_req_max_inflight);
+	info_append_uint32(db, "query-short-q-max-size", g_config.query_short_q_max_size);
+	info_append_uint32(db, "query-threads", g_config.query_threads);
+	info_append_uint32(db, "query-threshold", g_config.query_threshold);
+	info_append_uint64(db, "query-untracked-time-ms", g_config.query_untracked_time_ms);
+	info_append_uint32(db, "query-worker-threads", g_config.query_worker_threads);
+	info_append_bool(db, "run-as-daemon", g_config.run_as_daemon);
+	info_append_uint32(db, "scan-max-active", g_config.scan_max_active);
+	info_append_uint32(db, "scan-max-done", g_config.scan_max_done);
+	info_append_uint32(db, "scan-max-udf-transactions", g_config.scan_max_udf_transactions);
+	info_append_uint32(db, "scan-threads", g_config.scan_threads);
+	info_append_uint32(db, "service-threads", g_config.n_service_threads);
+	info_append_uint32(db, "sindex-builder-threads", g_config.sindex_builder_threads);
+	info_append_uint32(db, "sindex-gc-max-rate", g_config.sindex_gc_max_rate);
+	info_append_uint32(db, "sindex-gc-period", g_config.sindex_gc_period);
+	info_append_uint32(db, "ticker-interval", g_config.ticker_interval);
+	info_append_int(db, "transaction-max-ms", (int)(g_config.transaction_max_ns / 1000000));
+	info_append_uint32(db, "transaction-pending-limit", g_config.transaction_pending_limit);
+	info_append_uint32(db, "transaction-queues", g_config.n_transaction_queues);
+	info_append_uint32(db, "transaction-retry-ms", g_config.transaction_retry_ms);
+	info_append_uint32(db, "transaction-threads-per-queue", g_config.n_transaction_threads_per_queue);
+	info_append_string_safe(db, "work-directory", g_config.work_directory);
+
+	info_append_string(db, "debug-allocations", debug_allocations_string());
+	info_append_bool(db, "fabric-dump-msgs", g_config.fabric_dump_msgs);
+	info_append_uint32(db, "prole-extra-ttl", g_config.prole_extra_ttl);
+}
+
+static void
+append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list)
+{
+	for (uint32_t i = 0; i < list->n_addrs; ++i) {
+		info_append_string(db, name, list->addrs[i]);
+	}
+}
+
+void
+info_network_config_get(cf_dyn_buf *db)
+{
+	// Service:
+
+	info_append_int(db, "service.port", g_config.service.bind_port);
+	append_addrs(db, "service.address", &g_config.service.bind);
+	info_append_int(db, "service.access-port", g_config.service.std_port);
+	append_addrs(db, "service.access-address", &g_config.service.std);
+	info_append_int(db, "service.alternate-access-port", g_config.service.alt_port);
+	append_addrs(db, "service.alternate-access-address", &g_config.service.alt);
+
+	info_append_int(db, "service.tls-port", g_config.tls_service.bind_port);
+	append_addrs(db, "service.tls-address", &g_config.tls_service.bind);
+	info_append_int(db, "service.tls-access-port", g_config.tls_service.std_port);
+	append_addrs(db, "service.tls-access-address", &g_config.tls_service.std);
+	info_append_int(db, "service.tls-alternate-access-port", g_config.tls_service.alt_port);
+	append_addrs(db, "service.tls-alternate-access-address", &g_config.tls_service.alt);
+	info_append_string_safe(db, "service.tls-name", g_config.tls_service.tls_our_name);
+
+	for (uint32_t i = 0; i < g_config.tls_service.n_tls_peer_names; ++i) {
+		info_append_string(db, "service.tls-authenticate-client",
+				g_config.tls_service.tls_peer_names[i]);
+	}
+
+	// Heartbeat:
+
+	as_hb_info_config_get(db);
+
+	// Fabric:
+
+	append_addrs(db, "fabric.address", &g_config.fabric.bind);
+	info_append_int(db, "fabric.port", g_config.fabric.bind_port);
+	append_addrs(db, "fabric.tls-address", &g_config.tls_fabric.bind);
+	info_append_int(db, "fabric.tls-port", g_config.tls_fabric.bind_port);
+	info_append_string_safe(db, "fabric.tls-name", g_config.tls_fabric.tls_our_name);
+	info_append_int(db, "fabric.channel-bulk-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_BULK]);
+	info_append_int(db, "fabric.channel-bulk-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK]);
+	info_append_int(db, "fabric.channel-ctrl-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_CTRL]);
+	info_append_int(db, "fabric.channel-ctrl-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL]);
+	info_append_int(db, "fabric.channel-meta-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_META]);
+	info_append_int(db, "fabric.channel-meta-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META]);
+	info_append_int(db, "fabric.channel-rw-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_RW]);
+	info_append_int(db, "fabric.channel-rw-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW]);
+	info_append_bool(db, "fabric.keepalive-enabled", g_config.fabric_keepalive_enabled);
+	info_append_int(db, "fabric.keepalive-intvl", g_config.fabric_keepalive_intvl);
+	info_append_int(db, "fabric.keepalive-probes", g_config.fabric_keepalive_probes);
+	info_append_int(db, "fabric.keepalive-time", g_config.fabric_keepalive_time);
+	info_append_int(db, "fabric.latency-max-ms", g_config.fabric_latency_max_ms);
+	info_append_int(db, "fabric.recv-rearm-threshold", g_config.fabric_recv_rearm_threshold);
+	info_append_int(db, "fabric.send-threads", g_config.n_fabric_send_threads);
+
+	// Info:
+
+	append_addrs(db, "info.address", &g_config.info.bind);
+	info_append_int(db, "info.port", g_config.info.bind_port);
+
+	// TLS:
+
+	for (uint32_t i = 0; i < g_config.n_tls_specs; ++i) {
+		cf_tls_spec *spec = g_config.tls_specs + i;
+		char key[100];
+
+		snprintf(key, sizeof(key), "tls[%u].name", i);
+		info_append_string_safe(db, key, spec->name);
+
+		snprintf(key, sizeof(key), "tls[%u].cert_file", i);
+		info_append_string_safe(db, key, spec->cert_file);
+
+		snprintf(key, sizeof(key), "tls[%u].key_file", i);
+		info_append_string_safe(db, key, spec->key_file);
+
+		snprintf(key, sizeof(key), "tls[%u].ca_file", i);
+		info_append_string_safe(db, key, spec->ca_file);
+
+		snprintf(key, sizeof(key), "tls[%u].ca_path", i);
+		info_append_string_safe(db, key, spec->ca_path);
+
+		snprintf(key, sizeof(key), "tls[%u].cert_blacklist", i);
+		info_append_string_safe(db, key, spec->cert_blacklist);
+
+		snprintf(key, sizeof(key), "tls[%u].protocols", i);
+		info_append_string_safe(db, key, spec->protocols);
+
+		snprintf(key, sizeof(key), "tls[%u].cipher_suite", i);
+		info_append_string_safe(db, key, spec->cipher_suite);
+	}
+}
+
+
+void
+info_namespace_config_get(char* context, cf_dyn_buf *db)
+{
+	as_namespace *ns = as_namespace_get_byname(context);
+
+	if (! ns) {
+		cf_dyn_buf_append_string(db, "namespace not found;"); // TODO - start with "error"?
+		return;
+	}
+
+	info_append_uint32(db, "replication-factor", ns->cfg_replication_factor);
+	info_append_uint64(db, "memory-size", ns->memory_size);
+	info_append_uint64(db, "default-ttl", ns->default_ttl);
+
+	info_append_bool(db, "enable-xdr", ns->enable_xdr);
+	info_append_bool(db, "sets-enable-xdr", ns->sets_enable_xdr);
+	info_append_bool(db, "ns-forward-xdr-writes", ns->ns_forward_xdr_writes);
+	info_append_bool(db, "allow-nonxdr-writes", ns->ns_allow_nonxdr_writes);
+	info_append_bool(db, "allow-xdr-writes", ns->ns_allow_xdr_writes);
+
+	// Not true config, but act as config overrides:
+	cf_hist_track_get_settings(ns->read_hist, db);
+	cf_hist_track_get_settings(ns->query_hist, db);
+	cf_hist_track_get_settings(ns->udf_hist, db);
+	cf_hist_track_get_settings(ns->write_hist, db);
+
+	info_append_uint32(db, "cold-start-evict-ttl", ns->cold_start_evict_ttl);
+
+	if (ns->conflict_resolution_policy == AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION) {
+		info_append_string(db, "conflict-resolution-policy", "generation");
+	}
+	else if (ns->conflict_resolution_policy == AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME) {
+		info_append_string(db, "conflict-resolution-policy", "last-update-time");
+	}
+	else {
+		info_append_string(db, "conflict-resolution-policy", "undefined");
+	}
+
+	info_append_bool(db, "data-in-index", ns->data_in_index);
+	info_append_bool(db, "disable-write-dup-res", ns->write_dup_res_disabled);
+	info_append_bool(db, "disallow-null-setname", ns->disallow_null_setname);
+	info_append_bool(db, "enable-benchmarks-batch-sub", ns->batch_sub_benchmarks_enabled);
+	info_append_bool(db, "enable-benchmarks-read", ns->read_benchmarks_enabled);
+	info_append_bool(db, "enable-benchmarks-udf", ns->udf_benchmarks_enabled);
+	info_append_bool(db, "enable-benchmarks-udf-sub", ns->udf_sub_benchmarks_enabled);
+	info_append_bool(db, "enable-benchmarks-write", ns->write_benchmarks_enabled);
+	info_append_bool(db, "enable-hist-proxy", ns->proxy_hist_enabled);
+	info_append_uint32(db, "evict-hist-buckets", ns->evict_hist_buckets);
+	info_append_uint32(db, "evict-tenths-pct", ns->evict_tenths_pct);
+	info_append_uint32(db, "high-water-disk-pct", ns->hwm_disk_pct);
+	info_append_uint32(db, "high-water-memory-pct", ns->hwm_memory_pct);
+	info_append_uint64(db, "max-ttl", ns->max_ttl);
+	info_append_uint32(db, "migrate-order", ns->migrate_order);
+	info_append_uint32(db, "migrate-retransmit-ms", ns->migrate_retransmit_ms);
+	info_append_uint32(db, "migrate-sleep", ns->migrate_sleep);
+	info_append_uint32(db, "obj-size-hist-max", ns->obj_size_hist_max); // not original, may have been rounded
+	info_append_uint32(db, "partition-tree-locks", ns->tree_shared.n_lock_pairs);
+	info_append_uint32(db, "partition-tree-sprigs", ns->tree_shared.n_sprigs);
+	info_append_uint32(db, "rack-id", ns->rack_id);
+	info_append_string(db, "read-consistency-level-override", NS_READ_CONSISTENCY_LEVEL_NAME());
+	info_append_bool(db, "single-bin", ns->single_bin);
+	info_append_uint32(db, "stop-writes-pct", ns->stop_writes_pct);
+	info_append_bool(db, "strong-consistency", ns->cp);
+	info_append_bool(db, "strong-consistency-allow-expunge", ns->cp_allow_drops);
+	info_append_uint32(db, "tomb-raider-eligible-age", ns->tomb_raider_eligible_age);
+	info_append_uint32(db, "tomb-raider-period", ns->tomb_raider_period);
+	info_append_string(db, "write-commit-level-override", NS_WRITE_COMMIT_LEVEL_NAME());
+
+	info_append_string(db, "storage-engine",
+			(ns->storage_type == AS_STORAGE_ENGINE_MEMORY ? "memory" :
+				(ns->storage_type == AS_STORAGE_ENGINE_SSD ? "device" : "illegal")));
+
+	if (ns->storage_type == AS_STORAGE_ENGINE_SSD) {
+		for (int i = 0; i < AS_STORAGE_MAX_DEVICES; i++) {
+			if (! ns->storage_devices[i]) {
+				break;
+			}
+
+			info_append_string(db, "storage-engine.device", ns->storage_devices[i]);
+		}
+
+		for (int i = 0; i < AS_STORAGE_MAX_FILES; i++) {
+			if (! ns->storage_files[i]) {
+				break;
+			}
+
+			info_append_string(db, "storage-engine.file", ns->storage_files[i]);
+		}
+
+		// TODO - how to report the shadows?
+
+		info_append_uint64(db, "storage-engine.filesize", ns->storage_filesize);
+		info_append_string_safe(db, "storage-engine.scheduler-mode", ns->storage_scheduler_mode);
+		info_append_uint32(db, "storage-engine.write-block-size", ns->storage_write_block_size);
+		info_append_bool(db, "storage-engine.data-in-memory", ns->storage_data_in_memory);
+		info_append_bool(db, "storage-engine.cold-start-empty", ns->storage_cold_start_empty);
+		info_append_bool(db, "storage-engine.commit-to-device", ns->storage_commit_to_device);
+		info_append_uint32(db, "storage-engine.commit-min-size", ns->storage_commit_min_size);
+		info_append_uint32(db, "storage-engine.defrag-lwm-pct", ns->storage_defrag_lwm_pct);
+		info_append_uint32(db, "storage-engine.defrag-queue-min", ns->storage_defrag_queue_min);
+		info_append_uint32(db, "storage-engine.defrag-sleep", ns->storage_defrag_sleep);
+		info_append_int(db, "storage-engine.defrag-startup-minimum", ns->storage_defrag_startup_minimum);
+		info_append_bool(db, "storage-engine.disable-odirect", ns->storage_disable_odirect);
+		info_append_bool(db, "storage-engine.enable-benchmarks-storage", ns->storage_benchmarks_enabled);
+		info_append_bool(db, "storage-engine.enable-osync", ns->storage_enable_osync);
+		info_append_string_safe(db, "storage-engine.encryption-key-file", ns->storage_encryption_key_file);
+		info_append_uint64(db, "storage-engine.flush-max-ms", ns->storage_flush_max_us / 1000);
+		info_append_uint64(db, "storage-engine.fsync-max-sec", ns->storage_fsync_max_us / 1000000);
+		info_append_uint64(db, "storage-engine.max-write-cache", ns->storage_max_write_cache);
+		info_append_uint32(db, "storage-engine.min-avail-pct", ns->storage_min_avail_pct);
+		info_append_uint32(db, "storage-engine.post-write-queue", ns->storage_post_write_queue);
+		info_append_uint32(db, "storage-engine.tomb-raider-sleep", ns->storage_tomb_raider_sleep);
+		info_append_uint32(db, "storage-engine.write-threads", ns->storage_write_threads);
+	}
+
+	info_append_uint32(db, "sindex.num-partitions", ns->sindex_num_partitions);
+
+	info_append_bool(db, "geo2dsphere-within.strict", ns->geo2dsphere_within_strict);
+	info_append_uint32(db, "geo2dsphere-within.min-level", (uint32_t)ns->geo2dsphere_within_min_level);
+	info_append_uint32(db, "geo2dsphere-within.max-level", (uint32_t)ns->geo2dsphere_within_max_level);
+	info_append_uint32(db, "geo2dsphere-within.max-cells", (uint32_t)ns->geo2dsphere_within_max_cells);
+	info_append_uint32(db, "geo2dsphere-within.level-mod", (uint32_t)ns->geo2dsphere_within_level_mod);
+	info_append_uint32(db, "geo2dsphere-within.earth-radius-meters", ns->geo2dsphere_within_earth_radius_meters);
+}
+
+
+// TODO - security API?
+void
+info_security_config_get(cf_dyn_buf *db)
+{
+	info_append_bool(db, "enable-security", g_config.sec_cfg.security_enabled);
+	info_append_uint32(db, "privilege-refresh-period", g_config.sec_cfg.privilege_refresh_period);
+	info_append_uint32(db, "report-authentication-sinks", g_config.sec_cfg.report.authentication);
+	info_append_uint32(db, "report-data-op-sinks", g_config.sec_cfg.report.data_op);
+	info_append_uint32(db, "report-sys-admin-sinks", g_config.sec_cfg.report.sys_admin);
+	info_append_uint32(db, "report-user-admin-sinks", g_config.sec_cfg.report.user_admin);
+	info_append_uint32(db, "report-violation-sinks", g_config.sec_cfg.report.violation);
+	info_append_int(db, "syslog-local", g_config.sec_cfg.syslog_local);
+}
+
+
+void
+info_command_config_get_with_params(char *name, char *params, cf_dyn_buf *db)
+{
+	char context[1024];
+	int context_len = sizeof(context);
+
+	if (as_info_parameter_get(params, "context", context, &context_len) != 0) {
+		cf_dyn_buf_append_string(db, "Error: Invalid get-config parameter;");
+		return;
+	}
+
+	if (strcmp(context, "service") == 0) {
+		info_service_config_get(db);
+	}
+	else if (strcmp(context, "network") == 0) {
+		info_network_config_get(db);
+	}
+	else if (strcmp(context, "namespace") == 0) {
+		context_len = sizeof(context);
+
+		if (as_info_parameter_get(params, "id", context, &context_len) != 0) {
+			cf_dyn_buf_append_string(db, "Error:invalid id;");
+			return;
+		}
+
+		info_namespace_config_get(context, db);
+	}
+	else if (strcmp(context, "security") == 0) {
+		info_security_config_get(db);
+	}
+	else if (strcmp(context, "xdr") == 0) {
+		as_xdr_get_config(db);
+	}
+	else {
+		cf_dyn_buf_append_string(db, "Error:Invalid context;");
+	}
+}
+
+
+int
+info_command_config_get(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "config-get command received: params %s", params);
+
+	if (params && *params != 0) {
+		info_command_config_get_with_params(name, params, db);
+		cf_dyn_buf_chomp(db);
+		return 0;
+	}
+
+	// We come here when context is not mentioned.
+	// In that case we want to print everything.
+	info_service_config_get(db);
+	info_network_config_get(db);
+	info_security_config_get(db);
+	as_xdr_get_config(db);
+
+	cf_dyn_buf_chomp(db);
+
+	return 0;
+}
+
+
+//
+// config-set:context=service;variable=value;
+// config-set:context=network;variable=heartbeat.value;
+// config-set:context=namespace;id=test;variable=value;
+//
+int
+info_command_config_set_threadsafe(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "config-set command received: params %s", params);
+
+	char context[1024];
+	int  context_len = sizeof(context);
+	int val;
+	char bool_val[2][6] = {"false", "true"};
+
+	if (0 != as_info_parameter_get(params, "context", context, &context_len))
+		goto Error;
+	if (strcmp(context, "service") == 0) {
+		context_len = sizeof(context);
+		if (0 == as_info_parameter_get(params, "advertise-ipv6", context, &context_len)) {
+			if (strcmp(context, "true") == 0 || strcmp(context, "yes") == 0) {
+				cf_socket_set_advertise_ipv6(true);
+			}
+			else if (strcmp(context, "false") == 0 || strcmp(context, "no") == 0) {
+				cf_socket_set_advertise_ipv6(false);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "transaction-threads-per-queue", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if (val < 1 || val > MAX_TRANSACTION_THREADS_PER_QUEUE) {
+				cf_warning(AS_INFO, "transaction-threads-per-queue must be between 1 and %u", MAX_TRANSACTION_THREADS_PER_QUEUE);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of transaction-threads-per-queue from %u to %d ", g_config.n_transaction_threads_per_queue, val);
+			as_tsvc_set_threads_per_queue((uint32_t)val);
+		}
+		else if (0 == as_info_parameter_get(params, "transaction-retry-ms", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (val == 0)
+				goto Error;
+			cf_info(AS_INFO, "Changing value of transaction-retry-ms from %d to %d ", g_config.transaction_retry_ms, val);
+			g_config.transaction_retry_ms = val;
+		}
+		else if (0 == as_info_parameter_get(params, "transaction-max-ms", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of transaction-retry-ms from %"PRIu64" to %d ", (g_config.transaction_max_ns / 1000000), val);
+			g_config.transaction_max_ns = (uint64_t)val * 1000000;
+		}
+		else if (0 == as_info_parameter_get(params, "transaction-pending-limit", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of transaction-pending-limit from %d to %d ", g_config.transaction_pending_limit, val);
+			g_config.transaction_pending_limit = val;
+		}
+		else if (0 == as_info_parameter_get(params, "ticker-interval", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of ticker-interval from %d to %d ", g_config.ticker_interval, val);
+			g_config.ticker_interval = val;
+		}
+		else if (0 == as_info_parameter_get(params, "scan-max-active", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (val < 0 || val > 200) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of scan-max-active from %d to %d ", g_config.scan_max_active, val);
+			g_config.scan_max_active = val;
+			as_scan_limit_active_jobs(g_config.scan_max_active);
+		}
+		else if (0 == as_info_parameter_get(params, "scan-max-done", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (val < 0 || val > 1000) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of scan-max-done from %d to %d ", g_config.scan_max_done, val);
+			g_config.scan_max_done = val;
+			as_scan_limit_finished_jobs(g_config.scan_max_done);
+		}
+		else if (0 == as_info_parameter_get(params, "scan-max-udf-transactions", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of scan-max-udf-transactions from %d to %d ", g_config.scan_max_udf_transactions, val);
+			g_config.scan_max_udf_transactions = val;
+		}
+		else if (0 == as_info_parameter_get(params, "scan-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (val < 0 || val > 128) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of scan-threads from %d to %d ", g_config.scan_threads, val);
+			g_config.scan_threads = val;
+			as_scan_resize_thread_pool(g_config.scan_threads);
+		}
+		else if (0 == as_info_parameter_get(params, "batch-index-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (0 != as_batch_threads_resize(val))
+				goto Error;
+		}
+		else if (0 == as_info_parameter_get(params, "batch-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (0 != as_batch_direct_threads_resize(val))
+				goto Error;
+		}
+		else if (0 == as_info_parameter_get(params, "batch-max-requests", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of batch-max-requests from %d to %d ", g_config.batch_max_requests, val);
+			g_config.batch_max_requests = val;
+		}
+		else if (0 == as_info_parameter_get(params, "batch-max-buffers-per-queue", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of batch-max-buffers-per-queue from %d to %d ", g_config.batch_max_buffers_per_queue, val);
+			g_config.batch_max_buffers_per_queue = val;
+		}
+		else if (0 == as_info_parameter_get(params, "batch-max-unused-buffers", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of batch-max-unused-buffers from %d to %d ", g_config.batch_max_unused_buffers, val);
+			g_config.batch_max_unused_buffers = val;
+		}
+		else if (0 == as_info_parameter_get(params, "batch-priority", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of batch-priority from %d to %d ", g_config.batch_priority, val);
+			g_config.batch_priority = val;
+		}
+		else if (0 == as_info_parameter_get(params, "proto-fd-max", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of proto-fd-max from %d to %d ", g_config.n_proto_fd_max, val);
+			g_config.n_proto_fd_max = val;
+		}
+		else if (0 == as_info_parameter_get(params, "proto-fd-idle-ms", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of proto-fd-idle-ms from %d to %d ", g_config.proto_fd_idle_ms, val);
+			g_config.proto_fd_idle_ms = val;
+		}
+		else if (0 == as_info_parameter_get(params, "proto-slow-netio-sleep-ms", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of proto-slow-netio-sleep-ms from %d to %d ", g_config.proto_slow_netio_sleep_ms, val);
+			g_config.proto_slow_netio_sleep_ms = val;
+		}
+		else if (0 == as_info_parameter_get(params, "nsup-delete-sleep", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of nsup-delete-sleep from %d to %d ", g_config.nsup_delete_sleep, val);
+			g_config.nsup_delete_sleep = val;
+		}
+		else if (0 == as_info_parameter_get(params, "nsup-period", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of nsup-period from %d to %d ", g_config.nsup_period, val);
+			g_config.nsup_period = val;
+		}
+		else if (0 == as_info_parameter_get( params, "cluster-name", context, &context_len)){
+			if (!as_config_cluster_name_set(context)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of cluster-name to '%s'", context);
+		}
+		else if (0 == as_info_parameter_get(params, "migrate-max-num-incoming", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if ((uint32_t)val > AS_MIGRATE_LIMIT_MAX_NUM_INCOMING) {
+				cf_warning(AS_INFO, "migrate-max-num-incoming %d must be >= 0 and <= %u", val, AS_MIGRATE_LIMIT_MAX_NUM_INCOMING);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of migrate-max-num-incoming from %u to %d ", g_config.migrate_max_num_incoming, val);
+			g_config.migrate_max_num_incoming = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "migrate-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if ((uint32_t)val > MAX_NUM_MIGRATE_XMIT_THREADS) {
+				cf_warning(AS_INFO, "migrate-threads %d must be >= 0 and <= %u", val, MAX_NUM_MIGRATE_XMIT_THREADS);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of migrate-threads from %u to %d ", g_config.n_migrate_threads, val);
+			as_migrate_set_num_xmit_threads(val);
+		}
+		else if (0 == as_info_parameter_get(params, "min-cluster-size", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || (0 > val) || (as_clustering_cluster_size_min_set(val) < 0))
+				goto Error;
+		}
+		else if (0 == as_info_parameter_get(params, "prole-extra-ttl", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of prole-extra-ttl from %d to %d ", g_config.prole_extra_ttl, val);
+			g_config.prole_extra_ttl = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-buf-size", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_debug(AS_INFO, "query-buf-size = %"PRIu64"", val);
+			if (val < 1024) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-buf-size from %"PRIu64" to %"PRIu64"", g_config.query_buf_size, val);
+			g_config.query_buf_size = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-threshold", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_debug(AS_INFO, "query-threshold = %"PRIu64"", val);
+			if ((int64_t)val <= 0) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-threshold from %u to %"PRIu64, g_config.query_threshold, val);
+			g_config.query_threshold = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-untracked-time-ms", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_debug(AS_INFO, "query-untracked-time = %"PRIu64" milli seconds", val);
+			if ((int64_t)val < 0) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-untracked-time from %"PRIu64" milli seconds to %"PRIu64" milli seconds",
+						g_config.query_untracked_time_ms, val);
+			g_config.query_untracked_time_ms = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-rec-count-bound", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_debug(AS_INFO, "query-rec-count-bound = %"PRIu64"", val);
+			if ((int64_t)val <= 0) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-rec-count-bound from %"PRIu64" to %"PRIu64" ", g_config.query_rec_count_bound, val);
+			g_config.query_rec_count_bound = val;
+		}
+		else if (0 == as_info_parameter_get(params, "sindex-builder-threads", context, &context_len)) {
+			int val = 0;
+			if (0 != cf_str_atoi(context, &val) || (val > MAX_SINDEX_BUILDER_THREADS)) {
+				cf_warning(AS_INFO, "sindex-builder-threads: value must be <= %d, not %s", MAX_SINDEX_BUILDER_THREADS, context);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of sindex-builder-threads from %u to %d", g_config.sindex_builder_threads, val);
+			g_config.sindex_builder_threads = (uint32_t)val;
+			as_sbld_resize_thread_pool(g_config.sindex_builder_threads);
+		}
+		else if (0 == as_info_parameter_get(params, "sindex-gc-max-rate", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of sindex-gc-max-rate from %d to %d ", g_config.sindex_gc_max_rate, val);
+			g_config.sindex_gc_max_rate = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "sindex-gc-period", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			cf_info(AS_INFO, "Changing value of sindex-gc-period from %d to %d ", g_config.sindex_gc_period, val);
+			g_config.sindex_gc_period = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-threads", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-threads = %"PRIu64, val);
+			if (val == 0) {
+				cf_warning(AS_INFO, "query-threads should be a number %s", context);
+				goto Error;
+			}
+			int old_val = g_config.query_threads;
+			int new_val = 0;
+			if (as_query_reinit(val, &new_val) != AS_QUERY_OK) {
+				cf_warning(AS_INFO, "Config not changed.");
+				goto Error;
+			}
+
+			cf_info(AS_INFO, "Changing value of query-threads from %d to %d",
+					old_val, new_val);
+		}
+		else if (0 == as_info_parameter_get(params, "query-worker-threads", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-worker-threads = %"PRIu64, val);
+			if (val == 0) {
+				cf_warning(AS_INFO, "query-worker-threads should be a number %s", context);
+				goto Error;
+			}
+			int old_val = g_config.query_threads;
+			int new_val = 0;
+			if (as_query_worker_reinit(val, &new_val) != AS_QUERY_OK) {
+				cf_warning(AS_INFO, "Config not changed.");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-worker-threads from %d to %d",
+					old_val, new_val);
+		}
+		else if (0 == as_info_parameter_get(params, "query-priority", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query_priority = %"PRIu64, val);
+			if (val == 0) {
+				cf_warning(AS_INFO, "query_priority should be a number %s", context);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-priority from %d to %"PRIu64, g_config.query_priority, val);
+			g_config.query_priority = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-priority-sleep-us", context, &context_len)) {
+			uint64_t val = atoll(context);
+			if(val == 0) {
+				cf_warning(AS_INFO, "query_sleep should be a number %s", context);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-sleep from %"PRIu64" uSec to %"PRIu64" uSec ", g_config.query_sleep_us, val);
+			g_config.query_sleep_us = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-batch-size", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-batch-size = %"PRIu64, val);
+			if((int)val <= 0) {
+				cf_warning(AS_INFO, "query-batch-size should be a positive number");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-batch-size from %d to %"PRIu64, g_config.query_bsize, val);
+			g_config.query_bsize = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-req-max-inflight", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-req-max-inflight = %"PRIu64, val);
+			if((int)val <= 0) {
+				cf_warning(AS_INFO, "query-req-max-inflight should be a positive number");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-req-max-inflight from %d to %"PRIu64, g_config.query_req_max_inflight, val);
+			g_config.query_req_max_inflight = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-bufpool-size", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-bufpool-size = %"PRIu64, val);
+			if((int)val <= 0) {
+				cf_warning(AS_INFO, "query-bufpool-size should be a positive number");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-bufpool-size from %d to %"PRIu64, g_config.query_bufpool_size, val);
+			g_config.query_bufpool_size = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-in-transaction-thread", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of query-in-transaction-thread  from %s to %s", bool_val[g_config.query_in_transaction_thr], context);
+				g_config.query_in_transaction_thr = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of query-in-transaction-thread  from %s to %s", bool_val[g_config.query_in_transaction_thr], context);
+				g_config.query_in_transaction_thr = false;
+			}
+			else
+				goto Error;
+		}
+		else if (0 == as_info_parameter_get(params, "query-req-in-query-thread", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of query-req-in-query-thread from %s to %s", bool_val[g_config.query_req_in_query_thread], context);
+				g_config.query_req_in_query_thread = true;
+
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of query-req-in-query-thread from %s to %s", bool_val[g_config.query_req_in_query_thread], context);
+				g_config.query_req_in_query_thread = false;
+			}
+			else
+				goto Error;
+		}
+		else if (0 == as_info_parameter_get(params, "query-short-q-max-size", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-short-q-max-size = %"PRIu64, val);
+			if((int)val <= 0) {
+				cf_warning(AS_INFO, "query-short-q-max-size should be a positive number");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-short-q-max-size from %d to %"PRIu64, g_config.query_short_q_max_size, val);
+			g_config.query_short_q_max_size = val;
+		}
+		else if (0 == as_info_parameter_get(params, "query-long-q-max-size", context, &context_len)) {
+			uint64_t val = atoll(context);
+			cf_info(AS_INFO, "query-long-q-max-size = %"PRIu64, val);
+			if((int)val <= 0) {
+				cf_warning(AS_INFO, "query-long-q-max-size should be a positive number");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of query-longq-max-size from %d to %"PRIu64, g_config.query_long_q_max_size, val);
+			g_config.query_long_q_max_size = val;
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-fabric", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-fabric to %s", context);
+				g_config.fabric_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-fabric to %s", context);
+				g_config.fabric_benchmarks_enabled = false;
+				histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_BULK]);
+				histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_BULK]);
+				histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_BULK]);
+				histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_BULK]);
+				histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_CTRL]);
+				histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_CTRL]);
+				histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_CTRL]);
+				histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_CTRL]);
+				histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_META]);
+				histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_META]);
+				histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_META]);
+				histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_META]);
+				histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_RW]);
+				histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_RW]);
+				histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_RW]);
+				histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_RW]);
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-svc", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-svc to %s", context);
+				g_config.svc_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-svc to %s", context);
+				g_config.svc_benchmarks_enabled = false;
+				histogram_clear(g_stats.svc_demarshal_hist);
+				histogram_clear(g_stats.svc_queue_hist);
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-hist-info", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-hist-info to %s", context);
+				g_config.info_hist_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-hist-info to %s", context);
+				g_config.info_hist_enabled = false;
+				histogram_clear(g_stats.info_hist);
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "query-microbenchmark", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of query-enable-histogram to %s", context);
+				g_config.query_enable_histogram = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of query-enable-histogram to %s", context);
+				g_config.query_enable_histogram = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "query-pre-reserve-partitions", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of query-pre-reserve-partitions to %s", context);
+				g_config.partitions_pre_reserved = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of query-pre-reserve-partitions to %s", context);
+				g_config.partitions_pre_reserved = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else {
+			goto Error;
+		}
+	}
+	else if (strcmp(context, "network") == 0) {
+		context_len = sizeof(context);
+		if (0 == as_info_parameter_get(params, "heartbeat.interval", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (as_hb_tx_interval_set(val) != 0) {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "heartbeat.timeout", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			if (as_hb_max_intervals_missed_set(val) != 0){
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "heartbeat.mtu", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val))
+				goto Error;
+			as_hb_override_mtu_set(val);
+		}
+		else if (0 == as_info_parameter_get(params, "heartbeat.protocol", context, &context_len)) {
+			as_hb_protocol protocol =	(!strcmp(context, "v3") ? AS_HB_PROTOCOL_V3 :
+											(!strcmp(context, "reset") ? AS_HB_PROTOCOL_RESET :
+												(!strcmp(context, "none") ? AS_HB_PROTOCOL_NONE :
+													AS_HB_PROTOCOL_UNDEF)));
+			if (AS_HB_PROTOCOL_UNDEF == protocol) {
+				cf_warning(AS_INFO, "heartbeat protocol version %s not supported", context);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of heartbeat protocol version to %s", context);
+			if (0 > as_hb_protocol_set(protocol))
+				goto Error;
+		}
+		else if (0 == as_info_parameter_get(params, "fabric.channel-bulk-recv-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) {
+				cf_warning(AS_INFO, "fabric.channel-bulk-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS);
+				goto Error;
+			}
+			cf_info(AS_FABRIC, "changing fabric.channel-bulk-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK], val);
+			as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_BULK, val);
+		}
+		else if (0 == as_info_parameter_get(params, "fabric.channel-ctrl-recv-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) {
+				cf_warning(AS_INFO, "fabric.channel-ctrl-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS);
+				goto Error;
+			}
+			cf_info(AS_FABRIC, "changing fabric.channel-ctrl-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL], val);
+			as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_CTRL, val);
+		}
+		else if (0 == as_info_parameter_get(params, "fabric.channel-meta-recv-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) {
+				cf_warning(AS_INFO, "fabric.channel-meta-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS);
+				goto Error;
+			}
+			cf_info(AS_FABRIC, "changing fabric.channel-meta-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META], val);
+			as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_META, val);
+		}
+		else if (0 == as_info_parameter_get(params, "fabric.channel-rw-recv-threads", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) {
+				cf_warning(AS_INFO, "fabric.channel-rw-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS);
+				goto Error;
+			}
+			cf_info(AS_FABRIC, "changing fabric.channel-rw-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW], val);
+			as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_RW, val);
+		}
+		else if (0 == as_info_parameter_get(params, "fabric.recv-rearm-threshold", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+
+			if (val < 0 || val > 1024 * 1024) {
+				goto Error;
+			}
+
+			g_config.fabric_recv_rearm_threshold = (uint32_t)val;
+		}
+		else
+			goto Error;
+	}
+	else if (strcmp(context, "namespace") == 0) {
+		context_len = sizeof(context);
+		if (0 != as_info_parameter_get(params, "id", context, &context_len))
+			goto Error;
+		as_namespace *ns = as_namespace_get_byname(context);
+		if (!ns)
+			goto Error;
+
+		context_len = sizeof(context);
+		// configure namespace/set related parameters:
+		if (0 == as_info_parameter_get(params, "set", context, &context_len)) {
+			if (context_len == 0 || context_len >= AS_SET_NAME_MAX_SIZE) {
+				cf_warning(AS_INFO, "illegal length %d for set name %s",
+						context_len, context);
+				goto Error;
+			}
+
+			char set_name[AS_SET_NAME_MAX_SIZE];
+			size_t set_name_len = (size_t)context_len;
+
+			strcpy(set_name, context);
+
+			// Ideally, set operations should not be part of configs. But,
+			// set-delete is exception for historical reasons. Do an early check
+			// and bail out if set doesn't exist.
+			uint16_t set_id = as_namespace_get_set_id(ns, set_name);
+			if (set_id == INVALID_SET_ID) {
+				context_len = sizeof(context);
+				if (0 == as_info_parameter_get(params, "set-delete", context,
+						&context_len)) {
+					cf_warning(AS_INFO, "set-delete failed because set %s doesn't exist in ns %s",
+							set_name, ns->name);
+					goto Error;
+				}
+			}
+
+			// configurations should create set if it doesn't exist.
+			// checks if there is a vmap set with the same name and if so returns
+			// a ptr to it. if not, it creates an set structure, initializes it
+			// and returns a ptr to it.
+			as_set *p_set = NULL;
+			if (as_namespace_get_create_set_w_len(ns, set_name, set_name_len,
+					&p_set, NULL) != 0) {
+				goto Error;
+			}
+
+			context_len = sizeof(context);
+			if (0 == as_info_parameter_get(params, "set-enable-xdr", context, &context_len)) {
+				// TODO - make sure context is null-terminated.
+				if ((strncmp(context, "true", 4) == 0) || (strncmp(context, "yes", 3) == 0)) {
+					cf_info(AS_INFO, "Changing value of set-enable-xdr of ns %s set %s to %s", ns->name, p_set->name, context);
+					cf_atomic32_set(&p_set->enable_xdr, AS_SET_ENABLE_XDR_TRUE);
+				}
+				else if ((strncmp(context, "false", 5) == 0) || (strncmp(context, "no", 2) == 0)) {
+					cf_info(AS_INFO, "Changing value of set-enable-xdr of ns %s set %s to %s", ns->name, p_set->name, context);
+					cf_atomic32_set(&p_set->enable_xdr, AS_SET_ENABLE_XDR_FALSE);
+				}
+				else if (strncmp(context, "use-default", 11) == 0) {
+					cf_info(AS_INFO, "Changing value of set-enable-xdr of ns %s set %s to %s", ns->name, p_set->name, context);
+					cf_atomic32_set(&p_set->enable_xdr, AS_SET_ENABLE_XDR_DEFAULT);
+				}
+				else {
+					goto Error;
+				}
+			}
+			else if (0 == as_info_parameter_get(params, "set-disable-eviction", context, &context_len)) {
+				if ((strncmp(context, "true", 4) == 0) || (strncmp(context, "yes", 3) == 0)) {
+					cf_info(AS_INFO, "Changing value of set-disable-eviction of ns %s set %s to %s", ns->name, p_set->name, context);
+					DISABLE_SET_EVICTION(p_set, true);
+				}
+				else if ((strncmp(context, "false", 5) == 0) || (strncmp(context, "no", 2) == 0)) {
+					cf_info(AS_INFO, "Changing value of set-disable-eviction of ns %s set %s to %s", ns->name, p_set->name, context);
+					DISABLE_SET_EVICTION(p_set, false);
+				}
+				else {
+					goto Error;
+				}
+			}
+			else if (0 == as_info_parameter_get(params, "set-stop-writes-count", context, &context_len)) {
+				uint64_t val = atoll(context);
+				cf_info(AS_INFO, "Changing value of set-stop-writes-count of ns %s set %s to %lu", ns->name, p_set->name, val);
+				cf_atomic64_set(&p_set->stop_writes_count, val);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "memory-size", context, &context_len)) {
+			uint64_t val;
+
+			if (0 != cf_str_atoi_u64(context, &val)) {
+				goto Error;
+			}
+			cf_debug(AS_INFO, "memory-size = %"PRIu64"", val);
+			if (val > ns->memory_size)
+				ns->memory_size = val;
+			if (val < (ns->memory_size / 2L)) { // protect so someone does not reduce memory to below 1/2 current value
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of memory-size of ns %s from %"PRIu64" to %"PRIu64, ns->name, ns->memory_size, val);
+			ns->memory_size = val;
+		}
+		else if (0 == as_info_parameter_get(params, "high-water-disk-pct", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || val < 0 || val > 100) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of high-water-disk-pct of ns %s from %u to %d ", ns->name, ns->hwm_disk_pct, val);
+			ns->hwm_disk_pct = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "high-water-memory-pct", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || val < 0 || val > 100) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of high-water-memory-pct memory of ns %s from %u to %d ", ns->name, ns->hwm_memory_pct, val);
+			ns->hwm_memory_pct = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "evict-tenths-pct", context, &context_len)) {
+			cf_info(AS_INFO, "Changing value of evict-tenths-pct memory of ns %s from %d to %d ", ns->name, ns->evict_tenths_pct, atoi(context));
+			ns->evict_tenths_pct = atoi(context);
+		}
+		else if (0 == as_info_parameter_get(params, "evict-hist-buckets", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || val < 100 || val > 10000000) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of evict-hist-buckets of ns %s from %u to %d ", ns->name, ns->evict_hist_buckets, val);
+			ns->evict_hist_buckets = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "stop-writes-pct", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || val < 0 || val > 100) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of stop-writes-pct memory of ns %s from %u to %d ", ns->name, ns->stop_writes_pct, val);
+			ns->stop_writes_pct = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "default-ttl", context, &context_len)) {
+			uint64_t val;
+			if (cf_str_atoi_seconds(context, &val) != 0) {
+				cf_warning(AS_INFO, "default-ttl must be an unsigned number with time unit (s, m, h, or d)");
+				goto Error;
+			}
+			if (val > ns->max_ttl) {
+				cf_warning(AS_INFO, "default-ttl must be <= max-ttl (%lu seconds)", ns->max_ttl);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of default-ttl memory of ns %s from %"PRIu64" to %"PRIu64" ", ns->name, ns->default_ttl, val);
+			ns->default_ttl = val;
+		}
+		else if (0 == as_info_parameter_get(params, "max-ttl", context, &context_len)) {
+			uint64_t val;
+			if (cf_str_atoi_seconds(context, &val) != 0) {
+				cf_warning(AS_INFO, "max-ttl must be an unsigned number with time unit (s, m, h, or d)");
+				goto Error;
+			}
+			if (val == 0 || val > MAX_ALLOWED_TTL) {
+				cf_warning(AS_INFO, "max-ttl must be non-zero and <= %u seconds", MAX_ALLOWED_TTL);
+				goto Error;
+			}
+			if (val < ns->default_ttl) {
+				cf_warning(AS_INFO, "max-ttl must be >= default-ttl (%lu seconds)", ns->default_ttl);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of max-ttl memory of ns %s from %"PRIu64" to %"PRIu64" ", ns->name, ns->max_ttl, val);
+			ns->max_ttl = val;
+		}
+		else if (0 == as_info_parameter_get(params, "migrate-order", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || val < 1 || val > 10) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of migrate-order of ns %s from %u to %d", ns->name, ns->migrate_order, val);
+			ns->migrate_order = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "migrate-retransmit-ms", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of migrate-retransmit-ms of ns %s from %u to %d", ns->name, ns->migrate_retransmit_ms, val);
+			ns->migrate_retransmit_ms = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "migrate-sleep", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of migrate-sleep of ns %s from %u to %d", ns->name, ns->migrate_sleep, val);
+			ns->migrate_sleep = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "tomb-raider-eligible-age", context, &context_len)) {
+			uint64_t val;
+			if (cf_str_atoi_seconds(context, &val) != 0) {
+				cf_warning(AS_INFO, "tomb-raider-eligible-age must be an unsigned number with time unit (s, m, h, or d)");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of tomb-raider-eligible-age of ns %s from %u to %lu", ns->name, ns->tomb_raider_eligible_age, val);
+			ns->tomb_raider_eligible_age = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "tomb-raider-period", context, &context_len)) {
+			uint64_t val;
+			if (cf_str_atoi_seconds(context, &val) != 0) {
+				cf_warning(AS_INFO, "tomb-raider-period must be an unsigned number with time unit (s, m, h, or d)");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of tomb-raider-period of ns %s from %u to %lu", ns->name, ns->tomb_raider_period, val);
+			ns->tomb_raider_period = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "tomb-raider-sleep", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of tomb-raider-sleep of ns %s from %u to %d", ns->name, ns->storage_tomb_raider_sleep, val);
+			ns->storage_tomb_raider_sleep = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "obj-size-hist-max", context, &context_len)) {
+			uint32_t hist_max = (uint32_t)atoi(context);
+			uint32_t round_to = OBJ_SIZE_HIST_NUM_BUCKETS;
+			uint32_t round_max = hist_max ? ((hist_max + round_to - 1) / round_to) * round_to : round_to;
+			if (round_max != hist_max) {
+				cf_info(AS_INFO, "rounding obj-size-hist-max %u up to %u", hist_max, round_max);
+			}
+			cf_info(AS_INFO, "Changing value of obj-size-hist-max of ns %s to %u", ns->name, round_max);
+			cf_atomic32_set(&ns->obj_size_hist_max, round_max); // in 128-byte blocks
+		}
+		else if (0 == as_info_parameter_get(params, "rack-id", context, &context_len)) {
+			if (as_config_error_enterprise_only()) {
+				cf_warning(AS_INFO, "rack-id is enterprise-only");
+				goto Error;
+			}
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			if ((uint32_t)val > MAX_RACK_ID) {
+				cf_warning(AS_INFO, "rack-id %d must be >= 0 and <= %u", val, MAX_RACK_ID);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of rack-id of ns %s from %u to %d", ns->name, ns->rack_id, val);
+			ns->rack_id = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "conflict-resolution-policy", context, &context_len)) {
+			if (ns->cp) {
+				cf_warning(AS_INFO, "{%s} 'conflict-resolution-policy' is not applicable with 'strong-consistency'", ns->name);
+				goto Error;
+			}
+			if (strncmp(context, "generation", 10) == 0) {
+				cf_info(AS_INFO, "Changing value of conflict-resolution-policy of ns %s from %d to %s", ns->name, ns->conflict_resolution_policy, context);
+				ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION;
+			}
+			else if (strncmp(context, "last-update-time", 16) == 0) {
+				cf_info(AS_INFO, "Changing value of conflict-resolution-policy of ns %s from %d to %s", ns->name, ns->conflict_resolution_policy, context);
+				ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "defrag-lwm-pct", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of defrag-lwm-pct of ns %s from %d to %d ", ns->name, ns->storage_defrag_lwm_pct, val);
+
+			uint32_t old_val = ns->storage_defrag_lwm_pct;
+
+			ns->storage_defrag_lwm_pct = val;
+			ns->defrag_lwm_size = (ns->storage_write_block_size * ns->storage_defrag_lwm_pct) / 100;
+
+			if (ns->storage_defrag_lwm_pct > old_val) {
+				as_storage_defrag_sweep(ns);
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "defrag-queue-min", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of defrag-queue-min of ns %s from %u to %d", ns->name, ns->storage_defrag_queue_min, val);
+			ns->storage_defrag_queue_min = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "defrag-sleep", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of defrag-sleep of ns %s from %u to %d", ns->name, ns->storage_defrag_sleep, val);
+			ns->storage_defrag_sleep = (uint32_t)val;
+		}
+		else if (0 == as_info_parameter_get(params, "flush-max-ms", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of flush-max-ms of ns %s from %lu to %d", ns->name, ns->storage_flush_max_us / 1000, val);
+			ns->storage_flush_max_us = (uint64_t)val * 1000;
+		}
+		else if (0 == as_info_parameter_get(params, "fsync-max-sec", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of fsync-max-sec of ns %s from %lu to %d", ns->name, ns->storage_fsync_max_us / 1000000, val);
+			ns->storage_fsync_max_us = (uint64_t)val * 1000000;
+		}
+		else if (0 == as_info_parameter_get(params, "enable-xdr", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->enable_xdr], context);
+				ns->enable_xdr = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->enable_xdr], context);
+				ns->enable_xdr = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "sets-enable-xdr", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of sets-enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->sets_enable_xdr], context);
+				ns->sets_enable_xdr = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of sets-enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->sets_enable_xdr], context);
+				ns->sets_enable_xdr = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "ns-forward-xdr-writes", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of ns-forward-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_forward_xdr_writes], context);
+				ns->ns_forward_xdr_writes = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of ns-forward-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_forward_xdr_writes], context);
+				ns->ns_forward_xdr_writes = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "allow-nonxdr-writes", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of allow-nonxdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_nonxdr_writes], context);
+				ns->ns_allow_nonxdr_writes = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of allow-nonxdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_nonxdr_writes], context);
+				ns->ns_allow_nonxdr_writes = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "allow-xdr-writes", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of allow-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_xdr_writes], context);
+				ns->ns_allow_xdr_writes = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of allow-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_xdr_writes], context);
+				ns->ns_allow_xdr_writes = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "strong-consistency-allow-expunge", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of strong-consistency-allow-expunge of ns %s from %s to %s", ns->name, bool_val[ns->cp_allow_drops], context);
+				ns->cp_allow_drops = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of strong-consistency-allow-expunge of ns %s from %s to %s", ns->name, bool_val[ns->cp_allow_drops], context);
+				ns->cp_allow_drops = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "disable-write-dup-res", context, &context_len)) {
+			if (ns->cp) {
+				cf_warning(AS_INFO, "{%s} 'disable-write-dup-res' is not applicable with 'strong-consistency'", ns->name);
+				goto Error;
+			}
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of disable-write-dup-res of ns %s from %s to %s", ns->name, bool_val[ns->write_dup_res_disabled], context);
+				ns->write_dup_res_disabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of disable-write-dup-res of ns %s from %s to %s", ns->name, bool_val[ns->write_dup_res_disabled], context);
+				ns->write_dup_res_disabled = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "disallow-null-setname", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of disallow-null-setname of ns %s from %s to %s", ns->name, bool_val[ns->disallow_null_setname], context);
+				ns->disallow_null_setname = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of disallow-null-setname of ns %s from %s to %s", ns->name, bool_val[ns->disallow_null_setname], context);
+				ns->disallow_null_setname = false;
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-batch-sub", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-batch-sub of ns %s from %s to %s", ns->name, bool_val[ns->batch_sub_benchmarks_enabled], context);
+				ns->batch_sub_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-batch-sub of ns %s from %s to %s", ns->name, bool_val[ns->batch_sub_benchmarks_enabled], context);
+				ns->batch_sub_benchmarks_enabled = false;
+				histogram_clear(ns->batch_sub_start_hist);
+				histogram_clear(ns->batch_sub_restart_hist);
+				histogram_clear(ns->batch_sub_dup_res_hist);
+				histogram_clear(ns->batch_sub_repl_ping_hist);
+				histogram_clear(ns->batch_sub_read_local_hist);
+				histogram_clear(ns->batch_sub_response_hist);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-read", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-read of ns %s from %s to %s", ns->name, bool_val[ns->read_benchmarks_enabled], context);
+				ns->read_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-read of ns %s from %s to %s", ns->name, bool_val[ns->read_benchmarks_enabled], context);
+				ns->read_benchmarks_enabled = false;
+				histogram_clear(ns->read_start_hist);
+				histogram_clear(ns->read_restart_hist);
+				histogram_clear(ns->read_dup_res_hist);
+				histogram_clear(ns->read_repl_ping_hist);
+				histogram_clear(ns->read_local_hist);
+				histogram_clear(ns->read_response_hist);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-storage", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-storage of ns %s from %s to %s", ns->name, bool_val[ns->storage_benchmarks_enabled], context);
+				ns->storage_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-storage of ns %s from %s to %s", ns->name, bool_val[ns->storage_benchmarks_enabled], context);
+				ns->storage_benchmarks_enabled = false;
+				as_storage_histogram_clear_all(ns);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-udf", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-udf of ns %s from %s to %s", ns->name, bool_val[ns->udf_benchmarks_enabled], context);
+				ns->udf_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-udf of ns %s from %s to %s", ns->name, bool_val[ns->udf_benchmarks_enabled], context);
+				ns->udf_benchmarks_enabled = false;
+				histogram_clear(ns->udf_start_hist);
+				histogram_clear(ns->udf_restart_hist);
+				histogram_clear(ns->udf_dup_res_hist);
+				histogram_clear(ns->udf_master_hist);
+				histogram_clear(ns->udf_repl_write_hist);
+				histogram_clear(ns->udf_response_hist);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-udf-sub", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-udf-sub of ns %s from %s to %s", ns->name, bool_val[ns->udf_sub_benchmarks_enabled], context);
+				ns->udf_sub_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-udf-sub of ns %s from %s to %s", ns->name, bool_val[ns->udf_sub_benchmarks_enabled], context);
+				ns->udf_sub_benchmarks_enabled = false;
+				histogram_clear(ns->udf_sub_start_hist);
+				histogram_clear(ns->udf_sub_restart_hist);
+				histogram_clear(ns->udf_sub_dup_res_hist);
+				histogram_clear(ns->udf_sub_master_hist);
+				histogram_clear(ns->udf_sub_repl_write_hist);
+				histogram_clear(ns->udf_sub_response_hist);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-benchmarks-write", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-write of ns %s from %s to %s", ns->name, bool_val[ns->write_benchmarks_enabled], context);
+				ns->write_benchmarks_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-benchmarks-write of ns %s from %s to %s", ns->name, bool_val[ns->write_benchmarks_enabled], context);
+				ns->write_benchmarks_enabled = false;
+				histogram_clear(ns->write_start_hist);
+				histogram_clear(ns->write_restart_hist);
+				histogram_clear(ns->write_dup_res_hist);
+				histogram_clear(ns->write_master_hist);
+				histogram_clear(ns->write_repl_write_hist);
+				histogram_clear(ns->write_response_hist);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "enable-hist-proxy", context, &context_len)) {
+			if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-hist-proxy of ns %s from %s to %s", ns->name, bool_val[ns->proxy_hist_enabled], context);
+				ns->proxy_hist_enabled = true;
+			}
+			else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) {
+				cf_info(AS_INFO, "Changing value of enable-hist-proxy of ns %s from %s to %s", ns->name, bool_val[ns->proxy_hist_enabled], context);
+				ns->proxy_hist_enabled = false;
+				histogram_clear(ns->proxy_hist);
+			}
+			else {
+				goto Error;
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "max-write-cache", context, &context_len)) {
+			uint64_t val_u64;
+
+			if (0 != cf_str_atoi_u64(context, &val_u64)) {
+				goto Error;
+			}
+			if (val_u64 < (1024 * 1024 * 4)) { // TODO - why enforce this? And here, but not cfg.c?
+				cf_warning(AS_INFO, "can't set max-write-cache less than 4M");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of max-write-cache of ns %s from %lu to %lu ", ns->name, ns->storage_max_write_cache, val_u64);
+			ns->storage_max_write_cache = val_u64;
+			ns->storage_max_write_q = (int)(ns->storage_max_write_cache / ns->storage_write_block_size);
+		}
+		else if (0 == as_info_parameter_get(params, "min-avail-pct", context, &context_len)) {
+			ns->storage_min_avail_pct = atoi(context);
+			cf_info(AS_INFO, "Changing value of min-avail-pct of ns %s from %u to %u ", ns->name, ns->storage_min_avail_pct, atoi(context));
+		}
+		else if (0 == as_info_parameter_get(params, "post-write-queue", context, &context_len)) {
+			if (ns->storage_data_in_memory) {
+				cf_warning(AS_INFO, "ns %s, can't set post-write-queue if data-in-memory", ns->name);
+				goto Error;
+			}
+			if (0 != cf_str_atoi(context, &val)) {
+				cf_warning(AS_INFO, "ns %s, post-write-queue %s is not a number", ns->name, context);
+				goto Error;
+			}
+			if ((uint32_t)val > (4 * 1024)) {
+				cf_warning(AS_INFO, "ns %s, post-write-queue %u must be < 4K", ns->name, val);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of post-write-queue of ns %s from %d to %d ", ns->name, ns->storage_post_write_queue, val);
+			cf_atomic32_set(&ns->storage_post_write_queue, (uint32_t)val);
+		}
+		else if (0 == as_info_parameter_get(params, "read-consistency-level-override", context, &context_len)) {
+			if (ns->cp) {
+				cf_warning(AS_INFO, "{%s} 'read-consistency-level-override' is not applicable with 'strong-consistency'", ns->name);
+				goto Error;
+			}
+			char *original_value = NS_READ_CONSISTENCY_LEVEL_NAME();
+			if (strcmp(context, "all") == 0) {
+				ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ALL;
+			}
+			else if (strcmp(context, "off") == 0) {
+				ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_PROTO;
+			}
+			else if (strcmp(context, "one") == 0) {
+				ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ONE;
+			}
+			else {
+				goto Error;
+			}
+			if (strcmp(original_value, context)) {
+				cf_info(AS_INFO, "Changing value of read-consistency-level-override of ns %s from %s to %s", ns->name, original_value, context);
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "write-commit-level-override", context, &context_len)) {
+			if (ns->cp) {
+				cf_warning(AS_INFO, "{%s} 'write-commit-level-override' is not applicable with 'strong-consistency'", ns->name);
+				goto Error;
+			}
+			char *original_value = NS_WRITE_COMMIT_LEVEL_NAME();
+			if (strcmp(context, "all") == 0) {
+				ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_ALL;
+			}
+			else if (strcmp(context, "master") == 0) {
+				ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_MASTER;
+			}
+			else if (strcmp(context, "off") == 0) {
+				ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_PROTO;
+			}
+			else {
+				goto Error;
+			}
+			if (strcmp(original_value, context)) {
+				cf_info(AS_INFO, "Changing value of write-commit-level-override of ns %s from %s to %s", ns->name, original_value, context);
+			}
+		}
+		else if (0 == as_info_parameter_get(params, "geo2dsphere-within-max-cells", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val)) {
+				cf_warning(AS_INFO, "ns %s, geo2dsphere-within-max-cells %s is not a number", ns->name, context);
+				goto Error;
+			}
+			if (val <= 0) {
+				cf_warning(AS_INFO, "ns %s, geo2dsphere-within-max-cells %u must be > 0", ns->name, val);
+				goto Error;
+			}
+			if ((uint32_t)val > (MAX_REGION_CELLS)) {
+				cf_warning(AS_INFO, "ns %s, geo2dsphere-within-max-cells %u must be <= %u", ns->name, val, MAX_REGION_CELLS);
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of geo2dsphere-within-max-cells of ns %s from %d to %d ",
+					ns->name, ns->geo2dsphere_within_max_cells, val);
+			ns->geo2dsphere_within_max_cells = val;
+		}
+		else {
+			if (as_xdr_set_config_ns(ns->name, params) == false) {
+				goto Error;
+			}
+		}
+	} // end of namespace stanza
+	else if (strcmp(context, "security") == 0) {
+		context_len = sizeof(context);
+		if (0 == as_info_parameter_get(params, "privilege-refresh-period", context, &context_len)) {
+			if (0 != cf_str_atoi(context, &val) || val < 10 || val > 60 * 60 * 24) {
+				cf_warning(AS_INFO, "privilege-refresh-period must be an unsigned integer between 10 and 86400");
+				goto Error;
+			}
+			cf_info(AS_INFO, "Changing value of privilege-refresh-period from %u to %d", g_config.sec_cfg.privilege_refresh_period, val);
+			g_config.sec_cfg.privilege_refresh_period = (uint32_t)val;
+		}
+		else {
+			goto Error;
+		}
+	}
+	else if (strcmp(context, "xdr") == 0) {
+		if (as_xdr_set_config(params) == false) {
+			goto Error;
+		}
+	}
+	else
+		goto Error;
+
+	cf_info(AS_INFO, "config-set command completed: params %s",params);
+	cf_dyn_buf_append_string(db, "ok");
+	return(0);
+
+Error:
+	cf_dyn_buf_append_string(db, "error");
+	return(0);
+}
+
+// Protect all set-config commands from concurrency issues.
+static pthread_mutex_t g_set_cfg_lock = PTHREAD_MUTEX_INITIALIZER;
+
+int
+info_command_config_set(char *name, char *params, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&g_set_cfg_lock);
+
+	int result = info_command_config_set_threadsafe(name, params, db);
+
+	pthread_mutex_unlock(&g_set_cfg_lock);
+
+	return result;
+}
+
+//
+// log-set:log=id;context=foo;level=bar
+// ie:
+//   log-set:log=0;context=rw;level=debug
+
+
+int
+info_command_log_set(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "log-set command received: params %s", params);
+
+	char id_str[50];
+	int  id_str_len = sizeof(id_str);
+	int  id = -1;
+	bool found_id = true;
+	cf_fault_sink *s = 0;
+
+	if (0 != as_info_parameter_get(params, "id", id_str, &id_str_len)) {
+		if (0 != as_info_parameter_get(params, "log", id_str, &id_str_len)) {
+			cf_debug(AS_INFO, "log set command: no log id to be set - doing all");
+			found_id = false;
+		}
+	}
+	if (found_id == true) {
+		if (0 != cf_str_atoi(id_str, &id) ) {
+			cf_info(AS_INFO, "log set command: id must be an integer, is: %s", id_str);
+			cf_dyn_buf_append_string(db, "error-id-not-integer");
+			return(0);
+		}
+		s = cf_fault_sink_get_id(id);
+		if (!s) {
+			cf_info(AS_INFO, "log set command: sink id %d invalid", id);
+			cf_dyn_buf_append_string(db, "error-bad-id");
+			return(0);
+		}
+	}
+
+	// now, loop through all context strings. If we find a known context string,
+	// do the set
+	for (int c_id = 0; c_id < CF_FAULT_CONTEXT_UNDEF; c_id++) {
+
+		char level_str[50];
+		int  level_str_len = sizeof(level_str);
+		char *context = cf_fault_context_strings[c_id];
+		if (0 != as_info_parameter_get(params, context, level_str, &level_str_len)) {
+			continue;
+		}
+		for (uint32_t i = 0; level_str[i]; i++) level_str[i] = toupper(level_str[i]);
+
+		if (0 != cf_fault_sink_addcontext(s, context, level_str)) {
+			cf_info(AS_INFO, "log set command: addcontext failed: context %s level %s", context, level_str);
+			cf_dyn_buf_append_string(db, "error-invalid-context-or-level");
+			return(0);
+		}
+	}
+
+	cf_info(AS_INFO, "log-set command executed: params %s", params);
+
+	cf_dyn_buf_append_string(db, "ok");
+
+	return(0);
+}
+
+
+// latency:hist=reads;back=180;duration=60;slice=10;
+// throughput:hist=reads;back=180;duration=60;slice=10;
+// hist-track-start:hist=reads;back=43200;slice=30;thresholds=1,4,16,64;
+// hist-track-stop:hist=reads;
+//
+// hist     - optional histogram name - if none, command applies to all cf_hist_track objects
+//
+// for start command:
+// back     - total time span in seconds over which to cache data
+// slice    - period in seconds at which to cache histogram data
+// thresholds - comma-separated bucket (ms) values to track, must be powers of 2. e.g:
+//				1,4,16,64
+// defaults are:
+// - config value for back - mandatory, serves as flag for tracking
+// - config value if it exists for slice, otherwise 10 seconds
+// - config value if it exists for thresholds, otherwise internal defaults (1,8,64)
+//
+// for query commands:
+// back     - start search this many seconds before now, default: minimum to get last slice
+//			  using back=0 will get cached data from oldest cached data
+// duration - seconds (forward) from start to search, default 0: everything to present
+// slice    - intervals (in seconds) to analyze, default 0: everything as one slice
+//
+// e.g. query:
+// latency:hist=reads;back=180;duration=60;slice=10;
+// output (CF_HIST_TRACK_FMT_PACKED format) is:
+// requested value  latency:hist=reads;back=180;duration=60;slice=10
+// value is  reads:23:26:24-GMT,ops/sec,>1ms,>8ms,>64ms;23:26:34,30618.2,0.05,0.00,0.00;
+// 23:26:44,31942.1,0.02,0.00,0.00;23:26:54,30966.9,0.01,0.00,0.00;23:27:04,30380.4,0.01,0.00,0.00;
+// 23:27:14,37833.6,0.01,0.00,0.00;23:27:24,38502.7,0.01,0.00,0.00;23:27:34,39191.4,0.02,0.00,0.00;
+//
+// explanation:
+// 23:26:24-GMT - timestamp of histogram starting first slice
+// ops/sec,>1ms,>8ms,>64ms - labels for the columns: throughput, and which thresholds
+// 23:26:34,30618.2,0.05,0.00,0.00; - timestamp of histogram ending slice, throughput, latencies
+
+int
+info_command_hist_track(char *name, char *params, cf_dyn_buf *db)
+{
+	cf_debug(AS_INFO, "hist track %s command received: params %s", name, params);
+
+	char value_str[50];
+	int  value_str_len = sizeof(value_str);
+	cf_hist_track* hist_p = NULL;
+
+	if (0 != as_info_parameter_get(params, "hist", value_str, &value_str_len)) {
+		cf_debug(AS_INFO, "hist track %s command: no histogram specified - doing all", name);
+	}
+	else {
+		if (*value_str == '{') {
+			char* ns_name = value_str + 1;
+			char* ns_name_end = strchr(ns_name, '}');
+			as_namespace* ns = as_namespace_get_bybuf((uint8_t*)ns_name, ns_name_end - ns_name);
+
+			if (! ns) {
+				cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str);
+				cf_dyn_buf_append_string(db, "error-bad-hist-name");
+				return 0;
+			}
+
+			char* hist_name = ns_name_end + 1;
+
+			if (*hist_name++ != '-') {
+				cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str);
+				cf_dyn_buf_append_string(db, "error-bad-hist-name");
+				return 0;
+			}
+
+			if (0 == strcmp(hist_name, "read")) {
+				hist_p = ns->read_hist;
+			}
+			else if (0 == strcmp(hist_name, "write")) {
+				hist_p = ns->write_hist;
+			}
+			else if (0 == strcmp(hist_name, "udf")) {
+				hist_p = ns->udf_hist;
+			}
+			else if (0 == strcmp(hist_name, "query")) {
+				hist_p = ns->query_hist;
+			}
+			else {
+				cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str);
+				cf_dyn_buf_append_string(db, "error-bad-hist-name");
+				return 0;
+			}
+		}
+		else {
+			cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str);
+			cf_dyn_buf_append_string(db, "error-bad-hist-name");
+			return 0;
+		}
+	}
+
+	if (0 == strcmp(name, "hist-track-stop")) {
+		if (hist_p) {
+			cf_hist_track_stop(hist_p);
+		}
+		else {
+			for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+				as_namespace* ns = g_config.namespaces[i];
+
+				cf_hist_track_stop(ns->read_hist);
+				cf_hist_track_stop(ns->write_hist);
+				cf_hist_track_stop(ns->udf_hist);
+				cf_hist_track_stop(ns->query_hist);
+			}
+		}
+
+		cf_dyn_buf_append_string(db, "ok");
+
+		return 0;
+	}
+
+	bool start_cmd = 0 == strcmp(name, "hist-track-start");
+
+	// Note - default query params will get the most recent saved slice.
+	uint32_t back_sec = start_cmd ? g_config.hist_track_back : (g_config.hist_track_slice * 2) - 1;
+	uint32_t slice_sec = start_cmd ? g_config.hist_track_slice : 0;
+	int i;
+
+	value_str_len = sizeof(value_str);
+
+	if (0 == as_info_parameter_get(params, "back", value_str, &value_str_len)) {
+		if (0 == cf_str_atoi(value_str, &i)) {
+			back_sec = i >= 0 ? (uint32_t)i : (uint32_t)-i;
+		}
+		else {
+			cf_info(AS_INFO, "hist track %s command: back is not a number, using default", name);
+		}
+	}
+
+	value_str_len = sizeof(value_str);
+
+	if (0 == as_info_parameter_get(params, "slice", value_str, &value_str_len)) {
+		if (0 == cf_str_atoi(value_str, &i)) {
+			slice_sec = i >= 0 ? (uint32_t)i : (uint32_t)-i;
+		}
+		else {
+			cf_info(AS_INFO, "hist track %s command: slice is not a number, using default", name);
+		}
+	}
+
+	if (start_cmd) {
+		char* thresholds = g_config.hist_track_thresholds;
+
+		value_str_len = sizeof(value_str);
+
+		if (0 == as_info_parameter_get(params, "thresholds", value_str, &value_str_len)) {
+			thresholds = value_str;
+		}
+
+		cf_debug(AS_INFO, "hist track start command: back %u, slice %u, thresholds %s",
+				back_sec, slice_sec, thresholds ? thresholds : "null");
+
+		if (hist_p) {
+			if (cf_hist_track_start(hist_p, back_sec, slice_sec, thresholds)) {
+				cf_dyn_buf_append_string(db, "ok");
+			}
+			else {
+				cf_dyn_buf_append_string(db, "error-bad-start-params");
+			}
+		}
+		else {
+			for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+				as_namespace* ns = g_config.namespaces[i];
+
+				if ( ! (cf_hist_track_start(ns->read_hist, back_sec, slice_sec, thresholds) &&
+						cf_hist_track_start(ns->write_hist, back_sec, slice_sec, thresholds) &&
+						cf_hist_track_start(ns->udf_hist, back_sec, slice_sec, thresholds) &&
+						cf_hist_track_start(ns->query_hist, back_sec, slice_sec, thresholds))) {
+
+					cf_dyn_buf_append_string(db, "error-bad-start-params");
+					return 0;
+				}
+			}
+
+			cf_dyn_buf_append_string(db, "ok");
+		}
+
+		return 0;
+	}
+
+	// From here on it's latency or throughput...
+
+	uint32_t duration_sec = 0;
+
+	value_str_len = sizeof(value_str);
+
+	if (0 == as_info_parameter_get(params, "duration", value_str, &value_str_len)) {
+		if (0 == cf_str_atoi(value_str, &i)) {
+			duration_sec = i >= 0 ? (uint32_t)i : (uint32_t)-i;
+		}
+		else {
+			cf_info(AS_INFO, "hist track %s command: duration is not a number, using default", name);
+		}
+	}
+
+	bool throughput_only = 0 == strcmp(name, "throughput");
+
+	cf_debug(AS_INFO, "hist track %s command: back %u, duration %u, slice %u",
+			name, back_sec, duration_sec, slice_sec);
+
+	if (hist_p) {
+		cf_hist_track_get_info(hist_p, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db);
+	}
+	else {
+		for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+			as_namespace* ns = g_config.namespaces[i];
+
+			cf_hist_track_get_info(ns->read_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db);
+			cf_hist_track_get_info(ns->write_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db);
+			cf_hist_track_get_info(ns->udf_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db);
+			cf_hist_track_get_info(ns->query_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db);
+		}
+	}
+
+	cf_dyn_buf_chomp(db);
+
+	return 0;
+}
+
+// TODO - separate all these CP-related info commands.
+
+// Format is:
+//
+//	revive:{namespace=<ns-name>}
+//
+int
+info_command_revive(char *name, char *params, cf_dyn_buf *db)
+{
+	char ns_name[AS_ID_NAMESPACE_SZ] = { 0 };
+	int ns_name_len = (int)sizeof(ns_name);
+	int rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len);
+
+	if (rv == -2) {
+		cf_warning(AS_INFO, "revive: namespace parameter value too long");
+		cf_dyn_buf_append_string(db, "ERROR::bad-namespace");
+		return 0;
+	}
+
+	if (rv == 0) {
+		as_namespace *ns = as_namespace_get_byname(ns_name);
+
+		if (! ns) {
+			cf_warning(AS_INFO, "revive: unknown namespace %s", ns_name);
+			cf_dyn_buf_append_string(db, "ERROR::unknown-namespace");
+			return 0;
+		}
+
+		if (! as_partition_balance_revive(ns)) {
+			cf_warning(AS_INFO, "revive: failed - recluster in progress");
+			cf_dyn_buf_append_string(db, "ERROR::failed-revive");
+			return 0;
+		}
+
+		cf_info(AS_INFO, "revive: complete - issue 'recluster:' command");
+		cf_dyn_buf_append_string(db, "ok");
+		return 0;
+	}
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		if (! as_partition_balance_revive(ns)) {
+			cf_warning(AS_INFO, "revive: failed - recluster in progress");
+			cf_dyn_buf_append_string(db, "ERROR::failed-revive");
+			return 0;
+		}
+	}
+
+	cf_info(AS_INFO, "revive: complete - issue 'recluster:' command");
+	cf_dyn_buf_append_string(db, "ok");
+	return 0;
+}
+
+void
+namespace_roster_info(as_namespace *ns, cf_dyn_buf *db)
+{
+	as_exchange_info_lock();
+
+	cf_dyn_buf_append_string(db, "roster=");
+
+	if (ns->roster_count == 0) {
+		cf_dyn_buf_append_string(db, "null");
+	}
+	else {
+		for (uint32_t n = 0; n < ns->roster_count; n++) {
+			cf_dyn_buf_append_uint64_x(db, ns->roster[n]);
+
+			if (ns->roster_rack_ids[n] != 0) {
+				cf_dyn_buf_append_char(db, ROSTER_ID_PAIR_SEPARATOR);
+				cf_dyn_buf_append_uint32(db, ns->roster_rack_ids[n]);
+			}
+
+			cf_dyn_buf_append_char(db, ',');
+		}
+
+		cf_dyn_buf_chomp(db);
+	}
+
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "pending_roster=");
+
+	if (ns->smd_roster_count == 0) {
+		cf_dyn_buf_append_string(db, "null");
+	}
+	else {
+		for (uint32_t n = 0; n < ns->smd_roster_count; n++) {
+			cf_dyn_buf_append_uint64_x(db, ns->smd_roster[n]);
+
+			if (ns->smd_roster_rack_ids[n] != 0) {
+				cf_dyn_buf_append_char(db, ROSTER_ID_PAIR_SEPARATOR);
+				cf_dyn_buf_append_uint32(db, ns->smd_roster_rack_ids[n]);
+			}
+
+			cf_dyn_buf_append_char(db, ',');
+		}
+
+		cf_dyn_buf_chomp(db);
+	}
+
+	cf_dyn_buf_append_char(db, ':');
+
+	cf_dyn_buf_append_string(db, "observed_nodes=");
+
+	if (ns->observed_cluster_size == 0) {
+		cf_dyn_buf_append_string(db, "null");
+	}
+	else {
+		for (uint32_t n = 0; n < ns->observed_cluster_size; n++) {
+			cf_dyn_buf_append_uint64_x(db, ns->observed_succession[n]);
+
+			if (ns->rack_ids[n] != 0) {
+				cf_dyn_buf_append_char(db, ROSTER_ID_PAIR_SEPARATOR);
+				cf_dyn_buf_append_uint32(db, ns->rack_ids[n]);
+			}
+
+			cf_dyn_buf_append_char(db, ',');
+		}
+
+		cf_dyn_buf_chomp(db);
+	}
+
+	as_exchange_info_unlock();
+}
+
+// Format is:
+//
+//	roster:{namespace=<ns-name>}
+//
+int
+info_command_roster(char *name, char *params, cf_dyn_buf *db)
+{
+	char ns_name[AS_ID_NAMESPACE_SZ] = { 0 };
+	int ns_name_len = (int)sizeof(ns_name);
+	int rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len);
+
+	if (rv == -2) {
+		cf_warning(AS_INFO, "namespace parameter value too long");
+		cf_dyn_buf_append_string(db, "ERROR::bad-namespace");
+		return 0;
+	}
+
+	if (rv == 0) {
+		as_namespace *ns = as_namespace_get_byname(ns_name);
+
+		if (! ns) {
+			cf_warning(AS_INFO, "unknown namespace %s", ns_name);
+			cf_dyn_buf_append_string(db, "ERROR::unknown-namespace");
+			return 0;
+		}
+
+		namespace_roster_info(ns, db);
+
+		return 0;
+	}
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace *ns = g_config.namespaces[ns_ix];
+
+		cf_dyn_buf_append_string(db, "ns=");
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+
+		namespace_roster_info(ns, db);
+
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	cf_dyn_buf_chomp(db);
+
+	return 0;
+}
+
+// Format is:
+//
+//	roster-set:namespace=<ns-name>;nodes=<nodes-string>
+//
+// where <nodes-string> is comma-separated list of node-id:rack-id pairs, and
+// the :rack-id may be absent, indicating a rack-id of 0.
+//
+int
+info_command_roster_set(char *name, char *params, cf_dyn_buf *db)
+{
+	// Get the namespace name.
+
+	char ns_name[AS_ID_NAMESPACE_SZ];
+	int ns_name_len = (int)sizeof(ns_name);
+	int ns_rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len);
+
+	if (ns_rv != 0 || ns_name_len == 0) {
+		cf_warning(AS_INFO, "roster-set command: missing or invalid namespace name in command");
+		cf_dyn_buf_append_string(db, "ERROR::namespace-name");
+		return 0;
+	}
+
+	// Get the nodes list.
+
+	char nodes[AS_CLUSTER_SZ * ROSTER_STRING_ELE_LEN];
+	int nodes_len = (int)sizeof(nodes);
+	int nodes_rv = as_info_parameter_get(params, "nodes", nodes, &nodes_len);
+
+	if (nodes_rv == -2 || (nodes_rv == 0 && nodes_len == 0)) {
+		cf_warning(AS_INFO, "roster-set command: invalid nodes in command");
+		cf_dyn_buf_append_string(db, "ERROR::nodes");
+		return 0;
+	}
+
+	// Issue the roster-set command.
+
+	bool ok = as_roster_set_nodes_cmd(ns_name, nodes);
+
+	cf_dyn_buf_append_string(db, ok ? "ok" : "ERROR::roster-set");
+
+	return 0;
+}
+
+// Format is one of:
+//
+//	truncate:namespace=<ns-name>;set=<set-name>;lut=<UTC-nanosec-string>
+//	truncate:namespace=<ns-name>;set=<set-name>
+//
+//	truncate:namespace=<ns-name>;lut=<UTC-nanosec-string>
+//	truncate:namespace=<ns-name>
+//
+int
+info_command_truncate(char *name, char *params, cf_dyn_buf *db)
+{
+	// Get the namespace name.
+
+	char ns_name[AS_ID_NAMESPACE_SZ];
+	int ns_name_len = (int)sizeof(ns_name);
+	int ns_rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len);
+
+	if (ns_rv != 0 || ns_name_len == 0) {
+		cf_warning(AS_INFO, "truncate command: missing or invalid namespace name in command");
+		cf_dyn_buf_append_string(db, "ERROR::namespace-name");
+		return 0;
+	}
+
+	// Get the set-name if there is one.
+
+	char set_name[AS_SET_NAME_MAX_SIZE];
+	int set_name_len = (int)sizeof(set_name);
+	int set_rv = as_info_parameter_get(params, "set", set_name, &set_name_len);
+
+	if (set_rv == -2 || (set_rv == 0 && set_name_len == 0)) {
+		cf_warning(AS_INFO, "truncate command: invalid set name in command");
+		cf_dyn_buf_append_string(db, "ERROR::set-name");
+		return 0;
+	}
+
+	// Get the threshold last-update-time if there is one.
+
+	char lut_str[24]; // allow decimal, hex or octal in C constant format
+	int lut_str_len = (int)sizeof(lut_str);
+	int lut_rv = as_info_parameter_get(params, "lut", lut_str, &lut_str_len);
+
+	if (lut_rv == -2 || (lut_rv == 0 && lut_str_len == 0)) {
+		cf_warning(AS_INFO, "truncate command: invalid last-update-time in command");
+		cf_dyn_buf_append_string(db, "ERROR::last-update-time");
+		return 0;
+	}
+
+	// Issue the truncate command.
+
+	bool ok = as_truncate_cmd(ns_name,
+			set_rv == 0 ? set_name : NULL,
+			lut_rv == 0 ? lut_str : NULL);
+
+	cf_dyn_buf_append_string(db, ok ? "ok" : "ERROR::truncate");
+
+	return 0;
+}
+
+// Format is one of:
+//
+//	truncate-undo:namespace=<ns-name>;set=<set-name>
+//
+//	truncate-undo:namespace=<ns-name>
+//
+int
+info_command_truncate_undo(char *name, char *params, cf_dyn_buf *db)
+{
+	// Get the namespace name.
+
+	char ns_name[AS_ID_NAMESPACE_SZ];
+	int ns_name_len = (int)sizeof(ns_name);
+	int ns_rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len);
+
+	if (ns_rv != 0 || ns_name_len == 0) {
+		cf_warning(AS_INFO, "truncate-undo command: missing or invalid namespace name in command");
+		cf_dyn_buf_append_string(db, "ERROR::namespace-name");
+		return 0;
+	}
+
+	// Get the set-name if there is one.
+
+	char set_name[AS_SET_NAME_MAX_SIZE];
+	int set_name_len = (int)sizeof(set_name);
+	int set_rv = as_info_parameter_get(params, "set", set_name, &set_name_len);
+
+	if (set_rv == -2 || (set_rv == 0 && set_name_len == 0)) {
+		cf_warning(AS_INFO, "truncate-undo command: invalid set name in command");
+		cf_dyn_buf_append_string(db, "ERROR::set-name");
+		return 0;
+	}
+
+	// Issue the truncate-undo command.
+
+	as_truncate_undo_cmd(ns_name, set_rv == 0 ? set_name : NULL);
+
+	cf_dyn_buf_append_string(db, "ok");
+
+	return 0;
+}
+
+//
+// Log a message to the server.
+// Limited to 2048 characters.
+//
+// Format:
+//	log-message:message=<MESSAGE>[;who=<WHO>]
+//
+// Example:
+// 	log-message:message=Example Log Message;who=Aerospike User
+//
+int
+info_command_log_message(char *name, char *params, cf_dyn_buf *db)
+{
+	char who[128];
+	int who_len = sizeof(who);
+	if (0 != as_info_parameter_get(params, "who", who, &who_len)) {
+		strcpy(who, "unknown");
+	}
+
+	char message[2048];
+	int message_len = sizeof(message);
+	if (0 == as_info_parameter_get(params, "message", message, &message_len)) {
+		cf_info(AS_INFO, "%s: %s", who, message);
+	}
+
+	return 0;
+}
+
+// Generic info system functions
+// These functions act when an INFO message comes in over the PROTO pipe
+// collects the static and dynamic portions, puts it in a 'dyn buf',
+// and sends a reply
+//
+
+// Error strings for security check results.
+static void
+append_sec_err_str(cf_dyn_buf *db, uint32_t result, as_sec_perm cmd_perm) {
+	switch (result) {
+	case AS_SEC_ERR_NOT_AUTHENTICATED:
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_uint32(db, result);
+		cf_dyn_buf_append_string(db, ":not authenticated");
+		return;
+	case AS_SEC_ERR_ROLE_VIOLATION:
+		switch (cmd_perm) {
+		case PERM_INDEX_MANAGE:
+			INFO_COMMAND_SINDEX_FAILCODE(result, "role violation");
+			return;
+		case PERM_UDF_MANAGE:
+			cf_dyn_buf_append_string(db, "error=role_violation");
+			return;
+		default:
+			break;
+		}
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_uint32(db, result);
+		cf_dyn_buf_append_string(db, ":role violation");
+		return;
+	default:
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_uint32(db, result);
+		cf_dyn_buf_append_string(db, ":unexpected security error");
+		return;
+	}
+}
+
+static pthread_mutex_t		g_info_lock = PTHREAD_MUTEX_INITIALIZER;
+info_static		*static_head = 0;
+info_dynamic	*dynamic_head = 0;
+info_tree		*tree_head = 0;
+info_command	*command_head = 0;
+//
+// Pull up all elements in both list into the buffers
+// (efficient enough if you're looking for lots of things)
+// But only gets 'default' values
+//
+
+int
+info_all(const as_file_handle* fd_h, cf_dyn_buf *db)
+{
+	uint8_t auth_result = as_security_check(fd_h, PERM_NONE);
+
+	if (auth_result != AS_PROTO_RESULT_OK) {
+		as_security_log(fd_h, auth_result, PERM_NONE, "info-all request", NULL);
+		append_sec_err_str(db, auth_result, PERM_NONE);
+		cf_dyn_buf_append_char(db, EOL);
+		return 0;
+	}
+
+	info_static *s = static_head;
+	while (s) {
+		if (s->def == true) {
+			cf_dyn_buf_append_string( db, s->name);
+			cf_dyn_buf_append_char( db, SEP );
+			cf_dyn_buf_append_buf( db, (uint8_t *) s->value, s->value_sz);
+			cf_dyn_buf_append_char( db, EOL );
+		}
+		s = s->next;
+	}
+
+	info_dynamic *d = dynamic_head;
+	while (d) {
+		if (d->def == true) {
+			cf_dyn_buf_append_string( db, d->name);
+			cf_dyn_buf_append_char(db, SEP );
+			d->value_fn(d->name, db);
+			cf_dyn_buf_append_char(db, EOL);
+		}
+		d = d->next;
+	}
+
+	return(0);
+}
+
+//
+// Parse the input buffer. It contains a list of keys that should be spit back.
+// Do the parse, call the necessary function collecting the information in question
+// Filling the dynbuf
+
+int
+info_some(char *buf, char *buf_lim, const as_file_handle* fd_h, cf_dyn_buf *db)
+{
+	uint8_t auth_result = as_security_check(fd_h, PERM_NONE);
+
+	if (auth_result != AS_PROTO_RESULT_OK) {
+		// TODO - log null-terminated buf as detail?
+		as_security_log(fd_h, auth_result, PERM_NONE, "info request", NULL);
+		append_sec_err_str(db, auth_result, PERM_NONE);
+		cf_dyn_buf_append_char(db, EOL);
+		return 0;
+	}
+
+	// For each incoming name
+	char	*c = buf;
+	char	*tok = c;
+
+	while (c < buf_lim) {
+
+		if ( *c == EOL ) {
+			*c = 0;
+			char *name = tok;
+			bool handled = false;
+
+			// search the static queue first always
+			info_static *s = static_head;
+			while (s) {
+				if (strcmp(s->name, name) == 0) {
+					// return exact command string received from client
+					cf_dyn_buf_append_string( db, name);
+					cf_dyn_buf_append_char( db, SEP );
+					cf_dyn_buf_append_buf( db, (uint8_t *) s->value, s->value_sz);
+					cf_dyn_buf_append_char( db, EOL );
+					handled = true;
+					break;
+				}
+				s = s->next;
+			}
+
+			// didn't find in static, try dynamic
+			if (!handled) {
+				info_dynamic *d = dynamic_head;
+				while (d) {
+					if (strcmp(d->name, name) == 0) {
+						// return exact command string received from client
+						cf_dyn_buf_append_string( db, d->name);
+						cf_dyn_buf_append_char(db, SEP );
+						d->value_fn(d->name, db);
+						cf_dyn_buf_append_char(db, EOL);
+						handled = true;
+						break;
+					}
+					d = d->next;
+				}
+			}
+
+			// search the tree
+			if (!handled) {
+
+				// see if there's a '/',
+				char *branch = strchr( name, TREE_SEP);
+				if (branch) {
+					*branch = 0;
+					branch++;
+
+					info_tree *t = tree_head;
+					while (t) {
+						if (strcmp(t->name, name) == 0) {
+							// return exact command string received from client
+							cf_dyn_buf_append_string( db, t->name);
+							cf_dyn_buf_append_char( db, TREE_SEP);
+							cf_dyn_buf_append_string( db, branch);
+							cf_dyn_buf_append_char(db, SEP );
+							t->tree_fn(t->name, branch, db);
+							cf_dyn_buf_append_char(db, EOL);
+							break;
+						}
+						t = t->next;
+					}
+				}
+			}
+
+			tok = c + 1;
+		}
+		// commands have parameters
+		else if ( *c == ':' ) {
+			*c = 0;
+			char *name = tok;
+
+			// parse parameters
+			tok = c + 1;
+			// make sure c doesn't go beyond buf_lim
+			while (*c != EOL && c < buf_lim-1) c++;
+			if (*c != EOL) {
+				cf_warning(AS_INFO, "Info '%s' parameter not terminated with '\\n'.", name);
+				break;
+			}
+			*c = 0;
+			char *param = tok;
+
+			// search the command list
+			info_command *cmd = command_head;
+			while (cmd) {
+				if (strcmp(cmd->name, name) == 0) {
+					// return exact command string received from client
+					cf_dyn_buf_append_string( db, name);
+					cf_dyn_buf_append_char( db, ':');
+					cf_dyn_buf_append_string( db, param);
+					cf_dyn_buf_append_char( db, SEP );
+
+					uint8_t result = as_security_check(fd_h, cmd->required_perm);
+
+					as_security_log(fd_h, result, cmd->required_perm, name, param);
+
+					if (result == AS_PROTO_RESULT_OK) {
+						cmd->command_fn(cmd->name, param, db);
+					}
+					else {
+						append_sec_err_str(db, result, cmd->required_perm);
+					}
+
+					cf_dyn_buf_append_char( db, EOL );
+					break;
+				}
+				cmd = cmd->next;
+			}
+
+			if (!cmd) {
+				cf_info(AS_INFO, "received command %s, not registered", name);
+			}
+
+			tok = c + 1;
+		}
+
+		c++;
+
+	}
+	return(0);
+}
+
+int
+as_info_buffer(uint8_t *req_buf, size_t req_buf_len, cf_dyn_buf *rsp)
+{
+	// Either we'e doing all, or doing some
+	if (req_buf_len == 0) {
+		info_all(NULL, rsp);
+	}
+	else {
+		info_some((char *)req_buf, (char *)(req_buf + req_buf_len), NULL, rsp);
+	}
+
+	return(0);
+}
+
+//
+// Worker threads!
+// these actually do the work. There is a lot of network activity,
+// writes and such, don't want to clog up the main queue
+//
+
+void *
+thr_info_fn(void *unused)
+{
+	for ( ; ; ) {
+
+		as_info_transaction it;
+
+		if (0 != cf_queue_pop(g_info_work_q, &it, CF_QUEUE_FOREVER)) {
+			cf_crash(AS_TSVC, "unable to pop from info work queue");
+		}
+
+		as_file_handle *fd_h = it.fd_h;
+		as_proto *pr = it.proto;
+
+		// Allocate an output buffer sufficiently large to avoid ever resizing
+		cf_dyn_buf_define_size(db, 128 * 1024);
+		// write space for the header
+		uint64_t	h = 0;
+		cf_dyn_buf_append_buf(&db, (uint8_t *) &h, sizeof(h));
+
+		// Either we'e doing all, or doing some
+		if (pr->sz == 0) {
+			info_all(fd_h, &db);
+		}
+		else {
+			info_some((char *)pr->data, (char *)pr->data + pr->sz, fd_h, &db);
+		}
+
+		// write the proto header in the space we pre-wrote
+		db.buf[0] = 2;
+		db.buf[1] = 1;
+		uint64_t	sz = db.used_sz - 8;
+		db.buf[4] = (sz >> 24) & 0xff;
+		db.buf[5] = (sz >> 16) & 0xff;
+		db.buf[6] = (sz >> 8) & 0xff;
+		db.buf[7] = sz & 0xff;
+
+		// write the data buffer
+		if (cf_socket_send_all(&fd_h->sock, db.buf, db.used_sz,
+				MSG_NOSIGNAL, CF_SOCKET_TIMEOUT) < 0) {
+			cf_info(AS_INFO, "thr_info: can't write all bytes, fd %d error %d",
+					CSFD(&fd_h->sock), errno);
+			as_end_of_transaction_force_close(fd_h);
+			fd_h = NULL;
+		}
+
+		cf_dyn_buf_free(&db);
+
+		cf_free(pr);
+
+		if (fd_h) {
+			as_end_of_transaction_ok(fd_h);
+			fd_h = NULL;
+		}
+
+		G_HIST_INSERT_DATA_POINT(info_hist, it.start_time);
+		cf_atomic64_incr(&g_stats.info_complete);
+	}
+
+	return NULL;
+}
+
+//
+// received an info request from a file descriptor
+// Called by the thr_tsvc when an info message is seen
+// calls functions info_all or info_some to collect the response
+// calls write to send the response back
+//
+// Proto will be freed by the caller
+//
+
+void
+as_info(as_info_transaction *it)
+{
+	cf_queue_push(g_info_work_q, it);
+}
+
+// Return the number of pending Info requests in the queue.
+int
+as_info_queue_get_size()
+{
+	return cf_queue_sz(g_info_work_q);
+}
+
+// Registers a dynamic name-value calculator.
+// the get_value_fn will be called if a request comes in for this name.
+// only does the registration!
+// def means it's part of the default results - will get invoked for a blank info command (asinfo -v "")
+
+
+int
+as_info_set_dynamic(char *name, as_info_get_value_fn gv_fn, bool def)
+{
+	int rv = -1;
+	pthread_mutex_lock(&g_info_lock);
+
+	info_dynamic *e = dynamic_head;
+	while (e) {
+		if (strcmp(name, e->name) == 0) {
+			e->value_fn = gv_fn;
+			break;
+		}
+
+		e = e->next;
+	}
+
+	if (!e) {
+		e = cf_malloc(sizeof(info_dynamic));
+		e->def = def;
+		e->name = cf_strdup(name);
+		e->value_fn = gv_fn;
+		e->next = dynamic_head;
+		dynamic_head = e;
+	}
+	rv = 0;
+
+	pthread_mutex_unlock(&g_info_lock);
+	return(rv);
+}
+
+
+// Registers a tree-based name-value calculator.
+// the get_value_fn will be called if a request comes in for this name.
+// only does the registration!
+
+
+int
+as_info_set_tree(char *name, as_info_get_tree_fn gv_fn)
+{
+	int rv = -1;
+	pthread_mutex_lock(&g_info_lock);
+
+	info_tree *e = tree_head;
+	while (e) {
+		if (strcmp(name, e->name) == 0) {
+			e->tree_fn = gv_fn;
+			break;
+		}
+
+		e = e->next;
+	}
+
+	if (!e) {
+		e = cf_malloc(sizeof(info_tree));
+		e->name = cf_strdup(name);
+		e->tree_fn = gv_fn;
+		e->next = tree_head;
+		tree_head = e;
+	}
+	rv = 0;
+
+	pthread_mutex_unlock(&g_info_lock);
+	return(rv);
+}
+
+
+// Registers a command handler
+// the get_value_fn will be called if a request comes in for this name, and
+// parameters will be passed in
+// This function only does the registration!
+
+int
+as_info_set_command(char *name, as_info_command_fn command_fn, as_sec_perm required_perm)
+{
+	int rv = -1;
+	pthread_mutex_lock(&g_info_lock);
+
+	info_command *e = command_head;
+	while (e) {
+		if (strcmp(name, e->name) == 0) {
+			e->command_fn = command_fn;
+			break;
+		}
+
+		e = e->next;
+	}
+
+	if (!e) {
+		e = cf_malloc(sizeof(info_command));
+		e->name = cf_strdup(name);
+		e->command_fn = command_fn;
+		e->required_perm = required_perm;
+		e->next = command_head;
+		command_head = e;
+	}
+	rv = 0;
+
+	pthread_mutex_unlock(&g_info_lock);
+	return(rv);
+}
+
+
+
+//
+// Sets a static name-value pair
+// def means it's part of the default set - will get returned if nothing is passed
+
+int
+as_info_set_buf(const char *name, const uint8_t *value, size_t value_sz, bool def)
+{
+	pthread_mutex_lock(&g_info_lock);
+
+	// Delete case
+	if (value_sz == 0 || value == 0) {
+
+		info_static *p = 0;
+		info_static *e = static_head;
+
+		while (e) {
+			if (strcmp(name, e->name) == 0) {
+				if (p) {
+					p->next = e->next;
+					cf_free(e->name);
+					cf_free(e->value);
+					cf_free(e);
+				}
+				else {
+					info_static *_t = static_head->next;
+					cf_free(e->name);
+					cf_free(e->value);
+					cf_free(static_head);
+					static_head = _t;
+				}
+				break;
+			}
+			p = e;
+			e = e->next;
+		}
+	}
+	// insert case
+	else {
+
+		info_static *e = static_head;
+
+		// search for old value and overwrite
+		while(e) {
+			if (strcmp(name, e->name) == 0) {
+				cf_free(e->value);
+				e->value = cf_malloc(value_sz);
+				memcpy(e->value, value, value_sz);
+				e->value_sz = value_sz;
+				break;
+			}
+			e = e->next;
+		}
+
+		// not found, insert fresh
+		if (e == 0) {
+			info_static *_t = cf_malloc(sizeof(info_static));
+			_t->next = static_head;
+			_t->def = def;
+			_t->name = cf_strdup(name);
+			_t->value = cf_malloc(value_sz);
+			memcpy(_t->value, value, value_sz);
+			_t->value_sz = value_sz;
+			static_head = _t;
+		}
+	}
+
+	pthread_mutex_unlock(&g_info_lock);
+	return(0);
+
+}
+
+//
+// A helper function. Commands have the form:
+// cmd:param=value;param=value
+//
+// The main parser gives us the entire parameter string
+// so use this function to scan through and get the particular parameter value
+// you're looking for
+//
+// The 'param_string' is the param passed by the command parser into a command
+//
+// @return  0 : success
+//         -1 : parameter not found
+//         -2 : parameter found but value is too long
+//
+
+int
+as_info_parameter_get(char *param_str, char *param, char *value, int *value_len)
+{
+	cf_detail(AS_INFO, "parameter get: paramstr %s seeking param %s", param_str, param);
+
+	char *c = param_str;
+	char *tok = param_str;
+	int param_len = strlen(param);
+
+	while (*c) {
+		if (*c == '=') {
+			if ( ( param_len == c - tok) && (0 == memcmp(tok, param, param_len) ) ) {
+				c++;
+				tok = c;
+				while ( *c != 0 && *c != ';') c++;
+				if (*value_len <= c - tok)	{
+					// The found value is too long.
+					return(-2);
+				}
+				*value_len = c - tok;
+				memcpy(value, tok, *value_len);
+				value[*value_len] = 0;
+				return(0);
+			}
+			c++;
+		}
+		else if (*c == ';') {
+			c++;
+			tok = c;
+		}
+		else c++;
+
+	}
+
+	return(-1);
+}
+
+int
+as_info_set(const char *name, const char *value, bool def)
+{
+	return(as_info_set_buf(name, (const uint8_t *) value, strlen(value), def ) );
+}
+
+//
+//
+// service interfaces management
+//
+// There's a worker thread - info_interfaces_fn ---
+// which continually polls the interfaces to see if anything changed.
+// When it changes, it updates a generation count.
+// There's a hash table of all the other nodes in the cluster, and a counter
+// to see that they're all up-to-date on the generation
+//
+//
+// The fabric message in question can be expanded to do more than service interfaces.
+// By expanding the 'info_node_info' structure, and the fabric_msg, you can carry
+// more dynamic information than just the remote node's interfaces
+// But that's all that we can think of at the moment - the paxos communication method
+// makes sure that the distributed key system is properly distributed
+//
+
+static pthread_mutex_t g_serv_lock = PTHREAD_MUTEX_INITIALIZER;
+static char *g_serv_legacy = NULL;
+static char *g_serv_clear_std = NULL;
+static char *g_serv_clear_alt = NULL;
+static char *g_serv_tls_std = NULL;
+static char *g_serv_tls_alt = NULL;
+static char *g_serv_tls_name = NULL;
+static uint32_t g_serv_gen = 0;
+static cf_atomic64 g_peers_gen = 1;
+
+//
+// What other nodes are out there, and what are their ip addresses?
+//
+
+typedef struct info_node_info_s {
+	char     *service_addr;       // string representing the service address
+	char     *alternate_addr;     // string representing the alternate address
+	uint32_t generation;          // acked generation counter
+	char     *services_clear_std; // non-TLS standard services list
+	char     *services_tls_std;   // TLS standard services list
+	char     *services_clear_alt; // non-TLS alternate services list
+	char     *services_tls_alt;   // TLS alternate services list
+	char     *tls_name;           // TLS name
+	uint64_t last_changed;        // generation count of last modification (for delta updates)
+} info_node_info;
+
+typedef const char *(*info_node_proj_fn)(info_node_info *info);
+
+typedef struct services_printer_s {
+	info_node_proj_fn proj;
+	cf_dyn_buf        *db;
+	const char        *strip;
+	uint64_t          since;
+	bool              with_tls_name;
+	int32_t           count;
+} services_printer;
+
+typedef struct port_savings_context_s {
+	info_node_proj_fn proj;
+	uint64_t          since;
+	uint32_t          port_savings[65536];
+} port_savings_context;
+
+// To avoid the services bug, g_info_node_info_hash should *always* be a subset
+// of g_info_node_info_history_hash. In order to ensure this, every modification
+// of g_info_node_info_hash should first involve grabbing the lock for the same
+// key in g_info_node_info_history_hash.
+cf_shash *g_info_node_info_history_hash = NULL;
+cf_shash *g_info_node_info_hash = NULL;
+
+int info_node_info_reduce_fn(const void *key, void *data, void *udata);
+
+static char *
+format_services_string(const char **addrs, uint32_t n_addrs, cf_ip_port port, char sep)
+{
+	if (n_addrs == 0) {
+		return NULL;
+	}
+
+	cf_dyn_buf_define(db);
+
+	for (uint32_t i = 0; i < n_addrs; ++i) {
+		if (cf_ip_addr_is_dns_name(addrs[i])) {
+			cf_dyn_buf_append_string(&db, addrs[i]);
+			cf_dyn_buf_append_char(&db, ':');
+			cf_dyn_buf_append_string(&db, cf_ip_port_print(port));
+		}
+		else {
+			cf_sock_addr addr;
+			CF_NEVER_FAILS(cf_sock_addr_from_host_port(addrs[i], port, &addr));
+			cf_dyn_buf_append_string(&db, cf_sock_addr_print(&addr));
+		}
+
+		cf_dyn_buf_append_char(&db, sep);
+	}
+
+	if (n_addrs > 0) {
+		cf_dyn_buf_chomp(&db);
+	}
+
+	char *res = cf_dyn_buf_strdup(&db);
+	cf_dyn_buf_free(&db);
+	return res;
+}
+
+static char *
+format_services_addr(cf_ip_addr *addrs, int32_t n_addrs, cf_ip_port port, char sep)
+{
+	if (n_addrs == 0) {
+		return NULL;
+	}
+
+	cf_dyn_buf_define(db);
+
+	for (int32_t i = 0; i < n_addrs; ++i) {
+		cf_sock_addr addr;
+		cf_sock_addr_from_addr_port(&addrs[i], port, &addr);
+		cf_dyn_buf_append_string(&db, cf_sock_addr_print(&addr));
+		cf_dyn_buf_append_char(&db, sep);
+	}
+
+	if (n_addrs > 0) {
+		cf_dyn_buf_chomp(&db);
+	}
+
+	char *res = cf_dyn_buf_strdup(&db);
+	cf_dyn_buf_free(&db);
+	return res;
+}
+
+static bool
+detect_name_change(char **tls_name)
+{
+	char *node_name = cf_node_name();
+
+	if (node_name[0] == 0) {
+		cf_free(node_name);
+		node_name = NULL;
+	}
+
+	if (*tls_name == NULL && node_name == NULL) {
+		return false;
+	}
+
+	if (*tls_name != NULL && node_name != NULL && strcmp(*tls_name, node_name) == 0) {
+		cf_free(node_name);
+		return false;
+	}
+
+	if (*tls_name != NULL) {
+		cf_free(*tls_name);
+	}
+
+	*tls_name = node_name;
+	return true;
+}
+
+static uint32_t
+filter_legacy(const char **from, uint32_t n_from, const char **to)
+{
+	uint32_t n_to = 0;
+
+	for (uint32_t i = 0; i < n_from; ++i) {
+		if (cf_ip_addr_str_is_legacy(from[i])) {
+			to[n_to] = from[i];
+			++n_to;
+		}
+	}
+
+	return n_to;
+}
+
+static void
+set_static_services(void)
+{
+	const char *filter[CF_SOCK_CFG_MAX];
+	uint32_t n_filter;
+
+	if (g_access.service.addrs.n_addrs > 0) {
+		n_filter = filter_legacy(g_access.service.addrs.addrs, g_access.service.addrs.n_addrs,
+				filter);
+		g_serv_legacy = format_services_string(filter, n_filter, g_access.service.port, ';');
+
+		if (cf_ip_addr_legacy_only()) {
+			g_serv_clear_std = format_services_string(filter, n_filter, g_access.service.port, ',');
+		}
+		else {
+			g_serv_clear_std = format_services_string(g_access.service.addrs.addrs,
+					g_access.service.addrs.n_addrs, g_access.service.port, ',');
+		}
+	}
+
+	if (g_access.alt_service.addrs.n_addrs > 0) {
+		if (cf_ip_addr_legacy_only()) {
+			n_filter = filter_legacy(g_access.alt_service.addrs.addrs,
+					g_access.alt_service.addrs.n_addrs, filter);
+			g_serv_clear_alt = format_services_string(filter, n_filter, g_access.alt_service.port,
+					',');
+		}
+		else {
+			g_serv_clear_alt = format_services_string(g_access.alt_service.addrs.addrs,
+					g_access.alt_service.addrs.n_addrs, g_access.alt_service.port, ',');
+		}
+	}
+
+	if (g_access.tls_service.addrs.n_addrs > 0 && g_access.tls_service.port != 0) {
+		if (cf_ip_addr_legacy_only()) {
+			n_filter = filter_legacy(g_access.tls_service.addrs.addrs,
+					g_access.tls_service.addrs.n_addrs, filter);
+			g_serv_tls_std = format_services_string(filter, n_filter, g_access.tls_service.port,
+					',');
+		}
+		else {
+			g_serv_tls_std = format_services_string(g_access.tls_service.addrs.addrs,
+					g_access.tls_service.addrs.n_addrs, g_access.tls_service.port, ',');
+		}
+	}
+
+	if (g_access.alt_tls_service.addrs.n_addrs > 0 && g_access.alt_tls_service.port != 0) {
+		if (cf_ip_addr_legacy_only()) {
+			n_filter = filter_legacy(g_access.alt_tls_service.addrs.addrs,
+					g_access.alt_tls_service.addrs.n_addrs, filter);
+			g_serv_tls_alt = format_services_string(filter, n_filter, g_access.alt_tls_service.port,
+					',');
+		}
+		else {
+			g_serv_tls_alt = format_services_string(g_access.alt_tls_service.addrs.addrs,
+					g_access.alt_tls_service.addrs.n_addrs, g_access.alt_tls_service.port, ',');
+		}
+	}
+}
+
+void
+info_node_info_tend()
+{
+	cf_shash_reduce(g_info_node_info_hash, info_node_info_reduce_fn, 0);
+}
+
+void *
+info_interfaces_fn(void *unused)
+{
+	cf_ip_addr legacy[CF_SOCK_CFG_MAX];
+	uint32_t n_legacy = 0;
+
+	cf_ip_addr addrs[CF_SOCK_CFG_MAX];
+	uint32_t n_addrs = 0;
+
+	char *tls_name = NULL;
+	bool flag = cf_ip_addr_legacy_only();
+
+	while (true) {
+		bool chg_flag = cf_ip_addr_legacy_only() != flag;
+		bool chg_legacy = cf_inter_detect_changes_legacy(legacy, &n_legacy, CF_SOCK_CFG_MAX);
+		bool chg_any;
+
+		if (cf_ip_addr_legacy_only()) {
+			chg_any = cf_inter_detect_changes_legacy(addrs, &n_addrs, CF_SOCK_CFG_MAX);
+		}
+		else {
+			chg_any = cf_inter_detect_changes(addrs, &n_addrs, CF_SOCK_CFG_MAX);
+		}
+
+		if (n_legacy + n_addrs == 0) {
+			cf_warning(AS_INFO, "No network interface addresses detected for client access");
+		}
+
+		bool chg_name = detect_name_change(&tls_name);
+
+		if (chg_flag || chg_legacy || chg_any || chg_name) {
+			pthread_mutex_lock(&g_serv_lock);
+
+			if (chg_flag) {
+				set_static_services();
+				flag = cf_ip_addr_legacy_only();
+			}
+
+			if (chg_legacy && g_access.service.addrs.n_addrs == 0) {
+				if (g_serv_legacy != NULL) {
+					cf_free(g_serv_legacy);
+				}
+
+				g_serv_legacy = format_services_addr(legacy, n_legacy, g_access.service.port, ';');
+			}
+
+			if (chg_any && g_access.service.addrs.n_addrs == 0) {
+				if (g_serv_clear_std != NULL) {
+					cf_free(g_serv_clear_std);
+				}
+
+				g_serv_clear_std = format_services_addr(addrs, n_addrs, g_access.service.port, ',');
+			}
+
+			if (chg_any && g_access.tls_service.port != 0 &&
+					g_access.tls_service.addrs.n_addrs == 0) {
+				if (g_serv_tls_std != NULL) {
+					cf_free(g_serv_tls_std);
+				}
+
+				g_serv_tls_std = format_services_addr(addrs, n_addrs, g_access.tls_service.port,
+						',');
+			}
+
+			if (chg_name && g_config.tls_service.tls_our_name == NULL) {
+				g_serv_tls_name = tls_name;
+			}
+
+			++g_serv_gen;
+			pthread_mutex_unlock(&g_serv_lock);
+		}
+
+		info_node_info_tend();
+		sleep(2);
+	}
+
+	return NULL;
+}
+
+// Free the service strings of an info node.
+
+static void
+free_node_info_service(char **string)
+{
+	if (*string) {
+		cf_free(*string);
+		*string = 0;
+	}
+}
+
+static void
+free_node_info_services(info_node_info *info)
+{
+	free_node_info_service(&info->service_addr);
+	free_node_info_service(&info->alternate_addr);
+	free_node_info_service(&info->services_clear_std);
+	free_node_info_service(&info->services_tls_std);
+	free_node_info_service(&info->services_clear_alt);
+	free_node_info_service(&info->services_tls_alt);
+	free_node_info_service(&info->tls_name);
+}
+
+// Resets the service strings of an info node without freeing them.
+
+static void
+reset_node_info_services(info_node_info *info)
+{
+	info->service_addr = 0;
+	info->alternate_addr = 0;
+	info->services_clear_std = 0;
+	info->services_tls_std = 0;
+	info->services_clear_alt = 0;
+	info->services_tls_alt = 0;
+	info->tls_name = 0;
+}
+
+// Clone the service strings of an info node.
+
+static char *
+clone_node_info_service(const char *string)
+{
+	return string ? cf_strdup(string) : 0;
+}
+
+static void
+clone_node_info_services(info_node_info *from, info_node_info *to)
+{
+	to->service_addr = clone_node_info_service(from->service_addr);
+	to->alternate_addr = clone_node_info_service(from->alternate_addr);
+	to->services_clear_std = clone_node_info_service(from->services_clear_std);
+	to->services_tls_std = clone_node_info_service(from->services_tls_std);
+	to->services_clear_alt = clone_node_info_service(from->services_clear_alt);
+	to->services_tls_alt = clone_node_info_service(from->services_tls_alt);
+	to->tls_name = clone_node_info_service(from->tls_name);
+}
+
+// Compare the service strings of two info nodes.
+
+static bool
+compare_node_info_service(const char *lhs, const char *rhs)
+{
+	if (!lhs || !rhs) {
+		return !lhs && !rhs;
+	}
+
+	return strcmp(lhs, rhs) == 0;
+}
+
+static bool
+compare_node_info_services(info_node_info *lhs, info_node_info *rhs)
+{
+	return compare_node_info_service(lhs->service_addr, rhs->service_addr) &&
+			compare_node_info_service(lhs->alternate_addr, rhs->alternate_addr) &&
+			compare_node_info_service(lhs->services_clear_std, rhs->services_clear_std) &&
+			compare_node_info_service(lhs->services_tls_std, rhs->services_tls_std) &&
+			compare_node_info_service(lhs->services_clear_alt, rhs->services_clear_alt) &&
+			compare_node_info_service(lhs->services_tls_alt, rhs->services_tls_alt) &&
+			compare_node_info_service(lhs->tls_name, rhs->tls_name);
+}
+
+// Dump the service strings of an info node.
+
+static void
+dump_node_info_services(info_node_info *info)
+{
+	cf_debug(AS_INFO, "Service address:   %s", cf_str_safe_as_null(info->service_addr));
+	cf_debug(AS_INFO, "Alternate address: %s", cf_str_safe_as_null(info->alternate_addr));
+	cf_debug(AS_INFO, "Clear, standard:   %s", cf_str_safe_as_null(info->services_clear_std));
+	cf_debug(AS_INFO, "TLS, standard:     %s", cf_str_safe_as_null(info->services_tls_std));
+	cf_debug(AS_INFO, "Clear, alternate:  %s", cf_str_safe_as_null(info->services_clear_alt));
+	cf_debug(AS_INFO, "TLS, alternate:    %s", cf_str_safe_as_null(info->services_tls_alt));
+	cf_debug(AS_INFO, "TLS name:          %s", cf_str_safe_as_null(info->tls_name));
+}
+
+// This reduce function will eliminate elements from the info hash
+// which are no longer in the succession list
+
+typedef struct reduce_context_s {
+	uint32_t cluster_size;
+	cf_node *succession;
+	uint32_t n_deleted;
+	cf_node deleted[AS_CLUSTER_SZ];
+} reduce_context;
+
+int32_t
+info_clustering_event_reduce_fn(const void *key, void *data, void *udata)
+{
+	const cf_node *node = key;
+	info_node_info *info = data;
+	reduce_context *context = udata;
+
+	for (uint32_t i = 0; i < context->cluster_size; ++i) {
+		if (*node == context->succession[i]) {
+			return CF_SHASH_OK;
+		}
+	}
+
+	cf_debug(AS_INFO, "Clustering event reduce: removing node %" PRIx64, *node);
+
+	uint32_t n = context->n_deleted;
+	context->deleted[n] = *node;
+	++context->n_deleted;
+
+	free_node_info_services(info);
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+//
+// Maintain the info_node_info hash as a shadow of the succession list
+//
+static void
+info_clustering_event_listener(const as_exchange_cluster_changed_event* event, void* udata)
+{
+	uint64_t start_ms = cf_getms();
+	cf_debug(AS_INFO, "Info received new clustering state");
+
+	info_node_info temp;
+	temp.generation = 0;
+	temp.last_changed = 0;
+	reset_node_info_services(&temp);
+
+	uint32_t i;
+
+	for (i = 0; i < event->cluster_size; ++i) {
+		cf_node member_nodeid = event->succession[i];
+
+		if (member_nodeid == g_config.self_node) {
+			continue;
+		}
+
+		info_node_info *info_history;
+		pthread_mutex_t *vlock_history;
+
+		if (cf_shash_get_vlock(g_info_node_info_history_hash, &member_nodeid, (void **)&info_history,
+				&vlock_history) != CF_SHASH_OK) {
+			// This may fail, but this is OK. This should only fail when info_msg_fn is also trying
+			// to add this key, so either way the entry will be in the hash table.
+			cf_shash_put_unique(g_info_node_info_history_hash, &member_nodeid, &temp);
+
+			if (cf_shash_get_vlock(g_info_node_info_history_hash, &member_nodeid,
+					(void **)&info_history, &vlock_history) != CF_SHASH_OK) {
+				cf_crash(AS_INFO,
+						"Could not create info history hash entry for %" PRIx64, member_nodeid);
+				continue;
+			}
+		}
+
+		info_node_info *info;
+		pthread_mutex_t *vlock;
+
+		if (cf_shash_get_vlock(g_info_node_info_hash, &member_nodeid, (void **)&info,
+				&vlock) != CF_SHASH_OK) {
+			clone_node_info_services(info_history, &temp);
+			temp.last_changed = cf_atomic64_incr(&g_peers_gen);
+
+			if (cf_shash_put_unique(g_info_node_info_hash, &member_nodeid, &temp) == CF_SHASH_OK) {
+				reset_node_info_services(&temp);
+				info_history->last_changed = 0; // See info_clustering_event_reduce_fn().
+				cf_debug(AS_INFO, "Peers generation %" PRId64 ": added node %" PRIx64,
+						temp.last_changed, member_nodeid);
+			}
+			else {
+				free_node_info_services(&temp);
+				cf_crash(AS_INFO,
+						"Could not insert node %" PRIx64 " from clustering notification", member_nodeid);
+			}
+
+			temp.last_changed = 0;
+		}
+		else {
+			pthread_mutex_unlock(vlock);
+		}
+
+		pthread_mutex_unlock(vlock_history);
+	}
+
+	uint32_t before = cf_shash_get_size(g_info_node_info_hash);
+	cf_debug(AS_INFO, "Clustering succession list has %d element(s), info hash has %u", i, before);
+
+	reduce_context cont = { .cluster_size = event->cluster_size, .succession = event->succession, .n_deleted = 0 };
+	cf_shash_reduce(g_info_node_info_hash, info_clustering_event_reduce_fn, &cont);
+
+	// While an alumni is gone, its last_changed field is non-zero. When it comes back, the
+	// field goes back to zero.
+
+	for (uint32_t i = 0; i < cont.n_deleted; ++i) {
+		cf_debug(AS_INFO, "Updating alumni %" PRIx64, cont.deleted[i]);
+		info_node_info *info_history;
+		pthread_mutex_t *vlock_history;
+
+		if (cf_shash_get_vlock(g_info_node_info_history_hash, &cont.deleted[i],
+				(void **)&info_history, &vlock_history) != CF_SHASH_OK) {
+			cf_crash(AS_INFO, "Removing a node (%" PRIx64 ") that is not an alumni",
+					cont.deleted[i]);
+		}
+
+		info_history->last_changed = cf_atomic64_incr(&g_peers_gen);
+		cf_debug(AS_INFO, "Peers generation %" PRId64 ": removed node %" PRIx64,
+				info_history->last_changed, cont.deleted[i]);
+		pthread_mutex_unlock(vlock_history);
+	}
+
+	uint32_t after = cf_shash_get_size(g_info_node_info_hash);
+	cf_debug(AS_INFO, "After removal, info hash has %u element(s)", after);
+
+	cf_atomic32_incr(&g_node_info_generation);
+	cf_debug(AS_INFO, "info_clustering_event_listener took %" PRIu64 " ms", cf_getms() - start_ms);
+
+	// Trigger an immediate tend to start peer list update across the cluster.
+	info_node_info_tend();
+}
+
+// This goes in a reduce function for retransmitting my information to another node
+
+int
+info_node_info_reduce_fn(const void *key, void *data, void *udata)
+{
+	const cf_node *node = (const cf_node *)key;
+	info_node_info *infop = (info_node_info *) data;
+
+	if (infop->generation < g_serv_gen) {
+
+		cf_debug(AS_INFO, "sending service string %s to node %"PRIx64, g_serv_legacy, *node);
+
+		pthread_mutex_lock(&g_serv_lock);
+
+		msg *m = as_fabric_msg_get(M_TYPE_INFO);
+
+		// If we don't have the remote node's service address, request it via our update info. msg.
+		msg_set_uint32(m, INFO_FIELD_OP, infop->service_addr && infop->services_clear_std ?
+				INFO_OP_UPDATE : INFO_OP_UPDATE_REQ);
+		msg_set_uint32(m, INFO_FIELD_GENERATION, g_serv_gen);
+
+		if (g_serv_legacy) {
+			msg_set_str(m, INFO_FIELD_SERVICE_ADDRESS, g_serv_legacy, MSG_SET_COPY);
+		}
+
+		// Legacy alternate address field.
+		for (uint32_t i = 0; i < g_access.alt_service.addrs.n_addrs; ++i) {
+			if (cf_ip_addr_str_is_legacy(g_access.alt_service.addrs.addrs[i])) {
+				char tmp[250];
+				snprintf(tmp, sizeof(tmp), "%s:%d", g_access.alt_service.addrs.addrs[i],
+						g_access.service.port);
+				msg_set_str(m, INFO_FIELD_ALT_ADDRESS, tmp, MSG_SET_COPY);
+				break;
+			}
+		}
+
+		if (g_serv_clear_std) {
+			msg_set_str(m, INFO_FIELD_SERVICES_CLEAR_STD, g_serv_clear_std, MSG_SET_COPY);
+		}
+
+		if (g_serv_tls_std) {
+			msg_set_str(m, INFO_FIELD_SERVICES_TLS_STD, g_serv_tls_std, MSG_SET_COPY);
+		}
+
+		if (g_serv_clear_alt) {
+			msg_set_str(m, INFO_FIELD_SERVICES_CLEAR_ALT, g_serv_clear_alt, MSG_SET_COPY);
+		}
+
+		if (g_serv_tls_alt) {
+			msg_set_str(m, INFO_FIELD_SERVICES_TLS_ALT, g_serv_tls_alt, MSG_SET_COPY);
+		}
+
+		if (g_serv_tls_name) {
+			msg_set_str(m, INFO_FIELD_TLS_NAME, g_serv_tls_name, MSG_SET_COPY);
+		}
+
+		pthread_mutex_unlock(&g_serv_lock);
+
+		if (as_fabric_send(*node, m, AS_FABRIC_CHANNEL_CTRL) !=
+				AS_FABRIC_SUCCESS) {
+			as_fabric_msg_put(m);
+		}
+	}
+
+	return(0);
+}
+
+static char *
+convert_legacy_services(const char *legacy)
+{
+	if (legacy == NULL) {
+		return NULL;
+	}
+
+	char *res = cf_strdup(legacy);
+
+	for (size_t i = 0; res[i] != 0; ++i) {
+		if (res[i] == ';') {
+			res[i] = ',';
+		}
+	}
+
+	return res;
+}
+
+//
+// Receive a message from a remote node, jam it in my table
+//
+
+int
+info_msg_fn(cf_node node, msg *m, void *udata)
+{
+	uint32_t op;
+
+	if (msg_get_uint32(m, INFO_FIELD_OP, &op) != 0) {
+		as_fabric_msg_put(m);
+		return 0;
+	}
+
+	switch (op) {
+	case INFO_OP_UPDATE:
+	case INFO_OP_UPDATE_REQ:
+		{
+			cf_debug(AS_INFO, "Received service address from node %" PRIx64 "; op = %u", node, op);
+			info_node_info temp;
+			temp.generation = 0;
+			temp.last_changed = 0;
+			reset_node_info_services(&temp);
+			bool node_info_tend_required = false;
+
+			info_node_info *info_history;
+			pthread_mutex_t *vlock_history;
+
+			if (cf_shash_get_vlock(g_info_node_info_history_hash, &node, (void **)&info_history,
+					&vlock_history) != CF_SHASH_OK) {
+				// This may fail, but this is ok. This should only fail when as_info_paxos_event
+				// is concurrently trying to add this key, so either way the entry will be in the
+				// hash table.
+				cf_shash_put_unique(g_info_node_info_history_hash, &node, &temp);
+
+				if (cf_shash_get_vlock(g_info_node_info_history_hash, &node, (void **)&info_history,
+						&vlock_history) != CF_SHASH_OK) {
+					cf_crash(AS_INFO,
+							"Could not create info history hash entry for %" PRIx64, node);
+					break;
+				}
+			}
+
+			free_node_info_services(info_history);
+
+			if (msg_get_str(m, INFO_FIELD_SERVICE_ADDRESS, &info_history->service_addr,
+					0, MSG_GET_COPY_MALLOC) != 0 || !info_history->service_addr) {
+				cf_debug(AS_INFO, "No service address in message from node %" PRIx64, node);
+			}
+
+			if (msg_get_str(m, INFO_FIELD_ALT_ADDRESS, &info_history->alternate_addr,
+					0, MSG_GET_COPY_MALLOC) != 0) {
+				cf_debug(AS_INFO, "No alternate address message from node %" PRIx64, node);
+			}
+
+			if (msg_get_str(m, INFO_FIELD_SERVICES_CLEAR_STD, &info_history->services_clear_std,
+					0, MSG_GET_COPY_MALLOC) != 0 || !info_history->services_clear_std) {
+				cf_debug(AS_INFO, "No services-clear-std in message from node %" PRIx64, node);
+				info_history->services_clear_std =
+						convert_legacy_services(info_history->service_addr);
+			}
+
+			if (msg_get_str(m, INFO_FIELD_SERVICES_TLS_STD, &info_history->services_tls_std,
+					0, MSG_GET_COPY_MALLOC) != 0) {
+				cf_debug(AS_INFO, "No services-tls-std in message from node %" PRIx64, node);
+			}
+
+			if (msg_get_str(m, INFO_FIELD_SERVICES_CLEAR_ALT, &info_history->services_clear_alt,
+					0, MSG_GET_COPY_MALLOC) != 0) {
+				cf_debug(AS_INFO, "No services-clear-alt in message from node %" PRIx64, node);
+				info_history->services_clear_alt =
+						convert_legacy_services(info_history->alternate_addr);
+			}
+
+			if (msg_get_str(m, INFO_FIELD_SERVICES_TLS_ALT, &info_history->services_tls_alt,
+					0, MSG_GET_COPY_MALLOC) != 0) {
+				cf_debug(AS_INFO, "No services-tls-alt in message from node %" PRIx64, node);
+			}
+
+			if (msg_get_str(m, INFO_FIELD_TLS_NAME, &info_history->tls_name,
+					0, MSG_GET_COPY_MALLOC) != 0) {
+				cf_debug(AS_INFO, "No tls-name in message from node %" PRIx64, node);
+			}
+
+			dump_node_info_services(info_history);
+
+			info_node_info *info;
+			pthread_mutex_t *vlock;
+			info_node_info info_to_tend = { 0 };
+
+			if (cf_shash_get_vlock(g_info_node_info_hash, &node, (void **)&info, &vlock) == CF_SHASH_OK) {
+				if (!compare_node_info_services(info_history, info)) {
+					cf_debug(AS_INFO, "Changed node info entry, was:");
+					dump_node_info_services(info);
+					info->last_changed = cf_atomic64_incr(&g_peers_gen);
+					cf_debug(AS_INFO, "Peers generation %" PRId64 ": updated node %" PRIx64,
+							info->last_changed, node);
+				}
+
+				free_node_info_services(info);
+				clone_node_info_services(info_history, info);
+				if (INFO_OP_UPDATE_REQ == op) {
+					cf_debug(AS_INFO, "Received request for info update from node %" PRIx64 " ~~ setting node's info generation to 0!", node);
+					info->generation = 0;
+					node_info_tend_required = true;
+					memcpy(&info_to_tend, info, sizeof(info_to_tend));
+				}
+
+				pthread_mutex_unlock(vlock);
+			}
+			else {
+				// Before history hash was added to the code base, we would throw away the message
+				// in this case.
+				cf_debug(AS_INFO, "Node %" PRIx64 " not in info hash, saving service address in info history hash", node);
+			}
+
+			pthread_mutex_unlock(vlock_history);
+
+			// Send the ACK.
+			msg_preserve_fields(m, 1, INFO_FIELD_GENERATION);
+			msg_set_uint32(m, INFO_FIELD_OP, INFO_OP_ACK);
+
+			int rv = as_fabric_send(node, m, AS_FABRIC_CHANNEL_CTRL);
+
+			if (rv != AS_FABRIC_SUCCESS) {
+				cf_warning(AS_INFO, "Failed to send message %p with type %d to node %"PRIu64" (rv %d)",
+						m, (int32_t)m->type, node, rv);
+				as_fabric_msg_put(m);
+			}
+
+			if (node_info_tend_required) {
+				// Send our service update to the source.
+				info_node_info_reduce_fn(&node, &info_to_tend, NULL);
+			}
+		}
+
+		break;
+
+	case INFO_OP_ACK:
+		{
+
+			cf_debug(AS_INFO, " received ACK from node %"PRIx64, node);
+
+			// TODO - dangerous to continue if no generation ???
+			uint32_t gen = 0;
+			msg_get_uint32(m, INFO_FIELD_GENERATION, &gen);
+			info_node_info	*info;
+			pthread_mutex_t	*vlock;
+			if (0 == cf_shash_get_vlock(g_info_node_info_hash, &node, (void **) &info, &vlock)) {
+
+				info->generation = gen;
+
+				pthread_mutex_unlock(vlock);
+			}
+
+			as_fabric_msg_put(m);
+
+		}
+		break;
+
+	default:
+		as_fabric_msg_put(m);
+		break;
+	}
+
+	return(0);
+}
+
+//
+// This dynamic function reduces the info_node_info hash and builds up the string of services
+//
+
+int32_t
+info_get_x_legacy_reduce_fn(const void *key, void *data, void *udata)
+{
+	services_printer *sp = udata;
+	info_node_info *info = data;
+
+	info_node_proj_fn proj = sp->proj;
+	cf_dyn_buf *db = sp->db;
+	const char *services = proj(info);
+
+	if (services == NULL) {
+		return 0;
+	}
+
+	if (sp->count > 0) {
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	cf_dyn_buf_append_string(db, services);
+	++sp->count;
+	return 0;
+}
+
+int32_t
+info_get_x_legacy_reduce(cf_shash *h, info_node_proj_fn proj, cf_dyn_buf *db)
+{
+	services_printer sp = { .proj = proj, .db = db };
+	cf_shash_reduce(h, info_get_x_legacy_reduce_fn, (void *)&sp);
+	return 0;
+}
+
+static const char *
+project_services(info_node_info *info)
+{
+	return info->service_addr;
+}
+
+int32_t
+info_get_services(char *name, cf_dyn_buf *db)
+{
+	return info_get_x_legacy_reduce(g_info_node_info_hash, project_services, db);
+}
+
+int32_t
+info_get_services_alumni(char *name, cf_dyn_buf *db)
+{
+	return info_get_x_legacy_reduce(g_info_node_info_history_hash, project_services, db);
+}
+
+static const char *
+project_alt_addr(info_node_info *info)
+{
+	return info->alternate_addr;
+}
+
+int32_t
+info_get_alt_addr(char *name, cf_dyn_buf *db)
+{
+	return info_get_x_legacy_reduce(g_info_node_info_hash, project_alt_addr, db);
+}
+
+int32_t
+info_port_savings_reduce_fn(const void *key, void *data, void *udata)
+{
+	port_savings_context *psc = udata;
+	info_node_info *info = data;
+
+	if (info->last_changed <= psc->since) {
+		return 0;
+	}
+
+	const char *services = psc->proj(info);
+
+	if (services == NULL) {
+		return 0;
+	}
+
+	int32_t curr;
+
+	for (int32_t end = strlen(services); end > 0; end = curr) {
+		int32_t mult = 1;
+		int32_t port = 0;
+
+		for (curr = end - 1; curr >= 0; --curr) {
+			char ch = services[curr];
+
+			if (ch == ':') {
+				break;
+			}
+
+			if (ch < '0' || ch > '9') {
+				cf_warning(AS_INFO, "Invalid port number in services string: %s", services);
+				return 0;
+			}
+
+			port += (ch - '0') * mult;
+			mult *= 10;
+		}
+
+		int32_t savings = end - curr;
+		cf_debug(AS_INFO, "Default port %d saves %d byte(s)", port, savings);
+		psc->port_savings[port] += savings;
+
+		while (curr >= 0 && services[curr] != ',') {
+			--curr;
+		}
+	}
+
+	return 0;
+}
+
+static char *
+strip_service_suffixes(const char *services, const char *strip)
+{
+	const int32_t services_len = strlen(services);
+	const int32_t strip_len = strlen(strip);
+
+	char *clone = cf_strdup(services);
+
+	int32_t left = services_len;
+	int32_t right = services_len;
+
+	while (left >= strip_len) {
+		if (memcmp(clone + left - strip_len, strip, strip_len) == 0) {
+			left -= strip_len;
+		}
+
+		while (left > 0) {
+			clone[--right] = clone[--left];
+
+			if (clone[left] == ',') {
+				break;
+			}
+		}
+	}
+
+	memmove(clone, clone + right, services_len - right + 1);
+	return clone;
+}
+
+int32_t
+info_get_services_x_reduce_fn(const void *key, void *data, void *udata)
+{
+	services_printer *sp = udata;
+	const cf_node *node = key;
+	info_node_info *info = data;
+
+	if (info->last_changed <= sp->since) {
+		return 0;
+	}
+
+	const char *services = sp->proj(info);
+
+	if (services == NULL) {
+		return 0;
+	}
+
+	cf_dyn_buf *db = sp->db;
+
+	if (sp->count > 0) {
+		cf_dyn_buf_append_char(db, ',');
+	}
+
+	char node_id[17];
+	cf_str_itoa_u64(*node, node_id, 16);
+
+	cf_dyn_buf_append_char(db, '[');
+	cf_dyn_buf_append_string(db, node_id);
+	cf_dyn_buf_append_char(db, ',');
+
+	if (sp->with_tls_name && info->tls_name) {
+		cf_dyn_buf_append_string(db, info->tls_name);
+	}
+
+	cf_dyn_buf_append_char(db, ',');
+	cf_dyn_buf_append_char(db, '[');
+
+	if (sp->strip != NULL) {
+		char *stripped = strip_service_suffixes(services, sp->strip);
+		cf_dyn_buf_append_string(db, stripped);
+		cf_free(stripped);
+	}
+	else {
+		cf_dyn_buf_append_string(db, services);
+	}
+
+	cf_dyn_buf_append_char(db, ']');
+	cf_dyn_buf_append_char(db, ']');
+
+	++sp->count;
+	return 0;
+}
+
+int32_t
+info_get_services_x(cf_shash *h, info_node_proj_fn proj, cf_dyn_buf *db, uint64_t since,
+		bool with_tls_name)
+{
+	// Pick the default port that saves us the most space.
+	port_savings_context psc = { .proj = proj, .since = since };
+	cf_shash_reduce(h, info_port_savings_reduce_fn, &psc);
+
+	int32_t best_savings = 0;
+	int32_t best_port = 0;
+
+	for (int32_t i = 0; i < 65536; ++i) {
+		if (psc.port_savings[i] > best_savings) {
+			best_savings = psc.port_savings[i];
+			best_port = i;
+		}
+	}
+
+	cf_debug(AS_INFO, "Best default port is %d, saves %d byte(s)", best_port, best_savings);
+
+	cf_dyn_buf_append_uint64(db, cf_atomic64_get(g_peers_gen));
+	cf_dyn_buf_append_char(db, ',');
+
+	if (best_port > 0) {
+		cf_dyn_buf_append_int(db, best_port);
+	}
+
+	cf_dyn_buf_append_char(db, ',');
+
+	cf_dyn_buf_append_char(db, '[');
+
+	char strip[7];
+	snprintf(strip, sizeof(strip), ":%d", best_port);
+
+	services_printer sp = { .proj = proj, .db = db, .strip = strip, .since = since,
+			.with_tls_name = with_tls_name };
+	cf_shash_reduce(h, info_get_services_x_reduce_fn, (void *)&sp);
+
+	cf_dyn_buf_append_char(db, ']');
+	return sp.count;
+}
+
+int32_t
+info_get_services_x_gone_reduce_fn(const void *key, void *data, void *udata)
+{
+	services_printer *sp = udata;
+	const cf_node *node = key;
+	info_node_info *info = data;
+
+	if (info->last_changed <= sp->since || sp->proj(info) == NULL) {
+		return 0;
+	}
+
+	cf_dyn_buf *db = sp->db;
+
+	if (sp->count > 0) {
+		cf_dyn_buf_append_char(db, ',');
+	}
+
+	char node_id[17];
+	cf_str_itoa_u64(*node, node_id, 16);
+
+	cf_dyn_buf_append_char(db, '[');
+	cf_dyn_buf_append_string(db, node_id);
+	cf_dyn_buf_append_char(db, ',');
+	cf_dyn_buf_append_char(db, ',');
+	cf_dyn_buf_append_char(db, ']');
+
+	++sp->count;
+	return 0;
+}
+
+void
+info_get_services_x_delta(info_node_proj_fn proj, cf_dyn_buf *db, char *params, bool with_tls_name)
+{
+	uint64_t since;
+
+	if (cf_str_atoi_64(params, (int64_t *)&since) < 0) {
+		cf_warning(AS_INFO, "Invalid peers generation %s", params);
+		cf_dyn_buf_append_string(db, "ERROR");
+		return;
+	}
+
+	uint64_t orig_gen = cf_atomic64_get(g_peers_gen);
+
+	while (true) {
+		int32_t count = info_get_services_x(g_info_node_info_hash, proj, db, since, with_tls_name);
+		cf_dyn_buf_chomp(db); // Remove the "]".
+
+		services_printer sp = { .proj = proj, .db = db, .since = since, .count = count };
+		cf_shash_reduce(g_info_node_info_history_hash, info_get_services_x_gone_reduce_fn, &sp);
+
+		cf_dyn_buf_append_char(db, ']'); // Re-add the "]".
+
+		// Doing the above two reductions doesn't happen atomically. Theoretically, peers can
+		// arrive or leave between the two invocations, leading to duplicate or missing peers in
+		// the list. In this case, simply try again.
+
+		uint64_t gen = cf_atomic64_get(g_peers_gen);
+
+		if (gen == orig_gen) {
+			break;
+		}
+
+		db->used_sz = 0;
+		orig_gen = gen;
+	}
+}
+
+static const char *
+project_services_clear_std(info_node_info *info)
+{
+	return info->services_clear_std;
+}
+
+int32_t
+info_get_services_clear_std(char *name, cf_dyn_buf *db)
+{
+	info_get_services_x(g_info_node_info_hash, project_services_clear_std, db, 0, false);
+	return 0;
+}
+
+int32_t
+info_get_services_clear_std_delta(char *name, char *params, cf_dyn_buf *db)
+{
+	info_get_services_x_delta(project_services_clear_std, db, params, false);
+	return 0;
+}
+
+int32_t
+info_get_alumni_clear_std(char *name, cf_dyn_buf *db)
+{
+	info_get_services_x(g_info_node_info_history_hash, project_services_clear_std, db, 0, false);
+	return 0;
+}
+
+static const char *
+project_services_tls_std(info_node_info *info)
+{
+	return info->services_tls_std;
+}
+
+int32_t
+info_get_services_tls_std(char *name, cf_dyn_buf *db)
+{
+	info_get_services_x(g_info_node_info_hash, project_services_tls_std, db, 0, true);
+	return 0;
+}
+
+int32_t
+info_get_services_tls_std_delta(char *name, char *params, cf_dyn_buf *db)
+{
+	info_get_services_x_delta(project_services_tls_std, db, params, true);
+	return 0;
+}
+
+int32_t
+info_get_alumni_tls_std(char *name, cf_dyn_buf *db)
+{
+	info_get_services_x(g_info_node_info_history_hash, project_services_tls_std, db, 0, true);
+	return 0;
+}
+
+static const char *
+project_services_clear_alt(info_node_info *info)
+{
+	return info->services_clear_alt;
+}
+
+int32_t
+info_get_services_clear_alt(char *name, cf_dyn_buf *db)
+{
+	info_get_services_x(g_info_node_info_hash, project_services_clear_alt, db, 0, false);
+	return 0;
+}
+
+int32_t
+info_get_services_clear_alt_delta(char *name, char *params, cf_dyn_buf *db)
+{
+	info_get_services_x_delta(project_services_clear_alt, db, params, false);
+	return 0;
+}
+
+static const char *
+project_services_tls_alt(info_node_info *info)
+{
+	return info->services_tls_alt;
+}
+
+int32_t
+info_get_services_tls_alt(char *name, cf_dyn_buf *db)
+{
+	info_get_services_x(g_info_node_info_hash, project_services_tls_alt, db, 0, true);
+	return 0;
+}
+
+int32_t
+info_get_services_tls_alt_delta(char *name, char *params, cf_dyn_buf *db)
+{
+	info_get_services_x_delta(project_services_tls_alt, db, params, true);
+	return 0;
+}
+
+int32_t
+info_get_services_generation(char *name, cf_dyn_buf *db)
+{
+	cf_dyn_buf_append_uint64(db, cf_atomic64_get(g_peers_gen));
+	return 0;
+}
+
+//
+// This dynamic function removes nodes from g_info_node_info_history_hash that
+// aren't present in g_info_node_info_hash.
+//
+int
+history_purge_reduce_fn(const void *key, void *data, void *udata)
+{
+	return CF_SHASH_OK == cf_shash_get(g_info_node_info_hash, key, NULL) ? CF_SHASH_OK : CF_SHASH_REDUCE_DELETE;
+}
+
+int
+info_services_alumni_reset(char *name, cf_dyn_buf *db)
+{
+	cf_shash_reduce(g_info_node_info_history_hash, history_purge_reduce_fn, NULL);
+	cf_info(AS_INFO, "services alumni list reset");
+	cf_dyn_buf_append_string(db, "ok");
+
+	return(0);
+}
+
+
+
+//
+// Iterate through the current namespace list and cons up a string
+//
+
+int
+info_get_namespaces(char *name, cf_dyn_buf *db)
+{
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		cf_dyn_buf_append_string(db, g_config.namespaces[i]->name);
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	if (g_config.n_namespaces > 0) {
+		cf_dyn_buf_chomp(db);
+	}
+
+	return(0);
+}
+
+int
+info_get_logs(char *name, cf_dyn_buf *db)
+{
+	cf_fault_sink_strlist(db);
+	return(0);
+}
+
+int
+info_get_objects(char *name, cf_dyn_buf *db)
+{
+	uint64_t	objects = 0;
+
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		objects += g_config.namespaces[i]->n_objects;
+	}
+
+	cf_dyn_buf_append_uint64(db, objects);
+	return(0);
+}
+
+int
+info_get_sets(char *name, cf_dyn_buf *db)
+{
+	return info_get_tree_sets(name, "", db);
+}
+
+int
+info_get_bins(char *name, cf_dyn_buf *db)
+{
+	return info_get_tree_bins(name, "", db);
+}
+
+int
+info_get_config( char* name, cf_dyn_buf *db)
+{
+	return info_command_config_get(name, NULL, db);
+}
+
+int
+info_get_sindexes(char *name, cf_dyn_buf *db)
+{
+	return info_get_tree_sindexes(name, "", db);
+}
+
+
+void
+info_get_namespace_info(as_namespace *ns, cf_dyn_buf *db)
+{
+	// Cluster size.
+
+	// Using ns_ prefix to avoid confusion with global cluster_size.
+	info_append_uint32(db, "ns_cluster_size", ns->cluster_size);
+
+	// Using effective_ prefix to avoid confusion with configured value.
+	info_append_uint32(db, "effective_replication_factor", ns->replication_factor);
+
+	// Object counts.
+
+	info_append_uint64(db, "objects", ns->n_objects);
+	info_append_uint64(db, "tombstones", ns->n_tombstones);
+
+	repl_stats mp;
+	as_partition_get_replica_stats(ns, &mp);
+
+	info_append_uint64(db, "master_objects", mp.n_master_objects);
+	info_append_uint64(db, "master_tombstones", mp.n_master_tombstones);
+	info_append_uint64(db, "prole_objects", mp.n_prole_objects);
+	info_append_uint64(db, "prole_tombstones", mp.n_prole_tombstones);
+	info_append_uint64(db, "non_replica_objects", mp.n_non_replica_objects);
+	info_append_uint64(db, "non_replica_tombstones", mp.n_non_replica_tombstones);
+
+	// Consistency info.
+
+	info_append_uint32(db, "dead_partitions", ns->n_dead_partitions);
+	info_append_uint32(db, "unavailable_partitions", ns->n_unavailable_partitions);
+	info_append_bool(db, "clock_skew_stop_writes", ns->clock_skew_stop_writes);
+
+	// Expiration & eviction (nsup) stats.
+
+	info_append_bool(db, "stop_writes", ns->stop_writes != 0);
+	info_append_bool(db, "hwm_breached", ns->hwm_breached != 0);
+
+	info_append_uint64(db, "current_time", as_record_void_time_get());
+	info_append_uint64(db, "non_expirable_objects", ns->non_expirable_objects);
+	info_append_uint64(db, "expired_objects", ns->n_expired_objects);
+	info_append_uint64(db, "evicted_objects", ns->n_evicted_objects);
+	info_append_uint64(db, "evict_ttl", ns->evict_ttl);
+	info_append_uint32(db, "nsup_cycle_duration", ns->nsup_cycle_duration);
+	info_append_uint32(db, "nsup_cycle_sleep_pct", ns->nsup_cycle_sleep_pct);
+
+	// Truncate stats.
+
+	info_append_uint64(db, "truncate_lut", ns->truncate.lut);
+	info_append_uint64(db, "truncated_records", ns->truncate.n_records);
+
+	// Memory usage stats.
+
+	uint64_t data_memory = ns->n_bytes_memory;
+	uint64_t index_memory = as_index_size_get(ns) * (ns->n_objects + ns->n_tombstones);
+	uint64_t sindex_memory = ns->n_bytes_sindex_memory;
+	uint64_t used_memory = data_memory + index_memory + sindex_memory;
+
+	info_append_uint64(db, "memory_used_bytes", used_memory);
+	info_append_uint64(db, "memory_used_data_bytes", data_memory);
+	info_append_uint64(db, "memory_used_index_bytes", index_memory);
+	info_append_uint64(db, "memory_used_sindex_bytes", sindex_memory);
+
+	uint64_t free_pct = (ns->memory_size != 0 && (ns->memory_size > used_memory)) ?
+			((ns->memory_size - used_memory) * 100L) / ns->memory_size : 0;
+
+	info_append_uint64(db, "memory_free_pct", free_pct);
+
+	// Persistent memory block keys' namespace ID (enterprise only).
+	info_append_uint32(db, "xmem_id", ns->xmem_id);
+
+	// Remaining bin-name slots (yes, this can be negative).
+	if (! ns->single_bin) {
+		info_append_int(db, "available_bin_names", BIN_NAMES_QUOTA - (int)cf_vmapx_count(ns->p_bin_name_vmap));
+	}
+
+	// Persistent storage stats.
+
+	if (ns->storage_type == AS_STORAGE_ENGINE_SSD) {
+		int available_pct = 0;
+		uint64_t inuse_disk_bytes = 0;
+		as_storage_stats(ns, &available_pct, &inuse_disk_bytes);
+
+		info_append_uint64(db, "device_total_bytes", ns->ssd_size);
+		info_append_uint64(db, "device_used_bytes", inuse_disk_bytes);
+
+		free_pct = (ns->ssd_size != 0 && (ns->ssd_size > inuse_disk_bytes)) ?
+				((ns->ssd_size - inuse_disk_bytes) * 100L) / ns->ssd_size : 0;
+
+		info_append_uint64(db, "device_free_pct", free_pct);
+		info_append_int(db, "device_available_pct", available_pct);
+
+		if (! ns->storage_data_in_memory) {
+			info_append_int(db, "cache_read_pct", (int)(ns->cache_read_pct + 0.5));
+		}
+	}
+
+	// Migration stats.
+
+	info_append_uint64(db, "migrate_tx_partitions_imbalance", ns->migrate_tx_partitions_imbalance);
+
+	info_append_uint64(db, "migrate_tx_instances", ns->migrate_tx_instance_count);
+	info_append_uint64(db, "migrate_rx_instances", ns->migrate_rx_instance_count);
+
+	info_append_uint64(db, "migrate_tx_partitions_active", ns->migrate_tx_partitions_active);
+	info_append_uint64(db, "migrate_rx_partitions_active", ns->migrate_rx_partitions_active);
+
+	info_append_uint64(db, "migrate_tx_partitions_initial", ns->migrate_tx_partitions_initial);
+	info_append_uint64(db, "migrate_tx_partitions_remaining", ns->migrate_tx_partitions_remaining);
+
+	info_append_uint64(db, "migrate_rx_partitions_initial", ns->migrate_rx_partitions_initial);
+	info_append_uint64(db, "migrate_rx_partitions_remaining", ns->migrate_rx_partitions_remaining);
+
+	info_append_uint64(db, "migrate_records_skipped", ns->migrate_records_skipped);
+	info_append_uint64(db, "migrate_records_transmitted", ns->migrate_records_transmitted);
+	info_append_uint64(db, "migrate_record_retransmits", ns->migrate_record_retransmits);
+	info_append_uint64(db, "migrate_record_receives", ns->migrate_record_receives);
+
+	info_append_uint64(db, "migrate_signals_active", ns->migrate_signals_active);
+	info_append_uint64(db, "migrate_signals_remaining", ns->migrate_signals_remaining);
+
+	info_append_uint64(db, "appeals_tx_active", ns->appeals_tx_active);
+	info_append_uint64(db, "appeals_rx_active", ns->appeals_rx_active);
+
+	info_append_uint64(db, "appeals_tx_remaining", ns->appeals_tx_remaining);
+
+	info_append_uint64(db, "appeals_records_exonerated", ns->appeals_records_exonerated);
+
+	// From-client transaction stats.
+
+	info_append_uint64(db, "client_tsvc_error", ns->n_client_tsvc_error);
+	info_append_uint64(db, "client_tsvc_timeout", ns->n_client_tsvc_timeout);
+
+	info_append_uint64(db, "client_proxy_complete", ns->n_client_proxy_complete);
+	info_append_uint64(db, "client_proxy_error", ns->n_client_proxy_error);
+	info_append_uint64(db, "client_proxy_timeout", ns->n_client_proxy_timeout);
+
+	info_append_uint64(db, "client_read_success", ns->n_client_read_success);
+	info_append_uint64(db, "client_read_error", ns->n_client_read_error);
+	info_append_uint64(db, "client_read_timeout", ns->n_client_read_timeout);
+	info_append_uint64(db, "client_read_not_found", ns->n_client_read_not_found);
+
+	info_append_uint64(db, "client_write_success", ns->n_client_write_success);
+	info_append_uint64(db, "client_write_error", ns->n_client_write_error);
+	info_append_uint64(db, "client_write_timeout", ns->n_client_write_timeout);
+
+	// Subset of n_client_write_... above, respectively.
+	info_append_uint64(db, "xdr_write_success", ns->n_xdr_write_success);
+	info_append_uint64(db, "xdr_write_error", ns->n_xdr_write_error);
+	info_append_uint64(db, "xdr_write_timeout", ns->n_xdr_write_timeout);
+
+	info_append_uint64(db, "client_delete_success", ns->n_client_delete_success);
+	info_append_uint64(db, "client_delete_error", ns->n_client_delete_error);
+	info_append_uint64(db, "client_delete_timeout", ns->n_client_delete_timeout);
+	info_append_uint64(db, "client_delete_not_found", ns->n_client_delete_not_found);
+
+	// Subset of n_client_delete_... above, respectively.
+	info_append_uint64(db, "xdr_delete_success", ns->n_xdr_delete_success);
+	info_append_uint64(db, "xdr_delete_error", ns->n_xdr_delete_error);
+	info_append_uint64(db, "xdr_delete_timeout", ns->n_xdr_delete_timeout);
+	info_append_uint64(db, "xdr_delete_not_found", ns->n_xdr_delete_not_found);
+
+	info_append_uint64(db, "client_udf_complete", ns->n_client_udf_complete);
+	info_append_uint64(db, "client_udf_error", ns->n_client_udf_error);
+	info_append_uint64(db, "client_udf_timeout", ns->n_client_udf_timeout);
+
+	info_append_uint64(db, "client_lang_read_success", ns->n_client_lang_read_success);
+	info_append_uint64(db, "client_lang_write_success", ns->n_client_lang_write_success);
+	info_append_uint64(db, "client_lang_delete_success", ns->n_client_lang_delete_success);
+	info_append_uint64(db, "client_lang_error", ns->n_client_lang_error);
+
+	// Batch sub-transaction stats.
+
+	info_append_uint64(db, "batch_sub_tsvc_error", ns->n_batch_sub_tsvc_error);
+	info_append_uint64(db, "batch_sub_tsvc_timeout", ns->n_batch_sub_tsvc_timeout);
+
+	info_append_uint64(db, "batch_sub_proxy_complete", ns->n_batch_sub_proxy_complete);
+	info_append_uint64(db, "batch_sub_proxy_error", ns->n_batch_sub_proxy_error);
+	info_append_uint64(db, "batch_sub_proxy_timeout", ns->n_batch_sub_proxy_timeout);
+
+	info_append_uint64(db, "batch_sub_read_success", ns->n_batch_sub_read_success);
+	info_append_uint64(db, "batch_sub_read_error", ns->n_batch_sub_read_error);
+	info_append_uint64(db, "batch_sub_read_timeout", ns->n_batch_sub_read_timeout);
+	info_append_uint64(db, "batch_sub_read_not_found", ns->n_batch_sub_read_not_found);
+
+	// Internal-UDF sub-transaction stats.
+
+	info_append_uint64(db, "udf_sub_tsvc_error", ns->n_udf_sub_tsvc_error);
+	info_append_uint64(db, "udf_sub_tsvc_timeout", ns->n_udf_sub_tsvc_timeout);
+
+	info_append_uint64(db, "udf_sub_udf_complete", ns->n_udf_sub_udf_complete);
+	info_append_uint64(db, "udf_sub_udf_error", ns->n_udf_sub_udf_error);
+	info_append_uint64(db, "udf_sub_udf_timeout", ns->n_udf_sub_udf_timeout);
+
+	info_append_uint64(db, "udf_sub_lang_read_success", ns->n_udf_sub_lang_read_success);
+	info_append_uint64(db, "udf_sub_lang_write_success", ns->n_udf_sub_lang_write_success);
+	info_append_uint64(db, "udf_sub_lang_delete_success", ns->n_udf_sub_lang_delete_success);
+	info_append_uint64(db, "udf_sub_lang_error", ns->n_udf_sub_lang_error);
+
+	// Transaction retransmit stats.
+
+	info_append_uint64(db, "retransmit_client_read_dup_res", ns->n_retransmit_client_read_dup_res);
+
+	info_append_uint64(db, "retransmit_client_write_dup_res", ns->n_retransmit_client_write_dup_res);
+	info_append_uint64(db, "retransmit_client_write_repl_write", ns->n_retransmit_client_write_repl_write);
+
+	info_append_uint64(db, "retransmit_client_delete_dup_res", ns->n_retransmit_client_delete_dup_res);
+	info_append_uint64(db, "retransmit_client_delete_repl_write", ns->n_retransmit_client_delete_repl_write);
+
+	info_append_uint64(db, "retransmit_client_udf_dup_res", ns->n_retransmit_client_udf_dup_res);
+	info_append_uint64(db, "retransmit_client_udf_repl_write", ns->n_retransmit_client_udf_repl_write);
+
+	info_append_uint64(db, "retransmit_batch_sub_dup_res", ns->n_retransmit_batch_sub_dup_res);
+
+	info_append_uint64(db, "retransmit_udf_sub_dup_res", ns->n_retransmit_udf_sub_dup_res);
+	info_append_uint64(db, "retransmit_udf_sub_repl_write", ns->n_retransmit_udf_sub_repl_write);
+
+	// Scan stats.
+
+	info_append_uint64(db, "scan_basic_complete", ns->n_scan_basic_complete);
+	info_append_uint64(db, "scan_basic_error", ns->n_scan_basic_error);
+	info_append_uint64(db, "scan_basic_abort", ns->n_scan_basic_abort);
+
+	info_append_uint64(db, "scan_aggr_complete", ns->n_scan_aggr_complete);
+	info_append_uint64(db, "scan_aggr_error", ns->n_scan_aggr_error);
+	info_append_uint64(db, "scan_aggr_abort", ns->n_scan_aggr_abort);
+
+	info_append_uint64(db, "scan_udf_bg_complete", ns->n_scan_udf_bg_complete);
+	info_append_uint64(db, "scan_udf_bg_error", ns->n_scan_udf_bg_error);
+	info_append_uint64(db, "scan_udf_bg_abort", ns->n_scan_udf_bg_abort);
+
+	// Query stats.
+
+	uint64_t agg			= ns->n_aggregation;
+	uint64_t agg_success	= ns->n_agg_success;
+	uint64_t agg_err		= ns->n_agg_errs;
+	uint64_t agg_abort		= ns->n_agg_abort;
+	uint64_t agg_records	= ns->agg_num_records;
+
+	uint64_t lkup			= ns->n_lookup;
+	uint64_t lkup_success	= ns->n_lookup_success;
+	uint64_t lkup_err		= ns->n_lookup_errs;
+	uint64_t lkup_abort		= ns->n_lookup_abort;
+	uint64_t lkup_records	= ns->lookup_num_records;
+
+	info_append_uint64(db, "query_reqs", ns->query_reqs);
+	info_append_uint64(db, "query_fail", ns->query_fail);
+
+	info_append_uint64(db, "query_short_queue_full", ns->query_short_queue_full);
+	info_append_uint64(db, "query_long_queue_full", ns->query_long_queue_full);
+	info_append_uint64(db, "query_short_reqs", ns->query_short_reqs);
+	info_append_uint64(db, "query_long_reqs", ns->query_long_reqs);
+
+	info_append_uint64(db, "query_agg", agg);
+	info_append_uint64(db, "query_agg_success", agg_success);
+	info_append_uint64(db, "query_agg_error", agg_err);
+	info_append_uint64(db, "query_agg_abort", agg_abort);
+	info_append_uint64(db, "query_agg_avg_rec_count", agg ? agg_records / agg : 0);
+
+	info_append_uint64(db, "query_lookups", lkup);
+	info_append_uint64(db, "query_lookup_success", lkup_success);
+	info_append_uint64(db, "query_lookup_error", lkup_err);
+	info_append_uint64(db, "query_lookup_abort", lkup_abort);
+	info_append_uint64(db, "query_lookup_avg_rec_count", lkup ? lkup_records / lkup : 0);
+
+	info_append_uint64(db, "query_udf_bg_success", ns->n_query_udf_bg_success);
+	info_append_uint64(db, "query_udf_bg_failure", ns->n_query_udf_bg_failure);
+
+	// Geospatial query stats:
+	info_append_uint64(db, "geo_region_query_reqs", ns->geo_region_query_count);
+	info_append_uint64(db, "geo_region_query_cells", ns->geo_region_query_cells);
+	info_append_uint64(db, "geo_region_query_points", ns->geo_region_query_points);
+	info_append_uint64(db, "geo_region_query_falsepos", ns->geo_region_query_falsepos);
+
+	// Re-replication stats - relevant only for enterprise edition.
+
+	info_append_uint64(db, "re_repl_success", ns->n_re_repl_success);
+	info_append_uint64(db, "re_repl_error", ns->n_re_repl_error);
+	info_append_uint64(db, "re_repl_timeout", ns->n_re_repl_timeout);
+
+	// Special errors that deserve their own counters:
+
+	info_append_uint64(db, "fail_xdr_forbidden", ns->n_fail_xdr_forbidden);
+	info_append_uint64(db, "fail_key_busy", ns->n_fail_key_busy);
+	info_append_uint64(db, "fail_generation", ns->n_fail_generation);
+	info_append_uint64(db, "fail_record_too_big", ns->n_fail_record_too_big);
+
+	// Special non-error counters:
+
+	info_append_uint64(db, "deleted_last_bin", ns->n_deleted_last_bin);
+}
+
+//
+// Iterate through the current namespace list and cons up a string
+//
+
+int
+info_get_tree_namespace(char *name, char *subtree, cf_dyn_buf *db)
+{
+	as_namespace *ns = as_namespace_get_byname(subtree);
+
+	if (! ns)   {
+		cf_dyn_buf_append_string(db, "type=unknown"); // TODO - better message?
+		return 0;
+	}
+
+	info_get_namespace_info(ns, db);
+	info_namespace_config_get(ns->name, db);
+
+	cf_dyn_buf_chomp(db);
+
+	return 0;
+}
+
+int
+info_get_tree_sets(char *name, char *subtree, cf_dyn_buf *db)
+{
+	char *set_name    = NULL;
+	as_namespace *ns  = NULL;
+
+	// if there is a subtree, get the namespace
+	if (subtree && strlen(subtree) > 0) {
+		// see if subtree has a sep as well
+		set_name = strchr(subtree, TREE_SEP);
+
+		// pull out namespace, and namespace name...
+		if (set_name) {
+			int ns_name_len = (set_name - subtree);
+			char ns_name[ns_name_len + 1];
+			memcpy(ns_name, subtree, ns_name_len);
+			ns_name[ns_name_len] = '\0';
+			ns = as_namespace_get_byname(ns_name);
+			set_name++; // currently points to the TREE_SEP, which is not what we want.
+		}
+		else {
+			ns = as_namespace_get_byname(subtree);
+		}
+
+		if (!ns) {
+			cf_dyn_buf_append_string(db, "ns_type=unknown");
+			return(0);
+		}
+	}
+
+	// format w/o namespace is ns1:set1:prop1=val1:prop2=val2:..propn=valn;ns1:set2...;ns2:set1...;
+	if (!ns) {
+		for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+			as_namespace_get_set_info(g_config.namespaces[i], set_name, db);
+		}
+	}
+	// format w namespace w/o set name is ns:set1:prop1=val1:prop2=val2...propn=valn;ns:set2...;
+	// format w namespace & set name is prop1=val1:prop2=val2...propn=valn;
+	else {
+		as_namespace_get_set_info(ns, set_name, db);
+	}
+	return(0);
+}
+
+int
+info_get_tree_statistics(char *name, char *subtree, cf_dyn_buf *db)
+{
+	if (strcmp(subtree, "xdr") == 0) {
+		as_xdr_get_stats(db);
+		cf_dyn_buf_chomp(db);
+		return 0;
+	}
+
+	cf_dyn_buf_append_string(db, "error");
+	return -1;
+}
+
+int
+info_get_tree_bins(char *name, char *subtree, cf_dyn_buf *db)
+{
+	as_namespace *ns  = NULL;
+
+	// if there is a subtree, get the namespace
+	if (subtree && strlen(subtree) > 0) {
+		ns = as_namespace_get_byname(subtree);
+
+		if (!ns) {
+			cf_dyn_buf_append_string(db, "ns_type=unknown");
+			return 0;
+		}
+	}
+
+	// format w/o namespace is
+	// ns:num-bin-names=val1,bin-names-quota=val2,name1,name2,...;ns:...
+	if (!ns) {
+		for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+			as_namespace_get_bins_info(g_config.namespaces[i], db, true);
+		}
+	}
+	// format w/namespace is
+	// num-bin-names=val1,bin-names-quota=val2,name1,name2,...
+	else {
+		as_namespace_get_bins_info(ns, db, false);
+	}
+
+	return 0;
+}
+
+int
+info_command_hist_dump(char *name, char *params, cf_dyn_buf *db)
+{
+	char value_str[128];
+	int  value_str_len = sizeof(value_str);
+
+	if (0 != as_info_parameter_get(params, "ns", value_str, &value_str_len)) {
+		cf_info(AS_INFO, "hist-dump %s command: no namespace specified", name);
+		cf_dyn_buf_append_string(db, "error-no-namespace");
+		return 0;
+	}
+
+	as_namespace *ns = as_namespace_get_byname(value_str);
+
+	if (!ns) {
+		cf_info(AS_INFO, "hist-dump %s command: unknown namespace: %s", name, value_str);
+		cf_dyn_buf_append_string(db, "error-unknown-namespace");
+		return 0;
+	}
+
+	value_str_len = sizeof(value_str);
+
+	if (0 != as_info_parameter_get(params, "hist", value_str, &value_str_len)) {
+		cf_info(AS_INFO, "hist-dump %s command:", name);
+		cf_dyn_buf_append_string(db, "error-no-hist-name");
+
+		return 0;
+	}
+
+	// get optional set field
+	char set_name_str[AS_SET_NAME_MAX_SIZE];
+	int set_name_str_len = sizeof(set_name_str);
+	set_name_str[0] = 0;
+
+	as_info_parameter_get(params, "set", set_name_str, &set_name_str_len);
+
+	// format is ns1:ns_hist1=bucket_count,offset,b1,b2,b3...;
+	as_namespace_get_hist_info(ns, set_name_str, value_str, db, true);
+
+	return 0;
+}
+
+
+int
+info_get_tree_log(char *name, char *subtree, cf_dyn_buf *db)
+{
+	// see if subtree has a sep as well
+	int sink_id;
+	char *context = strchr(subtree, TREE_SEP);
+	if (context) { // this means: log/id/context ,
+		*context = 0;
+		context++;
+
+		if (0 != cf_str_atoi(subtree, &sink_id)) return(-1);
+
+		cf_fault_sink_context_strlist(sink_id, context, db);
+	}
+	else { // this means just: log/id , so get all contexts
+		if (0 != cf_str_atoi(subtree, &sink_id)) return(-1);
+
+		cf_fault_sink_context_all_strlist(sink_id, db);
+	}
+
+	return(0);
+}
+
+
+int
+info_get_tree_sindexes(char *name, char *subtree, cf_dyn_buf *db)
+{
+	char *index_name    = NULL;
+	as_namespace *ns  = NULL;
+
+	// if there is a subtree, get the namespace
+	if (subtree && strlen(subtree) > 0) {
+		// see if subtree has a sep as well
+		index_name = strchr(subtree, TREE_SEP);
+
+		// pull out namespace, and namespace name...
+		if (index_name) {
+			int ns_name_len = (index_name - subtree);
+			char ns_name[ns_name_len + 1];
+			memcpy(ns_name, subtree, ns_name_len);
+			ns_name[ns_name_len] = '\0';
+			ns = as_namespace_get_byname(ns_name);
+			index_name++; // currently points to the TREE_SEP, which is not what we want.
+		}
+		else {
+			ns = as_namespace_get_byname(subtree);
+		}
+
+		if (!ns) {
+			cf_dyn_buf_append_string(db, "ns_type=unknown");
+			return(0);
+		}
+	}
+
+	// format w/o namespace is:
+	//    ns=ns1:set=set1:indexname=index1:prop1=val1:...:propn=valn;ns=ns1:set=set2:indexname=index2:...;ns=ns2:set=set1:...;
+	if (!ns) {
+		for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+			as_sindex_list_str(g_config.namespaces[i], db);
+		}
+	}
+	// format w namespace w/o index name is:
+	//    ns=ns1:set=set1:indexname=index1:prop1=val1:...:propn=valn;ns=ns1:set=set2:indexname=indexname2:...;
+	else if (!index_name) {
+		as_sindex_list_str(ns, db);
+	}
+	else {
+		// format w namespace & index name is:
+		//    prop1=val1;prop2=val2;...;propn=valn
+		int resp = as_sindex_stats_str(ns, index_name, db);
+		if (resp) {
+			cf_warning(AS_INFO, "Failed to get statistics for index %s: err = %d", index_name, resp);
+			INFO_COMMAND_SINDEX_FAILCODE(
+					as_sindex_err_to_clienterr(resp, __FILE__, __LINE__),
+					as_sindex_err_str(resp));
+		}
+	}
+	return(0);
+}
+
+int32_t
+info_get_service(char *name, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&g_serv_lock);
+	cf_dyn_buf_append_string(db, g_serv_legacy != NULL ? g_serv_legacy : "");
+	pthread_mutex_unlock(&g_serv_lock);
+	return 0;
+}
+
+int32_t
+info_get_service_clear_std(char *name, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&g_serv_lock);
+	cf_dyn_buf_append_string(db, g_serv_clear_std != NULL ? g_serv_clear_std : "");
+	pthread_mutex_unlock(&g_serv_lock);
+	return 0;
+}
+
+int32_t
+info_get_service_tls_std(char *name, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&g_serv_lock);
+	cf_dyn_buf_append_string(db, g_serv_tls_std != NULL ? g_serv_tls_std : "");
+	pthread_mutex_unlock(&g_serv_lock);
+	return 0;
+}
+
+int32_t
+info_get_service_clear_alt(char *name, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&g_serv_lock);
+	cf_dyn_buf_append_string(db, g_serv_clear_alt != NULL ? g_serv_clear_alt : "");
+	pthread_mutex_unlock(&g_serv_lock);
+	return 0;
+}
+
+int32_t
+info_get_service_tls_alt(char *name, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&g_serv_lock);
+	cf_dyn_buf_append_string(db, g_serv_tls_alt != NULL ? g_serv_tls_alt : "");
+	pthread_mutex_unlock(&g_serv_lock);
+	return 0;
+}
+
+// SINDEX wire protocol examples:
+// 1.) NUMERIC:    sindex-create:ns=usermap;set=demo;indexname=um_age;indexdata=age,numeric
+// 2.) STRING:     sindex-create:ns=usermap;set=demo;indexname=um_state;indexdata=state,string
+/*
+ *  Parameters:
+ *  	params --- string passed to asinfo call
+ *  	imd    --  parses the params and fills this sindex struct.
+ *
+ *  Returns
+ *  	AS_SINDEX_OK if it successfully fills up imd
+ *      AS_SINDEX_ERR_PARAM otherwise
+ *     TODO REVIEW  : send cmd as argument
+ */
+int
+as_info_parse_params_to_sindex_imd(char* params, as_sindex_metadata *imd, cf_dyn_buf* db,
+		bool is_create, bool *is_smd_op, char * OP)
+{
+	if (! imd) {
+		cf_warning(AS_INFO, "%s : Failed. internal error.", OP);
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	char indexname_str[AS_ID_INAME_SZ];
+	int  indname_len  = sizeof(indexname_str);
+	int ret = as_info_parameter_get(params, STR_INDEXNAME, indexname_str,
+			&indname_len);
+	if ( ret == -1 ) {
+		cf_warning(AS_INFO, "%s : Failed. Missing Index name.", OP);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Missing Index name");
+		return AS_SINDEX_ERR_PARAM;
+	}
+	else if ( ret == -2 ) {
+		cf_warning(AS_INFO, "%s : Failed. Index name longer than allowed %d.",
+				OP, AS_ID_INAME_SZ-1);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Index name too long");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	char cmd[128];
+	snprintf(cmd, 128, "%s %s", OP, indexname_str);
+
+	char ns_str[AS_ID_NAMESPACE_SZ];
+	int ns_len       = sizeof(ns_str);
+	ret = as_info_parameter_get(params, STR_NS, ns_str, &ns_len);
+	if ( ret == -1 ) {
+		cf_warning(AS_INFO, "%s : Failed. Missing Namespace name.", cmd);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Missing Namespace name");
+		return AS_SINDEX_ERR_PARAM;
+	}
+	else if (ret == -2 ) {
+		cf_warning(AS_INFO, "%s : Failed. Namespace name longer than allowed %d.",
+				cmd, AS_ID_NAMESPACE_SZ - 1);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Namespace name too long");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	as_namespace *ns = as_namespace_get_byname(ns_str);
+	if (! ns) {
+		cf_warning(AS_INFO, "%s : Failed. Namespace '%s' not found %d",
+				cmd, ns_str, ns_len);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, "Namespace Not Found");
+		return AS_SINDEX_ERR_PARAM;
+	}
+	if (ns->single_bin) {
+		cf_warning(AS_INFO, "%s : Failed. Secondary Index is not allowed on single bin "
+				"namespace '%s'.", cmd, ns_str);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Single bin namespace");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	char set_str[AS_SET_NAME_MAX_SIZE];
+	int set_len  = sizeof(set_str);
+	if (imd->set) {
+		cf_free(imd->set);
+		imd->set = NULL;
+	}
+	ret = as_info_parameter_get(params, STR_SET, set_str, &set_len);
+	if (!ret && set_len != 0) {
+		if (as_namespace_get_create_set_w_len(ns, set_str, set_len, NULL, NULL)
+				!= 0) {
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+					"Set name quota full");
+			return AS_SINDEX_ERR_PARAM;
+		}
+		imd->set = cf_strdup(set_str);
+	} else if (ret == -2) {
+		cf_warning(AS_INFO, "%s : Failed. Setname longer than %d for index.",
+				cmd, AS_SET_NAME_MAX_SIZE - 1);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Set name too long");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	char cluster_op[6];
+	int cluster_op_len = sizeof(cluster_op);
+	if (as_info_parameter_get(params, "cluster_op", cluster_op, &cluster_op_len)
+			!= 0) {
+		*is_smd_op = true;
+	}
+	else if (strcmp(cluster_op, "true") == 0) {
+		*is_smd_op = true;
+	}
+	else if (strcmp(cluster_op, "false") == 0) {
+		*is_smd_op = false;
+	}
+
+	// Delete only need parsing till here
+	if (!is_create) {
+		imd->ns_name = cf_strdup(ns->name);
+		imd->iname   = cf_strdup(indexname_str);
+		return 0;
+	}
+
+	char indextype_str[AS_SINDEX_TYPE_STR_SIZE];
+	int  indtype_len = sizeof(indextype_str);
+	ret = as_info_parameter_get(params, STR_ITYPE, indextype_str, &indtype_len);
+	if (ret == -1) {
+		// if not specified the index type is DEFAULT
+		imd->itype = AS_SINDEX_ITYPE_DEFAULT;
+	}
+	else if (ret == -2) {
+		cf_warning(AS_INFO, "%s : Failed. Indextype str longer than allowed %d.",
+				cmd, AS_SINDEX_TYPE_STR_SIZE-1);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Indextype is too long");
+		return AS_SINDEX_ERR_PARAM;
+
+	}
+	else {
+		if (strncasecmp(indextype_str, STR_ITYPE_DEFAULT, 7) == 0) {
+			imd->itype = AS_SINDEX_ITYPE_DEFAULT;
+		}
+		else if (strncasecmp(indextype_str, STR_ITYPE_LIST, 4) == 0) {
+			imd->itype = AS_SINDEX_ITYPE_LIST;
+		}
+		else if (strncasecmp(indextype_str, STR_ITYPE_MAPKEYS, 7) == 0) {
+			imd->itype = AS_SINDEX_ITYPE_MAPKEYS;
+		}
+		else if (strncasecmp(indextype_str, STR_ITYPE_MAPVALUES, 9) == 0) {
+			imd->itype = AS_SINDEX_ITYPE_MAPVALUES;
+		}
+		else {
+			cf_warning(AS_INFO, "%s : Failed. Invalid indextype '%s'.", cmd,
+					indextype_str);
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+					"Invalid indextype. Should be one of [DEFAULT, LIST, MAPKEYS, MAPVALUES]");
+			return AS_SINDEX_ERR_PARAM;
+		}
+	}
+
+	// Indexdata = binpath,keytype
+	char indexdata_str[AS_SINDEXDATA_STR_SIZE];
+	int  indexdata_len = sizeof(indexdata_str);
+	if (as_info_parameter_get(params, STR_INDEXDATA, indexdata_str,
+				&indexdata_len)) {
+		cf_warning(AS_INFO, "%s : Failed. Invalid indexdata '%s'.", cmd,
+				indexdata_str);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Invalid indexdata");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	cf_vector *str_v = cf_vector_create(sizeof(void *), 10, VECTOR_FLAG_INITZERO);
+	cf_str_split(",", indexdata_str, str_v);
+	if ((cf_vector_size(str_v)) > 2) {
+		cf_warning(AS_INFO, "%s : Failed. >1 bins specified in indexdata.",
+				cmd);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Number of bins more than 1");
+		cf_vector_destroy(str_v);
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	char *path_str = NULL;
+	cf_vector_get(str_v, 0, &path_str);
+	if (! path_str) {
+		cf_warning(AS_INFO, "%s : Failed. Missing Bin Name.", cmd);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Missing Bin name");
+		cf_vector_destroy(str_v);
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	if (as_sindex_extract_bin_path(imd, path_str)
+			|| ! imd->bname) {
+		cf_warning(AS_INFO, "%s : Failed. Invalid Bin Path '%s'.", cmd, path_str);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Invalid Bin path");
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	if (imd->bname && strlen(imd->bname) >= AS_ID_BIN_SZ) {
+		cf_warning(AS_INFO, "%s : Failed. Bin Name longer than allowed %d",
+				cmd, AS_ID_BIN_SZ - 1);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, "Bin Name too long");
+		cf_vector_destroy(str_v);
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	char *type_str = NULL;
+	cf_vector_get(str_v, 1, &type_str);
+	if (! type_str) {
+		cf_warning(AS_INFO, "%s : Failed. Missing Bin type", cmd);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Missing Bin Type.");
+		cf_vector_destroy(str_v);
+		return AS_SINDEX_ERR_PARAM;
+	}
+
+	as_sindex_ktype ktype = as_sindex_ktype_from_string(type_str);
+	if (ktype == COL_TYPE_INVALID) {
+		cf_warning(AS_INFO, "%s : Failed. Invalid Bin type '%s'.", cmd, type_str);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Invalid Bin type. Supported types [Numeric, String, Geo2dsphere]");
+		cf_vector_destroy(str_v);
+		return AS_SINDEX_ERR_PARAM;
+	}
+	imd->sktype = ktype;
+
+
+
+	cf_vector_destroy(str_v);
+
+	if (is_create) {
+		imd->ns_name = cf_strdup(ns->name);
+		imd->iname   = cf_strdup(indexname_str);
+	}
+	imd->path_str = cf_strdup(path_str);
+	return AS_SINDEX_OK;
+}
+
+int info_command_sindex_create(char *name, char *params, cf_dyn_buf *db)
+{
+	as_sindex_metadata imd;
+	memset((void *)&imd, 0, sizeof(imd));
+	bool is_smd_op = true;
+
+	// Check info-command params for correctness.
+	int res = as_info_parse_params_to_sindex_imd(params, &imd, db, true, &is_smd_op, "SINDEX CREATE");
+
+	if (res != 0) {
+		goto ERR;
+	}
+
+	as_namespace *ns = as_namespace_get_byname(imd.ns_name);
+	res = as_sindex_create_check_params(ns, &imd);
+
+	if (res == AS_SINDEX_ERR_FOUND) {
+		cf_warning(AS_INFO, "SINDEX CREATE: Index already exists on namespace '%s', either with same name '%s' or same bin '%s' / type '%s' combination.",
+				imd.ns_name, imd.iname, imd.bname,
+				as_sindex_ktype_str(imd.sktype));
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_INDEX_FOUND,
+				"Index with the same name already exists or this bin has already been indexed.");
+		goto ERR;
+	}
+	else if (res == AS_SINDEX_ERR_MAXCOUNT) {
+		cf_warning(AS_INFO, "SINDEX CREATE : More than %d index are not allowed per namespace.", AS_SINDEX_MAX);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_INDEX_MAXCOUNT,
+				"Reached maximum number of sindex allowed");
+		goto ERR;
+	}
+
+	if (is_smd_op == true)
+	{
+		cf_info(AS_INFO, "SINDEX CREATE : Request received for %s:%s via SMD", imd.ns_name, imd.iname);
+
+		char smd_key[SINDEX_SMD_KEY_SIZE];
+
+		as_sindex_imd_to_smd_key(&imd, smd_key);
+		res = as_smd_set_metadata(SINDEX_MODULE, smd_key, imd.iname);
+
+		if (res != 0) {
+			cf_warning(AS_INFO, "SINDEX CREATE : Queuing the index %s metadata to SMD failed with error %s",
+					imd.iname, as_sindex_err_str(res));
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, as_sindex_err_str(res));
+			goto ERR;
+		}
+	}
+	else if (is_smd_op == false) {
+		cf_info(AS_INFO, "SINDEX CREATE : Request received for %s:%s via info", imd.ns_name, imd.iname);
+		res = as_sindex_create(ns, &imd);
+		if (0 != res) {
+			cf_warning(AS_INFO, "SINDEX CREATE : Failed with error %s for index %s",
+					as_sindex_err_str(res), imd.iname);
+			INFO_COMMAND_SINDEX_FAILCODE(as_sindex_err_to_clienterr(res, __FILE__, __LINE__),
+					as_sindex_err_str(res));
+			goto ERR;
+		}
+	}
+	cf_dyn_buf_append_string(db, "OK");
+ERR:
+	as_sindex_imd_free(&imd);
+	return(0);
+
+}
+
+int info_command_sindex_delete(char *name, char *params, cf_dyn_buf *db) {
+	as_sindex_metadata imd;
+	memset((void *)&imd, 0, sizeof(imd));
+	bool is_smd_op = true;
+	int res = as_info_parse_params_to_sindex_imd(params, &imd, db, false, &is_smd_op, "SINDEX DROP");
+
+	if (res != 0) {
+		goto ERR;
+	}
+
+	as_namespace *ns = as_namespace_get_byname(imd.ns_name);
+
+	// Do not use as_sindex_exists_by_defn() here, it'll fail because bname is null.
+	if (!as_sindex_delete_checker(ns, &imd)) {
+		cf_warning(AS_INFO, "SINDEX DROP : Index %s:%s does not exist on the system",
+				imd.ns_name, imd.iname);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND,
+				"Index does not exist on the system.");
+		goto ERR;
+	}
+
+	if (is_smd_op == true)
+	{
+		cf_info(AS_INFO, "SINDEX DROP : Request received for %s:%s via SMD", imd.ns_name, imd.iname);
+
+		char smd_key[SINDEX_SMD_KEY_SIZE];
+
+		if (as_sindex_delete_imd_to_smd_key(ns, &imd, smd_key)) {
+			res = as_smd_delete_metadata(SINDEX_MODULE, smd_key);
+		}
+		else {
+			res = AS_SINDEX_ERR_NOTFOUND;
+		}
+
+		if (0 != res) {
+			cf_warning(AS_INFO, "SINDEX DROP : Queuing the index %s metadata to SMD failed with error %s",
+					imd.iname, as_sindex_err_str(res));
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, as_sindex_err_str(res));
+			goto ERR;
+		}
+	}
+	else if(is_smd_op == false)
+	{
+		cf_info(AS_INFO, "SINDEX DROP : Request received for %s:%s via info", imd.ns_name, imd.iname);
+		res = as_sindex_destroy(ns, &imd);
+		if (0 != res) {
+			cf_warning(AS_INFO, "SINDEX DROP : Failed with error %s for index %s",
+					as_sindex_err_str(res), imd.iname);
+			INFO_COMMAND_SINDEX_FAILCODE(as_sindex_err_to_clienterr(res, __FILE__, __LINE__),
+					as_sindex_err_str(res));
+			goto ERR;
+		}
+	}
+
+	cf_dyn_buf_append_string(db, "OK");
+ERR:
+	as_sindex_imd_free(&imd);
+	return 0;
+}
+
+int
+as_info_parse_ns_iname(char* params, as_namespace ** ns, char ** iname, cf_dyn_buf* db, char * sindex_cmd)
+{
+	char ns_str[AS_ID_NAMESPACE_SZ];
+	int ns_len = sizeof(ns_str);
+	int ret    = 0;
+
+	ret = as_info_parameter_get(params, "ns", ns_str, &ns_len);
+	if (ret) {
+		if (ret == -2) {
+			cf_warning(AS_INFO, "%s : namespace name exceeds max length %d",
+				sindex_cmd, AS_ID_NAMESPACE_SZ);
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Namespace name exceeds max length");
+		}
+		else {
+			cf_warning(AS_INFO, "%s : invalid namespace %s", sindex_cmd, ns_str);
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Namespace Not Specified");
+		}
+		return -1;
+	}
+
+	*ns = as_namespace_get_byname(ns_str);
+	if (!*ns) {
+		cf_warning(AS_INFO, "%s : namespace %s not found", sindex_cmd, ns_str);
+		INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Namespace Not Found");
+		return -1;
+	}
+
+	// get indexname
+	char index_name_str[AS_ID_INAME_SZ];
+	int  index_len = sizeof(index_name_str);
+	ret = as_info_parameter_get(params, "indexname", index_name_str, &index_len);
+	if (ret) {
+		if (ret == -2) {
+			cf_warning(AS_INFO, "%s : indexname exceeds max length %d", sindex_cmd, AS_ID_INAME_SZ);
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Index Name exceeds max length");
+		}
+		else {
+			cf_warning(AS_INFO, "%s : invalid indexname %s", sindex_cmd, index_name_str);
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER,
+				"Index Name Not Specified");
+		}
+		return -1;
+	}
+
+	cf_info(AS_SINDEX, "%s : received request on index %s - namespace %s",
+			sindex_cmd, index_name_str, ns_str);
+
+	*iname = cf_strdup(index_name_str);
+
+	return 0;
+}
+
+int info_command_abort_scan(char *name, char *params, cf_dyn_buf *db) {
+	char context[100];
+	int  context_len = sizeof(context);
+	int rv = -1;
+	if (0 == as_info_parameter_get(params, "id", context, &context_len)) {
+		uint64_t trid;
+		trid = strtoull(context, NULL, 10);
+		if (trid != 0) {
+			rv = as_scan_abort(trid);
+		}
+	}
+
+	if (rv != 0) {
+		cf_dyn_buf_append_string(db, "ERROR:");
+		cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		cf_dyn_buf_append_string(db, ":Transaction Not Found");
+	}
+	else {
+		cf_dyn_buf_append_string(db, "OK");
+	}
+
+	return 0;
+}
+
+int info_command_abort_all_scans(char *name, char *params, cf_dyn_buf *db) {
+
+	int n_scans_killed = as_scan_abort_all();
+
+	cf_dyn_buf_append_string(db, "OK - number of scans killed: ");
+	cf_dyn_buf_append_int(db, n_scans_killed);
+
+	return 0;
+}
+
+int info_command_query_kill(char *name, char *params, cf_dyn_buf *db) {
+	char context[100];
+	int  context_len = sizeof(context);
+	int  rv          = AS_QUERY_ERR;
+	if (0 == as_info_parameter_get(params, "trid", context, &context_len)) {
+		uint64_t trid;
+		trid = strtoull(context, NULL, 10);
+		if (trid != 0) {
+			rv = as_query_kill(trid);
+		}
+	}
+
+	if (AS_QUERY_OK != rv) {
+		cf_dyn_buf_append_string(db, "Transaction Not Found");
+	}
+	else {
+		cf_dyn_buf_append_string(db, "Ok");
+	}
+
+	return 0;
+
+
+
+}
+int info_command_sindex_stat(char *name, char *params, cf_dyn_buf *db) {
+	as_namespace  *ns = NULL;
+	char * iname = NULL;
+
+	if (as_info_parse_ns_iname(params, &ns, &iname, db, "SINDEX STAT")) {
+		return 0;
+	}
+
+	int resp = as_sindex_stats_str(ns, iname, db);
+	if (resp)  {
+		cf_warning(AS_INFO, "SINDEX STAT : for index %s - ns %s failed with error %d",
+			iname, ns->name, resp);
+		INFO_COMMAND_SINDEX_FAILCODE(
+				as_sindex_err_to_clienterr(resp, __FILE__, __LINE__),
+				as_sindex_err_str(resp));
+	}
+
+	if (iname) {
+		cf_free(iname);
+	}
+	return(0);
+}
+
+
+// sindex-histogram:ns=test_D;indexname=indname;enable=true/false
+int info_command_sindex_histogram(char *name, char *params, cf_dyn_buf *db)
+{
+	as_namespace * ns = NULL;
+	char * iname = NULL;
+	if (as_info_parse_ns_iname(params, &ns, &iname, db, "SINDEX HISTOGRAM")) {
+		return 0;
+	}
+
+	char op[10];
+	int op_len = sizeof(op);
+
+	if (as_info_parameter_get(params, "enable", op, &op_len)) {
+		cf_info(AS_INFO, "SINDEX HISTOGRAM : invalid OP");
+		cf_dyn_buf_append_string(db, "Invalid Op");
+		goto END;
+	}
+
+	bool enable = false;
+	if (!strncmp(op, "true", 5) && op_len != 5) {
+		enable = true;
+	}
+	else if (!strncmp(op, "false", 6) && op_len != 6) {
+		enable = false;
+	}
+	else {
+		cf_info(AS_INFO, "SINDEX HISTOGRAM : invalid OP");
+		cf_dyn_buf_append_string(db, "Invalid Op");
+		goto END;
+	}
+
+	int resp = as_sindex_histogram_enable(ns, iname, enable);
+	if (resp) {
+		cf_warning(AS_INFO, "SINDEX HISTOGRAM : for index %s - ns %s failed with error %d",
+			iname, ns->name, resp);
+		INFO_COMMAND_SINDEX_FAILCODE(
+				as_sindex_err_to_clienterr(resp, __FILE__, __LINE__),
+				as_sindex_err_str(resp));
+	} else {
+		cf_dyn_buf_append_string(db, "Ok");
+		cf_info(AS_INFO, "SINDEX HISTOGRAM : for index %s - ns %s histogram is set as %s",
+			iname, ns->name, op);
+	}
+
+END:
+	if (iname) {
+		cf_free(iname);
+	}
+	return(0);
+}
+
+int info_command_sindex_list(char *name, char *params, cf_dyn_buf *db) {
+	bool listall = true;
+	char ns_str[128];
+	int ns_len = sizeof(ns_str);
+	if (!as_info_parameter_get(params, "ns", ns_str, &ns_len)) {
+		listall = false;
+	}
+
+	if (listall) {
+		bool found = false;
+		for (int i = 0; i < g_config.n_namespaces; i++) {
+			as_namespace *ns = g_config.namespaces[i];
+			if (ns) {
+				if (!as_sindex_list_str(ns, db)) {
+					found = true;
+				}
+				else {
+					cf_detail(AS_INFO, "No indexes for namespace %s", ns->name);
+				}
+			}
+		}
+
+		if (found) {
+			cf_dyn_buf_chomp(db);
+		}
+		else {
+			cf_dyn_buf_append_string(db, "Empty");
+		}
+	}
+	else {
+		as_namespace *ns = as_namespace_get_byname(ns_str);
+		if (!ns) {
+			cf_warning(AS_INFO, "SINDEX LIST : ns %s not found", ns_str);
+			INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, "Namespace Not Found");
+			return 0;
+		} else {
+			if (as_sindex_list_str(ns, db)) {
+				cf_info(AS_INFO, "ns not found");
+				cf_dyn_buf_append_string(db, "Empty");
+			}
+			return 0;
+		}
+	}
+	return(0);
+}
+
+// Defined in "make_in/version.c" (auto-generated by the build system.)
+extern const char aerospike_build_id[];
+extern const char aerospike_build_time[];
+extern const char aerospike_build_type[];
+extern const char aerospike_build_os[];
+extern const char aerospike_build_features[];
+
+int
+as_info_init()
+{
+	// g_info_node_info_history_hash is a hash of all nodes that have ever been
+	// recognized by this node - either via paxos or info messages.
+	g_info_node_info_history_hash = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), sizeof(info_node_info), 64, CF_SHASH_BIG_LOCK);
+
+	// g_info_node_info_hash is a hash of all nodes *currently* in the cluster.
+	// This hash should *always* be a subset of g_info_node_info_history_hash -
+	// to ensure this, you should take the lock on the corresponding key in
+	// info_history_hash before modifying an element in this hash table. This
+	// hash is used to create the services list.
+	g_info_node_info_hash = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), sizeof(info_node_info), 64, CF_SHASH_BIG_LOCK);
+
+	// create worker threads
+	g_info_work_q = cf_queue_create(sizeof(as_info_transaction), true);
+
+	char vstr[64];
+	sprintf(vstr, "%s build %s", aerospike_build_type, aerospike_build_id);
+
+	// Set some basic values
+	as_info_set("version", vstr, true);                  // Returns the edition and build number.
+	as_info_set("build", aerospike_build_id, true);      // Returns the build number for this server.
+	as_info_set("build_os", aerospike_build_os, true);   // Return the OS used to create this build.
+	as_info_set("build_time", aerospike_build_time, true); // Return the creation time of this build.
+	as_info_set("edition", aerospike_build_type, true);  // Return the edition of this build.
+	as_info_set("digests", "RIPEMD160", false);          // Returns the hashing algorithm used by the server for key hashing.
+	as_info_set("status", "ok", false);                  // Always returns ok, used to verify service port is open.
+	as_info_set("STATUS", "OK", false);                  // Always returns OK, used to verify service port is open.
+
+	char istr[1024];
+	cf_str_itoa(AS_PARTITIONS, istr, 10);
+	as_info_set("partitions", istr, false);              // Returns the number of partitions used to hash keys across.
+
+	cf_str_itoa_u64(g_config.self_node, istr, 16);
+	as_info_set("node", istr, true);                     // Node ID. Unique 15 character hex string for each node based on the mac address and port.
+	as_info_set("name", istr, false);                    // Alias to 'node'.
+	// Returns list of features supported by this server
+	static char features[1024];
+	strcat(features, "peers;cdt-list;cdt-map;pipelining;geo;float;batch-index;replicas;replicas-all;replicas-master;replicas-prole;udf");
+	strcat(features, aerospike_build_features);
+	as_info_set("features", features, true);
+	as_hb_mode hb_mode;
+	as_hb_info_listen_addr_get(&hb_mode, istr, sizeof(istr));
+	as_info_set( hb_mode == AS_HB_MODE_MESH ? "mesh" :  "mcast", istr, false);
+
+	// All commands accepted by asinfo/telnet
+	as_info_set("help", "alloc-info;asm;bins;build;build_os;build_time;cluster-name;config-get;config-set;"
+				"df;digests;dump-cluster;dump-fabric;dump-hb;dump-migrates;dump-msgs;dump-rw;"
+				"dump-si;dump-skew;dump-smd;dump-wb;dump-wb-summary;feature-key;get-config;get-sl;hist-dump;"
+				"hist-track-start;hist-track-stop;jem-stats;jobs;latency;log;log-set;"
+				"log-message;logs;mcast;mem;mesh;mstats;mtrace;name;namespace;namespaces;node;"
+				"racks;recluster;revive;roster;roster-set;service;services;services-alumni;services-alumni-reset;set-config;"
+				"set-log;sets;set-sl;show-devices;sindex;sindex-create;sindex-delete;"
+				"sindex-histogram;"
+				"smd;statistics;status;tip;tip-clear;truncate;truncate-undo;version;",
+				false);
+	/*
+	 * help intentionally does not include the following:
+	 * cluster-generation;features;objects;
+	 * partition-generation;partition-info;partitions;replicas-master;
+	 * replicas-prole;replicas-read;replicas-write;throughput
+	 */
+
+	// Set up some dynamic functions
+	as_info_set_dynamic("alumni-clear-std", info_get_alumni_clear_std, false);        // Supersedes "services-alumni" for non-TLS service.
+	as_info_set_dynamic("alumni-tls-std", info_get_alumni_tls_std, false);            // Supersedes "services-alumni" for TLS service.
+	as_info_set_dynamic("bins", info_get_bins, false);                                // Returns bin usage information and used bin names.
+	as_info_set_dynamic("cluster-generation", info_get_cluster_generation, true);     // Returns cluster generation.
+	as_info_set_dynamic("cluster-name", info_get_cluster_name, false);                // Returns cluster name.
+	as_info_set_dynamic("endpoints", info_get_endpoints, false);                      // Returns the expanded bind / access address configuration.
+	as_info_set_dynamic("feature-key", info_get_features, false);                     // Returns the contents of the feature key (except signature).
+	as_info_set_dynamic("get-config", info_get_config, false);                        // Returns running config for specified context.
+	as_info_set_dynamic("logs", info_get_logs, false);                                // Returns a list of log file locations in use by this server.
+	as_info_set_dynamic("namespaces", info_get_namespaces, false);                    // Returns a list of namespace defined on this server.
+	as_info_set_dynamic("objects", info_get_objects, false);                          // Returns the number of objects stored on this server.
+	as_info_set_dynamic("partition-generation", info_get_partition_generation, true); // Returns the current partition generation.
+	as_info_set_dynamic("partition-info", info_get_partition_info, false);            // Returns partition ownership information.
+	as_info_set_dynamic("peers-clear-alt", info_get_services_clear_alt, false);       // Supersedes "services-alternate" for non-TLS, alternate addresses.
+	as_info_set_dynamic("peers-clear-std", info_get_services_clear_std, false);       // Supersedes "services" for non-TLS, standard addresses.
+	as_info_set_dynamic("peers-generation", info_get_services_generation, false);     // Returns the generation of the peers-*-* services lists.
+	as_info_set_dynamic("peers-tls-alt", info_get_services_tls_alt, false);           // Supersedes "services-alternate" for TLS, alternate addresses.
+	as_info_set_dynamic("peers-tls-std", info_get_services_tls_std, false);           // Supersedes "services" for TLS, standard addresses.
+	as_info_set_dynamic("replicas", info_get_replicas, false);                        // Same as replicas-all, but includes regime.
+	as_info_set_dynamic("replicas-all", info_get_replicas_all, false);                // Base 64 encoded binary representation of partitions this node is replica for.
+	as_info_set_dynamic("replicas-master", info_get_replicas_master, false);          // Base 64 encoded binary representation of partitions this node is master (replica) for.
+	as_info_set_dynamic("replicas-prole", info_get_replicas_prole, false);            // Base 64 encoded binary representation of partitions this node is prole (replica) for.
+	as_info_set_dynamic("service", info_get_service, false);                          // IP address and server port for this node, expected to be a single.
+	                                                                                  // address/port per node, may be multiple address if this node is configured.
+	                                                                                  // to listen on multiple interfaces (typically not advised).
+	as_info_set_dynamic("service-clear-alt", info_get_service_clear_alt, false);      // Supersedes "service". The alternate address and port for this node's non-TLS
+	                                                                                  // client service.
+	as_info_set_dynamic("service-clear-std", info_get_service_clear_std, false);      // Supersedes "service". The address and port for this node's non-TLS client service.
+	as_info_set_dynamic("service-tls-alt", info_get_service_tls_alt, false);          // Supersedes "service". The alternate address and port for this node's TLS
+	                                                                                  // client service.
+	as_info_set_dynamic("service-tls-std", info_get_service_tls_std, false);          // Supersedes "service". The address and port for this node's TLS client service.
+	as_info_set_dynamic("services", info_get_services, true);                         // List of addresses of neighbor cluster nodes to advertise for Application to connect.
+	as_info_set_dynamic("services-alternate", info_get_alt_addr, false);              // IP address mapping from internal to public ones
+	as_info_set_dynamic("services-alumni", info_get_services_alumni, true);           // All neighbor addresses (services) this server has ever know about.
+	as_info_set_dynamic("services-alumni-reset", info_services_alumni_reset, false);  // Reset the services alumni to equal services.
+	as_info_set_dynamic("sets", info_get_sets, false);                                // Returns set statistics for all or a particular set.
+	as_info_set_dynamic("statistics", info_get_stats, true);                          // Returns system health and usage stats for this server.
+
+#ifdef INFO_SEGV_TEST
+	as_info_set_dynamic("segvtest", info_segv_test, true);
+#endif
+
+	// Tree-based names
+	as_info_set_tree("bins", info_get_tree_bins);           // Returns bin usage information and used bin names for all or a particular namespace.
+	as_info_set_tree("log", info_get_tree_log);             //
+	as_info_set_tree("namespace", info_get_tree_namespace); // Returns health and usage stats for a particular namespace.
+	as_info_set_tree("sets", info_get_tree_sets);           // Returns set statistics for all or a particular set.
+	as_info_set_tree("statistics", info_get_tree_statistics);
+
+	// Define commands
+	as_info_set_command("config-get", info_command_config_get, PERM_NONE);                    // Returns running config for specified context.
+	as_info_set_command("config-set", info_command_config_set, PERM_SET_CONFIG);              // Set a configuration parameter at run time, configuration parameter must be dynamic.
+	as_info_set_command("dump-cluster", info_command_dump_cluster, PERM_LOGGING_CTRL);        // Print debug information about clustering and exchange to the log file.
+	as_info_set_command("dump-fabric", info_command_dump_fabric, PERM_LOGGING_CTRL);          // Print debug information about fabric to the log file.
+	as_info_set_command("dump-hb", info_command_dump_hb, PERM_LOGGING_CTRL);                  // Print debug information about heartbeat state to the log file.
+	as_info_set_command("dump-hlc", info_command_dump_hlc, PERM_LOGGING_CTRL);                // Print debug information about Hybrid Logical Clock to the log file.
+	as_info_set_command("dump-migrates", info_command_dump_migrates, PERM_LOGGING_CTRL);      // Print debug information about migration.
+	as_info_set_command("dump-msgs", info_command_dump_msgs, PERM_LOGGING_CTRL);              // Print debug information about existing 'msg' objects and queues to the log file.
+	as_info_set_command("dump-rw", info_command_dump_rw_request_hash, PERM_LOGGING_CTRL);     // Print debug information about transaction hash table to the log file.
+	as_info_set_command("dump-si", info_command_dump_si, PERM_LOGGING_CTRL);                  // Print information about a Secondary Index
+	as_info_set_command("dump-skew", info_command_dump_skew, PERM_LOGGING_CTRL);              // Print information about clock skew
+	as_info_set_command("dump-smd", info_command_dump_smd, PERM_LOGGING_CTRL);                // Print information about System Metadata (SMD) to the log file.
+	as_info_set_command("dump-wb", info_command_dump_wb, PERM_LOGGING_CTRL);                  // Print debug information about Write Bocks (WB) to the log file.
+	as_info_set_command("dump-wb-summary", info_command_dump_wb_summary, PERM_LOGGING_CTRL);  // Print summary information about all Write Blocks (WB) on a device to the log file.
+	as_info_set_command("get-config", info_command_config_get, PERM_NONE);                    // Returns running config for all or a particular context.
+	as_info_set_command("get-sl", info_command_get_sl, PERM_NONE);                            // Get the Paxos succession list.
+	as_info_set_command("hist-dump", info_command_hist_dump, PERM_NONE);                      // Returns a histogram snapshot for a particular histogram.
+	as_info_set_command("hist-track-start", info_command_hist_track, PERM_SERVICE_CTRL);      // Start or Restart histogram tracking.
+	as_info_set_command("hist-track-stop", info_command_hist_track, PERM_SERVICE_CTRL);       // Stop histogram tracking.
+	as_info_set_command("jem-stats", info_command_jem_stats, PERM_LOGGING_CTRL);              // Print JEMalloc statistics to the log file.
+	as_info_set_command("latency", info_command_hist_track, PERM_NONE);                       // Returns latency and throughput information.
+	as_info_set_command("log-message", info_command_log_message, PERM_NONE);                  // Log a message.
+	as_info_set_command("log-set", info_command_log_set, PERM_LOGGING_CTRL);                  // Set values in the log system.
+	as_info_set_command("peers-clear-alt", info_get_services_clear_alt_delta, PERM_NONE);     // The delta update version of "peers-clear-alt".
+	as_info_set_command("peers-clear-std", info_get_services_clear_std_delta, PERM_NONE);     // The delta update version of "peers-clear-std".
+	as_info_set_command("peers-tls-alt", info_get_services_tls_alt_delta, PERM_NONE);         // The delta update version of "peers-tls-alt".
+	as_info_set_command("peers-tls-std", info_get_services_tls_std_delta, PERM_NONE);         // The delta update version of "peers-tls-std".
+	as_info_set_command("racks", info_command_racks, PERM_NONE);                              // Rack-aware information.
+	as_info_set_command("recluster", info_command_recluster, PERM_NONE);                      // Force cluster to re-form. FIXME - what permission?
+	as_info_set_command("revive", info_command_revive, PERM_NONE);                            // Mark all partitions as "trusted".
+	as_info_set_command("roster", info_command_roster, PERM_NONE);                            // Roster information.
+	as_info_set_command("roster-set", info_command_roster_set, PERM_NONE);                    // Set the entire roster. FIXME - what permission?
+	as_info_set_command("set-config", info_command_config_set, PERM_SET_CONFIG);              // Set config values.
+	as_info_set_command("set-log", info_command_log_set, PERM_LOGGING_CTRL);                  // Set values in the log system.
+	as_info_set_command("show-devices", info_command_show_devices, PERM_LOGGING_CTRL);        // Print snapshot of wblocks to the log file.
+	as_info_set_command("throughput", info_command_hist_track, PERM_NONE);                    // Returns throughput info.
+	as_info_set_command("tip", info_command_tip, PERM_SERVICE_CTRL);                          // Add external IP to mesh-mode heartbeats.
+	as_info_set_command("tip-clear", info_command_tip_clear, PERM_SERVICE_CTRL);              // Clear tip list from mesh-mode heartbeats.
+	as_info_set_command("truncate", info_command_truncate, PERM_TRUNCATE);                    // Truncate a namespace or set.
+	as_info_set_command("truncate-undo", info_command_truncate_undo, PERM_TRUNCATE);          // Undo a truncate command.
+	as_info_set_command("xdr-command", as_info_command_xdr, PERM_SERVICE_CTRL);               // Command to XDR module.
+
+	// SINDEX
+	as_info_set_dynamic("sindex", info_get_sindexes, false);
+	as_info_set_tree("sindex", info_get_tree_sindexes);
+	as_info_set_command("sindex-create", info_command_sindex_create, PERM_INDEX_MANAGE);  // Create a secondary index.
+	as_info_set_command("sindex-delete", info_command_sindex_delete, PERM_INDEX_MANAGE);  // Delete a secondary index.
+
+	// UDF
+	as_info_set_dynamic("udf-list", udf_cask_info_list, false);
+	as_info_set_command("udf-put", udf_cask_info_put, PERM_UDF_MANAGE);
+	as_info_set_command("udf-get", udf_cask_info_get, PERM_NONE);
+	as_info_set_command("udf-remove", udf_cask_info_remove, PERM_UDF_MANAGE);
+	as_info_set_command("udf-clear-cache", udf_cask_info_clear_cache, PERM_UDF_MANAGE);
+
+	// JOBS
+	as_info_set_command("jobs", info_command_mon_cmd, PERM_JOB_MONITOR);  // Manipulate the multi-key lookup monitoring infrastructure.
+
+	// Undocumented Secondary Index Command
+	as_info_set_command("sindex-histogram", info_command_sindex_histogram, PERM_SERVICE_CTRL);
+
+	as_info_set_dynamic("query-list", as_query_list, false);
+	as_info_set_command("query-kill", info_command_query_kill, PERM_QUERY_MANAGE);
+	as_info_set_command("scan-abort", info_command_abort_scan, PERM_SCAN_MANAGE);            // Abort a scan with a given id.
+	as_info_set_command("scan-abort-all", info_command_abort_all_scans, PERM_SCAN_MANAGE);   // Abort all scans.
+	as_info_set_dynamic("scan-list", as_scan_list, false);                                   // List info for all scan jobs.
+	as_info_set_command("sindex-stat", info_command_sindex_stat, PERM_NONE);
+	as_info_set_command("sindex-list", info_command_sindex_list, PERM_NONE);
+	as_info_set_dynamic("sindex-builder-list", as_sbld_list, false);                         // List info for all secondary index builder jobs.
+
+	as_xdr_info_init();
+
+	// Spin up the Info threads *after* all static and dynamic Info commands have been added
+	// so we can guarantee that the static and dynamic lists will never again be changed.
+	pthread_attr_t thr_attr;
+	pthread_attr_init(&thr_attr);
+	pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED);
+
+	for (int i = 0; i < g_config.n_info_threads; i++) {
+		pthread_t tid;
+		if (0 != pthread_create(&tid, &thr_attr, thr_info_fn, (void *) 0 )) {
+			cf_crash(AS_INFO, "pthread_create: %s", cf_strerror(errno));
+		}
+	}
+
+	as_fabric_register_msg_fn(M_TYPE_INFO, info_mt, sizeof(info_mt), INFO_MSG_SCRATCH_SIZE, info_msg_fn, 0 /* udata */ );
+
+	as_exchange_register_listener(info_clustering_event_listener, NULL);
+
+	// Initialize services info exchange machinery.
+	set_static_services();
+
+	if (g_config.tls_service.tls_our_name != NULL) {
+		g_serv_tls_name = g_config.tls_service.tls_our_name;
+	}
+
+	++g_serv_gen;
+
+	pthread_t info_interfaces_th;
+	pthread_create(&info_interfaces_th, &thr_attr, info_interfaces_fn, 0);
+	return(0);
+}
diff --git a/as/src/base/thr_info_port.c b/as/src/base/thr_info_port.c
new file mode 100644
index 00000000..bdc4c4fc
--- /dev/null
+++ b/as/src/base/thr_info_port.c
@@ -0,0 +1,316 @@
+/*
+ * thr_info_port.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/thr_info_port.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+
+#include "cf_str.h"
+#include "dynbuf.h"
+#include "fault.h"
+#include "socket.h"
+
+#include "base/cfg.h"
+#include "base/thr_info.h"
+
+#define POLL_SZ 1024
+
+// State for any open info port.
+typedef struct {
+	int			recv_pos;
+	int			recv_alloc;
+	uint8_t		*recv_buf;
+
+	int			xmit_pos;    // where we're currently writing
+	int			xmit_limit;  // the end of the write buffer
+	int			xmit_alloc;
+	uint8_t		*xmit_buf;
+
+	cf_socket	sock;
+
+} info_port_state;
+
+cf_serv_cfg g_info_bind = { .n_cfgs = 0 };
+cf_ip_port g_info_port = 0;
+
+static cf_sockets g_sockets;
+
+// Using int for 4-byte size, but maintaining bool semantics.
+static volatile int g_started = false;
+
+void
+info_port_state_free(info_port_state *ips)
+{
+	if (ips->recv_buf) cf_free(ips->recv_buf);
+	if (ips->xmit_buf) cf_free(ips->xmit_buf);
+	cf_socket_close(&ips->sock);
+	cf_socket_term(&ips->sock);
+	memset(ips, -1, sizeof(info_port_state));
+	cf_free(ips);
+}
+
+
+int
+thr_info_port_readable(info_port_state *ips)
+{
+	int sz = cf_socket_available(&ips->sock);
+
+	if (sz == 0) {
+		return 0;
+	}
+
+	// Make sure we've got some reasonable space in the read buffer.
+	if (ips->recv_alloc - ips->recv_pos < sz) {
+		int new_sz = sz + ips->recv_pos + 100;
+		ips->recv_buf = cf_realloc(ips->recv_buf, new_sz);
+		ips->recv_alloc = new_sz;
+	}
+
+	int n = cf_socket_recv(&ips->sock, ips->recv_buf + ips->recv_pos, ips->recv_alloc - ips->recv_pos, 0);
+	if (n < 0) {
+		if (errno != EAGAIN) {
+			cf_detail(AS_INFO_PORT, "info socket: read fail: error: rv %d sz was %d errno %d", n, ips->recv_alloc - ips->recv_pos, errno);
+		}
+		return -1;
+	}
+	ips->recv_pos += n;
+
+	// What about a control-c?
+	if (-1 != cf_str_strnchr(ips->recv_buf, ips->recv_pos, 0xFF)) {
+		cf_debug(AS_INFO_PORT, "recived a control c, aborting");
+		return -1;
+	}
+
+	// See if we've got a CR or LF in the buf yet.
+	int cr = cf_str_strnchr(ips->recv_buf, ips->recv_pos, '\r');
+	int lf = cf_str_strnchr(ips->recv_buf, ips->recv_pos, '\n');
+	if ((cr >= 0) || (lf >= 0)) {
+		size_t len;
+		// Take the closest of cr or lf.
+		if (-1 == lf) {
+			len = cr;
+		}
+		else if (-1 == cr) {
+			len = lf;
+		}
+		else {
+			len = lf < cr ? lf : cr;
+		}
+
+		// We have a message. Process it.
+		cf_dyn_buf_define(db);
+
+		ips->recv_buf[len] = '\n';
+		len++;
+
+		// Fill out the db buffer with the response (always returns 0).
+		as_info_buffer(ips->recv_buf, len, &db);
+		if (db.used_sz == 0)   			cf_dyn_buf_append_char(&db, '\n');
+
+		// See if it has a tab, get that location. It probably does.
+		int tab = cf_str_strnchr(db.buf, db.used_sz , '\t');
+		tab++;
+
+		while (len < ips->recv_pos &&
+				((ips->recv_buf[len] == '\r') || (ips->recv_buf[len] == '\n'))) {
+
+			len ++ ;
+		}
+
+		// Move transmit buffer forward.
+		if (ips->recv_pos - len > 0) {
+			memmove(ips->recv_buf, ips->recv_buf + len, ips->recv_pos - len);
+			ips->recv_pos -= len;
+		}
+		else {
+			ips->recv_pos = 0;
+		}
+
+		// Queue the response - set to the xmit buf.
+		if (ips->xmit_alloc - ips->xmit_limit < db.used_sz) {
+			ips->xmit_buf = cf_realloc(ips->xmit_buf, db.used_sz + ips->xmit_limit);
+			ips->xmit_alloc = db.used_sz + ips->xmit_limit;
+		}
+		memcpy(ips->xmit_buf + ips->xmit_limit, db.buf + tab, db.used_sz - tab);
+		ips->xmit_limit += db.used_sz - tab;
+
+		cf_dyn_buf_free(&db);
+	}
+
+	return 0;
+}
+
+
+int
+thr_info_port_writable(info_port_state *ips)
+{
+	// Do we have bytes to write?
+	if (ips->xmit_limit > 0) {
+
+		// Write them!
+		int rv = cf_socket_send(&ips->sock, ips->xmit_buf + ips->xmit_pos, ips->xmit_limit - ips->xmit_pos , MSG_NOSIGNAL);
+		if (rv < 0) {
+			if (errno != EAGAIN) {
+				return -1;
+			}
+		}
+		else if (rv == 0) {
+			cf_debug(AS_INFO_PORT, "send with return value 0");
+			return 0;
+		}
+		else {
+			ips->xmit_pos += rv;
+			if (ips->xmit_pos == ips->xmit_limit) {
+				ips->xmit_pos = ips->xmit_limit = 0;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+// Demarshal info socket connections.
+void *
+thr_info_port_fn(void *arg)
+{
+	cf_poll poll;
+	cf_debug(AS_INFO_PORT, "Info port process started");
+
+	// Start the listener socket. Note that because this is done after privilege
+	// de-escalation, we can't use privileged ports.
+
+	if (cf_socket_init_server(&g_info_bind, &g_sockets) < 0) {
+		cf_crash(AS_INFO_PORT, "Couldn't initialize service sockets");
+	}
+
+	cf_poll_create(&poll);
+	cf_poll_add_sockets(poll, &g_sockets, EPOLLIN | EPOLLERR | EPOLLHUP);
+	cf_socket_show_server(AS_INFO_PORT, "info", &g_sockets);
+
+	g_started = true;
+
+	while (true) {
+		cf_poll_event events[POLL_SZ];
+		int32_t n_ev = cf_poll_wait(poll, events, POLL_SZ, -1);
+
+		for (int32_t i = 0; i < n_ev; ++i) {
+			cf_socket *ssock = events[i].data;
+
+			if (cf_sockets_has_socket(&g_sockets, ssock)) {
+				cf_socket csock;
+				cf_sock_addr addr;
+
+				if (cf_socket_accept(ssock, &csock, &addr) < 0) {
+					// This means we're out of file descriptors.
+					if (errno == EMFILE) {
+						cf_warning(AS_INFO_PORT, "Too many file descriptors in use, consider raising limit");
+						continue;
+					}
+
+					cf_crash(AS_INFO_PORT, "cf_socket_accept() failed");
+				}
+
+				cf_detail(AS_INFO_PORT, "New connection: %s", cf_sock_addr_print(&addr));
+				info_port_state *ips = cf_malloc(sizeof(info_port_state));
+
+				ips->recv_pos = 0;
+				ips->recv_alloc = 100;
+				ips->recv_buf = cf_malloc(100);
+				ips->xmit_limit = ips->xmit_pos = 0;
+				ips->xmit_alloc = 100;
+				ips->xmit_buf = cf_malloc(100);
+				cf_socket_copy(&csock, &ips->sock);
+
+				cf_poll_add_socket(poll, &csock, EPOLLIN | EPOLLOUT | EPOLLET | EPOLLRDHUP, ips);
+			}
+			else {
+				info_port_state *ips = events[i].data;
+
+				if (ips == NULL) {
+					cf_crash(AS_INFO_PORT, "Event with null handle");
+				}
+
+				cf_detail(AS_INFO_PORT, "Events %x on FD %d", events[i].events, CSFD(&ips->sock));
+
+				if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
+					cf_detail(AS_INFO_PORT, "Remote close on FD %d", CSFD(&ips->sock));
+					cf_poll_delete_socket(poll, &ips->sock);
+					info_port_state_free(ips);
+					continue;
+				}
+
+				if ((events[i].events & EPOLLIN) != 0 && thr_info_port_readable(ips) < 0) {
+					cf_poll_delete_socket(poll, &ips->sock);
+					info_port_state_free(ips);
+					continue;
+				}
+
+				if ((events[i].events & EPOLLOUT) != 0 && thr_info_port_writable(ips) < 0) {
+					cf_poll_delete_socket(poll, &ips->sock);
+					info_port_state_free(ips);
+					continue;
+				}
+			}
+
+			pthread_testcancel();
+		}
+	}
+
+	return NULL;
+}
+
+
+void
+as_info_port_start()
+{
+	if (g_info_port == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO_PORT, "starting info port thread");
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, thr_info_port_fn, NULL) != 0) {
+		cf_crash(AS_INFO_PORT, "failed to create info port thread");
+	}
+
+	// For orderly startup log, wait for endpoint setup.
+	while (! g_started) {
+		usleep(1000);
+	}
+}
diff --git a/as/src/base/thr_nsup.c b/as/src/base/thr_nsup.c
new file mode 100644
index 00000000..525c9c91
--- /dev/null
+++ b/as/src/base/thr_nsup.c
@@ -0,0 +1,1276 @@
+/*
+ * thr_nsup.c
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * namespace supervisor
+ */
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/param.h> // for MIN and MAX
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "fault.h"
+#include "hardware.h"
+#include "linear_hist.h"
+#include "vmapx.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/thr_sindex.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "base/xdr_serverside.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define EVAL_STOP_WRITES_PERIOD 10 // seconds
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static bool eval_stop_writes(as_namespace *ns);
+static bool eval_hwm_breached(as_namespace *ns);
+
+
+//==========================================================
+// Eviction during cold start.
+//
+// No real need for this to be in thr_nsup.c, except maybe
+// for convenient comparison to run-time eviction.
+//
+
+#define EVAL_WRITE_STATE_FREQUENCY 1024
+#define COLD_START_HIST_MIN_BUCKETS 100000 // histogram memory is transient
+
+
+//------------------------------------------------
+// Reduce callback prepares for cold start eviction.
+// - builds cold start eviction histogram
+//
+typedef struct cold_start_evict_prep_info_s {
+	as_namespace*		ns;
+	linear_hist*		hist;
+	bool*				sets_not_evicting;
+} cold_start_evict_prep_info;
+
+static void
+cold_start_evict_prep_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_index* r = r_ref->r;
+	cold_start_evict_prep_info* p_info = (cold_start_evict_prep_info*)udata;
+	uint32_t set_id = as_index_get_set_id(r);
+	uint32_t void_time = r->void_time;
+
+	if (void_time != 0 &&
+			! p_info->sets_not_evicting[set_id]) {
+		linear_hist_insert_data_point(p_info->hist, void_time);
+	}
+
+	as_record_done(r_ref, p_info->ns);
+}
+
+//------------------------------------------------
+// Threads prepare for cold start eviction.
+//
+typedef struct evict_prep_thread_info_s {
+	as_namespace*		ns;
+	cf_atomic32*		p_pid;
+	uint32_t			i_cpu;
+	linear_hist*		hist;
+	bool*				sets_not_evicting;
+} evict_prep_thread_info;
+
+void*
+run_cold_start_evict_prep(void* udata)
+{
+	evict_prep_thread_info* p_info = (evict_prep_thread_info*)udata;
+
+	cf_topo_pin_to_cpu((cf_topo_cpu_index)p_info->i_cpu);
+
+	as_namespace *ns = p_info->ns;
+
+	cold_start_evict_prep_info cb_info;
+
+	cb_info.ns = ns;
+	cb_info.hist = p_info->hist;
+	cb_info.sets_not_evicting = p_info->sets_not_evicting;
+
+	int pid;
+
+	while ((pid = (int)cf_atomic32_incr(p_info->p_pid)) < AS_PARTITIONS) {
+		// Don't bother with partition reservations - it's startup.
+		as_index_reduce_live(ns->partitions[pid].vp, cold_start_evict_prep_reduce_cb, &cb_info);
+	}
+
+	return NULL;
+}
+
+//------------------------------------------------
+// Reduce callback evicts records on cold start.
+// - evicts based on calculated threshold
+//
+typedef struct cold_start_evict_info_s {
+	as_namespace*	ns;
+	as_partition*	p_partition;
+	bool*			sets_not_evicting;
+	uint32_t		num_evicted;
+	uint32_t		num_0_void_time;
+} cold_start_evict_info;
+
+static void
+cold_start_evict_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_index* r = r_ref->r;
+	cold_start_evict_info* p_info = (cold_start_evict_info*)udata;
+	as_namespace* ns = p_info->ns;
+	as_partition* p_partition = p_info->p_partition;
+	uint32_t set_id = as_index_get_set_id(r);
+	uint32_t void_time = r->void_time;
+
+	if (void_time != 0) {
+		if (! p_info->sets_not_evicting[set_id] &&
+				void_time < ns->cold_start_threshold_void_time) {
+			as_index_delete(p_partition->vp, &r->keyd);
+			p_info->num_evicted++;
+		}
+	}
+	else {
+		p_info->num_0_void_time++;
+	}
+
+	as_record_done(r_ref, ns);
+}
+
+//------------------------------------------------
+// Threads do cold start eviction.
+//
+typedef struct evict_thread_info_s {
+	as_namespace*	ns;
+	cf_atomic32		pid;
+	cf_atomic32		i_cpu;
+	bool*			sets_not_evicting;
+	cf_atomic32		total_evicted;
+	cf_atomic32		total_0_void_time;
+} evict_thread_info;
+
+void*
+run_cold_start_evict(void* udata)
+{
+	evict_thread_info* p_info = (evict_thread_info*)udata;
+
+	cf_topo_pin_to_cpu((cf_topo_cpu_index)cf_atomic32_incr(&p_info->i_cpu));
+
+	as_namespace* ns = p_info->ns;
+
+	cold_start_evict_info cb_info;
+
+	cb_info.ns = ns;
+	cb_info.sets_not_evicting = p_info->sets_not_evicting;
+	cb_info.num_evicted = 0;
+	cb_info.num_0_void_time = 0;
+
+	int pid;
+
+	while ((pid = (int)cf_atomic32_incr(&p_info->pid)) < AS_PARTITIONS) {
+		// Don't bother with partition reservations - it's startup.
+		as_partition* p_partition = &ns->partitions[pid];
+
+		cb_info.p_partition = p_partition;
+		as_index_reduce_live(p_partition->vp, cold_start_evict_reduce_cb, &cb_info);
+	}
+
+	cf_atomic32_add(&p_info->total_evicted, cb_info.num_evicted);
+	cf_atomic32_add(&p_info->total_0_void_time, cb_info.num_0_void_time);
+
+	return NULL;
+}
+
+//------------------------------------------------
+// Get the cold start histogram's TTL range.
+//
+// TODO - ttl_range to 32 bits?
+static uint64_t
+get_cold_start_ttl_range(as_namespace* ns, uint32_t now)
+{
+	uint64_t max_void_time = 0;
+
+	for (int n = 0; n < AS_PARTITIONS; n++) {
+		uint64_t partition_max_void_time = cf_atomic64_get(ns->partitions[n].max_void_time);
+
+		if (partition_max_void_time > max_void_time) {
+			max_void_time = partition_max_void_time;
+		}
+	}
+
+	// Use max-ttl to cap the namespace maximum void-time.
+	uint64_t cap = now + ns->max_ttl;
+
+	if (max_void_time > cap) {
+		max_void_time = cap;
+	}
+
+	// Convert to TTL - used for cold start histogram range.
+	return max_void_time > now ? max_void_time - now : 0;
+}
+
+//------------------------------------------------
+// Set cold start eviction threshold.
+//
+static uint64_t
+set_cold_start_threshold(as_namespace* ns, linear_hist* hist)
+{
+	linear_hist_threshold threshold;
+	uint64_t subtotal = linear_hist_get_threshold_for_fraction(hist, ns->evict_tenths_pct, &threshold);
+	bool all_buckets = threshold.value == 0xFFFFffff;
+
+	if (subtotal == 0) {
+		if (all_buckets) {
+			cf_warning(AS_NSUP, "{%s} cold start found no records eligible for eviction", ns->name);
+		}
+		else {
+			cf_warning(AS_NSUP, "{%s} cold start found no records below eviction void-time %u - threshold bucket %u, width %u sec, count %lu > target %lu (%.1f pct)",
+					ns->name, threshold.value, threshold.bucket_index,
+					threshold.bucket_width, threshold.bucket_count,
+					threshold.target_count, (float)ns->evict_tenths_pct / 10.0);
+		}
+
+		return 0;
+	}
+
+	if (all_buckets) {
+		cf_warning(AS_NSUP, "{%s} cold start would evict all %lu records eligible - not evicting!", ns->name, subtotal);
+		return 0;
+	}
+
+	cf_atomic32_set(&ns->cold_start_threshold_void_time, threshold.value);
+
+	return subtotal;
+}
+
+//------------------------------------------------
+// Cold start eviction, called by drv_ssd.c.
+// Returns false if a serious problem occurred and
+// we can't proceed.
+//
+bool
+as_cold_start_evict_if_needed(as_namespace* ns)
+{
+	pthread_mutex_lock(&ns->cold_start_evict_lock);
+
+	// Only go further than here every thousand record add attempts.
+	if (ns->cold_start_record_add_count++ % EVAL_WRITE_STATE_FREQUENCY != 0) {
+		pthread_mutex_unlock(&ns->cold_start_evict_lock);
+		return true;
+	}
+
+	uint32_t now = as_record_void_time_get();
+
+	// Update threshold void-time if we're past it.
+	if (now > cf_atomic32_get(ns->cold_start_threshold_void_time)) {
+		cf_atomic32_set(&ns->cold_start_threshold_void_time, now);
+	}
+
+	// Are we out of control?
+	if (eval_stop_writes(ns)) {
+		cf_warning(AS_NSUP, "{%s} hit stop-writes limit", ns->name);
+		pthread_mutex_unlock(&ns->cold_start_evict_lock);
+		return false;
+	}
+
+	// If we don't need to evict, we're done.
+	if (! eval_hwm_breached(ns)) {
+		pthread_mutex_unlock(&ns->cold_start_evict_lock);
+		return true;
+	}
+
+	// We want to evict, but are we allowed to do so?
+	if (! g_config.nsup_startup_evict) {
+		cf_warning(AS_NSUP, "{%s} hwm breached but not allowed to evict", ns->name);
+		pthread_mutex_unlock(&ns->cold_start_evict_lock);
+		return true;
+	}
+
+	// We may evict - set up the cold start eviction histogram.
+	cf_info(AS_NSUP, "{%s} cold start building eviction histogram ...", ns->name);
+
+	uint32_t ttl_range = (uint32_t)get_cold_start_ttl_range(ns, now);
+	uint32_t n_buckets = MAX(ns->evict_hist_buckets, COLD_START_HIST_MIN_BUCKETS);
+
+	uint32_t num_sets = cf_vmapx_count(ns->p_sets_vmap);
+	bool sets_not_evicting[AS_SET_MAX_COUNT + 1];
+
+	memset(sets_not_evicting, 0, sizeof(sets_not_evicting));
+
+	for (uint32_t j = 0; j < num_sets; j++) {
+		uint32_t set_id = j + 1;
+		as_set* p_set;
+
+		if (cf_vmapx_get_by_index(ns->p_sets_vmap, j, (void**)&p_set) != CF_VMAPX_OK) {
+			cf_crash(AS_NSUP, "failed to get set index %u from vmap", j);
+		}
+
+		if (IS_SET_EVICTION_DISABLED(p_set)) {
+			sets_not_evicting[set_id] = true;
+		}
+	}
+
+	// Split these tasks across multiple threads.
+	uint32_t n_cpus = cf_topo_count_cpus();
+	pthread_t evict_threads[n_cpus];
+
+	// Reduce all partitions to build the eviction histogram.
+	evict_prep_thread_info prep_thread_infos[n_cpus];
+	cf_atomic32 pid = -1;
+
+	for (uint32_t n = 0; n < n_cpus; n++) {
+		prep_thread_infos[n].ns = ns;
+		prep_thread_infos[n].p_pid = &pid;
+		prep_thread_infos[n].i_cpu = n;
+		prep_thread_infos[n].hist = linear_hist_create("thread-hist", now, ttl_range, n_buckets);
+		prep_thread_infos[n].sets_not_evicting = sets_not_evicting;
+
+		if (pthread_create(&evict_threads[n], NULL, run_cold_start_evict_prep, (void*)&prep_thread_infos[n]) != 0) {
+			cf_crash(AS_NSUP, "{%s} failed to create evict-prep thread %u", ns->name, n);
+		}
+	}
+
+	for (uint32_t n = 0; n < n_cpus; n++) {
+		pthread_join(evict_threads[n], NULL);
+
+		if (n == 0) {
+			continue;
+		}
+
+		linear_hist_merge(prep_thread_infos[0].hist, prep_thread_infos[n].hist);
+		linear_hist_destroy(prep_thread_infos[n].hist);
+	}
+	// Now we're single-threaded again.
+
+	// Calculate the eviction threshold.
+	uint64_t n_evictable = set_cold_start_threshold(ns, prep_thread_infos[0].hist);
+
+	linear_hist_destroy(prep_thread_infos[0].hist);
+
+	if (n_evictable == 0) {
+		cf_warning(AS_NSUP, "{%s} hwm breached but no records to evict", ns->name);
+		pthread_mutex_unlock(&ns->cold_start_evict_lock);
+		return true;
+	}
+
+	cf_info(AS_NSUP, "{%s} cold start found %lu records eligible for eviction, evict ttl %u", ns->name, n_evictable, cf_atomic32_get(ns->cold_start_threshold_void_time) - now);
+
+	// Reduce all partitions to evict based on the thresholds.
+	evict_thread_info thread_info = {
+			.ns = ns,
+			.pid = -1,
+			.i_cpu = -1,
+			.sets_not_evicting = sets_not_evicting,
+			.total_evicted = 0,
+			.total_0_void_time = 0
+	};
+
+	for (uint32_t n = 0; n < n_cpus; n++) {
+		if (pthread_create(&evict_threads[n], NULL, run_cold_start_evict, (void*)&thread_info) != 0) {
+			cf_crash(AS_NSUP, "{%s} failed to create evict thread %u", ns->name, n);
+		}
+	}
+
+	for (uint32_t n = 0; n < n_cpus; n++) {
+		pthread_join(evict_threads[n], NULL);
+	}
+	// Now we're single-threaded again.
+
+	cf_info(AS_NSUP, "{%s} cold start evicted %u records, found %u 0-void-time records", ns->name, thread_info.total_evicted, thread_info.total_0_void_time);
+
+	pthread_mutex_unlock(&ns->cold_start_evict_lock);
+	return true;
+}
+
+//
+// END - Eviction during cold start.
+//==========================================================
+
+//==========================================================
+// Temporary dangling prole garbage collection.
+//
+
+typedef struct garbage_collect_info_s {
+	as_namespace*	ns;
+	as_index_tree*	p_tree;
+	uint32_t		now;
+	uint32_t		num_deleted;
+} garbage_collect_info;
+
+static void
+garbage_collect_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	garbage_collect_info* p_info = (garbage_collect_info*)udata;
+	uint32_t void_time = r_ref->r->void_time;
+
+	// If we're past void-time plus safety margin, delete the record.
+	if (void_time != 0 && p_info->now > void_time + g_config.prole_extra_ttl) {
+		as_index_delete(p_info->p_tree, &r_ref->r->keyd);
+		p_info->num_deleted++;
+	}
+
+	as_record_done(r_ref, p_info->ns);
+}
+
+static int
+garbage_collect_next_prole_partition(as_namespace* ns, int pid)
+{
+	as_partition_reservation rsv;
+
+	// Look for the next non-master partition past pid, but loop only once over
+	// all partitions.
+	for (int n = 0; n < AS_PARTITIONS; n++) {
+		// Increment pid and wrap if necessary.
+		if (++pid == AS_PARTITIONS) {
+			pid = 0;
+		}
+
+		// Note - may want a new method to get these under a single partition
+		// lock, but for now just do the two separate reserve calls.
+		if (as_partition_reserve_write(ns, pid, &rsv, NULL) == 0) {
+			// This is a master partition - continue.
+			as_partition_release(&rsv);
+		}
+		else {
+			as_partition_reserve(ns, pid, &rsv);
+
+			// This is a non-master partition - garbage collect and break.
+			garbage_collect_info cb_info;
+
+			cb_info.ns = ns;
+			cb_info.p_tree = rsv.tree;
+			cb_info.now = as_record_void_time_get();
+			cb_info.num_deleted = 0;
+
+			// Reduce the partition, deleting long-expired records.
+			as_index_reduce_live(rsv.tree, garbage_collect_reduce_cb, &cb_info);
+
+			if (cb_info.num_deleted != 0) {
+				cf_info(AS_NSUP, "namespace %s pid %d: %u expired non-masters",
+						ns->name, pid, cb_info.num_deleted);
+			}
+
+			as_partition_release(&rsv);
+
+			// Do only one partition per nsup loop.
+			break;
+		}
+	}
+
+	return pid;
+}
+
+//
+// END - Temporary dangling prole garbage collection.
+//==========================================================
+
+
+static cf_queue* g_p_nsup_delete_q = NULL;
+
+int
+as_nsup_queue_get_size()
+{
+	return g_p_nsup_delete_q ? cf_queue_sz(g_p_nsup_delete_q) : 0;
+}
+
+// Make sure a huge nsup deletion wave won't blow delete queue up.
+#define DELETE_Q_SAFETY_THRESHOLD	10000
+#define DELETE_Q_SAFETY_SLEEP_us	1000 // 1 millisecond
+
+// Wait for delete queue to clear.
+#define DELETE_Q_CLEAR_SLEEP_us		1000 // 1 millisecond
+
+typedef struct record_delete_info_s {
+	as_namespace*	ns;
+	cf_digest		digest;
+} record_delete_info;
+
+
+//------------------------------------------------
+// Run thread to handle delete queue.
+//
+void*
+run_nsup_delete(void* pv_data)
+{
+	while (true) {
+		record_delete_info q_item;
+
+		if (CF_QUEUE_OK != cf_queue_pop(g_p_nsup_delete_q, (void*)&q_item, CF_QUEUE_FOREVER)) {
+			cf_crash(AS_NSUP, "nsup delete queue pop failed");
+		}
+
+		// Generate a delete transaction for this digest, and hand it to tsvc.
+
+		uint8_t info2 = AS_MSG_INFO2_WRITE | AS_MSG_INFO2_DELETE;
+
+		cl_msg *msgp = as_msg_create_internal(q_item.ns->name, &q_item.digest,
+				0, info2, 0);
+
+		as_transaction tr;
+		as_transaction_init_head(&tr, NULL, msgp);
+
+		as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_NAMESPACE);
+		as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_DIGEST_RIPE);
+		tr.origin = FROM_NSUP;
+		tr.start_time = cf_getns();
+
+		as_tsvc_enqueue(&tr);
+
+		// Throttle - don't overwhelm tsvc queue.
+		if (g_config.nsup_delete_sleep != 0) {
+			usleep(g_config.nsup_delete_sleep);
+		}
+	}
+
+	return NULL;
+}
+
+//------------------------------------------------
+// Queue a record for deletion.
+//
+static void
+queue_for_delete(as_namespace* ns, cf_digest* p_digest)
+{
+	record_delete_info q_item;
+
+	q_item.ns = ns; // not bothering with namespace reservation
+	q_item.digest = *p_digest;
+
+	cf_queue_push(g_p_nsup_delete_q, (void*)&q_item);
+}
+
+//------------------------------------------------
+// Insert data into object size histograms.
+//
+static void
+add_to_obj_size_histograms(as_namespace* ns, as_index* r)
+{
+	uint32_t set_id = as_index_get_set_id(r);
+	linear_hist* set_obj_size_hist = ns->set_obj_size_hists[set_id];
+	uint64_t n_rblocks = r->n_rblocks;
+
+	linear_hist_insert_data_point(ns->obj_size_hist, n_rblocks);
+
+	if (set_obj_size_hist) {
+		linear_hist_insert_data_point(set_obj_size_hist, n_rblocks);
+	}
+}
+
+//------------------------------------------------
+// Insert data into TTL histograms.
+//
+static void
+add_to_ttl_histograms(as_namespace* ns, as_index* r)
+{
+	uint32_t set_id = as_index_get_set_id(r);
+	linear_hist* set_ttl_hist = ns->set_ttl_hists[set_id];
+	uint32_t void_time = r->void_time;
+
+	linear_hist_insert_data_point(ns->ttl_hist, void_time);
+
+	if (set_ttl_hist) {
+		linear_hist_insert_data_point(set_ttl_hist, void_time);
+	}
+}
+
+//------------------------------------------------
+// Reduce callback prepares for eviction.
+// - builds object size, eviction & TTL histograms
+// - counts 0-void-time records
+//
+typedef struct evict_prep_info_s {
+	as_namespace*	ns;
+	bool*			sets_not_evicting;
+	uint64_t		num_0_void_time;
+} evict_prep_info;
+
+static void
+evict_prep_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_index* r = r_ref->r;
+	evict_prep_info* p_info = (evict_prep_info*)udata;
+	as_namespace* ns = p_info->ns;
+	uint32_t set_id = as_index_get_set_id(r);
+	uint32_t void_time = r->void_time;
+
+	add_to_obj_size_histograms(ns, r);
+
+	if (void_time != 0) {
+		if (! p_info->sets_not_evicting[set_id]) {
+			linear_hist_insert_data_point(ns->evict_hist, void_time);
+		}
+
+		add_to_ttl_histograms(ns, r);
+	}
+	else {
+		p_info->num_0_void_time++;
+	}
+
+	as_record_done(r_ref, ns);
+}
+
+//------------------------------------------------
+// Reduce callback evicts records.
+// - evicts based on general threshold
+// - does expiration on eviction-disabled sets
+//
+typedef struct evict_info_s {
+	as_namespace*	ns;
+	uint32_t		now;
+	bool*			sets_not_evicting;
+	uint32_t		evict_void_time;
+	uint64_t		num_evicted;
+} evict_info;
+
+static void
+evict_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_index* r = r_ref->r;
+	evict_info* p_info = (evict_info*)udata;
+	as_namespace* ns = p_info->ns;
+	uint32_t set_id = as_index_get_set_id(r);
+	uint32_t void_time = r->void_time;
+
+	if (void_time != 0) {
+		if (p_info->sets_not_evicting[set_id]) {
+			if (p_info->now > void_time) {
+				queue_for_delete(ns, &r->keyd);
+				p_info->num_evicted++;
+			}
+		}
+		else if (void_time < p_info->evict_void_time) {
+			queue_for_delete(ns, &r->keyd);
+			p_info->num_evicted++;
+		}
+	}
+
+	as_record_done(r_ref, ns);
+}
+
+//------------------------------------------------
+// Reduce callback expires records.
+// - does expiration
+// - builds object size & TTL histograms
+// - counts 0-void-time records
+//
+typedef struct expire_info_s {
+	as_namespace*	ns;
+	uint32_t		now;
+	uint64_t		num_expired;
+	uint64_t		num_0_void_time;
+} expire_info;
+
+static void
+expire_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_index* r = r_ref->r;
+	expire_info* p_info = (expire_info*)udata;
+	as_namespace* ns = p_info->ns;
+	uint32_t void_time = r->void_time;
+
+	if (void_time != 0) {
+		if (p_info->now > void_time) {
+			queue_for_delete(ns, &r->keyd);
+			p_info->num_expired++;
+		}
+		else {
+			add_to_obj_size_histograms(ns, r);
+			add_to_ttl_histograms(ns, r);
+		}
+	}
+	else {
+		add_to_obj_size_histograms(ns, r);
+		p_info->num_0_void_time++;
+	}
+
+	as_record_done(r_ref, ns);
+}
+
+//------------------------------------------------
+// Reduce all master partitions, using specified
+// functionality. Throttle to make sure deletions
+// generated by reducing each partition don't blow
+// up the delete queue.
+//
+static void
+reduce_master_partitions(as_namespace* ns, as_index_reduce_fn cb, void* udata, uint32_t* p_n_waits, const char* tag)
+{
+	as_partition_reservation rsv;
+
+	for (int n = 0; n < AS_PARTITIONS; n++) {
+		if (as_partition_reserve_write(ns, n, &rsv, NULL) != 0) {
+			continue;
+		}
+
+		as_index_reduce_live(rsv.tree, cb, udata);
+
+		as_partition_release(&rsv);
+
+		while (cf_queue_sz(g_p_nsup_delete_q) > DELETE_Q_SAFETY_THRESHOLD) {
+			usleep(DELETE_Q_SAFETY_SLEEP_us);
+			(*p_n_waits)++;
+		}
+
+		cf_debug(AS_NSUP, "{%s} %s done partition index %d, waits %u", ns->name, tag, n, *p_n_waits);
+	}
+}
+
+//------------------------------------------------
+// Lazily create and clear a set's size histogram.
+//
+static void
+clear_set_obj_size_hist(as_namespace* ns, uint32_t set_id)
+{
+	if (! ns->set_obj_size_hists[set_id]) {
+		char hist_name[HISTOGRAM_NAME_SIZE];
+
+		sprintf(hist_name, "%s set %u object size histogram", ns->name, set_id);
+		ns->set_obj_size_hists[set_id] = linear_hist_create(hist_name, 0, 0, OBJ_SIZE_HIST_NUM_BUCKETS);
+	}
+
+	linear_hist_clear(ns->set_obj_size_hists[set_id], 0, cf_atomic32_get(ns->obj_size_hist_max));
+}
+
+//------------------------------------------------
+// Lazily create and clear a set's TTL histogram.
+//
+static void
+clear_set_ttl_hist(as_namespace* ns, uint32_t set_id, uint32_t now, uint64_t ttl_range)
+{
+	if (! ns->set_ttl_hists[set_id]) {
+		char hist_name[HISTOGRAM_NAME_SIZE];
+
+		sprintf(hist_name, "%s set %u ttl histogram", ns->name, set_id);
+		ns->set_ttl_hists[set_id] = linear_hist_create(hist_name, 0, 0, TTL_HIST_NUM_BUCKETS);
+	}
+
+	linear_hist_clear(ns->set_ttl_hists[set_id], now, ttl_range);
+}
+
+//------------------------------------------------
+// Get the TTL range for histograms.
+//
+// TODO - ttl_range to 32 bits?
+static uint64_t
+get_ttl_range(as_namespace* ns, uint32_t now)
+{
+	uint64_t max_master_void_time = 0;
+	as_partition_reservation rsv;
+
+	for (int n = 0; n < AS_PARTITIONS; n++) {
+		if (as_partition_reserve_write(ns, n, &rsv, NULL) != 0) {
+			continue;
+		}
+
+		as_partition_release(&rsv);
+
+		uint64_t partition_max_void_time = cf_atomic64_get(ns->partitions[n].max_void_time);
+
+		if (partition_max_void_time > max_master_void_time) {
+			max_master_void_time = partition_max_void_time;
+		}
+	}
+
+	// Use max-ttl to cap the namespace maximum void-time.
+	uint64_t cap = now + ns->max_ttl;
+
+	if (max_master_void_time > cap) {
+		max_master_void_time = cap;
+	}
+
+	// Convert to TTL - used for histogram ranges.
+	return max_master_void_time > now ? max_master_void_time - now : 0;
+}
+
+//------------------------------------------------
+// Get general eviction threshold.
+//
+static bool
+get_threshold(as_namespace* ns, uint32_t* p_evict_void_time)
+{
+	linear_hist_threshold threshold;
+	uint64_t subtotal = linear_hist_get_threshold_for_fraction(ns->evict_hist, ns->evict_tenths_pct, &threshold);
+	bool all_buckets = threshold.value == 0xFFFFffff;
+
+	*p_evict_void_time = threshold.value;
+
+	if (subtotal == 0) {
+		if (all_buckets) {
+			cf_warning(AS_NSUP, "{%s} no records eligible for eviction", ns->name);
+		}
+		else {
+			cf_warning(AS_NSUP, "{%s} no records below eviction void-time %u - threshold bucket %u, width %u sec, count %lu > target %lu (%.1f pct)",
+					ns->name, threshold.value, threshold.bucket_index,
+					threshold.bucket_width, threshold.bucket_count,
+					threshold.target_count, (float)ns->evict_tenths_pct / 10.0);
+		}
+
+		return false;
+	}
+
+	if (all_buckets) {
+		cf_warning(AS_NSUP, "{%s} would evict all %lu records eligible - not evicting!", ns->name, subtotal);
+		return false;
+	}
+
+	cf_info(AS_NSUP, "{%s} found %lu records eligible for eviction", ns->name, subtotal);
+
+	return true;
+}
+
+//------------------------------------------------
+// Stats per namespace at the end of an nsup lap.
+//
+static void
+update_stats(as_namespace* ns, uint64_t n_master, uint64_t n_0_void_time,
+		uint64_t n_expired_objects, uint64_t n_evicted_objects,
+		uint32_t evict_ttl, uint32_t n_general_waits, uint32_t n_clear_waits,
+		uint64_t start_ms)
+{
+	ns->non_expirable_objects = n_0_void_time;
+
+	cf_atomic64_add(&ns->n_expired_objects, n_expired_objects);
+	cf_atomic64_add(&ns->n_evicted_objects, n_evicted_objects);
+
+	cf_atomic64_set(&ns->evict_ttl, evict_ttl);
+
+	uint64_t total_duration_ms = cf_getms() - start_ms;
+
+	ns->nsup_cycle_duration = (uint32_t)(total_duration_ms / 1000);
+	ns->nsup_cycle_sleep_pct = total_duration_ms == 0 ? 0 : (uint32_t)((n_general_waits * 100) / total_duration_ms);
+
+	cf_info(AS_NSUP, "{%s} nsup-done: master-objects (%lu,%lu) expired (%lu,%lu) evicted (%lu,%lu) evict-ttl %d waits (%u,%u) total-ms %lu",
+			ns->name,
+			n_master, n_0_void_time,
+			ns->n_expired_objects, n_expired_objects,
+			ns->n_evicted_objects, n_evicted_objects,
+			evict_ttl,
+			n_general_waits, n_clear_waits,
+			total_duration_ms);
+}
+
+//------------------------------------------------
+// Namespace supervisor thread "run" function.
+//
+void *
+run_nsup(void *arg)
+{
+	// Garbage-collect long-expired proles, one partition per loop.
+	int prole_pids[g_config.n_namespaces];
+
+	for (int n = 0; n < g_config.n_namespaces; n++) {
+		prole_pids[n] = -1;
+	}
+
+	uint64_t last_time = cf_get_seconds();
+
+	for ( ; ; ) {
+		// Wake up every 1 second to check the nsup timeout.
+		struct timespec delay = { 1, 0 };
+		nanosleep(&delay, NULL);
+
+		uint64_t curr_time = cf_get_seconds();
+
+		if ((curr_time - last_time) < g_config.nsup_period) {
+			continue; // period has not been reached for running eviction check
+		}
+
+		last_time = curr_time;
+
+		// Iterate over every namespace.
+		for (int i = 0; i < g_config.n_namespaces; i++) {
+			as_namespace *ns = g_config.namespaces[i];
+
+			uint64_t start_ms = cf_getms();
+
+			cf_info(AS_NSUP, "{%s} nsup-start", ns->name);
+
+			linear_hist_clear(ns->obj_size_hist, 0, cf_atomic32_get(ns->obj_size_hist_max));
+
+			// The "now" used for all expiration and eviction.
+			uint32_t now = as_record_void_time_get();
+
+			// Get the histogram range - used by all histograms.
+			uint32_t ttl_range = (uint32_t)get_ttl_range(ns, now);
+
+			linear_hist_clear(ns->ttl_hist, now, ttl_range);
+
+			uint64_t n_expired_records = 0;
+			uint64_t n_0_void_time_records = 0;
+
+			uint32_t num_sets = cf_vmapx_count(ns->p_sets_vmap);
+
+			bool sets_protected = false;
+
+			// Giving this max possible size to spare us checking each record's
+			// set-id during index reduce.
+			bool sets_not_evicting[AS_SET_MAX_COUNT + 1];
+
+			memset(sets_not_evicting, 0, sizeof(sets_not_evicting));
+
+			for (uint32_t j = 0; j < num_sets; j++) {
+				uint32_t set_id = j + 1;
+
+				clear_set_obj_size_hist(ns, set_id);
+				clear_set_ttl_hist(ns, set_id, now, ttl_range);
+
+				as_set* p_set;
+
+				if (cf_vmapx_get_by_index(ns->p_sets_vmap, j, (void**)&p_set) != CF_VMAPX_OK) {
+					cf_crash(AS_NSUP, "failed to get set index %u from vmap", j);
+				}
+
+				if (IS_SET_EVICTION_DISABLED(p_set)) {
+					sets_not_evicting[set_id] = true;
+					sets_protected = true;
+				}
+			}
+
+			uint64_t n_evicted_records = 0;
+			uint32_t evict_ttl = 0;
+			uint32_t n_general_waits = 0;
+
+			// Check whether or not we need to do general eviction.
+
+			if (eval_hwm_breached(ns)) {
+				// Eviction is necessary.
+
+				linear_hist_clear(ns->obj_size_hist, 0, cf_atomic32_get(ns->obj_size_hist_max));
+				linear_hist_reset(ns->evict_hist, now, ttl_range, ns->evict_hist_buckets);
+				linear_hist_clear(ns->ttl_hist, now, ttl_range);
+
+				for (uint32_t j = 0; j < num_sets; j++) {
+					uint32_t set_id = j + 1;
+
+					linear_hist_clear(ns->set_obj_size_hists[set_id], 0, cf_atomic32_get(ns->obj_size_hist_max));
+					linear_hist_clear(ns->set_ttl_hists[set_id], now, ttl_range);
+				}
+
+				evict_prep_info cb_info1;
+
+				memset(&cb_info1, 0, sizeof(cb_info1));
+				cb_info1.ns = ns;
+				cb_info1.sets_not_evicting = sets_not_evicting;
+
+				// Reduce master partitions, building histograms to calculate
+				// general eviction threshold.
+				reduce_master_partitions(ns, evict_prep_reduce_cb, &cb_info1, &n_general_waits, "evict-prep");
+
+				n_0_void_time_records = cb_info1.num_0_void_time;
+
+				evict_info cb_info2;
+
+				memset(&cb_info2, 0, sizeof(cb_info2));
+				cb_info2.ns = ns;
+				cb_info2.now = now;
+				cb_info2.sets_not_evicting = sets_not_evicting;
+
+				// Determine general eviction threshold.
+				if (get_threshold(ns, &cb_info2.evict_void_time)) {
+					// Save the eviction depth in the device header(s) so it can
+					// be used to speed up cold start, etc.
+					as_storage_save_evict_void_time(ns, cb_info2.evict_void_time);
+
+					// Reduce master partitions, deleting records up to
+					// threshold. (This automatically deletes expired records.)
+					reduce_master_partitions(ns, evict_reduce_cb, &cb_info2, &n_general_waits, "evict");
+
+					evict_ttl = cb_info2.evict_void_time - now;
+					n_evicted_records = cb_info2.num_evicted;
+				}
+				else if (sets_protected || cb_info2.evict_void_time == now) {
+					// Convert eviction into expiration.
+					cb_info2.evict_void_time = now;
+
+					// Reduce master partitions, deleting expired records,
+					// including those in eviction-protected sets.
+					reduce_master_partitions(ns, evict_reduce_cb, &cb_info2, &n_general_waits, "expire-protected-sets");
+
+					// Count these as expired rather than evicted, since we can.
+					n_expired_records = cb_info2.num_evicted;
+				}
+
+				// For now there's no get_info() call for evict_hist.
+				//linear_hist_save_info(ns->evict_hist);
+			}
+			else {
+				// Eviction is not necessary, only expiration.
+
+				expire_info cb_info;
+
+				memset(&cb_info, 0, sizeof(cb_info));
+				cb_info.ns = ns;
+				cb_info.now = now;
+
+				// Reduce master partitions, deleting expired records.
+				reduce_master_partitions(ns, expire_reduce_cb, &cb_info, &n_general_waits, "expire");
+
+				n_expired_records = cb_info.num_expired;
+				n_0_void_time_records = cb_info.num_0_void_time;
+			}
+
+			linear_hist_dump(ns->obj_size_hist);
+			linear_hist_save_info(ns->obj_size_hist);
+			linear_hist_dump(ns->ttl_hist);
+			linear_hist_save_info(ns->ttl_hist);
+
+			for (uint32_t j = 0; j < num_sets; j++) {
+				uint32_t set_id = j + 1;
+
+				linear_hist_dump(ns->set_obj_size_hists[set_id]);
+				linear_hist_save_info(ns->set_obj_size_hists[set_id]);
+				linear_hist_dump(ns->set_ttl_hists[set_id]);
+				linear_hist_save_info(ns->set_ttl_hists[set_id]);
+			}
+
+			// Wait for delete queue to clear.
+			uint32_t n_clear_waits = 0;
+
+			while (cf_queue_sz(g_p_nsup_delete_q) > 0) {
+				usleep(DELETE_Q_CLEAR_SLEEP_us);
+				n_clear_waits++;
+			}
+
+			update_stats(ns, linear_hist_get_total(ns->ttl_hist) + n_0_void_time_records, n_0_void_time_records,
+					n_expired_records, n_evicted_records, evict_ttl,
+					n_general_waits, n_clear_waits, start_ms);
+
+			// Garbage-collect long-expired proles, one partition per loop.
+			if (g_config.prole_extra_ttl != 0) {
+				prole_pids[i] = garbage_collect_next_prole_partition(ns, prole_pids[i]);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+//------------------------------------------------
+// Namespace stop-writes thread "run" function.
+//
+void *
+run_stop_writes(void *arg)
+{
+	while (true) {
+		sleep(EVAL_STOP_WRITES_PERIOD);
+
+		for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+			eval_stop_writes(g_config.namespaces[ns_ix]);
+		}
+	}
+
+	return NULL;
+}
+
+//------------------------------------------------
+// Start supervisor threads.
+//
+void
+as_nsup_start()
+{
+	// Seed the random number generator.
+	srand(time(NULL));
+
+	// Create queue for nsup-generated deletions.
+	g_p_nsup_delete_q = cf_queue_create(sizeof(record_delete_info), true);
+
+	cf_info(AS_NSUP, "starting namespace supervisor threads");
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	// Start thread to handle all nsup-generated deletions.
+	if (0 != pthread_create(&thread, &attrs, run_nsup_delete, NULL)) {
+		cf_crash(AS_NSUP, "nsup delete thread create failed");
+	}
+
+	// Start namespace supervisor thread to do expiration & eviction.
+	if (0 != pthread_create(&thread, &attrs, run_nsup, NULL)) {
+		cf_crash(AS_NSUP, "nsup thread create failed");
+	}
+
+	// Start thread to do stop-writes evaluation.
+	if (0 != pthread_create(&thread, &attrs, run_stop_writes, NULL)) {
+		cf_crash(AS_NSUP, "nsup stop-writes thread create failed");
+	}
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static bool
+eval_stop_writes(as_namespace *ns)
+{
+	// Compute the high-watermark.
+	uint64_t mem_stop_writes = (ns->memory_size * ns->stop_writes_pct) / 100;
+
+	// Compute device available percent for namespace.
+	int device_avail_pct = 0;
+
+	as_storage_stats(ns, &device_avail_pct, NULL);
+
+	// Compute memory usage for namespace.
+	uint64_t index_sz = ns->n_objects * as_index_size_get(ns);
+	uint64_t tombstone_index_sz = ns->n_tombstones * as_index_size_get(ns);
+	uint64_t sindex_sz = ns->n_bytes_sindex_memory;
+	uint64_t data_in_memory_sz = ns->n_bytes_memory;
+	uint64_t memory_sz = index_sz + tombstone_index_sz + data_in_memory_sz + sindex_sz;
+
+	// Possible reasons for eviction or stopping writes.
+	static const char* reasons[] = {
+			NULL,									// 0x0
+			"(memory)",								// 0x1
+			"(device-avail-pct)",					// 0x2
+			"(memory & device-avail-pct)",			// 0x3 (0x1 | 0x2)
+			"(xdr-log)",							// 0x4
+			"(memory & xdr-log)",					// 0x5 (0x1 | 0x4)
+			"(device-avail-pct & xdr-log)",			// 0x6 (0x2 | 0x4)
+			"(memory & device-avail-pct & xdr-log)"	// 0x7 (0x1 | 0x2 | 0x4)
+	};
+
+	// Check if the writes should be stopped.
+	bool stop_writes = false;
+	uint32_t why_stopped = 0x0;
+
+	if (memory_sz > mem_stop_writes) {
+		stop_writes = true;
+		why_stopped = 0x1;
+	}
+
+	if (device_avail_pct < (int)ns->storage_min_avail_pct) {
+		stop_writes = true;
+		why_stopped |= 0x2;
+	}
+
+	if (is_xdr_digestlog_low(ns)) {
+		stop_writes = true;
+		why_stopped |= 0x4;
+	}
+
+	if (stop_writes) {
+		cf_warning(AS_NSUP, "{%s} breached stop-writes limit %s, memory sz:%lu (%lu + %lu) limit:%lu, disk avail-pct:%d",
+				ns->name, reasons[why_stopped],
+				memory_sz, index_sz, data_in_memory_sz, mem_stop_writes,
+				device_avail_pct);
+	}
+	else {
+		cf_debug(AS_NSUP, "{%s} stop-writes limit not breached, memory sz:%lu (%lu + %lu) limit:%lu, disk avail-pct:%d",
+				ns->name,
+				memory_sz, index_sz, data_in_memory_sz, mem_stop_writes,
+				device_avail_pct);
+	}
+
+	cf_atomic32_set(&ns->stop_writes, stop_writes ? 1 : 0);
+
+	return stop_writes;
+}
+
+static bool
+eval_hwm_breached(as_namespace *ns)
+{
+	// Compute the high-watermark - memory.
+	uint64_t mem_hwm = (ns->memory_size * ns->hwm_memory_pct) / 100;
+
+	// Compute the high-watermark - disk.
+	uint64_t ssd_hwm = (ns->ssd_size * ns->hwm_disk_pct) / 100;
+
+	// Compute disk usage for namespace.
+	uint64_t used_disk_sz = 0;
+
+	as_storage_stats(ns, NULL, &used_disk_sz);
+
+	// Compute memory usage for namespace.
+	uint64_t index_sz = ns->n_objects * as_index_size_get(ns);
+	uint64_t tombstone_index_sz = ns->n_tombstones * as_index_size_get(ns);
+	uint64_t sindex_sz = ns->n_bytes_sindex_memory;
+	uint64_t data_in_memory_sz = ns->n_bytes_memory;
+	uint64_t memory_sz = index_sz + tombstone_index_sz + data_in_memory_sz + sindex_sz;
+
+	// Possible reasons for eviction.
+	// (We don't use all combinations, but in case we change our minds...)
+	static const char* reasons[] = {
+		NULL, "(memory)", "(disk)", "(memory & disk)"
+	};
+
+	// Check if either high water mark is breached.
+	bool hwm_breached = false;
+	uint32_t how_breached = 0x0;
+
+	if (memory_sz > mem_hwm) {
+		hwm_breached = true;
+		how_breached = 0x1;
+	}
+
+	if (used_disk_sz > ssd_hwm) {
+		hwm_breached = true;
+		how_breached |= 0x2;
+	}
+
+	if (hwm_breached) {
+		cf_warning(AS_NSUP, "{%s} breached eviction hwm %s, memory sz:%lu (%lu + %lu) hwm:%lu, disk sz:%lu hwm:%lu",
+				ns->name, reasons[how_breached],
+				memory_sz, index_sz, data_in_memory_sz, mem_hwm,
+				used_disk_sz, ssd_hwm);
+	}
+	else {
+		cf_debug(AS_NSUP, "{%s} neither eviction hwm breached, memory sz:%lu (%lu + %lu) hwm:%lu, disk sz:%lu hwm:%lu",
+				ns->name,
+				memory_sz, index_sz, data_in_memory_sz, mem_hwm,
+				used_disk_sz, ssd_hwm);
+	}
+
+	cf_atomic32_set(&ns->hwm_breached, hwm_breached ? 1 : 0);
+
+	return hwm_breached;
+}
diff --git a/as/src/base/thr_query.c b/as/src/base/thr_query.c
new file mode 100644
index 00000000..e9973f7b
--- /dev/null
+++ b/as/src/base/thr_query.c
@@ -0,0 +1,3383 @@
+/*
+ * thr_query.c
+ *
+ * Copyright (C) 2012-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * This code is responsible for the query execution. Each query received
+ * query transaction for the query threads to execute. Query has two parts
+ * a) Generator  : This query the Aerospike Index B-tree and creates the digest list and
+ *                 queues it up for LOOKUP / UDF / AGGREGATION
+ * b) Aggregator : This does required processing of the record and send back
+ *                 response to the clients.
+ *                 LOOKUP:      Read the record from the disk and based on the
+ *                              records selected by query packs it into the buffer
+ *                              and returns it back to the client
+ *                 UDF:         Reads the record from the disk and based on the
+ *                              query applies UDF and packs the result back into
+ *                              the buffer and returns it back to the client.
+ *                 AGGREGATION: Creates istream(on the digstlist) and ostream(
+ *                              over the network buffer) and applies aggregator
+ *                              functions. For a single query this can be called
+ *                              multiple times. The istream interface takes care
+ *                              of partition reservation / record opening/ closing
+ *                              and object lock synchronization. Whole of which
+ *                              is driven by as_stream_read / as_stream_write from
+ *                              inside aggregation UDF. ostream keeps sending by
+ *                              batched result to the client.
+ *
+ *  Please note all these parts can either be performed under single thread
+ *  context or by different set of threads. For the namespace with data on disk
+ *  I/O is performed separately in different set of I/O pools
+ *
+ *  Flow of code looks like
+ *
+ *  1. thr_tsvc()
+ *
+ *                 ---------------------------------> query_generator
+ *                /                                      /|\      |
+ *  as_query -----                                        |       |   qtr released
+ * (sets up qtr)  \   qtr reserved                        |      \|/
+ *                 ----------------> g_query_q ------> query_th
+ *
+ *
+ *  2. Query Threads
+ *                          ---------------------------------> qwork_process
+ *                        /                                          /|\      |
+ *  query_generator --                                                |       |  qtr released
+ *  (sets up qwork)        \  qtr reserved                            |      \|/
+ *                          --------------> g_query_work_queue -> query_th
+ *
+ *
+ *
+ *  3. I/O threads
+ *                                query_process_ioreq  --> query_io
+ *                               /
+ *  qwork_process -----------------query_process_udfreq --> internal txn
+ *                               \
+ *                                query_process_aggreq --> ag_aggr_process
+ *
+ *  (Releases all the resources qtr and qwork if allocated)
+ *
+ *  A query may be single thread execution or a multi threaded application. In the
+ *  single thread execution all the functions are called in the single thread context
+ *  and no queue is involved. In case of multi thread context qtr is setup by thr_tsvc
+ *  and which is picked up by the query threads which could either service it in single
+ *  thread or queue up to the I/O worker thread (done generally in case of data on ssd)
+ *
+ */
+
+#include "base/thr_query.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_integer.h"
+#include "aerospike/as_list.h"
+#include "aerospike/as_map.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_stream.h"
+#include "aerospike/as_string.h"
+#include "aerospike/as_rec.h"
+#include "aerospike/as_val.h"
+#include "aerospike/mod_lua.h"
+#include "citrusleaf/cf_ll.h"
+#include "citrusleaf/cf_rchash.h"
+
+#include "ai_btree.h"
+#include "bt.h"
+#include "bt_iterator.h"
+
+#include "base/aggr.h"
+#include "base/as_stap.h"
+#include "base/datamodel.h"
+#include "base/predexp.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/stats.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "base/udf_memtracker.h"
+#include "base/udf_record.h"
+#include "fabric/fabric.h"
+#include "fabric/partition.h"
+#include "geospatial/geospatial.h"
+#include "transaction/udf.h"
+
+
+/*
+ * Query Transaction State
+ */
+// **************************************************************************************************
+typedef enum {
+	AS_QTR_STATE_INIT     = 0,
+	AS_QTR_STATE_RUNNING  = 1,
+	AS_QTR_STATE_ABORT    = 2,
+	AS_QTR_STATE_ERR      = 3,
+	AS_QTR_STATE_DONE     = 4,
+} qtr_state;
+// **************************************************************************************************
+
+/*
+ * Query Transcation Type
+ */
+// **************************************************************************************************
+typedef enum {
+	QUERY_TYPE_LOOKUP  = 0,
+	QUERY_TYPE_AGGR    = 1,
+	QUERY_TYPE_UDF_BG  = 2,
+	QUERY_TYPE_UDF_FG  = 3,
+
+	QUERY_TYPE_UNKNOWN  = -1
+} query_type;
+
+
+
+/*
+ * Query Transaction Structure
+ */
+// **************************************************************************************************
+typedef struct as_query_transaction_s {
+
+	/*
+ 	* MT (Read Only) No protection required
+ 	*/
+	/************************** Query Parameter ********************************/
+	uint64_t                 trid;
+	as_namespace           * ns;
+	char                   * setname;
+	as_sindex              * si;
+	as_sindex_range        * srange;
+	query_type               job_type;  // Job type [LOOKUP/AGG/UDF]
+	bool                     no_bin_data;
+	predexp_eval_t         * predexp_eval;
+	cf_vector              * binlist;
+	as_file_handle         * fd_h;      // ref counted nonetheless
+	/************************** Run Time Data *********************************/
+	bool                     blocking;
+	uint32_t                 priority;
+	uint64_t                 start_time;               // Start time
+	uint64_t                 end_time;                 // timeout value
+
+	/*
+ 	* MT (Single Writer / Single Threaded / Multiple Readers)
+ 	* Atomics or no Protection
+ 	*/
+	/****************** Stats (only generator) ***********************/
+	uint64_t                 querying_ai_time_ns;  // Time spent by query to run lookup secondary index trees.
+	uint32_t                 n_digests;            // Digests picked by from secondary index
+											   	   // including record read
+	bool                     short_running;
+	bool                     track;
+
+	/*
+ 	* MT (Multiple Writers)
+ 	* These fields are either needs to be atomic or protected by lock.
+ 	*/
+	/****************** Stats (worker threads) ***********************/
+	cf_atomic64              n_result_records;     // Number of records returned as result
+												   // if aggregation returns 1 record count
+												   // is 1, irrelevant of number of record
+												   // being touched.
+	cf_atomic64              net_io_bytes;
+	cf_atomic64              n_read_success;
+
+	/********************** Query Progress ***********************************/
+	cf_atomic32              n_qwork_active;
+	cf_atomic32              n_io_outstanding;
+	cf_atomic32              n_udf_tr_queued;    				// Throttling: max in flight scan
+
+	/********************* Net IO packet order *******************************/
+	cf_atomic32              netio_push_seq;
+	cf_atomic32              netio_pop_seq;
+
+	/********************** IO Buf Builder ***********************************/
+	pthread_mutex_t          buf_mutex;
+	cf_buf_builder         * bb_r;
+	/****************** Query State and Result Code **************************/
+	pthread_mutex_t          slock;
+	bool                     do_requeue;
+	qtr_state                state;
+	int                      result_code;
+
+    /********************* Fields Not Memzeroed **********************
+	*
+	* Empirically, some of the following fields *still* require memzero
+	* initialization. Please test with a memset(qtr, 0xff, sizeof(*qtr))
+	* right after allocation before you initialize before moving them
+	* into the uninitialized section.
+	*
+	* NB: Read Only or Single threaded
+	*/
+	struct ai_obj            bkey;
+	as_aggr_call             agg_call; // Stream UDF Details
+	iudf_origin              origin;   // Record UDF Details
+	bool                     is_durable_delete; // enterprise only
+	as_sindex_qctx           qctx;     // Secondary Index details
+	as_partition_reservation * rsv;
+} as_query_transaction;
+// **************************************************************************************************
+
+
+
+/*
+ * Query Request Type
+ */
+// **************************************************************************************************
+typedef enum {
+	QUERY_WORK_TYPE_NONE   = -1, // Request for I/O
+	QUERY_WORK_TYPE_LOOKUP =  0, // Request for I/O
+	QUERY_WORK_TYPE_AGG    =  1, // Request for Aggregation
+	QUERY_WORK_TYPE_UDF_BG =  2, // Request for running UDF on query result
+} query_work_type;
+// **************************************************************************************************
+
+
+/*
+ * Query Request
+ */
+// **************************************************************************************************
+typedef struct query_work_s {
+	query_work_type        type;
+	as_query_transaction * qtr;
+	cf_ll                * recl;
+	uint64_t               queued_time_ns;
+} query_work;
+// **************************************************************************************************
+
+
+/*
+ * Job Monitoring
+ */
+// **************************************************************************************************
+typedef struct query_jobstat_s {
+	int               index;
+	as_mon_jobstat ** jobstat;
+	int               max_size;
+} query_jobstat;
+// **************************************************************************************************
+
+/*
+ * Skey list
+ */
+// **************************************************************************************************
+typedef struct qtr_skey_s {
+	as_query_transaction * qtr;
+	as_sindex_key        * skey;
+} qtr_skey;
+// **************************************************************************************************
+
+
+/*
+ * Query Engine Global
+ */
+// **************************************************************************************************
+static int              g_current_queries_count = 0;
+static pthread_rwlock_t g_query_lock
+						= PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;
+static cf_rchash      * g_query_job_hash = NULL;
+// Buf Builder Pool
+static cf_queue       * g_query_response_bb_pool  = 0;
+static cf_queue       * g_query_qwork_pool         = 0;
+pthread_mutex_t         g_query_pool_mutex = PTHREAD_MUTEX_INITIALIZER;
+as_query_transaction  * g_query_pool_head = NULL;
+size_t                  g_query_pool_count = 0;
+//
+// GENERATOR
+static pthread_t        g_query_threads[AS_QUERY_MAX_THREADS];
+static pthread_attr_t   g_query_th_attr;
+static cf_queue       * g_query_short_queue     = 0;
+static cf_queue       * g_query_long_queue      = 0;
+static cf_atomic32      g_query_threadcnt       = 0;
+
+cf_atomic32             g_query_short_running   = 0;
+cf_atomic32             g_query_long_running    = 0;
+
+// I/O & AGGREGATOR
+static pthread_t       g_query_worker_threads[AS_QUERY_MAX_WORKER_THREADS];
+static pthread_attr_t  g_query_worker_th_attr;
+static cf_queue     *  g_query_work_queue    = 0;
+static cf_atomic32     g_query_worker_threadcnt = 0;
+// **************************************************************************************************
+
+/*
+ * Extern Functions
+ */
+// **************************************************************************************************
+
+extern cf_vector * as_sindex_binlist_from_msg(as_namespace *ns, as_msg *msgp, int * numbins);
+
+// **************************************************************************************************
+
+/*
+ * Forward Declaration
+ */
+// **************************************************************************************************
+
+static void qtr_finish_work(as_query_transaction *qtr, cf_atomic32 *stat, char *fname, int lineno, bool release);
+
+// **************************************************************************************************
+
+/*
+ * Histograms
+ */
+// **************************************************************************************************
+histogram * query_txn_q_wait_hist;               // Histogram to track time spend in trasaction queue. Transaction
+												 // queue backing, it is busy. Check if query in transaction is
+												 // true from query perspective.
+histogram * query_query_q_wait_hist;             // Histogram to track time spend waiting in queue for query thread.
+												 // Query queue backing up. Try increasing query thread in case CPU is
+												 // not fully utilized or if system is not IO bound
+histogram * query_prepare_batch_hist;            // Histogram to track time spend while preparing batches. Secondary index
+												 // slow. Check batch is too big
+histogram * query_batch_io_q_wait_hist;          // Histogram to track time spend waiting in queue for worker thread.
+histogram * query_batch_io_hist;                 // Histogram to track time spend doing I/O per batch. This includes
+												 // priority based sleep after n units of work.
+												 // For above two Query worker thread busy if not IO bound then try bumping
+												 // up the priority. Query thread may be yielding too much.
+histogram * query_net_io_hist;                   // Histogram to track time spend sending results to client. Network problem!!
+												 // or client too slow
+
+#define QUERY_HIST_INSERT_DATA_POINT(type, start_time_ns)              \
+do {                                                                   \
+	if (g_config.query_enable_histogram && start_time_ns != 0) {       \
+		if (type) {                                                    \
+			histogram_insert_data_point(type, start_time_ns);          \
+		}                                                              \
+	}                                                                  \
+} while(0);
+
+#define QUERY_HIST_INSERT_RAW(type, time_ns)                      \
+do {                                                                   \
+	if (g_config.query_enable_histogram && time_ns != 0) {             \
+		if (type) {                                                    \
+			histogram_insert_raw(type, time_ns);                       \
+		}                                                              \
+	}                                                                  \
+} while(0);
+
+// **************************************************************************************************
+
+
+/*
+ * Query Locks
+ */
+// **************************************************************************************************
+static void
+qtr_lock(as_query_transaction *qtr) {
+	if (qtr) {
+		pthread_mutex_lock(&qtr->slock);
+	}
+}
+static void
+qtr_unlock(as_query_transaction *qtr) {
+	if (qtr) {
+		pthread_mutex_unlock(&qtr->slock);
+	}
+}
+// **************************************************************************************************
+
+
+/*
+ * Query Transaction Pool
+ */
+// **************************************************************************************************
+static as_query_transaction *
+qtr_alloc()
+{
+	pthread_mutex_lock(&g_query_pool_mutex);
+
+	as_query_transaction * qtr;
+
+	if (!g_query_pool_head) {
+		qtr = cf_rc_alloc(sizeof(as_query_transaction));
+	} else {
+		qtr = g_query_pool_head;
+		g_query_pool_head = * (as_query_transaction **) qtr;
+		--g_query_pool_count;
+		cf_rc_reserve(qtr);
+	}
+
+	pthread_mutex_unlock(&g_query_pool_mutex);
+	return qtr;
+}
+
+static void
+qtr_free(as_query_transaction * qtr)
+{
+	pthread_mutex_lock(&g_query_pool_mutex);
+
+	if (g_query_pool_count >= AS_QUERY_MAX_QTR_POOL) {
+		cf_rc_free(qtr);
+	}
+	else {
+		// Use the initial location as a next pointer.
+		* (as_query_transaction **) qtr = g_query_pool_head;
+		g_query_pool_head = qtr;
+		++g_query_pool_count;
+	}
+
+	pthread_mutex_unlock(&g_query_pool_mutex);
+}
+// **************************************************************************************************
+
+
+/*
+ * Bufbuilder buffer pool
+ */
+// **************************************************************************************************
+static int
+bb_poolrelease(cf_buf_builder *bb_r)
+{
+	int ret = AS_QUERY_OK;
+	if ((cf_queue_sz(g_query_response_bb_pool) > g_config.query_bufpool_size)
+			|| g_config.query_buf_size != cf_buf_builder_size(bb_r)) {
+		cf_detail(AS_QUERY, "Freed Buffer of Size %zu with", bb_r->alloc_sz + sizeof(as_msg));
+		cf_buf_builder_free(bb_r);
+	} else {
+		cf_detail(AS_QUERY, "Pushed %p %"PRIu64" %d ", bb_r, g_config.query_buf_size, cf_buf_builder_size(bb_r));
+		cf_queue_push(g_query_response_bb_pool, &bb_r);
+	}
+	return ret;
+}
+
+static cf_buf_builder *
+bb_poolrequest()
+{
+	cf_buf_builder *bb_r;
+	int rv = cf_queue_pop(g_query_response_bb_pool, &bb_r, CF_QUEUE_NOWAIT);
+	if (rv == CF_QUEUE_EMPTY) {
+		bb_r = cf_buf_builder_create_size(g_config.query_buf_size);
+		if (!bb_r) {
+			cf_crash(AS_QUERY, "Allocation Error in Buf builder Pool !!");
+		}
+	} else if (rv == CF_QUEUE_OK) {
+		bb_r->used_sz = 0;
+		cf_detail(AS_QUERY, "Popped %p", bb_r);
+	} else {
+		cf_warning(AS_QUERY, "Failed to find response buffer in the pool%d", rv);
+		return NULL;
+	}
+	return bb_r;
+};
+// **************************************************************************************************
+
+/*
+ * Query Request Pool
+ */
+// **************************************************************************************************
+static int
+qwork_poolrelease(query_work *qwork)
+{
+	if (!qwork) return AS_QUERY_OK;
+	qwork->qtr   = 0;
+	qwork->type  = QUERY_WORK_TYPE_NONE;
+
+	int ret = AS_QUERY_OK;
+	if (cf_queue_sz(g_query_qwork_pool) < AS_QUERY_MAX_QREQ) {
+		cf_detail(AS_QUERY, "Pushed qwork %p", qwork);
+		cf_queue_push(g_query_qwork_pool, &qwork);
+	} else {
+		cf_detail(AS_QUERY, "Freed qwork %p", qwork);
+		cf_free(qwork);
+	}
+	if (ret != CF_QUEUE_OK) ret = AS_QUERY_ERR;
+	return ret;
+}
+
+static query_work *
+qwork_poolrequest()
+{
+	query_work *qwork = NULL;
+	int rv = cf_queue_pop(g_query_qwork_pool, &qwork, CF_QUEUE_NOWAIT);
+	if (rv == CF_QUEUE_EMPTY) {
+		qwork = cf_malloc(sizeof(query_work));
+		memset(qwork, 0, sizeof(query_work));
+	} else if (rv != CF_QUEUE_OK) {
+		cf_warning(AS_QUERY, "Failed to find query work in the pool");
+		return NULL;
+	}
+	qwork->qtr   = 0;
+	qwork->type  = QUERY_WORK_TYPE_NONE;
+	return qwork;
+};
+// **************************************************************************************************
+
+
+/*
+ * Query State set/get function
+ */
+// **************************************************************************************************
+static void
+qtr_set_running(as_query_transaction *qtr) {
+	qtr_lock(qtr);
+	if (qtr->state == AS_QTR_STATE_INIT) {
+		qtr->state       = AS_QTR_STATE_RUNNING;
+	} else {
+		cf_crash(AS_QUERY, "Invalid Query state %d while moving to running state ...", qtr->state);
+	}
+	qtr_unlock(qtr);
+}
+
+/*
+ * Query in non init state (picked up by generator) means it is
+ * running. Could be RUNNING/ABORT/FAIL/DONE
+ */
+static bool
+qtr_started(as_query_transaction *qtr) {
+	qtr_lock(qtr);
+	bool started = false;
+	if (qtr->state != AS_QTR_STATE_INIT) {
+		started = true;
+	}
+	qtr_unlock(qtr);
+	return started;
+}
+
+static void
+qtr_set_abort(as_query_transaction *qtr, int result_code, char *fname, int lineno)
+{
+	qtr_lock(qtr);
+	if (qtr->state == AS_QTR_STATE_RUNNING
+		|| qtr->state == AS_QTR_STATE_DONE) {
+		cf_debug(AS_QUERY, "Query %p Aborted at %s:%d", qtr, fname, lineno);
+		qtr->state       = AS_QTR_STATE_ABORT;
+		qtr->result_code = result_code;
+	}
+	qtr_unlock(qtr);
+}
+
+static void
+qtr_set_err(as_query_transaction *qtr, int result_code, char *fname, int lineno)
+{
+	qtr_lock(qtr);
+	if (qtr->state == AS_QTR_STATE_RUNNING) {
+		cf_debug(AS_QUERY, "Query %p Error at %s:%d", qtr, fname, lineno);
+		qtr->state       = AS_QTR_STATE_ERR;
+		qtr->result_code = result_code;
+	}
+	qtr_unlock(qtr);
+}
+
+static void
+qtr_set_done(as_query_transaction *qtr, int result_code, char *fname, int lineno)
+{
+	qtr_lock(qtr);
+	if (qtr->state == AS_QTR_STATE_RUNNING) {
+		cf_debug(AS_QUERY, "Query %p Done at %s:%d", qtr, fname, lineno);
+		qtr->state       = AS_QTR_STATE_DONE;
+		qtr->result_code = result_code;
+	}
+	qtr_unlock(qtr);
+}
+
+static bool
+qtr_failed(as_query_transaction *qtr)
+{
+	qtr_lock(qtr);
+	bool abort = false;
+	if ((qtr->state == AS_QTR_STATE_ABORT)
+		 || (qtr->state == AS_QTR_STATE_ERR)) {
+		abort = true;
+	}
+	qtr_unlock(qtr);
+	return abort;
+}
+
+static bool
+qtr_is_abort(as_query_transaction *qtr)
+{
+	qtr_lock(qtr);
+	bool abort = false;
+	if (qtr->state == AS_QTR_STATE_ABORT) {
+		abort = true;
+	}
+	qtr_unlock(qtr);
+	return abort;
+}
+
+
+static bool
+qtr_finished(as_query_transaction *qtr)
+{
+	qtr_lock(qtr);
+	bool finished = false;
+	if ((qtr->state == AS_QTR_STATE_DONE)
+		|| (qtr->state == AS_QTR_STATE_ERR)
+		|| (qtr->state == AS_QTR_STATE_ABORT)) {
+		finished = true;
+	}
+	qtr_unlock(qtr);
+	return finished;
+}
+
+static void
+query_check_timeout(as_query_transaction *qtr)
+{
+	if ((qtr)
+		&& (qtr->end_time != 0)
+		&& (cf_getns() > qtr->end_time)) {
+		cf_debug(AS_QUERY, "Query Timed-out %lu %lu", cf_getns(), qtr->end_time);
+		qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_TIMEOUT, __FILE__, __LINE__);
+	}
+}
+// **************************************************************************************************
+
+
+/*
+ * Query Destructor Function
+ */
+// **************************************************************************************************
+static void
+query_release_prereserved_partitions(as_query_transaction * qtr)
+{
+	if (!qtr) {
+		cf_warning(AS_QUERY, "qtr is NULL");
+		return;
+	}
+	if (qtr->qctx.partitions_pre_reserved) {
+		for (int i=0; i<AS_PARTITIONS; i++) {
+			if (qtr->qctx.can_partition_query[i]) {
+				as_partition_release(&qtr->rsv[i]);
+			}
+		}
+		if (qtr->rsv) {
+			cf_free(qtr->rsv);
+		}
+	}
+}
+
+/*
+ * NB: These stats come into picture only if query really started
+ * running. If it fails before even running it is accounted in
+ * fail
+ */
+static inline void
+query_update_stats(as_query_transaction *qtr)
+{
+	uint64_t rows = cf_atomic64_get(qtr->n_result_records);
+
+	switch (qtr->job_type) {
+		case QUERY_TYPE_LOOKUP:
+			if (qtr->state == AS_QTR_STATE_ABORT) {
+				cf_atomic64_incr(&qtr->ns->n_lookup_abort);
+			} else if (qtr->state == AS_QTR_STATE_ERR) {
+				cf_atomic64_incr(&(qtr->si->stats.lookup_errs));
+				cf_atomic64_incr(&qtr->ns->n_lookup_errs);
+			}
+			if (!qtr_failed(qtr))
+				cf_atomic64_incr(&qtr->ns->n_lookup_success);
+			cf_atomic64_incr(&qtr->si->stats.n_lookup);
+			cf_atomic64_add(&qtr->si->stats.lookup_response_size, qtr->net_io_bytes);
+			cf_atomic64_add(&qtr->si->stats.lookup_num_records, rows);
+			cf_atomic64_add(&qtr->ns->lookup_response_size, qtr->net_io_bytes);
+			cf_atomic64_add(&qtr->ns->lookup_num_records, rows);
+			break;
+
+		case QUERY_TYPE_AGGR:
+			if (qtr->state == AS_QTR_STATE_ABORT) {
+				cf_atomic64_incr(&qtr->ns->n_agg_abort);
+			} else if (qtr->state == AS_QTR_STATE_ERR) {
+				cf_atomic64_incr(&(qtr->si->stats.agg_errs));
+				cf_atomic64_incr(&qtr->ns->n_agg_errs);
+			}
+			if (!qtr_failed(qtr))
+				cf_atomic64_incr(&qtr->ns->n_agg_success);
+			cf_atomic64_incr(&qtr->si->stats.n_aggregation);
+			cf_atomic64_add(&qtr->si->stats.agg_response_size, qtr->net_io_bytes);
+			cf_atomic64_add(&qtr->si->stats.agg_num_records, rows);
+			cf_atomic64_add(&qtr->ns->agg_response_size, qtr->net_io_bytes);
+			cf_atomic64_add(&qtr->ns->agg_num_records, rows);
+			break;
+
+		case QUERY_TYPE_UDF_BG:
+			if (qtr_failed(qtr)) {
+				cf_atomic64_incr(&qtr->ns->n_query_udf_bg_failure);
+			} else {
+				cf_atomic64_incr(&qtr->ns->n_query_udf_bg_success);
+			}
+			break;
+
+		default:
+			cf_crash(AS_QUERY, "Unknown Query Type !!");
+			break;
+	}
+
+	// Can't use macro that tr and rw use.
+	qtr->ns->query_hist_active = true;
+	cf_hist_track_insert_data_point(qtr->ns->query_hist, qtr->start_time);
+
+	SINDEX_HIST_INSERT_DATA_POINT(qtr->si, query_hist, qtr->start_time);
+
+	if (qtr->querying_ai_time_ns) {
+		QUERY_HIST_INSERT_RAW(query_prepare_batch_hist, qtr->querying_ai_time_ns);
+	}
+
+	if (qtr->n_digests) {
+		SINDEX_HIST_INSERT_RAW(qtr->si, query_rcnt_hist, qtr->n_digests);
+		if (rows) {
+			// Can't use macro that tr and rw use.
+			qtr->ns->query_rec_count_hist_active = true;
+			histogram_insert_raw(qtr->ns->query_rec_count_hist, rows);
+
+			SINDEX_HIST_INSERT_RAW(qtr->si, query_diff_hist, qtr->n_digests - rows);
+		}
+	}
+
+
+
+	uint64_t query_stop_time = cf_getns();
+	uint64_t elapsed_us = (query_stop_time - qtr->start_time) / 1000;
+	cf_detail(AS_QUERY,
+			"Total time elapsed %"PRIu64" us, %"PRIu64" of %d read operations avg latency %"PRIu64" us",
+			elapsed_us, rows, qtr->n_digests, rows > 0 ? elapsed_us / rows : 0);
+}
+
+static void
+query_run_teardown(as_query_transaction *qtr)
+{
+	query_update_stats(qtr);
+
+	if (qtr->n_udf_tr_queued != 0) {
+		cf_warning(AS_QUERY, "QUEUED UDF not equal to zero when query transaction is done");
+	}
+
+	if (qtr->qctx.recl) {
+		cf_ll_reduce(qtr->qctx.recl, true /*forward*/, as_index_keys_ll_reduce_fn, NULL);
+		cf_free(qtr->qctx.recl);
+		qtr->qctx.recl = NULL;
+	}
+
+	if (qtr->short_running) {
+		cf_atomic32_decr(&g_query_short_running);
+	} else {
+		cf_atomic32_decr(&g_query_long_running);
+	}
+
+	// Release all the partitions
+	query_release_prereserved_partitions(qtr);
+
+
+	if (qtr->bb_r) {
+		bb_poolrelease(qtr->bb_r);
+		qtr->bb_r = NULL;
+	}
+
+	pthread_mutex_destroy(&qtr->buf_mutex);
+}
+
+static void
+query_teardown(as_query_transaction *qtr)
+{
+	if (qtr->srange)      as_sindex_range_free(&qtr->srange);
+	if (qtr->si)          AS_SINDEX_RELEASE(qtr->si);
+	if (qtr->binlist)     cf_vector_destroy(qtr->binlist);
+	if (qtr->setname)     cf_free(qtr->setname);
+	if (qtr->predexp_eval) predexp_destroy(qtr->predexp_eval);
+	if (qtr->job_type == QUERY_TYPE_AGGR && qtr->agg_call.def.arglist) {
+		as_list_destroy(qtr->agg_call.def.arglist);
+	}
+	else if (qtr->job_type == QUERY_TYPE_UDF_BG) {
+		iudf_origin_destroy(&qtr->origin);
+	}
+	pthread_mutex_destroy(&qtr->slock);
+}
+
+static void
+query_release_fd(as_file_handle *fd_h, bool force_close)
+{
+	if (fd_h) {
+		fd_h->fh_info &= ~FH_INFO_DONOT_REAP;                                  
+		fd_h->last_used = cf_getms();                   
+		as_end_of_transaction(fd_h, force_close);
+	}
+}
+
+static void
+query_transaction_done(as_query_transaction *qtr)
+{
+
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	if (!qtr)
+		return;
+
+	ASD_QUERY_TRANS_DONE(nodeid, qtr->trid, (void *) qtr);
+
+	if (qtr_started(qtr)) {
+		query_run_teardown(qtr);
+	}
+
+
+	// if query is aborted force close connection.
+	// Not to be reused
+	query_release_fd(qtr->fd_h, qtr_is_abort(qtr));
+	qtr->fd_h = NULL;
+	query_teardown(qtr);
+
+	ASD_QUERY_QTR_FREE(nodeid, qtr->trid, (void *) qtr);
+
+	qtr_free(qtr);
+}
+// **************************************************************************************************
+
+
+/*
+ * Query Transaction Ref Counts
+ */
+// **************************************************************************************************
+int
+qtr_release(as_query_transaction *qtr, char *fname, int lineno)
+{
+	if (qtr) {
+		int val = cf_rc_release(qtr);
+		if (val == 0) {
+			cf_detail(AS_QUERY, "Released qtr [%s:%d] %p %d ", fname, lineno, qtr, val);
+			query_transaction_done(qtr);
+		}
+		cf_detail(AS_QUERY, "Released qtr [%s:%d] %p %d ", fname, lineno, qtr, val);
+	}
+	return AS_QUERY_OK;
+}
+
+static int
+qtr_reserve(as_query_transaction *qtr, char *fname, int lineno)
+{
+	if (!qtr) {
+		return AS_QUERY_ERR;
+	}
+	int val = cf_rc_reserve(qtr);
+	cf_detail(AS_QUERY, "Reserved qtr [%s:%d] %p %d ", fname, lineno, qtr, val);
+	return AS_QUERY_OK;
+}
+// **************************************************************************************************
+
+
+/*
+ * Async Network IO Entry Point
+ */
+// **************************************************************************************************
+/* Call back function to determine if the IO should go ahead or not.
+ * Purpose
+ * 1. If our sequence number does not match requeue
+ * 2. If query aborted fail IO.
+ * 3. In all other cases let the IO go through. That would mean
+ *    if IO is queued it will be done before the fin with error
+ *    result_code is sent !!
+ */
+int
+query_netio_start_cb(void *udata, int seq)
+{
+	as_netio *io               = (as_netio *)udata;
+	as_query_transaction *qtr  = (as_query_transaction *)io->data;
+	cf_detail(AS_QUERY, "Netio Started_CB %d %d %d %d ", io->offset, io->seq, qtr->netio_pop_seq, qtr->state);
+
+	// It is needed to send all the packets in sequence
+	// A packet can be requeued after being half sent.
+	if (seq > cf_atomic32_get(qtr->netio_pop_seq)) {
+		return AS_NETIO_CONTINUE;
+	}
+
+	if (qtr_is_abort(qtr)) {
+		return AS_QUERY_ERR;
+	}
+
+	return AS_NETIO_OK;
+}
+
+/*
+ * The function after the IO on the network has been done.
+ * 1. If OK was done successfully bump up the sequence number and
+ *    fix stats
+ * 2. Release the qtr if something fails ... which would trigger
+ *    fin packet send and eventually free up qtr
+ * Abort it set if something goes wrong
+ */
+int
+query_netio_finish_cb(void *data, int retcode)
+{
+	as_netio *io               = (as_netio *)data;
+    cf_detail(AS_QUERY, "Query Finish Callback io seq %d with retCode %d", io->seq, retcode);
+	as_query_transaction *qtr  = (as_query_transaction *)io->data;
+	if (qtr && (retcode != AS_NETIO_CONTINUE)) {
+		// If send success make stat is updated
+		if (retcode == AS_NETIO_OK) {
+			cf_atomic64_add(&qtr->net_io_bytes, io->bb_r->used_sz + 8);
+		} else {
+			qtr_set_abort(qtr, AS_PROTO_RESULT_FAIL_QUERY_NETIO_ERR, __FILE__, __LINE__);
+		}
+		QUERY_HIST_INSERT_DATA_POINT(query_net_io_hist, io->start_time);
+
+		// Undo the increment from query_netio(). Cannot reach zero here: the
+		// increment owned by the transaction will only be undone after all netio
+		// is complete.
+		cf_rc_release(io->fd_h);
+		io->fd_h = NULL;
+		bb_poolrelease(io->bb_r);
+
+		cf_atomic32_incr(&qtr->netio_pop_seq);
+
+		qtr_finish_work(qtr, &qtr->n_io_outstanding, __FILE__, __LINE__, true);
+	}
+	return retcode;
+}
+
+#define MAX_OUTSTANDING_IO_REQ 2
+static int
+query_netio_wait(as_query_transaction *qtr)
+{
+	return (cf_atomic32_get(qtr->n_io_outstanding) > MAX_OUTSTANDING_IO_REQ) ? AS_QUERY_ERR : AS_QUERY_OK;
+}
+
+// Returns AS_NETIO_OK always
+static int
+query_netio(as_query_transaction *qtr)
+{
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	ASD_QUERY_NETIO_STARTING(nodeid, qtr->trid);
+
+	as_netio        io;
+
+	io.finish_cb = query_netio_finish_cb;
+	io.start_cb  = query_netio_start_cb;
+
+	qtr_reserve(qtr, __FILE__, __LINE__);
+	io.data        = qtr;
+
+	io.bb_r        = qtr->bb_r;
+	qtr->bb_r      = NULL;
+
+	cf_rc_reserve(qtr->fd_h);
+	io.fd_h        = qtr->fd_h;
+
+	io.offset      = 0;
+
+	cf_atomic32_incr(&qtr->n_io_outstanding);
+	io.seq         = cf_atomic32_incr(&qtr->netio_push_seq);
+	io.start_time  = cf_getns();
+
+	int ret        = as_netio_send(&io, false, qtr->blocking);
+	qtr->bb_r      = bb_poolrequest();
+   	cf_buf_builder_reserve(&qtr->bb_r, 8, NULL);
+
+	ASD_QUERY_NETIO_FINISHED(nodeid, qtr->trid);
+
+	return ret;
+}
+// **************************************************************************************************
+
+
+/*
+ * Query Reservation Abstraction
+ */
+// **************************************************************************************************
+// Returns NULL if partition with is 'pid' is not query-able Else
+//      if all the partitions are reserved upfront returns the rsv used for reserving the partition
+//      else reserves the partition and returns rsv
+as_partition_reservation *
+query_reserve_partition(as_namespace * ns, as_query_transaction * qtr, uint32_t pid, as_partition_reservation * rsv)
+{
+	if (qtr->qctx.partitions_pre_reserved) {
+		if (!qtr->qctx.can_partition_query[pid]) {
+			cf_debug(AS_QUERY, "Getting digest in rec list which do not belong to query-able partition.");
+			return NULL;
+		}
+		return &qtr->rsv[pid];
+	}
+
+	// Works for scan aggregation
+	if (!rsv) {
+		cf_warning(AS_QUERY, "rsv is null while reserving partition.");
+		return NULL;
+	}
+
+	if (0 != as_partition_reserve_query(ns, pid, rsv)) {
+		return NULL;
+	}
+
+	return rsv;
+}
+
+void
+query_release_partition(as_query_transaction * qtr, as_partition_reservation * rsv)
+{
+	if (!qtr->qctx.partitions_pre_reserved) {
+		as_partition_release(rsv);
+	}
+}
+
+// Pre reserves query-able partitions
+void
+as_query_pre_reserve_partitions(as_query_transaction * qtr)
+{
+	if (!qtr) {
+		cf_warning(AS_QUERY, "qtr is NULL");
+		return;
+	}
+	if (qtr->qctx.partitions_pre_reserved) {
+		qtr->rsv = cf_malloc(sizeof(as_partition_reservation) * AS_PARTITIONS);
+		as_partition_prereserve_query(qtr->ns, qtr->qctx.can_partition_query, qtr->rsv);
+	} else {
+		qtr->rsv = NULL;
+	}
+}
+
+// **************************************************************************************************
+
+
+/*
+ * Query tracking
+ */
+// **************************************************************************************************
+// Put qtr in a global hash
+static int
+hash_put_qtr(as_query_transaction * qtr)
+{
+	if (!qtr->track) {
+		return AS_QUERY_CONTINUE;
+	}
+
+	int rc = cf_rchash_put_unique(g_query_job_hash, &qtr->trid, sizeof(qtr->trid), qtr);
+	if (rc) {
+		cf_warning(AS_SINDEX, "QTR Put in hash failed with error %d", rc);
+	}
+
+	return rc;
+}
+
+// Get Qtr from global hash
+static int
+hash_get_qtr(uint64_t trid, as_query_transaction ** qtr)
+{
+	int rv = cf_rchash_get(g_query_job_hash, &trid, sizeof(trid), (void **) qtr);
+	if (CF_RCHASH_OK != rv) {
+		cf_info(AS_SINDEX, "Query job with transaction id [%"PRIu64"] does not exist", trid );
+	}
+	return rv;
+}
+
+// Delete Qtr from global hash
+static int
+hash_delete_qtr(as_query_transaction *qtr)
+{
+	if (!qtr->track) {
+		return AS_QUERY_CONTINUE;
+	}
+
+	int rv = cf_rchash_delete(g_query_job_hash, &qtr->trid, sizeof(qtr->trid));
+	if (CF_RCHASH_OK != rv) {
+		cf_warning(AS_SINDEX, "Failed to delete qtr from query hash.");
+	}
+	return rv;
+}
+// If any query run from more than g_config.query_untracked_time_ms
+// 		we are going to track it
+// else no.
+int
+hash_track_qtr(as_query_transaction *qtr)
+{
+	if (!qtr->track) {
+		if ((cf_getns() - qtr->start_time) > (g_config.query_untracked_time_ms * 1000000)) {
+			qtr->track = true;
+			qtr_reserve(qtr, __FILE__, __LINE__);
+			int ret = hash_put_qtr(qtr);
+			if (ret != 0 && ret != AS_QUERY_CONTINUE) {
+				// track should be disabled otherwise at the
+				// qtr cleanup stage some other qtr with the same
+				// trid can get cleaned up.
+				qtr->track     = false;
+				qtr_release(qtr, __FILE__, __LINE__);
+				return AS_QUERY_ERR;
+			}
+		}
+	}
+	return AS_QUERY_OK;
+}
+// **************************************************************************************************
+
+
+
+/*
+ * Query Request IO functions
+ */
+// **************************************************************************************************
+/*
+ * Function query_add_response
+ *
+ * Returns -
+ *		AS_QUERY_OK  - On success.
+ *		AS_QUERY_ERR - On failure.
+ *
+ * Notes -
+ *	Basic query call back function. Fills up the client response buffer;
+ *	sends out buffer and then
+ *	reinitializes the buf for the next set of requests,
+ *	In case buffer is full Bail out quick if unable to send response back to client
+ *
+ *	On success, qtr->n_result_records is incremented by 1.
+ *
+ * Synchronization -
+ * 		Takes a lock over qtr->buf
+ */
+static int
+query_add_response(void *void_qtr, as_storage_rd *rd)
+{
+	as_query_transaction *qtr = (as_query_transaction *)void_qtr;
+
+	// TODO - check and handle error result (< 0 - drive IO) explicitly?
+	size_t msg_sz = (size_t)as_msg_make_response_bufbuilder(NULL, rd,
+			qtr->no_bin_data, true, true, qtr->binlist);
+	int ret = 0;
+
+	pthread_mutex_lock(&qtr->buf_mutex);
+	cf_buf_builder *bb_r = qtr->bb_r;
+	if (bb_r == NULL) {
+		// Assert that query is aborted if bb_r is found to be null
+		pthread_mutex_unlock(&qtr->buf_mutex);
+		return AS_QUERY_ERR;
+	}
+
+	if (msg_sz > (bb_r->alloc_sz - bb_r->used_sz) && bb_r->used_sz != 0) {
+		query_netio(qtr);
+	}
+
+	int32_t result = as_msg_make_response_bufbuilder(&qtr->bb_r, rd,
+			qtr->no_bin_data, true, true, qtr->binlist);
+
+	if (result < 0) {
+		ret = result;
+		cf_warning(AS_QUERY, "Weird there is space but still the packing failed "
+				"available = %zd msg size = %zu",
+				bb_r->alloc_sz - bb_r->used_sz, msg_sz);
+	}
+	cf_atomic64_incr(&qtr->n_result_records);
+	pthread_mutex_unlock(&qtr->buf_mutex);
+	return ret;
+}
+
+
+static int
+query_add_fin(as_query_transaction *qtr)
+{
+
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+	cf_detail(AS_QUERY, "Adding fin %p", qtr);
+	uint8_t *b;
+	// in case of aborted query, the bb_r is already released
+	if (qtr->bb_r == NULL) {
+		// Assert that query is aborted if bb_r is found to be null
+		return AS_QUERY_ERR;
+	}
+	cf_buf_builder_reserve(&qtr->bb_r, sizeof(as_msg), &b);
+
+	ASD_QUERY_ADDFIN(nodeid, qtr->trid);
+	// set up the header
+	uint8_t *buf      = b;
+	as_msg *msgp      = (as_msg *) buf;
+	msgp->header_sz   = sizeof(as_msg);
+	msgp->info1       = 0;
+	msgp->info2       = 0;
+	msgp->info3       = AS_MSG_INFO3_LAST;
+	msgp->unused      = 0;
+	msgp->result_code = qtr->result_code;
+	msgp->generation  = 0;
+	msgp->record_ttl  = 0;
+	msgp->n_fields    = 0;
+	msgp->n_ops       = 0;
+	msgp->transaction_ttl = 0;
+	as_msg_swap_header(msgp);
+	return AS_QUERY_OK;
+}
+
+static int
+query_send_fin(as_query_transaction *qtr) {
+	// Send out the final data back
+	if (qtr->fd_h) {
+		query_add_fin(qtr);
+		query_netio(qtr);
+	}
+	return AS_QUERY_OK;
+}
+
+static void
+query_send_bg_udf_response(as_transaction *tr)
+{
+	cf_detail(AS_QUERY, "Send Fin for Background UDF");
+	bool force_close = ! as_msg_send_fin(&tr->from.proto_fd_h->sock, AS_PROTO_RESULT_OK);
+	query_release_fd(tr->from.proto_fd_h, force_close);
+	tr->from.proto_fd_h = NULL;
+}
+
+static bool
+query_match_integer_fromval(as_query_transaction * qtr, as_val *v, as_sindex_key *skey)
+{
+	as_sindex_bin_data *start = &qtr->srange->start;
+	as_sindex_bin_data *end   = &qtr->srange->end;
+
+	if ((AS_PARTICLE_TYPE_INTEGER != as_sindex_pktype(qtr->si->imd))
+			|| (AS_PARTICLE_TYPE_INTEGER != start->type)
+			|| (AS_PARTICLE_TYPE_INTEGER != end->type)) {
+		cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d  binname=%s index=%s",
+				AS_PARTICLE_TYPE_INTEGER, start->type, end->type, as_sindex_pktype(qtr->si->imd),
+				qtr->si->imd->bname, qtr->si->imd->iname);
+		return false;
+	}
+	as_integer * i = as_integer_fromval(v);
+	int64_t value  = as_integer_get(i);
+	if (skey->key.int_key != value) {
+		cf_debug(AS_QUERY, "query_record_matches: sindex key does "
+			"not matches bin value in record. skey %ld bin value %ld", skey->key.int_key, value);
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+query_match_string_fromval(as_query_transaction * qtr, as_val *v, as_sindex_key *skey)
+{
+	as_sindex_bin_data *start = &qtr->srange->start;
+	as_sindex_bin_data *end   = &qtr->srange->end;
+
+	if ((AS_PARTICLE_TYPE_STRING != as_sindex_pktype(qtr->si->imd))
+			|| (AS_PARTICLE_TYPE_STRING != start->type)
+			|| (AS_PARTICLE_TYPE_STRING != end->type)) {
+		cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d  binname=%s index=%s",
+				AS_PARTICLE_TYPE_STRING, start->type, end->type, as_sindex_pktype(qtr->si->imd),
+				qtr->si->imd->bname, qtr->si->imd->iname);
+		return false;
+	}
+
+	char * str_val = as_string_get(as_string_fromval(v));
+	cf_digest str_digest;
+	cf_digest_compute(str_val, strlen(str_val), &str_digest);
+
+	if (memcmp(&str_digest, &skey->key.str_key, AS_DIGEST_KEY_SZ)) {
+		return false;
+	}
+	return true;
+}
+
+static bool
+query_match_geojson_fromval(as_query_transaction * qtr, as_val *v, as_sindex_key *skey)
+{
+	as_sindex_bin_data *start = &qtr->srange->start;
+	as_sindex_bin_data *end   = &qtr->srange->end;
+
+	if ((AS_PARTICLE_TYPE_GEOJSON != as_sindex_pktype(qtr->si->imd))
+			|| (AS_PARTICLE_TYPE_GEOJSON != start->type)
+			|| (AS_PARTICLE_TYPE_GEOJSON != end->type)) {
+		cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d  binname=%s index=%s",
+				AS_PARTICLE_TYPE_GEOJSON, start->type, end->type,
+				as_sindex_pktype(qtr->si->imd), qtr->si->imd->bname,
+				qtr->si->imd->iname);
+		return false;
+	}
+
+	return as_particle_geojson_match_asval(v, qtr->srange->cellid,
+			qtr->srange->region, qtr->ns->geo2dsphere_within_strict);
+}
+
+// If the value matches foreach should stop iterating the
+bool
+query_match_mapkeys_foreach(const as_val * key, const as_val * val, void * udata)
+{
+	qtr_skey * q_s = (qtr_skey *)udata;
+	switch (key->type) {
+	case AS_STRING:
+		// If matches return false
+		return !query_match_string_fromval(q_s->qtr, (as_val *)key, q_s->skey);
+	case AS_INTEGER:
+		// If matches return false
+		return !query_match_integer_fromval(q_s->qtr,(as_val *) key, q_s->skey);
+	case AS_GEOJSON:
+		// If matches return false
+		return !query_match_geojson_fromval(q_s->qtr,(as_val *) key, q_s->skey);
+	default:
+		// All others don't match
+		return true;
+	}
+}
+
+static bool
+query_match_mapvalues_foreach(const as_val * key, const as_val * val, void * udata)
+{
+	qtr_skey * q_s = (qtr_skey *)udata;
+	switch (val->type) {
+	case AS_STRING:
+		// If matches return false
+		return !query_match_string_fromval(q_s->qtr, (as_val *)val, q_s->skey);
+	case AS_INTEGER:
+		// If matches return false
+		return !query_match_integer_fromval(q_s->qtr, (as_val *)val, q_s->skey);
+	case AS_GEOJSON:
+		// If matches return false
+		return !query_match_geojson_fromval(q_s->qtr, (as_val *)val, q_s->skey);
+	default:
+		// All others don't match
+		return true;
+	}
+}
+
+static bool
+query_match_listele_foreach(as_val * val, void * udata)
+{
+	qtr_skey * q_s = (qtr_skey *)udata;
+	switch (val->type) {
+	case AS_STRING:
+		// If matches return false
+		return !query_match_string_fromval(q_s->qtr, val, q_s->skey);
+	case AS_INTEGER:
+		// If matches return false
+		return !query_match_integer_fromval(q_s->qtr, val, q_s->skey);
+	case AS_GEOJSON:
+		// If matches return false
+		return !query_match_geojson_fromval(q_s->qtr, val, q_s->skey);
+	default:
+		// All others don't match
+		return true;
+	}
+}
+/*
+ * Validate record based on its content and query make sure it indeed should
+ * be selected. Secondary index does lazy delete for the entries for the record
+ * for which data is on ssd. See sindex design doc for details. Hence it is
+ * possible that it returns digest for which record may have changed. Do the
+ * validation before returning the row.
+ */
+static bool
+query_record_matches(as_query_transaction *qtr, as_storage_rd *rd, as_sindex_key * skey)
+{
+	// TODO: Add counters and make sure it is not a performance hit
+	as_sindex_bin_data *start = &qtr->srange->start;
+	as_sindex_bin_data *end   = &qtr->srange->end;
+
+	//TODO: Make it more general to support sindex over multiple bins	
+	as_bin * b = as_bin_get_by_id(rd, qtr->si->imd->binid);
+
+	if (!b) {
+		cf_debug(AS_QUERY , "as_query_record_validation: "
+				"Bin name %s not found ", qtr->si->imd->bname);
+		// Possible bin may not be there anymore classic case of
+		// bin delete.
+		return false;
+	}
+	uint8_t type = as_bin_get_particle_type(b);
+
+	// If the bin is of type cdt, we need to see if anyone of the value within cdt
+	// matches the query.
+	// This can be performance hit for big list and maps.
+	as_val * res_val = NULL;
+	as_val * val     = NULL;
+	bool matches     = false;
+	bool from_cdt    = false;
+	switch (type) {
+		case AS_PARTICLE_TYPE_INTEGER : {
+			if ((type != as_sindex_pktype(qtr->si->imd))
+			|| (type != start->type)
+			|| (type != end->type)) {
+				cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d  binname=%s index=%s",
+					type, start->type, end->type, as_sindex_pktype(qtr->si->imd),
+					qtr->si->imd->bname, qtr->si->imd->iname);
+				matches = false;
+				break;
+			}
+
+			int64_t i = as_bin_particle_integer_value(b);
+			if (skey->key.int_key != i) {
+				cf_debug(AS_QUERY, "query_record_matches: sindex key does "
+						"not matches bin value in record. bin value %ld skey value %ld", i, skey->key.int_key);
+				matches = false;
+				break;
+			}
+			matches = true;
+			break;
+		}
+		case AS_PARTICLE_TYPE_STRING : {
+			if ((type != as_sindex_pktype(qtr->si->imd))
+			|| (type != start->type)
+			|| (type != end->type)) {
+				cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d  binname=%s index=%s",
+					type, start->type, end->type, as_sindex_pktype(qtr->si->imd),
+					qtr->si->imd->bname, qtr->si->imd->iname);
+				matches = false;
+				break;
+			}
+
+			char * buf;
+			uint32_t psz = as_bin_particle_string_ptr(b, &buf);
+			cf_digest bin_digest;
+			cf_digest_compute(buf, psz, &bin_digest);
+			if (memcmp(&skey->key.str_key, &bin_digest, AS_DIGEST_KEY_SZ)) {
+				matches = false;
+				break;
+			}
+			matches = true;
+			break;
+		}
+		case AS_PARTICLE_TYPE_GEOJSON : {
+			if ((type != as_sindex_pktype(qtr->si->imd))
+			|| (type != start->type)
+			|| (type != end->type)) {
+				cf_debug(AS_QUERY, "as_query_record_matches: Type mismatch %d!=%d!=%d!=%d  binname=%s index=%s",
+					type, start->type, end->type, as_sindex_pktype(qtr->si->imd),
+					qtr->si->imd->bname, qtr->si->imd->iname);
+				return false;
+			}
+
+			bool iswithin = as_particle_geojson_match(b->particle,
+					qtr->srange->cellid, qtr->srange->region,
+					qtr->ns->geo2dsphere_within_strict);
+
+			// We either found a valid point or a false positive.
+			if (iswithin) {
+				cf_atomic64_incr(&qtr->ns->geo_region_query_points);
+			}
+			else {
+				cf_atomic64_incr(&qtr->ns->geo_region_query_falsepos);
+			}
+
+			return iswithin;
+		}
+		case AS_PARTICLE_TYPE_MAP : {
+			val     = as_bin_particle_to_asval(b);
+			res_val = as_sindex_extract_val_from_path(qtr->si->imd, val);
+			if (!res_val) {
+				matches = false;
+				break;
+			}
+			from_cdt = true;
+			break;
+		}
+		case AS_PARTICLE_TYPE_LIST : {
+			val     = as_bin_particle_to_asval(b);
+			res_val = as_sindex_extract_val_from_path(qtr->si->imd, val);
+			if (!res_val) {
+				matches = false;
+				break;
+			}
+			from_cdt = true;
+			break;
+		}
+		default: {
+			break;
+		}
+	}
+
+	if (from_cdt) {
+		if (res_val->type == AS_INTEGER) {
+			// Defensive check.
+			if (qtr->si->imd->itype == AS_SINDEX_ITYPE_DEFAULT) {
+				matches = query_match_integer_fromval(qtr, res_val, skey);
+			}
+			else {
+				matches = false;
+			}
+		}
+		else if (res_val->type == AS_STRING) {
+			// Defensive check.
+			if (qtr->si->imd->itype == AS_SINDEX_ITYPE_DEFAULT) {
+				matches = query_match_string_fromval(qtr, res_val, skey);
+			}
+			else {
+				matches = false;
+			}
+		}
+		else if (res_val->type == AS_MAP) {
+			qtr_skey q_s;
+			q_s.qtr  = qtr;
+			q_s.skey = skey;
+			// Defensive check.
+			if (qtr->si->imd->itype == AS_SINDEX_ITYPE_MAPKEYS) {
+				as_map * map = as_map_fromval(res_val);
+				matches = !as_map_foreach(map, query_match_mapkeys_foreach, &q_s);
+			}
+			else if (qtr->si->imd->itype == AS_SINDEX_ITYPE_MAPVALUES){
+				as_map * map = as_map_fromval(res_val);
+				matches = !as_map_foreach(map, query_match_mapvalues_foreach, &q_s);
+			}
+			else {
+				matches = false;
+			}
+		}
+		else if (res_val->type == AS_LIST) {
+			qtr_skey q_s;
+			q_s.qtr  = qtr;
+			q_s.skey = skey;
+
+			// Defensive check
+			if (qtr->si->imd->itype == AS_SINDEX_ITYPE_LIST) {
+				as_list * list = as_list_fromval(res_val);
+				matches = !as_list_foreach(list, query_match_listele_foreach, &q_s);
+			}
+			else {
+				matches = false;
+			}
+		}
+	}
+
+	if (val) {
+		as_val_destroy(val);
+	}
+	return matches;
+}
+
+
+
+static int
+query_io(as_query_transaction *qtr, cf_digest *dig, as_sindex_key * skey)
+{
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	as_namespace * ns = qtr->ns;
+	as_partition_reservation rsv_stack;
+	as_partition_reservation * rsv = &rsv_stack;
+
+	// We make sure while making digest list that current partition is query-able
+	// Attempt the query reservation here as well. If this partition is not
+	// query-able anymore then no need to return anything
+	// Since we are reserving all the partitions upfront, this is a defensive check
+	uint32_t pid = as_partition_getid(dig);
+	rsv = query_reserve_partition(ns, qtr, pid, rsv);
+	if (!rsv) {
+		return AS_QUERY_OK;
+	}
+
+	ASD_QUERY_IO_STARTING(nodeid, qtr->trid);
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+	int rec_rv      = as_record_get_live(rsv->tree, dig, &r_ref, ns);
+
+	if (rec_rv == 0) {
+		as_index *r = r_ref.r;
+
+		predexp_args_t predargs = { .ns = ns, .md = r, .vl = NULL, .rd = NULL };
+
+		if (qtr->predexp_eval &&
+			! predexp_matches_metadata(qtr->predexp_eval, &predargs)) {
+			as_record_done(&r_ref, ns);
+			goto CLEANUP;
+		}
+
+		// check to see this isn't a record waiting to die
+		if (as_record_is_doomed(r, ns)) {
+			as_record_done(&r_ref, ns);
+			cf_debug(AS_QUERY,
+					"build_response: record expired. treat as not found");
+			// Not sending error message to client as per the agreement
+			// that server will never send a error result code to the query client.
+			goto CLEANUP;
+		}
+
+		// make sure it's brought in from storage if necessary
+		as_storage_rd rd;
+		as_storage_record_open(ns, r, &rd);
+		qtr->n_read_success += 1;
+
+		// TODO - even if qtr->no_bin_data is true, we still read bins in order
+		// to check via query_record_matches() below. If sindex evolves to not
+		// have to do that, optimize this case and bypass reading bins.
+
+		as_storage_rd_load_n_bins(&rd); // TODO - handle error returned
+
+		// Note: This array must stay in scope until the response
+		//       for this record has been built, since in the get
+		//       data w/ record on device case, it's copied by
+		//       reference directly into the record descriptor!
+		as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+		// Figure out which bins you want - for now, all
+		as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned
+		rd.n_bins = as_bin_inuse_count(&rd);
+
+		// Now we have a record.
+		predargs.rd = &rd;
+
+		if (qtr->predexp_eval &&
+			 ! predexp_matches_record(qtr->predexp_eval, &predargs)) {
+			as_storage_record_close(&rd);
+			as_record_done(&r_ref, ns);
+			goto CLEANUP;
+		}
+
+		// Call Back
+		if (!query_record_matches(qtr, &rd, skey)) {
+			as_storage_record_close(&rd);
+			as_record_done(&r_ref, ns);
+			query_release_partition(qtr, rsv);
+			cf_atomic64_incr(&g_stats.query_false_positives);
+			ASD_QUERY_IO_NOTMATCH(nodeid, qtr->trid);
+			return AS_QUERY_OK;
+		}
+
+		int ret = query_add_response(qtr, &rd);
+		if (ret != 0) {
+			as_storage_record_close(&rd);
+			as_record_done(&r_ref, ns);
+			qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_CBERROR, __FILE__, __LINE__);
+			query_release_partition(qtr, rsv);
+			ASD_QUERY_IO_ERROR(nodeid, qtr->trid);
+			return AS_QUERY_ERR;
+		}
+		as_storage_record_close(&rd);
+		as_record_done(&r_ref, ns);
+	} else {
+		// What do we do about empty records?
+		// 1. Should gin up an empty record
+		// 2. Current error is returned back to the client.
+		cf_detail(AS_QUERY, "query_generator: "
+				"as_record_get returned %d : key %"PRIx64, rec_rv,
+				*(uint64_t *)dig);
+	}
+CLEANUP :
+	query_release_partition(qtr, rsv);
+
+	ASD_QUERY_IO_FINISHED(nodeid, qtr->trid);
+
+	return AS_QUERY_OK;
+}
+// **************************************************************************************************
+
+/*
+ * Query Aggregation Request Workhorse Function
+ */
+// **************************************************************************************************
+static int
+query_add_val_response(void *void_qtr, const as_val *val, bool success)
+{
+	as_query_transaction *qtr = (as_query_transaction *)void_qtr;
+	if (!qtr) {
+		return AS_QUERY_ERR;
+	}
+
+	uint32_t msg_sz = as_particle_asval_client_value_size(val);
+	if (0 == msg_sz) {
+		cf_warning(AS_PROTO, "particle to buf: could not copy data!");
+	}
+
+	pthread_mutex_lock(&qtr->buf_mutex);
+	cf_buf_builder *bb_r = qtr->bb_r;
+	if (bb_r == NULL) {
+		// Assert that query is aborted if bb_r is found to be null
+		pthread_mutex_unlock(&qtr->buf_mutex);
+		return AS_QUERY_ERR;
+	}
+
+	if (msg_sz > (bb_r->alloc_sz - bb_r->used_sz) && bb_r->used_sz != 0) {
+		query_netio(qtr);
+	}
+
+	as_msg_make_val_response_bufbuilder(val, &qtr->bb_r, msg_sz, success);
+	cf_atomic64_incr(&qtr->n_result_records);
+
+	pthread_mutex_unlock(&qtr->buf_mutex);
+	return 0;
+}
+
+
+static void
+query_add_result(char *res, as_query_transaction *qtr, bool success)
+{
+	const as_val * v = (as_val *) as_string_new (res, false);
+	query_add_val_response((void *) qtr, v, success);
+	as_val_destroy(v);
+}
+
+
+static int
+query_process_aggreq(query_work *qagg)
+{
+	as_query_transaction *qtr = qagg->qtr;
+	if (!qtr) {
+		return AS_QUERY_ERR;
+	}
+
+	if (!cf_ll_size(qagg->recl)) {
+		return AS_QUERY_ERR;
+	}
+
+	as_result   *res = as_result_new();
+	int ret          = as_aggr_process(qtr->ns, &qtr->agg_call, qagg->recl, (void *)qtr, res);
+
+	if (ret != 0) {
+        char *rs = as_module_err_string(ret);
+        if (res->value != NULL) {
+            as_string * lua_s   = as_string_fromval(res->value);
+            char *      lua_err  = (char *) as_string_tostring(lua_s);
+            if (lua_err != NULL) {
+                int l_rs_len = strlen(rs);
+                rs = cf_realloc(rs,l_rs_len + strlen(lua_err) + 4);
+                sprintf(&rs[l_rs_len]," : %s",lua_err);
+            }
+        }
+        query_add_result(rs, qtr, false);
+        cf_free(rs);
+	}
+    as_result_destroy(res);
+	return ret;
+}
+// **************************************************************************************************
+
+
+/*
+ * Aggregation HOOKS
+ */
+// **************************************************************************************************
+as_stream_status
+agg_ostream_write(void *udata, as_val *v)
+{
+	as_query_transaction *qtr = (as_query_transaction *)udata;
+	if (!v) {
+		return AS_STREAM_OK;
+	}
+	int ret = AS_STREAM_OK;
+	if (query_add_val_response((void *)qtr, v, true)) {
+		ret = AS_STREAM_ERR;
+	}
+	as_val_destroy(v);
+	return ret;
+}
+
+static as_partition_reservation *
+agg_reserve_partition(void *udata, as_namespace *ns, uint32_t pid, as_partition_reservation *rsv)
+{
+	return query_reserve_partition(ns, (as_query_transaction *)udata, pid, rsv);
+}
+
+static void
+agg_release_partition(void *udata, as_partition_reservation *rsv)
+{
+	query_release_partition((as_query_transaction *)udata, rsv);
+}
+
+static void
+agg_set_error(void * udata, int err)
+{
+	qtr_set_err((as_query_transaction *)udata, AS_PROTO_RESULT_FAIL_QUERY_CBERROR, __FILE__, __LINE__);
+}
+
+// true if matches
+static bool
+agg_record_matches(void *udata, udf_record *urecord, void *key_data)
+{
+	as_query_transaction * qtr = (as_query_transaction*)udata;
+	as_sindex_key *skey        = (void *)key_data;
+	qtr->n_read_success++;
+	if (query_record_matches(qtr, urecord->rd, skey) == false) {
+		cf_atomic64_incr(&g_stats.query_false_positives); // PUT IT INSIDE PRE_CHECK
+		return false;
+	}
+	return true;
+}
+
+const as_aggr_hooks query_aggr_hooks = {
+	.ostream_write = agg_ostream_write,
+	.set_error     = agg_set_error,
+	.ptn_reserve   = agg_reserve_partition,
+	.ptn_release   = agg_release_partition,
+	.pre_check     = agg_record_matches
+};
+// **************************************************************************************************
+
+
+
+
+
+/*
+ * Query Request UDF functions
+ */
+// **************************************************************************************************
+// NB: Caller holds a write hash lock _BE_CAREFUL_ if you intend to take
+// lock inside this function
+int
+query_udf_bg_tr_complete(void *udata, int retcode)
+{
+	as_query_transaction *qtr = (as_query_transaction *)udata;
+	if (!qtr) {
+		cf_warning(AS_QUERY, "Complete called with invalid job id");
+		return AS_QUERY_ERR;
+	}
+
+	qtr_finish_work(qtr, &qtr->n_udf_tr_queued, __FILE__, __LINE__, true);
+	return AS_QUERY_OK;
+}
+
+// Creates a internal transaction for per record UDF execution triggered
+// from inside generator. The generator could be scan job generating digest
+// or query generating digest.
+int
+query_udf_bg_tr_start(as_query_transaction *qtr, cf_digest *keyd)
+{
+	if (qtr->origin.predexp) {
+		as_partition_reservation rsv_stack;
+		as_partition_reservation *rsv = &rsv_stack;
+		uint32_t pid = as_partition_getid(keyd);
+
+		if (! (rsv = query_reserve_partition(qtr->ns, qtr, pid, rsv))) {
+			return AS_QUERY_OK;
+		}
+
+		as_index_ref r_ref;
+		r_ref.skip_lock = false;
+
+		if (as_record_get_live(rsv->tree, keyd, &r_ref, qtr->ns) != 0) {
+			query_release_partition(qtr, rsv);
+			return AS_QUERY_OK;
+		}
+
+		predexp_args_t predargs = {
+				.ns = qtr->ns, .md = r_ref.r, .vl = NULL, .rd = NULL
+		};
+
+		if (qtr->origin.predexp &&
+				! predexp_matches_metadata(qtr->origin.predexp, &predargs)) {
+			as_record_done(&r_ref, qtr->ns);
+			query_release_partition(qtr, rsv);
+			return AS_QUERY_OK;
+		}
+
+		as_record_done(&r_ref, qtr->ns);
+		query_release_partition(qtr, rsv);
+	}
+
+	as_transaction tr;
+
+	as_transaction_init_iudf(&tr, qtr->ns, keyd, &qtr->origin, qtr->is_durable_delete);
+
+	qtr_reserve(qtr, __FILE__, __LINE__);
+	cf_atomic32_incr(&qtr->n_udf_tr_queued);
+
+	as_tsvc_enqueue(&tr);
+
+	return AS_QUERY_OK;
+}
+
+static int
+query_process_udfreq(query_work *qudf)
+{
+	int ret               = AS_QUERY_OK;
+	cf_ll_element  * ele  = NULL;
+	cf_ll_iterator * iter = NULL;
+	as_query_transaction *qtr = qudf->qtr;
+	if (!qtr)           return AS_QUERY_ERR;
+	cf_detail(AS_QUERY, "Performing UDF");
+	iter                  = cf_ll_getIterator(qudf->recl, true /*forward*/);
+	if (!iter) {
+		ret              = AS_QUERY_ERR;
+		qtr_set_err(qtr, AS_SINDEX_ERR_NO_MEMORY, __FILE__, __LINE__);
+		goto Cleanup;
+	}
+
+	while ((ele = cf_ll_getNext(iter))) {
+		as_index_keys_ll_element * node;
+		node                         = (as_index_keys_ll_element *) ele;
+		as_index_keys_arr * keys_arr  = node->keys_arr;
+		if (!keys_arr) {
+			continue;
+		}
+		node->keys_arr   =  NULL;
+
+		for (int i = 0; i < keys_arr->num; i++) {
+
+			while (cf_atomic32_get(qtr->n_udf_tr_queued) >= (AS_QUERY_MAX_UDF_TRANSACTIONS * (qtr->priority / 10 + 1))) {
+				usleep(g_config.query_sleep_us);
+				query_check_timeout(qtr);
+				if (qtr_failed(qtr)) {
+					ret = AS_QUERY_ERR;
+					goto Cleanup;
+				}
+			}
+
+			if (AS_QUERY_ERR == query_udf_bg_tr_start(qtr, &keys_arr->pindex_digs[i])) {
+				as_index_keys_release_arr_to_queue(keys_arr);
+				ret = AS_QUERY_ERR;
+				goto Cleanup;
+			}
+		}
+		as_index_keys_release_arr_to_queue(keys_arr);
+	}
+Cleanup:
+	if (iter) {
+		cf_ll_releaseIterator(iter);
+		iter = NULL;
+	}
+	return ret;
+}
+// **************************************************************************************************
+
+
+
+
+static int
+query_process_ioreq(query_work *qio)
+{
+
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	as_query_transaction *qtr = qio->qtr;
+	if (!qtr) {
+		return AS_QUERY_ERR;
+	}
+
+	ASD_QUERY_IOREQ_STARTING(nodeid, qtr->trid);
+
+	cf_ll_element * ele   = NULL;
+	cf_ll_iterator * iter = NULL;
+
+	cf_detail(AS_QUERY, "Performing IO");
+	uint64_t time_ns      = 0;
+	if (g_config.query_enable_histogram || qtr->si->enable_histogram) {
+		time_ns = cf_getns();
+	}
+	iter                  = cf_ll_getIterator(qio->recl, true /*forward*/);
+	if (!iter) {
+		cf_crash(AS_QUERY, "Cannot allocate iterator... out of memory !!");
+	}
+
+	while ((ele = cf_ll_getNext(iter))) {
+		as_index_keys_ll_element * node;
+		node                       = (as_index_keys_ll_element *) ele;
+		as_index_keys_arr *keys_arr = node->keys_arr;
+		if (!keys_arr) {
+			continue;
+		}
+		node->keys_arr     = NULL;
+		for (int i = 0; i < keys_arr->num; i++) {
+			if (AS_QUERY_OK != query_io(qtr, &keys_arr->pindex_digs[i], &keys_arr->sindex_keys[i])) {
+				as_index_keys_release_arr_to_queue(keys_arr);
+				goto Cleanup;
+			}
+
+			int64_t nresults = cf_atomic64_get(qtr->n_result_records);
+			if (nresults > 0 && (nresults % qtr->priority == 0))
+			{
+				usleep(g_config.query_sleep_us);
+				query_check_timeout(qtr);
+				if (qtr_failed(qtr)) {
+					as_index_keys_release_arr_to_queue(keys_arr);
+					goto Cleanup;
+				}
+			}
+		}
+		as_index_keys_release_arr_to_queue(keys_arr);
+	}
+Cleanup:
+
+	if (iter) {
+		cf_ll_releaseIterator(iter);
+		iter = NULL;
+	}
+	QUERY_HIST_INSERT_DATA_POINT(query_batch_io_hist, time_ns);
+	SINDEX_HIST_INSERT_DATA_POINT(qtr->si, query_batch_io, time_ns);
+
+	ASD_QUERY_IOREQ_FINISHED(nodeid, qtr->trid);
+
+	return AS_QUERY_OK;
+}
+
+// **************************************************************************************************
+
+
+/*
+ * Query Request Processing
+ */
+// **************************************************************************************************
+static int
+qwork_process(query_work *qworkp)
+{
+	QUERY_HIST_INSERT_DATA_POINT(query_batch_io_q_wait_hist, qworkp->queued_time_ns);
+	cf_detail(AS_QUERY, "Processing Request %d", qworkp->type);
+	if (qtr_failed(qworkp->qtr)) {
+		return AS_QUERY_ERR;
+	}
+	int ret = AS_QUERY_OK;
+	switch (qworkp->type) {
+		case QUERY_WORK_TYPE_LOOKUP:
+			ret = query_process_ioreq(qworkp);
+			break;
+		case QUERY_WORK_TYPE_UDF_BG: // Does it need different call ??
+			ret = query_process_udfreq(qworkp);
+			break;
+		case QUERY_WORK_TYPE_AGG:
+			ret = query_process_aggreq(qworkp);
+			break;
+		default:
+			cf_warning(AS_QUERY, "Unsupported query type %d.. Dropping it", qworkp->type);
+			break;
+	}
+	return ret;
+}
+
+static void
+qwork_setup(query_work *qworkp, as_query_transaction *qtr)
+{
+	qtr_reserve(qtr, __FILE__, __LINE__);
+	qworkp->qtr               = qtr;
+	qworkp->recl              = qtr->qctx.recl;
+	qtr->qctx.recl            = NULL;
+	qworkp->queued_time_ns    = cf_getns();
+	qtr->n_digests          += qtr->qctx.n_bdigs;
+	qtr->qctx.n_bdigs        = 0;
+
+	switch (qtr->job_type) {
+		case QUERY_TYPE_LOOKUP:
+			qworkp->type          = QUERY_WORK_TYPE_LOOKUP;
+			break;
+		case QUERY_TYPE_AGGR:
+			qworkp->type          = QUERY_WORK_TYPE_AGG;
+			break;
+		case QUERY_TYPE_UDF_BG:
+			qworkp->type          = QUERY_WORK_TYPE_UDF_BG;
+			break;
+		default:
+			cf_crash(AS_QUERY, "Unknown Query Type !!");
+	}
+}
+
+static void
+qwork_teardown(query_work *qworkp)
+{
+	if (qworkp->recl) {
+		cf_ll_reduce(qworkp->recl, true /*forward*/, as_index_keys_ll_reduce_fn, NULL);
+		cf_free(qworkp->recl);
+		qworkp->recl = NULL;
+	}
+	qtr_release(qworkp->qtr, __FILE__, __LINE__);
+	qworkp->qtr = NULL;
+}
+// **************************************************************************************************
+
+
+void *
+qwork_th(void *q_to_wait_on)
+{
+	unsigned int         thread_id = cf_atomic32_incr(&g_query_worker_threadcnt);
+	cf_detail(AS_QUERY, "Created Query Worker Thread %d", thread_id);
+	query_work   * qworkp     = NULL;
+	int                  ret       = AS_QUERY_OK;
+
+	while (1) {
+		// Kill self if thread id is greater than that of number of configured
+		// Config change should be flag for quick check
+		if (thread_id > g_config.query_worker_threads) {
+			pthread_rwlock_rdlock(&g_query_lock);
+			if (thread_id > g_config.query_worker_threads) {
+				cf_atomic32_decr(&g_query_worker_threadcnt);
+				pthread_rwlock_unlock(&g_query_lock);
+				cf_detail(AS_QUERY, "Query Worker thread %d exited", thread_id);
+				return NULL;
+			}
+			pthread_rwlock_unlock(&g_query_lock);
+		}
+		if (cf_queue_pop(g_query_work_queue, &qworkp, CF_QUEUE_FOREVER) != 0) {
+			cf_crash(AS_QUERY, "Failed to pop from Query worker queue.");
+		}
+		cf_detail(AS_QUERY, "Popped I/O work [%p,%p]", qworkp, qworkp->qtr);
+
+		ret = qwork_process(qworkp);
+
+		as_query_transaction *qtr = qworkp->qtr;
+		if ((ret != AS_QUERY_OK) && !qtr_failed(qtr)) {
+			cf_warning(AS_QUERY, "Request processing failed but query is not qtr_failed .... ret %d", ret);
+		}
+		qtr_finish_work(qtr, &qtr->n_qwork_active, __FILE__, __LINE__, false);
+		qwork_teardown(qworkp);
+		qwork_poolrelease(qworkp);
+	}
+
+	return NULL;
+}
+
+/*
+ * Query Generator
+ */
+// **************************************************************************************************
+/*
+ * Function query_get_nextbatch
+ *
+ * Notes-
+ *		Function generates the next batch of digest list after looking up
+ * 		secondary index tree. The function populates qctx->recl with the
+ * 		digest list.
+ *
+ * Returns
+ * 		AS_QUERY_OK:  If the batch is full qctx->n_bdigs == qctx->bsize. The caller
+ *  		   then processes the batch and reset the qctx->recl and qctx->n_bdigs.
+ *
+ * 		AS_QUERY_CONTINUE:  If the caller should continue calling this function.
+ *
+ * 		AS_QUERY_ERR: In case of error
+ */
+int
+query_get_nextbatch(as_query_transaction *qtr)
+{
+	int              ret     = AS_QUERY_OK;
+	as_sindex       *si      = qtr->si;
+	as_sindex_qctx  *qctx    = &qtr->qctx;
+	uint64_t         time_ns = 0;
+	if (g_config.query_enable_histogram
+		|| qtr->si->enable_histogram) {
+		time_ns = cf_getns();
+	}
+
+	as_sindex_range *srange	 = &qtr->srange[qctx->range_index];
+
+	if (qctx->pimd_idx == -1) {
+		if (!srange->isrange) {
+			qctx->pimd_idx	 = ai_btree_key_hash_from_sbin(si->imd, &srange->start);
+		} else {
+			qctx->pimd_idx	 = 0;
+		}
+	}
+
+	if (!qctx->recl) {
+		qctx->recl = cf_malloc(sizeof(cf_ll));
+		cf_ll_init(qctx->recl, as_index_keys_ll_destroy_fn, false /*no lock*/);
+		qctx->n_bdigs        = 0;
+	} else {
+		// Following condition may be true if the
+		// query has moved from short query pool to
+		// long running query pool
+		if (qctx->n_bdigs >= qctx->bsize)
+			return ret;
+	}
+
+	// Query Aerospike Index
+	int      qret            = as_sindex_query(qtr->si, srange, &qtr->qctx);
+	cf_detail(AS_QUERY, "start %ld end %ld @ %d pimd found %"PRIu64, srange->start.u.i64, srange->end.u.i64, qctx->pimd_idx, qctx->n_bdigs);
+
+	qctx->new_ibtr           = false;
+	if (qret < 0) { // [AS_SINDEX_OK, AS_SINDEX_CONTINUE] -> OK
+		qtr_set_err(qtr, as_sindex_err_to_clienterr(qret, __FILE__, __LINE__), __FILE__, __LINE__);
+		ret = AS_QUERY_ERR;
+		goto batchout;
+	}
+
+	if (time_ns) {
+		if (g_config.query_enable_histogram) {
+			qtr->querying_ai_time_ns += cf_getns() - time_ns;
+		} else if (qtr->si->enable_histogram) {
+			SINDEX_HIST_INSERT_DATA_POINT(qtr->si, query_batch_lookup, time_ns);
+		}
+	}
+	if (qctx->n_bdigs < qctx->bsize) {
+		qctx->new_ibtr       = true;
+		qctx->nbtr_done      = false;
+		qctx->pimd_idx++;
+		cf_detail(AS_QUERY, "All the Data finished moving to next tree %d", qctx->pimd_idx);
+		if (!srange->isrange) {
+			qtr->result_code = AS_PROTO_RESULT_OK;
+			ret              = AS_QUERY_DONE;
+			goto batchout;
+		}
+		if (qctx->pimd_idx == si->imd->nprts) {
+
+			// Geospatial queries need to search multiple ranges.  The
+			// srange object is a vector of MAX_REGION_CELLS elements.
+			// We iterate over ranges until we encounter an empty
+			// srange (num_binval == 0).
+			//
+			if (qctx->range_index == (MAX_REGION_CELLS - 1) ||
+				qtr->srange[qctx->range_index+1].num_binval == 0) {
+				qtr->result_code = AS_PROTO_RESULT_OK;
+				ret              = AS_QUERY_DONE;
+				goto batchout;
+			}
+			qctx->range_index++;
+			qctx->pimd_idx = -1;
+		}
+		ret = AS_QUERY_CONTINUE;
+		goto batchout;
+	}
+batchout:
+	return ret;
+}
+
+
+/*
+ * Phase II setup just after the generator picks up query for
+ * the first time
+ */
+static int
+query_run_setup(as_query_transaction *qtr)
+{
+
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+#endif
+
+	QUERY_HIST_INSERT_DATA_POINT(query_query_q_wait_hist, qtr->start_time);
+	cf_atomic64_set(&qtr->n_result_records, 0);
+	qtr->track               = false;
+	qtr->querying_ai_time_ns = 0;
+	qtr->n_io_outstanding    = 0;
+	qtr->netio_push_seq      = 0;
+	qtr->netio_pop_seq       = 1;
+	qtr->blocking            = false;
+	pthread_mutex_init(&qtr->buf_mutex, NULL);
+
+	// Aerospike Index object initialization
+	qtr->result_code              = AS_PROTO_RESULT_OK;
+
+	// Initialize qctx
+	// start with the threshold value
+	qtr->qctx.bsize               = g_config.query_threshold;
+	qtr->qctx.new_ibtr            = true;
+	qtr->qctx.nbtr_done           = false;
+	qtr->qctx.pimd_idx            = -1;
+	qtr->qctx.recl                = NULL;
+	qtr->qctx.n_bdigs             = 0;
+	qtr->qctx.range_index         = 0;
+	qtr->qctx.partitions_pre_reserved = g_config.partitions_pre_reserved;
+	qtr->qctx.bkey                = &qtr->bkey;
+	init_ai_obj(qtr->qctx.bkey);
+	bzero(&qtr->qctx.bdig, sizeof(cf_digest));
+	// Populate all the paritions for which this partition is query-able
+	as_query_pre_reserve_partitions(qtr);
+
+	qtr->priority                 = g_config.query_priority;
+	qtr->bb_r                     = bb_poolrequest();
+	cf_buf_builder_reserve(&qtr->bb_r, 8, NULL);
+
+	qtr_set_running(qtr);
+	cf_atomic64_incr(&qtr->ns->query_short_reqs);
+	cf_atomic32_incr(&g_query_short_running);
+
+	// This needs to be distant from the initialization of nodeid to
+	// workaround a lame systemtap/compiler interaction.
+	ASD_QUERY_INIT(nodeid, qtr->trid);
+
+	return AS_QUERY_OK;
+}
+
+static int
+query_qtr_enqueue(as_query_transaction *qtr, bool is_requeue)
+{
+	uint64_t limit  = 0;
+	uint32_t size   = 0;
+	cf_queue    * q;
+	cf_atomic64 * queue_full_err;
+	if (qtr->short_running) {
+		limit          = g_config.query_short_q_max_size;
+		size           = cf_atomic32_get(g_query_short_running);
+		q              = g_query_short_queue;
+		queue_full_err = &qtr->ns->query_short_queue_full;
+	}
+	else {
+		limit          = g_config.query_long_q_max_size;
+		size           = cf_atomic32_get(g_query_long_running);
+		q              = g_query_long_queue;
+		queue_full_err = &qtr->ns->query_long_queue_full;
+	}
+
+	// Allow requeue without limit check, to cover for dynamic
+	// config change while query
+	if (!is_requeue && (size > limit)) {
+		cf_atomic64_incr(queue_full_err);
+		return AS_QUERY_ERR;
+	} else {
+		cf_queue_push(q, &qtr);
+		cf_detail(AS_QUERY, "Logged query ");
+	}
+
+	return AS_QUERY_OK;
+}
+
+int
+query_requeue(as_query_transaction *qtr)
+{
+	int ret = AS_QUERY_OK;
+	if (query_qtr_enqueue(qtr, true) != 0) {
+		cf_warning(AS_QUERY, "Queuing Error... continue!!");
+		qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_QUEUEFULL, __FILE__, __LINE__);
+		ret = AS_QUERY_ERR;
+	} else {
+		cf_detail(AS_QUERY, "Query Queued Due to Network");
+		ret = AS_QUERY_OK;
+	}
+	return ret;
+}
+
+static void
+qtr_finish_work(as_query_transaction *qtr, cf_atomic32 *stat, char *fname, int lineno, bool release)
+{
+	qtr_lock(qtr);
+	uint32_t val = cf_atomic32_decr(stat);
+	if ((val == 0) && qtr->do_requeue) {
+		query_requeue(qtr);
+		cf_detail(AS_QUERY, "(%s:%d) Job Requeued %p", fname, lineno, qtr);
+		qtr->do_requeue = false;
+	}
+	qtr_unlock(qtr);
+	if (release) {
+		qtr_release(qtr, fname, lineno);
+	}
+}
+
+//
+// 0: Successfully requeued
+// -1: Query Err
+// 1: Not requeued continue
+// 2: Query finished
+//
+static int
+query_qtr_check_and_requeue(as_query_transaction *qtr)
+{
+	bool do_enqueue = false;
+	// Step 1: If the query batch is done then wait for number of outstanding qwork to
+	// finish. This may slow down query responses get the better model
+	if (qtr_finished(qtr)) {
+		if ((cf_atomic32_get(qtr->n_qwork_active) == 0)
+				&& (cf_atomic32_get(qtr->n_io_outstanding) == 0)
+				&& (cf_atomic32_get(qtr->n_udf_tr_queued) == 0)) {
+			cf_detail(AS_QUERY, "Request is finished");
+			return AS_QUERY_DONE;
+		}
+		do_enqueue = true;
+		cf_detail(AS_QUERY, "Request not finished qwork(%d) io(%d)", cf_atomic32_get(qtr->n_qwork_active), cf_atomic32_get(qtr->n_io_outstanding));
+	}
+
+	// Step 2: Client is slow requeue
+	if (query_netio_wait(qtr) != AS_QUERY_OK) {
+		do_enqueue = true;
+	}
+
+	// Step 3: Check to see if this is long running query. This is determined by
+	// checking number of records read. Please note that it makes sure the false
+	// entries in secondary index does not effect this decision. All short running
+	// queries perform I/O in the batch thread context.
+	if ((cf_atomic64_get(qtr->n_result_records) >= g_config.query_threshold)
+			&& qtr->short_running) {
+		qtr->short_running       = false;
+		// Change batch size to the long running job batch size value
+		qtr->qctx.bsize          = g_config.query_bsize;
+		cf_atomic32_decr(&g_query_short_running);
+		cf_atomic32_incr(&g_query_long_running);
+		cf_atomic64_incr(&qtr->ns->query_long_reqs);
+		cf_atomic64_decr(&qtr->ns->query_short_reqs);
+		cf_detail(AS_QUERY, "Query Queued Into Long running thread pool %ld %d", cf_atomic64_get(qtr->n_result_records), qtr->short_running);
+		do_enqueue = true;
+	}
+
+	if (do_enqueue) {
+		int ret = AS_QUERY_OK;
+		qtr_lock(qtr);
+		if ((cf_atomic32_get(qtr->n_qwork_active) != 0)
+				|| (cf_atomic32_get(qtr->n_io_outstanding) != 0)
+				|| (cf_atomic32_get(qtr->n_udf_tr_queued) != 0)) {
+			cf_detail(AS_QUERY, "Job Setup for Requeue %p", qtr);
+
+			// Release of one of the above will perform requeue... look for
+			// qtr_finish_work();
+			qtr->do_requeue = true;
+			ret = AS_QUERY_OK;
+		} else {
+			ret = query_requeue(qtr);
+		}
+		qtr_unlock(qtr);
+		return ret;
+	}
+
+	return AS_QUERY_CONTINUE;
+}
+static bool
+query_process_inline(as_query_transaction *qtr)
+{
+	if (   g_config.query_req_in_query_thread
+		|| (cf_atomic32_get((qtr)->n_qwork_active) > g_config.query_req_max_inflight)
+		|| (qtr && qtr->short_running)
+		|| (qtr && qtr_finished(qtr))) {
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+/*
+ * Process the query work either inilne or pass it on to the
+ * worker thread
+ *
+ * Returns
+ *     -1 : Fail
+ *     0  : Success
+ */
+static int
+qtr_process(as_query_transaction *qtr)
+{
+	if (query_process_inline(qtr)) {
+		query_work qwork;
+		qwork_setup(&qwork, qtr);
+
+		int ret = qwork_process(&qwork);
+
+		qwork_teardown(&qwork);
+		return ret;
+
+	} else {
+		query_work *qworkp = qwork_poolrequest();
+		if (!qworkp) {
+			cf_warning(AS_QUERY, "Could not allocate query "
+					"request structure .. out of memory .. Aborting !!!");
+			return AS_QUERY_ERR;
+		}
+		// Successfully queued
+		cf_atomic32_incr(&qtr->n_qwork_active);
+		qwork_setup(qworkp, qtr);
+		cf_queue_push(g_query_work_queue, &qworkp);
+
+	}
+	return AS_QUERY_OK;
+}
+
+static int
+query_check_bound(as_query_transaction *qtr)
+{
+	if (cf_atomic64_get(qtr->n_result_records) > g_config.query_rec_count_bound) {
+		return AS_QUERY_ERR;
+	}
+	return AS_QUERY_OK;
+}
+/*
+ * Function query_generator
+ *
+ * Does the following
+ * 1. Calls the sindex layer for fetching digest list
+ * 2. If short running query performs I/O inline and for long running query
+ *    queues it up for work threads to execute.
+ * 3. If the query is short_running and has hit threshold. Requeue it for
+ *    long running generator threads
+ *
+ * Returns -
+ * 		Nothing, sets the qtr status accordingly
+ */
+static void
+query_generator(as_query_transaction *qtr)
+{
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+	uint64_t trid = qtr->trid;
+	size_t nrecs = 0;
+#endif
+
+	// Query can get requeue for many different reason. Check if it is
+	// already started before indulging in act to setting it up for run
+	if (!qtr_started(qtr)) {
+		query_run_setup(qtr);
+	}
+
+	int loop = 0;
+	while (true) {
+
+		// Step 1: Check for requeue
+		int ret = query_qtr_check_and_requeue(qtr);
+		if (ret == AS_QUERY_ERR) {
+			cf_warning(AS_QUERY, "Unexpected requeue failure .. shutdown connection.. abort!!");
+			qtr_set_abort(qtr, AS_PROTO_RESULT_FAIL_QUERY_NETIO_ERR, __FILE__, __LINE__);
+			break;
+		} else if (ret == AS_QUERY_DONE) {
+			break;
+		} else if (ret == AS_QUERY_OK) {
+			return;
+		}
+		// Step 2: Check for timeout
+		query_check_timeout(qtr);
+		if (qtr_failed(qtr)) {
+			qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_TIMEOUT, __FILE__, __LINE__);
+			continue;
+		}
+		// Step 3: Conditionally track
+		if (hash_track_qtr(qtr)) {
+			qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_DUPLICATE, __FILE__, __LINE__);
+			continue;
+		}
+
+		// Step 4: If needs user based abort
+		if (query_check_bound(qtr)) {
+			qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_USERABORT, __FILE__, __LINE__);
+			continue;
+		}
+
+		// Step 5: Get Next Batch
+		loop++;
+		int qret    = query_get_nextbatch(qtr);
+
+		cf_detail(AS_QUERY, "Loop=%d, Selected=%"PRIu64", ret=%d", loop, qtr->qctx.n_bdigs, qret);
+		switch (qret) {
+			case  AS_QUERY_OK:
+			case  AS_QUERY_DONE:
+				break;
+			case  AS_QUERY_ERR:
+				continue;
+			case  AS_QUERY_CONTINUE:
+				continue;
+			default:
+				cf_warning(AS_QUERY, "Unexpected return type");
+				continue;
+		}
+
+		if (qret == AS_QUERY_DONE) {
+			// In case all physical tree is done return. if not range loop
+			// till less than batch size results are returned
+#if defined(USE_SYSTEMTAP)
+			nrecs = qtr->n_result_records;
+#endif
+			qtr_set_done(qtr, AS_PROTO_RESULT_OK, __FILE__, __LINE__);
+		}
+
+		// Step 6: Prepare Query Request either to process inline or for
+		//         queueing up for offline processing
+		if (qtr_process(qtr)) {
+			qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_CBERROR, __FILE__, __LINE__);
+			continue;
+		}
+	}
+
+	if (!qtr_is_abort(qtr)) {
+		// Send the fin packet in it is NOT a shutdown
+		query_send_fin(qtr);
+	}
+	// deleting it from the global hash.
+	hash_delete_qtr(qtr);
+	qtr_release(qtr, __FILE__, __LINE__);
+	ASD_QUERY_DONE(nodeid, trid, nrecs);
+}
+
+/*
+ * Function as_query_worker
+ *
+ * Notes -
+ * 		Process one queue's Query requests.
+ * 			- Immediately fail if query has timed out
+ * 			- Maximum queries that can be served is number of threads
+ *
+ * 		Releases the qtr, which will call as_query_trasaction_done
+ *
+ * Synchronization -
+ * 		Takes a global query lock while
+ */
+void*
+query_th(void* q_to_wait_on)
+{
+	cf_queue *           query_queue = (cf_queue*)q_to_wait_on;
+	unsigned int         thread_id    = cf_atomic32_incr(&g_query_threadcnt);
+	cf_detail(AS_QUERY, "Query Thread Created %d", thread_id);
+	as_query_transaction *qtr         = NULL;
+
+	while (1) {
+		// Kill self if thread id is greater than that of number of configured
+		// thread
+		if (thread_id > g_config.query_threads) {
+			pthread_rwlock_rdlock(&g_query_lock);
+			if (thread_id > g_config.query_threads) {
+				cf_atomic32_decr(&g_query_threadcnt);
+				pthread_rwlock_unlock(&g_query_lock);
+				cf_detail(AS_QUERY, "Query thread %d exited", thread_id);
+				return NULL;
+			}
+			pthread_rwlock_unlock(&g_query_lock);
+		}
+		if (cf_queue_pop(query_queue, &qtr, CF_QUEUE_FOREVER) != 0) {
+			cf_crash(AS_QUERY, "Failed to pop from Query worker queue.");
+		}
+
+		query_generator(qtr);
+	}
+	return AS_QUERY_OK;
+}
+
+/*
+ * Parse the UDF OP type to find what type of UDF this is or otherwise not even
+ * UDF
+ */
+query_type
+query_get_type(as_transaction* tr)
+{
+	if (! as_transaction_is_udf(tr)) {
+		return QUERY_TYPE_LOOKUP;
+	}
+
+	as_msg_field *udf_op_f = as_transaction_has_udf_op(tr) ?
+			as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_OP) : NULL;
+
+	if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_AGGREGATE) {
+		return QUERY_TYPE_AGGR;
+	}
+
+	if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_BACKGROUND) {
+		return QUERY_TYPE_UDF_BG;
+	}
+/*
+	if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_FOREGROUND) {
+		return QUERY_TYPE_UDF_FG;
+	}
+*/
+	return QUERY_TYPE_UNKNOWN;
+}
+
+/*
+ * Function aggr_query_init
+ */
+int
+aggr_query_init(as_aggr_call * call, as_transaction *tr)
+{
+	if (! udf_def_init_from_msg(&call->def, tr)) {
+		return AS_QUERY_ERR;
+	}
+
+	call->aggr_hooks    = &query_aggr_hooks;
+	return AS_QUERY_OK;
+}
+
+static int
+query_setup_udf_call(as_query_transaction *qtr, as_transaction *tr)
+{
+	switch (qtr->job_type) {
+		case QUERY_TYPE_LOOKUP:
+			cf_atomic64_incr(&qtr->ns->n_lookup);
+			break;
+		case QUERY_TYPE_AGGR:
+			if (aggr_query_init(&qtr->agg_call, tr) != AS_QUERY_OK) {
+				tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER;
+				return AS_QUERY_ERR;
+			}
+			cf_atomic64_incr(&qtr->ns->n_aggregation);
+			break;
+		case QUERY_TYPE_UDF_BG:
+			if (! udf_def_init_from_msg(&qtr->origin.def, tr)) {
+				tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER;
+				return AS_QUERY_ERR;
+			}
+			break;
+		default:
+			cf_crash(AS_QUERY, "Invalid QUERY TYPE %d !!!", qtr->job_type);
+			break;
+	}
+	return AS_QUERY_OK;
+}
+
+static void
+query_setup_fd(as_query_transaction *qtr, as_transaction *tr)
+{
+	switch (qtr->job_type) {
+		case QUERY_TYPE_LOOKUP:
+		case QUERY_TYPE_AGGR:
+			qtr->fd_h                = tr->from.proto_fd_h;
+			qtr->fd_h->fh_info      |= FH_INFO_DONOT_REAP;
+			break;
+		case QUERY_TYPE_UDF_BG:
+			qtr->fd_h  = NULL;
+			break;
+		default:
+			cf_crash(AS_QUERY, "Invalid QUERY TYPE %d !!!", qtr->job_type);
+			break;
+	}
+}
+/*
+ * Phase I query setup which happens just before query is queued for generator
+ * Populates valid qtrp in case of success and NULL in case of failure.
+ * All the query related parsing code sits here
+ *
+ * Returns:
+ *   AS_QUERY_OK in case of successful
+ *   AS_QUERY_DONE in case nothing to be like scan on non-existent set
+ *   AS_QUERY_ERR in case of parsing failure
+ *
+ */
+static int
+query_setup(as_transaction *tr, as_namespace *ns, as_query_transaction **qtrp)
+{
+
+#if defined(USE_SYSTEMTAP)
+	uint64_t nodeid = g_config.self_node;
+	uint64_t trid = tr ? as_transaction_trid(tr) : 0;
+#endif
+
+	int rv = AS_QUERY_ERR;
+	*qtrp  = NULL;
+
+    ASD_QUERY_STARTING(nodeid, trid);
+
+	uint64_t start_time     = cf_getns();
+	as_sindex *si           = NULL;
+	cf_vector *binlist      = 0;
+	as_sindex_range *srange = 0;
+	predexp_eval_t *predexp_eval = NULL;
+	char *setname           = NULL;
+	as_query_transaction *qtr = NULL;
+
+	bool has_sindex   = as_sindex_ns_has_sindex(ns);
+	if (!has_sindex) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND;
+		cf_debug(AS_QUERY, "No Secondary Index on namespace %s", ns->name);
+		goto Cleanup;
+	}
+
+	as_msg *m = &tr->msgp->msg;
+
+	// TODO - still lots of redundant msg field parsing (e.g. for set) - fix.
+	if ((si = as_sindex_from_msg(ns, m)) == NULL) {
+		cf_debug(AS_QUERY, "No Index Defined in the Query");
+	}
+
+    ASD_SINDEX_MSGRANGE_STARTING(nodeid, trid);
+	int ret = as_sindex_rangep_from_msg(ns, m, &srange);
+	if (AS_QUERY_OK != ret) {
+		cf_debug(AS_QUERY, "Could not instantiate index range metadata... "
+				"Err, %s", as_sindex_err_str(ret));
+		tr->result_code = as_sindex_err_to_clienterr(ret, __FILE__, __LINE__);
+		goto Cleanup;
+	}
+
+	ASD_SINDEX_MSGRANGE_FINISHED(nodeid, trid);
+	// get optional set
+	as_msg_field *sfp = as_transaction_has_set(tr) ?
+			as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET) : NULL;
+
+	if (sfp) {
+		uint32_t setname_len = as_msg_field_get_value_sz(sfp);
+
+		if (setname_len >= AS_SET_NAME_MAX_SIZE) {
+			cf_warning(AS_QUERY, "set name too long");
+			tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER;
+			goto Cleanup;
+		}
+
+		if (setname_len != 0) {
+			setname = cf_strndup((const char *)sfp->data, setname_len);
+		}
+	}
+
+	if (si) {
+
+		if (! as_sindex_can_query(si)) {
+			tr->result_code = as_sindex_err_to_clienterr(
+					AS_SINDEX_ERR_NOT_READABLE, __FILE__, __LINE__);
+			goto Cleanup;
+		}
+	} else {
+		// Look up sindex by bin in the query in case not
+		// specified in query
+		si = as_sindex_from_range(ns, setname, srange);
+	}
+
+	if (as_transaction_has_predexp(tr)) {
+		as_msg_field * pfp = as_msg_field_get(m, AS_MSG_FIELD_TYPE_PREDEXP);
+		predexp_eval = predexp_build(pfp);
+		if (! predexp_eval) {
+			cf_warning(AS_QUERY, "Failed to build predicate expression");
+			tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER;
+			goto Cleanup;
+		}
+	}
+	
+	int numbins = 0;
+	// Populate binlist to be Projected by the Query
+	binlist = as_sindex_binlist_from_msg(ns, m, &numbins);
+
+	// If anyone of the bin in the bin is bad, fail the query
+	if (numbins != 0 && !binlist) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_INDEX_GENERIC;
+		goto Cleanup;
+	}
+
+	if (!has_sindex || !si) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND;
+		goto Cleanup;
+	}
+
+	// quick check if there is any data with the certain set name
+	if (setname && as_namespace_get_set_id(ns, setname) == INVALID_SET_ID) {
+		cf_info(AS_QUERY, "Query on non-existent set %s", setname);
+		tr->result_code = AS_PROTO_RESULT_OK;
+		rv              = AS_QUERY_DONE;
+		goto Cleanup;
+	}
+	cf_detail(AS_QUERY, "Query on index %s ",
+			((as_sindex_metadata *)si->imd)->iname);
+
+	query_type qtype = query_get_type(tr);
+	if (qtype == QUERY_TYPE_UNKNOWN) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER;
+		rv              = AS_QUERY_ERR;
+		goto Cleanup;
+	}
+
+	if (qtype == QUERY_TYPE_AGGR && as_transaction_has_predexp(tr)) {
+		cf_warning(AS_QUERY, "aggregation queries do not support predexp filters");
+		tr->result_code = AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE;
+		rv              = AS_QUERY_ERR;
+		goto Cleanup;
+	}
+
+	ASD_QUERY_QTRSETUP_STARTING(nodeid, trid);
+	qtr = qtr_alloc();
+	if (!qtr) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN;
+		goto Cleanup;
+	}
+	ASD_QUERY_QTR_ALLOC(nodeid, trid, (void *) qtr);
+	// Be aware of the size of qtr
+	// Memset it partial
+	memset(qtr, 0, offsetof(as_query_transaction, bkey));
+
+	ASD_QUERY_QTRSETUP_FINISHED(nodeid, trid);
+
+	qtr->ns = ns;
+	qtr->job_type = qtype;
+
+	if (query_setup_udf_call(qtr, tr)) {
+		rv = AS_QUERY_ERR;
+		cf_free(qtr);
+		goto Cleanup;
+	}
+
+	query_setup_fd(qtr, tr);
+
+	if (qtr->job_type == QUERY_TYPE_LOOKUP) {
+		qtr->predexp_eval = predexp_eval;
+		qtr->no_bin_data = (m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0;
+	}
+	else if (qtr->job_type == QUERY_TYPE_UDF_BG) {
+		qtr->origin.predexp = predexp_eval;
+		qtr->origin.cb     = query_udf_bg_tr_complete;
+		qtr->origin.udata  = (void *)qtr;
+		qtr->is_durable_delete = as_transaction_is_durable_delete(tr);
+	}
+
+	// Consume everything from tr rest will be picked up in init
+	qtr->trid                = as_transaction_trid(tr);
+	qtr->setname             = setname;
+	qtr->si                  = si;
+	qtr->srange              = srange;
+	qtr->binlist             = binlist;
+	qtr->start_time          = start_time;
+	qtr->end_time            = tr->end_time;
+	qtr->rsv                 = NULL;
+
+	rv = AS_QUERY_OK;
+
+	pthread_mutex_init(&qtr->slock, NULL);
+	qtr->state         = AS_QTR_STATE_INIT;
+	qtr->do_requeue    = false;
+	qtr->short_running = true;
+
+	*qtrp = qtr;
+	return rv;
+
+Cleanup:
+	// Pre Query Setup Failure
+	if (setname)     cf_free(setname);
+	if (si)          AS_SINDEX_RELEASE(si);
+	if (predexp_eval) predexp_destroy(predexp_eval);
+	if (srange)      as_sindex_range_free(&srange);
+	if (binlist)     cf_vector_destroy(binlist);
+	return rv;
+}
+
+/*
+ *	Arguments -
+ *		tr - transaction coming from the client.
+ *
+ *	Returns -
+ *		AS_QUERY_OK  - on success. Responds, frees msgp and proto_fd
+ *		AS_QUERY_ERR - on failure. That means the query was not even started.
+ *		               frees msgp, response is responsibility of caller
+ *
+ * 	Notes -
+ * 		Allocates and reserves the qtr if query_in_transaction_thr
+ * 		is set to false or data is in not in memory.
+ * 		Has the responsibility to free tr->msgp.
+ * 		Either call query_transaction_done or Cleanup to free the msgp
+ */
+int
+as_query(as_transaction *tr, as_namespace *ns)
+{
+	if (tr) {
+		QUERY_HIST_INSERT_DATA_POINT(query_txn_q_wait_hist, tr->start_time);
+	}
+
+	as_query_transaction *qtr;
+	int rv = query_setup(tr, ns, &qtr);
+
+	if (rv == AS_QUERY_DONE) {
+		// Send FIN packet to client to ignore this.
+		bool force_close = ! as_msg_send_fin(&tr->from.proto_fd_h->sock, AS_PROTO_RESULT_OK);
+		query_release_fd(tr->from.proto_fd_h, force_close);
+		tr->from.proto_fd_h = NULL; // Paranoid
+		return AS_QUERY_OK;
+	} else if (rv == AS_QUERY_ERR) {
+		// tsvc takes care of managing fd
+		return AS_QUERY_ERR;
+	}
+
+	if (g_config.query_in_transaction_thr) {
+		if (qtr->job_type == QUERY_TYPE_UDF_BG) {
+			query_send_bg_udf_response(tr);
+		}
+		query_generator(qtr);
+	} else {
+		if (query_qtr_enqueue(qtr, false)) {
+			// This error will be accounted by thr_tsvc layer. Thus
+			// reset fd_h before calling qtr release, and let the
+			// transaction handler deal with the failure.
+			qtr->fd_h           = NULL;
+			qtr_release(qtr, __FILE__, __LINE__);
+			tr->result_code     = AS_PROTO_RESULT_FAIL_QUERY_QUEUEFULL;
+			return AS_QUERY_ERR;
+		}
+		// Respond after queuing is successfully.
+		if (qtr->job_type == QUERY_TYPE_UDF_BG) {
+			query_send_bg_udf_response(tr);
+		}
+	}
+
+	// Query engine will reply to queued query as needed.
+	tr->from.proto_fd_h = NULL;
+	return AS_QUERY_OK;
+}
+// **************************************************************************************************
+
+
+/*
+ * Query Utility and Monitoring functions
+ */
+// **************************************************************************************************
+
+// Find matching trid and kill the query
+int
+as_query_kill(uint64_t trid)
+{
+	as_query_transaction *qtr;
+	int rv = hash_get_qtr(trid, &qtr);
+
+	if (rv != AS_QUERY_OK) {
+		cf_warning(AS_QUERY, "Cannot kill query with trid [%"PRIu64"]",  trid);
+	} else {
+		qtr_set_abort(qtr, AS_PROTO_RESULT_FAIL_QUERY_USERABORT, __FILE__, __LINE__);
+		rv = AS_QUERY_OK;
+		qtr_release(qtr, __FILE__, __LINE__);
+	}
+
+	return rv;
+}
+
+// Find matching trid and set priority
+int
+as_query_set_priority(uint64_t trid, uint32_t priority)
+{
+	as_query_transaction *qtr;
+	int rv = hash_get_qtr(trid, &qtr);
+
+	if (rv != AS_QUERY_OK) {
+		cf_warning(AS_QUERY, "Cannot set priority for query with trid [%"PRIu64"]",  trid);
+	} else {
+		uint32_t old_priority = qtr->priority;
+		qtr->priority = priority;
+		cf_info(AS_QUERY, "Query priority changed from %d to %d", old_priority, priority);
+		rv = AS_QUERY_OK;
+		qtr_release(qtr, __FILE__, __LINE__);
+	}
+	return rv;
+}
+
+int
+as_query_list_job_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+	as_query_transaction * qtr = (as_query_transaction*)object;
+	cf_dyn_buf * db = (cf_dyn_buf*) udata;
+
+	cf_dyn_buf_append_string(db, "trid=");
+	cf_dyn_buf_append_uint64(db, qtr->trid);
+	cf_dyn_buf_append_string(db, ":job_type=");
+	cf_dyn_buf_append_int(db, qtr->job_type);
+	cf_dyn_buf_append_string(db, ":n_result_records=");
+	cf_dyn_buf_append_uint64(db, cf_atomic_int_get(qtr->n_result_records));
+	cf_dyn_buf_append_string(db, ":run_time=");
+	cf_dyn_buf_append_uint64(db, (cf_getns() - qtr->start_time) / 1000);
+	cf_dyn_buf_append_string(db, ":state=");
+	if(qtr_failed(qtr)) {
+		cf_dyn_buf_append_string(db, "ABORTED");
+	} else {
+		cf_dyn_buf_append_string(db, "RUNNING");
+	}
+	cf_dyn_buf_append_string(db, ";");
+	return AS_QUERY_OK;
+}
+
+// Lists thr current running queries
+int
+as_query_list(char *name, cf_dyn_buf *db)
+{
+	uint32_t size = cf_rchash_get_size(g_query_job_hash);
+	// No elements in the query job hash, return failure
+	if (!size) {
+		cf_dyn_buf_append_string(db, "No running queries");
+	}
+	// Else go through all the jobs in the hash and list their statistics
+	else {
+		cf_rchash_reduce(g_query_job_hash, as_query_list_job_reduce_fn, db);
+		cf_dyn_buf_chomp(db);
+	}
+	return AS_QUERY_OK;
+}
+
+
+// query module to monitor
+void
+as_query_fill_jobstat(as_query_transaction *qtr, as_mon_jobstat *stat)
+{
+	stat->trid          = qtr->trid;
+	stat->cpu           = 0;                               // not implemented
+	stat->run_time      = (cf_getns() - qtr->start_time) / 1000000;
+	stat->recs_read     = qtr->n_read_success;
+	stat->net_io_bytes  = qtr->net_io_bytes;
+	stat->priority      = qtr->priority;
+
+	// Not implemented:
+	stat->progress_pct    = 0;
+	stat->time_since_done = 0;
+	stat->job_type[0]     = '\0';
+
+	strcpy(stat->ns, qtr->ns->name);
+
+	if (qtr->setname) {
+		strcpy(stat->set, qtr->setname);
+	} else {
+		strcpy(stat->set, "NULL");
+	}
+
+	strcpy(stat->status, "active");
+
+	char *specific_data   = stat->jdata;
+	sprintf(specific_data, ":sindex-name=%s:", qtr->si->imd->iname);
+}
+
+/*
+ * Populates the as_mon_jobstat and returns to mult-key lookup monitoring infrastructure.
+ * Serves as a callback function
+ *
+ * Returns -
+ * 		NULL - In case of failure.
+ * 		as_mon_jobstat - On success.
+ */
+as_mon_jobstat *
+as_query_get_jobstat(uint64_t trid)
+{
+	as_mon_jobstat *stat;
+	as_query_transaction *qtr;
+	int rv = hash_get_qtr(trid, &qtr);
+
+	if (rv != AS_QUERY_OK) {
+		cf_warning(AS_MON, "No query was found with trid [%"PRIu64"]", trid);
+		stat = NULL;
+	}
+	else {
+		stat = cf_malloc(sizeof(as_mon_jobstat));
+		as_query_fill_jobstat(qtr, stat);
+		qtr_release(qtr, __FILE__, __LINE__);
+	}
+	return stat;
+}
+
+
+int
+as_mon_query_jobstat_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata)
+{
+	as_query_transaction * qtr = (as_query_transaction*)object;
+	query_jobstat *job_pool = (query_jobstat*) udata;
+
+	if ( job_pool->index >= job_pool->max_size) return AS_QUERY_OK;
+	as_mon_jobstat * stat = *(job_pool->jobstat);
+	stat                  = stat + job_pool->index;
+	as_query_fill_jobstat(qtr, stat);
+	(job_pool->index)++;
+	return AS_QUERY_OK;
+}
+
+as_mon_jobstat *
+as_query_get_jobstat_all(int * size)
+{
+	*size = cf_rchash_get_size(g_query_job_hash);
+	if(*size == 0) return AS_QUERY_OK;
+
+	as_mon_jobstat     * job_stats;
+	query_jobstat     job_pool;
+
+	job_stats          = (as_mon_jobstat *) cf_malloc(sizeof(as_mon_jobstat) * (*size));
+	job_pool.jobstat  = &job_stats;
+	job_pool.index    = 0;
+	job_pool.max_size = *size;
+	cf_rchash_reduce(g_query_job_hash, as_mon_query_jobstat_reduce_fn, &job_pool);
+	*size              = job_pool.index;
+	return job_stats;
+}
+
+void
+as_query_histogram_dumpall()
+{
+	if (g_config.query_enable_histogram == false)
+	{
+		return;
+	}
+
+	if (query_txn_q_wait_hist) {
+		histogram_dump(query_txn_q_wait_hist);
+	}
+	if (query_query_q_wait_hist) {
+		histogram_dump(query_query_q_wait_hist);
+	}
+	if (query_prepare_batch_hist) {
+		histogram_dump(query_prepare_batch_hist);
+	}
+	if (query_batch_io_q_wait_hist) {
+		histogram_dump(query_batch_io_q_wait_hist);
+	}
+	if (query_batch_io_hist) {
+		histogram_dump(query_batch_io_hist);
+	}
+	if (query_net_io_hist) {
+		histogram_dump(query_net_io_hist);
+	}
+}
+
+
+/*
+ * Query Subsystem Initialization function
+ */
+// **************************************************************************************************
+void
+as_query_gconfig_default(as_config *c)
+{
+	// NB: Do not change query_threads default to odd. as_query_reinit code cannot
+	// handle it. Code to handle it is unnecessarily complicated code, hence opted
+	// to make the default value even.
+	c->query_threads             = 6;
+	c->query_worker_threads      = 15;
+	c->query_priority            = 10;
+	c->query_sleep_us            = 1;
+	c->query_bsize               = QUERY_BATCH_SIZE;
+	c->query_in_transaction_thr  = 0;
+	c->query_req_max_inflight    = AS_QUERY_MAX_QREQ_INFLIGHT;
+	c->query_bufpool_size        = AS_QUERY_MAX_BUFS;
+	c->query_short_q_max_size    = AS_QUERY_MAX_SHORT_QUEUE_SZ;
+	c->query_long_q_max_size     = AS_QUERY_MAX_LONG_QUEUE_SZ;
+	c->query_buf_size            = AS_QUERY_BUF_SIZE;
+	c->query_threshold           = 10;	// threshold after which the query is considered long running
+										// no reason for choosing 10
+	c->query_rec_count_bound     = UINT64_MAX; // Unlimited
+	c->query_req_in_query_thread = 0;
+	c->query_untracked_time_ms   = AS_QUERY_UNTRACKED_TIME;
+
+	c->partitions_pre_reserved       = false;
+}
+
+
+void
+as_query_init()
+{
+	g_current_queries_count = 0;
+	cf_detail(AS_QUERY, "Initialize %d Query Worker threads.", g_config.query_threads);
+
+	// global job hash to keep track of the query job
+	cf_rchash_create(&g_query_job_hash, cf_rchash_fn_u32, NULL, sizeof(uint64_t), 64, CF_RCHASH_MANY_LOCK);
+
+	// I/O threads
+	g_query_qwork_pool = cf_queue_create(sizeof(query_work *), true);
+	g_query_response_bb_pool = cf_queue_create(sizeof(void *), true);
+	g_query_work_queue = cf_queue_create(sizeof(query_work *), true);
+
+	// Create the query worker threads detached so we don't need to join with them.
+	if (pthread_attr_init(&g_query_worker_th_attr)) {
+		cf_crash(AS_SINDEX, "failed to initialize the query worker thread attributes");
+	}
+	if (pthread_attr_setdetachstate(&g_query_worker_th_attr, PTHREAD_CREATE_DETACHED)) {
+		cf_crash(AS_SINDEX, "failed to set the query worker thread attributes to the detached state");
+	}
+	int max = g_config.query_worker_threads;
+	for (int i = 0; i < max; i++) {
+		pthread_create(&g_query_worker_threads[i], &g_query_worker_th_attr,
+				qwork_th, (void*)g_query_work_queue);
+	}
+
+	g_query_short_queue = cf_queue_create(sizeof(as_query_transaction *), true);
+	g_query_long_queue = cf_queue_create(sizeof(as_query_transaction *), true);
+
+	// Create the query threads detached so we don't need to join with them.
+	if (pthread_attr_init(&g_query_th_attr)) {
+		cf_crash(AS_SINDEX, "failed to initialize the query thread attributes");
+	}
+	if (pthread_attr_setdetachstate(&g_query_th_attr, PTHREAD_CREATE_DETACHED)) {
+		cf_crash(AS_SINDEX, "failed to set the query thread attributes to the detached state");
+	}
+
+	max = g_config.query_threads;
+	for (int i = 0; i < max; i += 2) {
+		if (pthread_create(&g_query_threads[i], &g_query_th_attr,
+					query_th, (void*)g_query_short_queue)
+				|| pthread_create(&g_query_threads[i + 1], &g_query_th_attr,
+						query_th, (void*)g_query_long_queue)) {
+			cf_crash(AS_QUERY, "Failed to create query transaction threads for query short queue");
+		}
+	}
+
+	char hist_name[64];
+
+	sprintf(hist_name, "query_txn_q_wait_us");
+	query_txn_q_wait_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "query_query_q_wait_us");
+	query_query_q_wait_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "query_prepare_batch_us");
+	query_prepare_batch_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "query_batch_io_q_wait_us");
+	query_batch_io_q_wait_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "query_batch_io_us");
+	query_batch_io_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	sprintf(hist_name, "query_net_io_us");
+	query_net_io_hist = histogram_create(hist_name, HIST_MICROSECONDS);
+
+	g_config.query_enable_histogram	 = false;
+}
+
+/*
+ * 	Description -
+ * 		It tries to set the query_worker_threads to the given value.
+ *
+ * 	Synchronization -
+ * 		Takes a global query lock to protect the config of
+ *
+ *	Arguments -
+ *		set_size - Value which one want to assign to query_threads.
+ *
+ * 	Returns -
+ * 		AS_QUERY_OK  - On successful resize of query threads.
+ * 		AS_QUERY_ERR - Either the set_size exceeds AS_QUERY_MAX_THREADS
+ * 					   OR Query threads were not initialized on the first place.
+ */
+int
+as_query_worker_reinit(int set_size, int *actual_size)
+{
+	if (set_size > AS_QUERY_MAX_WORKER_THREADS) {
+		cf_warning(AS_QUERY, "Cannot increase query threads more than %d",
+				AS_QUERY_MAX_WORKER_THREADS);
+		//unlock
+		return AS_QUERY_ERR;
+	}
+
+	pthread_rwlock_wrlock(&g_query_lock);
+	// Add threads if count is increased
+	int i = cf_atomic32_get(g_query_worker_threadcnt);
+	g_config.query_worker_threads = set_size;
+	if (set_size > g_query_worker_threadcnt) {
+		for (; i < set_size; i++) {
+			cf_detail(AS_QUERY, "Creating thread %d", i);
+			if (0 != pthread_create(&g_query_worker_threads[i], &g_query_worker_th_attr,
+					qwork_th, (void*)g_query_work_queue)) {
+				break;
+			}
+		}
+		g_config.query_worker_threads = i;
+	}
+	*actual_size = g_config.query_worker_threads;
+
+	pthread_rwlock_unlock(&g_query_lock);
+
+	return AS_QUERY_OK;
+}
+
+/*
+ * 	Description -
+ * 		It tries to set the query_threads to the given value.
+ *
+ * 	Synchronization -
+ * 		Takes a global query lock to protect the config of
+ *
+ *	Arguments -
+ *		set_size - Value which one want to assign to query_threads.
+ *
+ * 	Returns -
+ * 		AS_QUERY_OK  - On successful resize of query threads.
+ * 		AS_QUERY_ERR - Either the set_size exceeds AS_QUERY_MAX_THREADS
+ * 					   OR Query threads were not initialized on the first place.
+ */
+int
+as_query_reinit(int set_size, int *actual_size)
+{
+	if (set_size > AS_QUERY_MAX_THREADS) {
+		cf_warning(AS_QUERY, "Cannot increase query threads more than %d",
+				AS_QUERY_MAX_THREADS);
+		return AS_QUERY_ERR;
+	}
+
+	pthread_rwlock_wrlock(&g_query_lock);
+	// Add threads if count is increased
+	int i = cf_atomic32_get(g_query_threadcnt);
+
+	// make it multiple of 2
+	if (set_size % 2 != 0)
+		set_size++;
+
+	g_config.query_threads = set_size;
+	if (set_size > g_query_threadcnt) {
+		for (; i < set_size; i++) {
+			cf_detail(AS_QUERY, "Creating thread %d", i);
+			if (0 != pthread_create(&g_query_threads[i], &g_query_th_attr,
+					query_th, (void*)g_query_short_queue)) {
+				break;
+			}
+			i++;
+			if (0 != pthread_create(&g_query_threads[i], &g_query_th_attr,
+					query_th, (void*)g_query_long_queue)) {
+				break;
+			}
+		}
+		g_config.query_threads = i;
+	}
+	*actual_size = g_config.query_threads;
+
+	pthread_rwlock_unlock(&g_query_lock);
+
+	return AS_QUERY_OK;
+}
+// **************************************************************************************************
diff --git a/as/src/base/thr_sindex.c b/as/src/base/thr_sindex.c
new file mode 100644
index 00000000..a838ee7d
--- /dev/null
+++ b/as/src/base/thr_sindex.c
@@ -0,0 +1,841 @@
+/*
+ * thr_sindex.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+ /*
+ * SYNOPSIS
+ * This file implements supporting threads for the secondary index implementation.
+ * Currently following two main threads are implemented here
+ *
+ * -  Secondary index gc thread which walks sweeps through secondary indexes
+ *   and cleanup the stale entries by looking up digest in the primary index.
+ *
+ * -  Secondary index thread which cleans up secondary index entry for a particular
+ *    partitions
+ *
+ */
+
+#include "base/thr_sindex.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_ll.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "ai_obj.h"
+#include "ai_btree.h"
+#include "fault.h"
+#include "shash.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/job_manager.h"
+#include "base/monitor.h"
+#include "base/secondary_index.h"
+#include "base/stats.h"
+#include "fabric/partition.h"
+
+
+int as_sbld_build(as_sindex* si);
+
+// All this is global because Aerospike Index is single threaded
+pthread_rwlock_t g_sindex_rwlock = PTHREAD_RWLOCK_INITIALIZER;
+pthread_rwlock_t g_ai_rwlock     = PTHREAD_RWLOCK_INITIALIZER;
+pthread_t g_sindex_populate_th;
+pthread_t g_sindex_destroy_th;
+pthread_t g_sindex_gc_th;
+
+cf_queue *g_sindex_populate_q;
+cf_queue *g_sindex_destroy_q;
+cf_queue *g_sindex_populateall_done_q;
+cf_queue *g_q_objs_to_defrag;
+bool      g_sindex_boot_done;
+
+typedef struct as_sindex_set_s {
+	as_namespace * ns;
+	as_set * set;
+} as_sindex_set;
+
+int
+ll_sindex_gc_reduce_fn(cf_ll_element *ele, void *udata)
+{
+	return CF_LL_REDUCE_DELETE;
+}
+
+void
+as_sindex_gc_release_gc_arr_to_queue(void *v)
+{
+	objs_to_defrag_arr *dt = (objs_to_defrag_arr *)v;
+	if (cf_queue_sz(g_q_objs_to_defrag) < SINDEX_GC_QUEUE_HIGHWATER) {
+		cf_queue_push(g_q_objs_to_defrag, &dt);
+	}
+	else {
+		cf_free(dt);
+	}
+}
+
+void
+ll_sindex_gc_destroy_fn(cf_ll_element *ele)
+{
+	ll_sindex_gc_element * node = (ll_sindex_gc_element *) ele;
+	if (node) {
+		as_sindex_gc_release_gc_arr_to_queue((void *)(node->objs_to_defrag));
+		cf_free(node);
+	}
+}
+
+objs_to_defrag_arr *
+as_sindex_gc_get_defrag_arr(void)
+{
+	objs_to_defrag_arr *dt;
+	if (cf_queue_pop(g_q_objs_to_defrag, &dt, CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) {
+		dt = cf_malloc(sizeof(objs_to_defrag_arr));
+	}
+	dt->num = 0;
+	return dt;
+}
+
+// Main thread which looks at the request of the populating index
+void *
+as_sindex__populate_fn(void *param)
+{
+	while(1) {
+		as_sindex *si;
+		cf_queue_pop(g_sindex_populate_q, &si, CF_QUEUE_FOREVER);
+		// TODO should check flag under a lock
+		// conflict with as_sindex_repair
+		if (si->flag & AS_SINDEX_FLAG_POPULATING) {
+			// Earlier job to populate index is still going on, push it back
+			// into the queue to look at it later. this is problem only when
+			// there are multiple populating threads currently there is only 1.
+			cf_queue_push(g_sindex_populate_q, &si);
+		} else {
+			cf_debug(AS_SINDEX, "Populating index %s", si->imd->iname);
+			// should set under a lock
+			si->flag |= AS_SINDEX_FLAG_POPULATING;
+			si->stats.recs_pending = si->ns->n_objects;
+			as_sbld_build(si);
+		}
+	}
+	return NULL;
+}
+
+
+// Main thread which looks at the request of the destroy of index
+void *
+as_sindex__destroy_fn(void *param)
+{
+	while(1) {
+		as_sindex *si;
+		cf_queue_pop(g_sindex_destroy_q, &si, CF_QUEUE_FOREVER);
+
+		SINDEX_GWLOCK();
+		cf_assert((si->state == AS_SINDEX_DESTROY),
+				AS_SINDEX, " Invalid state %d at cleanup expected %d for %p and %s", si->state, AS_SINDEX_DESTROY, si, (si) ? ((si->imd) ? si->imd->iname : NULL) : NULL);
+		int rv = as_sindex__delete_from_set_binid_hash(si->ns, si->imd);
+		if (rv) {
+			cf_warning(AS_SINDEX, "Delete from set_binid hash fails with error %d", rv);
+		}
+		// Free entire usage counter before tree destroy
+		cf_atomic64_sub(&si->ns->n_bytes_sindex_memory,
+				ai_btree_get_isize(si->imd) + ai_btree_get_nsize(si->imd));
+
+		// Cache the ibtr pointers
+		uint16_t nprts = si->imd->nprts;
+		struct btree *ibtr[nprts];
+		for (int i = 0; i < nprts; i++) {
+			as_sindex_pmetadata *pimd = &si->imd->pimd[i];
+			ibtr[i] = pimd->ibtr;
+			ai_btree_reset_pimd(pimd);
+		}
+
+		as_sindex_destroy_pmetadata(si);
+		si->state = AS_SINDEX_INACTIVE;
+		si->flag  = 0;
+
+		si->ns->sindex_cnt--;
+
+		if (si->imd->set) {
+			as_set *p_set = as_namespace_get_set_by_name(si->ns, si->imd->set);
+			p_set->n_sindexes--;
+		} else {
+			si->ns->n_setless_sindexes--;
+		}
+
+		as_sindex_metadata *imd = si->imd;
+		si->imd = NULL;
+
+		char iname[AS_ID_INAME_SZ];
+		memset(iname, 0, AS_ID_INAME_SZ);
+		snprintf(iname, strlen(imd->iname) + 1, "%s", imd->iname);
+		cf_shash_delete(si->ns->sindex_iname_hash, (void *)iname);
+
+
+		as_namespace *ns = si->ns;
+		si->ns      = NULL;
+		si->simatch = -1;
+
+		as_sindex_metadata *recreate_imd = NULL;
+		if (si->recreate_imd) {
+			recreate_imd = si->recreate_imd;
+			si->recreate_imd = NULL;
+		}
+
+		// remember this is going to release the write lock
+		// of meta-data first. This is the only special case
+		// where both GLOCK and LOCK is called together
+		SINDEX_GWUNLOCK();
+
+		// Destroy cached ibtr pointer
+		for (int i = 0; i < imd->nprts; i++) {
+			ai_btree_delete_ibtr(ibtr[i]);
+		}
+		as_sindex_imd_free(imd);
+		cf_rc_free(imd);
+
+		if (recreate_imd) {
+			as_sindex_create(ns, recreate_imd);
+			as_sindex_imd_free(recreate_imd);
+			cf_rc_free(recreate_imd);
+		}
+	}
+	return NULL;
+}
+
+void
+as_sindex_update_gc_stat(as_sindex *si, uint64_t r, uint64_t start_time_ms)
+{
+	cf_atomic64_add(&si->stats.n_deletes,        r);
+	cf_atomic64_add(&si->stats.n_objects,        -r);
+	cf_atomic64_add(&si->stats.n_defrag_records, r);
+	cf_atomic64_add(&si->stats.defrag_time, cf_getms() - start_time_ms);
+}
+
+typedef struct gc_stat_s {
+	uint64_t  processed;
+	uint64_t  found;
+	uint64_t  deleted;
+	uint64_t  creation_time;
+	uint64_t  deletion_time;
+} gc_stat;
+
+typedef struct gc_ctx_s {
+	uint32_t      ns_id;
+	as_sindex    *si;
+	uint16_t      pimd_idx;
+
+	// stat
+	gc_stat      stat;
+
+	// config
+	uint64_t     start_time;
+	uint32_t     gc_max_rate;
+} gc_ctx;
+
+typedef struct gc_offset_s {
+	ai_obj    i_col;
+	uint64_t  pos;  // uint actually
+	bool      done;
+} gc_offset;
+
+static bool
+can_gc_si(as_sindex *si, uint16_t pimd_idx)
+{
+	if (! as_sindex_isactive(si)) {
+		return false;
+	}
+
+	if (si->state == AS_SINDEX_DESTROY) {
+		return false;
+	}
+
+	// pimd_idx we are iterating does not
+	// exist in this sindex.
+	if (pimd_idx >= si->imd->nprts) {
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+gc_getnext_si(gc_ctx *ctx)
+{
+	int16_t si_idx;
+	as_namespace *ns = g_config.namespaces[ctx->ns_id];
+
+	// From previous si_idx or 0
+	if (ctx->si) {
+		si_idx = ctx->si->simatch;
+		AS_SINDEX_RELEASE(ctx->si);
+		ctx->si = NULL;
+	} else {
+		si_idx = -1;
+	}
+
+	SINDEX_GRLOCK();
+
+	while (true) {
+
+		si_idx++;
+		if (si_idx == AS_SINDEX_MAX) {
+			SINDEX_GRUNLOCK();
+			return false;
+		}
+
+		as_sindex *si = &ns->sindex[si_idx];
+
+		if (! can_gc_si(si, ctx->pimd_idx)) {
+			continue;
+		}
+
+		AS_SINDEX_RESERVE(si);
+		ctx->si = si;
+		SINDEX_GRUNLOCK();
+		return true;
+	}
+}
+
+static void
+gc_print_ctx(gc_ctx *ctx)
+{
+	cf_detail(AS_SINDEX, "%s %s[%d]", g_config.namespaces[ctx->ns_id]->name,
+			ctx->si ? ctx->si->imd->iname : "NULL", ctx->pimd_idx);
+}
+
+// TODO - Find the correct values
+#define CREATE_LIST_PER_ITERATION_LIMIT   10000
+#define PROCESS_LIST_PER_ITERATION_LIMIT  10
+
+// true if tree is done
+// false if more in tree
+static bool
+gc_create_list(as_sindex *si, as_sindex_pmetadata *pimd, cf_ll *gc_list,
+		gc_offset *offsetp, gc_stat *statp)
+{
+	uint64_t processed = 0;
+	uint64_t found = 0;
+	uint64_t limit_per_iteration = CREATE_LIST_PER_ITERATION_LIMIT;
+
+	uint64_t start_time = cf_getms();
+
+	PIMD_RLOCK(&pimd->slock);
+	as_sindex_status ret = ai_btree_build_defrag_list(si->imd, pimd,
+			&offsetp->i_col, &offsetp->pos, limit_per_iteration,
+			&processed, &found, gc_list);
+
+	PIMD_RUNLOCK(&pimd->slock);
+
+	statp->creation_time += (cf_getms() - start_time);
+	statp->processed += processed;
+	statp->found += found;
+
+	if (ret == AS_SINDEX_DONE) {
+		offsetp->done = true;
+	}
+
+	if (ret == AS_SINDEX_ERR) {
+		return false;
+	}
+
+	return true;
+}
+
+static void
+gc_process_list(as_sindex *si, as_sindex_pmetadata *pimd, cf_ll *gc_list,
+		gc_offset *offsetp, gc_stat *statp)
+{
+	uint64_t deleted = 0;
+	uint64_t start_time = cf_getms();
+	uint64_t limit_per_iteration = PROCESS_LIST_PER_ITERATION_LIMIT;
+
+	bool more = true;
+
+	while (more) {
+
+		PIMD_WLOCK(&pimd->slock);
+		more = ai_btree_defrag_list(si->imd, pimd, gc_list,
+				limit_per_iteration, &deleted);
+		PIMD_WUNLOCK(&pimd->slock);
+	}
+
+	// Update secondary index object count
+	// statistics aggressively.
+	as_sindex_update_gc_stat(si, deleted, start_time);
+
+	statp->deletion_time = cf_getms() - start_time;
+	statp->deleted += deleted;
+}
+
+static void
+gc_throttle(gc_ctx *ctx)
+{
+	while (true) {
+		uint64_t expected_processed =
+			(cf_get_seconds() - ctx->start_time) * ctx->gc_max_rate;
+
+		// processed less than expected
+		// no throttling needed.
+		if (ctx->stat.processed <= expected_processed) {
+			break;
+		}
+
+		usleep(10000); // 10 ms
+	}
+}
+
+static void
+do_gc(gc_ctx *ctx)
+{
+	// SKEY + Digest offset
+	gc_offset offset;
+	init_ai_obj(&offset.i_col);
+	offset.pos = 0;
+	offset.done = false;
+
+	as_sindex *si = ctx->si;
+	as_sindex_pmetadata *pimd = &si->imd->pimd[ctx->pimd_idx];
+
+	cf_ll gc_list;
+	cf_ll_init(&gc_list, &ll_sindex_gc_destroy_fn, false);
+
+	while (true) {
+
+		if (! gc_create_list(si, pimd, &gc_list, &offset, &ctx->stat)) {
+			break;
+		}
+
+		if (cf_ll_size(&gc_list) > 0) {
+			gc_process_list(si, pimd, &gc_list, &offset, &ctx->stat);
+			cf_ll_reduce(&gc_list, true /*forward*/, ll_sindex_gc_reduce_fn, NULL);
+		}
+
+		if (offset.done) {
+			break;
+		}
+	}
+
+	cf_ll_reduce(&gc_list, true /*forward*/, ll_sindex_gc_reduce_fn, NULL);
+}
+
+static void
+update_gc_stat(gc_stat *statp)
+{
+	g_stats.sindex_gc_objects_validated  += statp->processed;
+	g_stats.sindex_gc_garbage_found      += statp->found;
+	g_stats.sindex_gc_garbage_cleaned    += statp->deleted;
+	g_stats.sindex_gc_list_deletion_time += statp->deletion_time;
+	g_stats.sindex_gc_list_creation_time += statp->creation_time;
+}
+
+void *
+as_sindex__gc_fn(void *udata)
+{
+	while (! g_sindex_boot_done) {
+		sleep(10);
+		continue;
+	}
+
+	cf_debug(AS_SINDEX, "Secondary index gc thread started !!");
+
+	uint64_t last_time = cf_get_seconds();
+
+	for ( ; ; ) {
+		// Wake up every 1 second to check the gc timeout.
+		struct timespec delay = { 1, 0 };
+		nanosleep(&delay, NULL);
+
+		uint64_t curr_time = cf_get_seconds();
+
+		if ((curr_time - last_time) < g_config.sindex_gc_period) {
+			continue; // period has not been reached for running gc check
+		}
+
+		last_time = curr_time;
+
+		for (int i = 0; i < g_config.n_namespaces; i++) {
+
+			as_namespace *ns = g_config.namespaces[i];
+
+			if (ns->sindex_cnt == 0) {
+				continue;
+			}
+
+			cf_info(AS_NSUP, "{%s} sindex-gc start", ns->name);
+
+			uint64_t start_time_ms = cf_getms();
+
+			// gc_max_rate change at the namespace boundary
+			gc_ctx ctx = {
+				.ns_id = i,
+				.si = NULL,
+				.stat = { 0 },
+				.start_time = cf_get_seconds(),
+				.gc_max_rate = g_config.sindex_gc_max_rate
+			};
+
+			// Give one pimd quata of chance for every sindex
+			// in a namespace in round robin manner.
+			for (uint16_t pimd_idx = 0; pimd_idx < MAX_PARTITIONS_PER_INDEX;
+					pimd_idx++) {
+
+				ctx.pimd_idx = pimd_idx;
+
+				while (gc_getnext_si(&ctx)) {
+					gc_print_ctx(&ctx);
+					do_gc(&ctx);
+
+					// throttle after every quota (1 pimd)
+					gc_throttle(&ctx);
+				}
+			}
+
+			cf_info(AS_NSUP, "{%s} sindex-gc: Processed: %ld, found:%ld, deleted: %ld: Total time: %ld ms",
+					ns->name, ctx.stat.processed, ctx.stat.found, ctx.stat.deleted,
+					cf_getms() - start_time_ms);
+
+			update_gc_stat(&ctx.stat);
+		}
+	}
+}
+
+
+/*
+ * Secondary index main gc thread, it keeps watching out for request to
+ * the gc, Client API to set up aerospike facing meta data for the secondary index
+ * and setting all the initial things
+ *
+ * Parameter:
+ *		 sindex_metadata:  (in/out) Index meta-data structure
+ *
+ * Caller:
+ *		aerospike
+ * Return:
+ *		0: On success
+ *		-1: On failure
+ * Synchronization:
+ * 		Acquires the meta lock.
+ */
+void
+as_sindex_thr_init()
+{
+	// Thread request read lock on this recursively could possibly cause deadlock. Caller
+	// should be careful with that
+	pthread_rwlockattr_t rwattr;
+	if (!g_q_objs_to_defrag) {
+		g_q_objs_to_defrag = cf_queue_create(sizeof(void *), true);
+	}
+	if (0 != pthread_rwlockattr_init(&rwattr))
+		cf_crash(AS_SINDEX, "pthread_rwlockattr_init: %s", cf_strerror(errno));
+	if (0 != pthread_rwlockattr_setkind_np(&rwattr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP))
+		cf_crash( AS_SINDEX, "pthread_rwlockattr_setkind_np: %s", cf_strerror(errno));
+
+	// Aerospike Index Metadata lock
+	if (0 != pthread_rwlock_init(&g_ai_rwlock, &rwattr)) {
+		cf_crash(AS_SINDEX, " Could not create secondary index ddl mutex ");
+	}
+
+	// Sindex Metadata lock
+	if (0 != pthread_rwlock_init(&g_sindex_rwlock, &rwattr)) {
+		cf_crash(AS_SINDEX, " Could not create secondary index ddl mutex ");
+	}
+
+	g_sindex_populate_q = cf_queue_create(sizeof(as_sindex *), true);
+	if (0 != pthread_create(&g_sindex_populate_th, 0, as_sindex__populate_fn, 0)) {
+		cf_crash(AS_SINDEX, " Could not create sindex populate thread ");
+	}
+
+	g_sindex_destroy_q = cf_queue_create(sizeof(as_sindex *), true);
+	if (0 != pthread_create(&g_sindex_destroy_th, 0, as_sindex__destroy_fn, 0)) {
+		cf_crash(AS_SINDEX, " Could not create sindex destroy thread ");
+	}
+
+	if (0 != pthread_create(&g_sindex_gc_th, 0, as_sindex__gc_fn, 0)) {
+		cf_crash(AS_SINDEX, " Could not create sindex gc thread ");
+	}
+
+	g_sindex_populateall_done_q = cf_queue_create(sizeof(int), true);
+	// At the beginning it is false. It is set to true when all the sindex
+	// are populated.
+	g_sindex_boot_done = false;
+}
+
+
+//==============================================================================
+// Secondary index builder.
+//
+
+// sbld_job - derived class header:
+typedef struct sbld_job_s {
+	// Base object must be first:
+	as_job			_base;
+
+	// Derived class data:
+	as_sindex*		si;
+
+	char*			si_name;
+	cf_atomic64		n_reduced;
+} sbld_job;
+
+sbld_job* sbld_job_create(as_namespace* ns, uint16_t set_id, as_sindex* si);
+
+// as_job_manager instance for secondary index builder:
+static as_job_manager g_sbld_manager;
+
+
+//------------------------------------------------
+// Sindex builder public API.
+//
+
+void
+as_sbld_init()
+{
+	// TODO - config for max done?
+	// Initialize with maximum threads since first use is always build-all at
+	// startup. The thread pool will be down-sized right after that.
+	as_job_manager_init(&g_sbld_manager, UINT_MAX, 100, MAX_SINDEX_BUILDER_THREADS);
+}
+
+int
+as_sbld_build(as_sindex* si)
+{
+	as_sindex_metadata *imd = si->imd;
+	as_namespace *ns = as_namespace_get_byname(imd->ns_name);
+
+	if (! ns) {
+		cf_warning(AS_SINDEX, "sindex build %s ns %s - unrecognized namespace", imd->iname, imd->ns_name);
+		as_sindex_populate_done(si);
+		AS_SINDEX_RELEASE(si);
+		return -1;
+	}
+
+	uint16_t set_id = INVALID_SET_ID;
+
+	if (imd->set && (set_id = as_namespace_get_set_id(ns, imd->set)) == INVALID_SET_ID) {
+		cf_info(AS_SINDEX, "sindex build %s ns %s - set %s not found - assuming empty", imd->iname, imd->ns_name, imd->set);
+		as_sindex_populate_done(si);
+		AS_SINDEX_RELEASE(si);
+		return -3;
+	}
+
+	sbld_job* job = sbld_job_create(ns, set_id, si);
+
+	// Can't fail for this kind of job.
+	as_job_manager_start_job(&g_sbld_manager, (as_job*)job);
+
+	return 0;
+}
+
+void
+as_sbld_build_all(as_namespace* ns)
+{
+	sbld_job* job = sbld_job_create(ns, INVALID_SET_ID, NULL);
+
+	// Can't fail for this kind of job.
+	as_job_manager_start_job(&g_sbld_manager, (as_job*)job);
+}
+
+void
+as_sbld_resize_thread_pool(uint32_t n_threads)
+{
+	as_job_manager_resize_thread_pool(&g_sbld_manager, n_threads);
+}
+
+int
+as_sbld_list(char* name, cf_dyn_buf* db)
+{
+	as_mon_info_cmd(AS_MON_MODULES[SBLD_MOD], NULL, 0, 0, db);
+	return 0;
+}
+
+as_mon_jobstat*
+as_sbld_get_jobstat(uint64_t trid)
+{
+	return as_job_manager_get_job_info(&g_sbld_manager, trid);
+}
+
+as_mon_jobstat*
+as_sbld_get_jobstat_all(int* size)
+{
+	return as_job_manager_get_info(&g_sbld_manager, size);
+}
+
+int
+as_sbld_abort(uint64_t trid)
+{
+	return as_job_manager_abort_job(&g_sbld_manager, trid) ? 0 : -1;
+}
+
+
+//------------------------------------------------
+// sbld_job derived class implementation.
+//
+
+void sbld_job_slice(as_job* _job, as_partition_reservation* rsv);
+void sbld_job_finish(as_job* _job);
+void sbld_job_destroy(as_job* _job);
+void sbld_job_info(as_job* _job, as_mon_jobstat* stat);
+
+const as_job_vtable sbld_job_vtable = {
+		sbld_job_slice,
+		sbld_job_finish,
+		sbld_job_destroy,
+		sbld_job_info
+};
+
+void sbld_job_reduce_cb(as_index_ref* r_ref, void* udata);
+
+//
+// sbld_job creation.
+//
+
+sbld_job*
+sbld_job_create(as_namespace* ns, uint16_t set_id, as_sindex* si)
+{
+	sbld_job* job = cf_malloc(sizeof(sbld_job));
+
+	as_job_init((as_job*)job, &sbld_job_vtable, &g_sbld_manager,
+			RSV_MIGRATE, 0, ns, set_id, AS_JOB_PRIORITY_MEDIUM);
+
+	job->si = si;
+	job->si_name = si ? cf_strdup(si->imd->iname) : NULL;
+	job->n_reduced = 0;
+
+	return job;
+}
+
+//
+// sbld_job mandatory as_job interface.
+//
+
+void
+sbld_job_slice(as_job* _job, as_partition_reservation* rsv)
+{
+	as_index_reduce_live(rsv->tree, sbld_job_reduce_cb, (void*)_job);
+}
+
+void
+sbld_job_finish(as_job* _job)
+{
+	sbld_job* job = (sbld_job*)_job;
+
+	as_sindex_ticker_done(_job->ns, job->si, _job->start_ms);
+
+	if (job->si) {
+		as_sindex_populate_done(job->si);
+		job->si->stats.loadtime = cf_getms() - _job->start_ms;
+		AS_SINDEX_RELEASE(job->si);
+	}
+	else {
+		as_sindex_boot_populateall_done(_job->ns);
+	}
+}
+
+void
+sbld_job_destroy(as_job* _job)
+{
+	sbld_job* job = (sbld_job*)_job;
+
+	if (job->si_name) {
+		cf_free(job->si_name);
+	}
+}
+
+void
+sbld_job_info(as_job* _job, as_mon_jobstat* stat)
+{
+	sbld_job* job = (sbld_job*)_job;
+
+	if (job->si_name) {
+		strcpy(stat->job_type, "sindex-build");
+
+		char *extra = stat->jdata + strlen(stat->jdata);
+
+		sprintf(extra, ":sindex-name=%s", job->si_name);
+	}
+	else {
+		strcpy(stat->job_type, "sindex-build-all");
+	}
+}
+
+//
+// sbld_job utilities.
+//
+
+void
+sbld_job_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_job* _job = (as_job*)udata;
+	sbld_job* job = (sbld_job*)_job;
+	as_namespace* ns = _job->ns;
+
+	if (_job->abandoned != 0) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	if (job->si) {
+		cf_atomic64_decr(&job->si->stats.recs_pending);
+	}
+
+	as_sindex_ticker(ns, job->si, cf_atomic64_incr(&job->n_reduced), _job->start_ms);
+
+	as_index *r = r_ref->r;
+
+	if ((_job->set_id != INVALID_SET_ID && _job->set_id != as_index_get_set_id(r)) ||
+			as_record_is_doomed(r, ns)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	as_storage_rd rd;
+	as_storage_record_open(ns, r, &rd);
+	as_storage_rd_load_n_bins(&rd); // TODO - handle error returned
+	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];
+	as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned
+
+	if (job->si) {
+		if (as_sindex_put_rd(job->si, &rd)) {
+			as_record_done(r_ref, ns);
+			as_job_manager_abandon_job(_job->mgr, _job, AS_JOB_FAIL_UNKNOWN);
+			return;
+		}
+	}
+	else {
+		as_sindex_putall_rd(ns, &rd);
+	}
+
+	as_storage_record_close(&rd);
+	as_record_done(r_ref, ns);
+
+	cf_atomic64_incr(&_job->n_records_read);
+}
diff --git a/as/src/base/thr_tsvc.c b/as/src/base/thr_tsvc.c
new file mode 100644
index 00000000..50ddc4a3
--- /dev/null
+++ b/as/src/base/thr_tsvc.c
@@ -0,0 +1,580 @@
+/*
+ * thr_tsvc.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/thr_tsvc.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "fault.h"
+#include "hardware.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/scan.h"
+#include "base/secondary_index.h"
+#include "base/security.h"
+#include "base/stats.h"
+#include "base/thr_batch.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "base/xdr_serverside.h"
+#include "fabric/fabric.h"
+#include "fabric/partition.h"
+#include "fabric/partition_balance.h"
+#include "storage/storage.h"
+#include "transaction/delete.h"
+#include "transaction/proxy.h"
+#include "transaction/re_replicate.h"
+#include "transaction/read.h"
+#include "transaction/udf.h"
+#include "transaction/write.h"
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_queue* g_transaction_queues[MAX_TRANSACTION_QUEUES] = { NULL };
+
+// Track number of threads for each queue independently.
+static uint32_t g_queues_n_threads[MAX_TRANSACTION_QUEUES] = { 0 };
+
+// It's ok for this to not be atomic - might not round-robin perfectly, but will
+// be cache friendly.
+static uint32_t g_current_q = 0;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void tsvc_add_threads(uint32_t qid, uint32_t n_threads);
+void tsvc_remove_threads(uint32_t qid, uint32_t n_threads);
+void *run_tsvc(void *arg);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline bool
+should_security_check_data_op(const as_transaction *tr)
+{
+	return tr->origin == FROM_CLIENT || tr->origin == FROM_BATCH;
+}
+
+static inline bool
+read_would_duplicate_resolve(const as_namespace* ns, const as_msg* m)
+{
+	return READ_CONSISTENCY_LEVEL(ns, *m) == AS_READ_CONSISTENCY_LEVEL_ALL;
+}
+
+static const char*
+write_type_tag(const as_transaction *tr)
+{
+	return as_transaction_is_delete(tr) ? "delete" :
+			(as_transaction_is_udf(tr) ? "udf" : "write");
+}
+
+static inline void
+detail_unique_client_rw(const as_transaction *tr, bool is_write)
+{
+	if (tr->origin == FROM_CLIENT) {
+		cf_detail_digest(AS_RW_CLIENT, &tr->keyd, "{%s} client %s %s ",
+				tr->rsv.ns->name, tr->from.proto_fd_h->client,
+				is_write ? write_type_tag(tr) : "read");
+	}
+}
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_tsvc_init()
+{
+	cf_info(AS_TSVC, "%u transaction queues: starting %u threads per queue",
+			g_config.n_transaction_queues,
+			g_config.n_transaction_threads_per_queue);
+
+	// Create the transaction queues.
+	for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) {
+		g_transaction_queues[qid] =
+				cf_queue_create(AS_TRANSACTION_HEAD_SIZE, true);
+	}
+
+	// Start all the transaction threads.
+	for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) {
+		tsvc_add_threads(qid, g_config.n_transaction_threads_per_queue);
+	}
+}
+
+
+// Decide which queue to use, and enqueue transaction.
+void
+as_tsvc_enqueue(as_transaction *tr)
+{
+	uint32_t qid;
+
+	if (g_config.auto_pin == CF_TOPO_AUTO_PIN_NONE ||
+			g_config.n_namespaces_not_inlined == 0) {
+		cf_debug(AS_TSVC, "no CPU pinning - dispatching transaction round-robin");
+		// Transaction can go on any queue - distribute evenly.
+		qid = (g_current_q++) % g_config.n_transaction_queues;
+	}
+	else {
+		qid = cf_topo_current_cpu();
+		cf_debug(AS_TSVC, "transaction on CPU %u", qid);
+	}
+
+	cf_queue_push(g_transaction_queues[qid], tr);
+}
+
+
+// Triggered via dynamic configuration change.
+void
+as_tsvc_set_threads_per_queue(uint32_t target_n_threads)
+{
+	for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) {
+		uint32_t current_n_threads = g_queues_n_threads[qid];
+
+		if (target_n_threads > current_n_threads) {
+			tsvc_add_threads(qid, target_n_threads - current_n_threads);
+		}
+		else {
+			tsvc_remove_threads(qid, current_n_threads - target_n_threads);
+		}
+	}
+
+	g_config.n_transaction_threads_per_queue = target_n_threads;
+}
+
+
+// Total transactions currently queued, for ticker and info statistics.
+int
+as_tsvc_queue_get_size()
+{
+	int current_total = 0;
+
+	for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) {
+		current_total += cf_queue_sz(g_transaction_queues[qid]);
+	}
+
+	return current_total;
+}
+
+
+// Handle the transaction, including proxy to another node if necessary.
+void
+as_tsvc_process_transaction(as_transaction *tr)
+{
+	if (tr->msgp->proto.type == PROTO_TYPE_INTERNAL_XDR) {
+		as_xdr_read_txn(tr);
+		return;
+	}
+
+	int rv;
+	bool free_msgp = true;
+	cl_msg *msgp = tr->msgp;
+	as_msg *m = &msgp->msg;
+
+	as_transaction_init_body(tr);
+
+	// Check that the socket is authenticated.
+	if (tr->origin == FROM_CLIENT) {
+		uint8_t result = as_security_check(tr->from.proto_fd_h, PERM_NONE);
+
+		if (result != AS_PROTO_RESULT_OK) {
+			as_security_log(tr->from.proto_fd_h, result, PERM_NONE, NULL, NULL);
+			as_transaction_error(tr, NULL, (uint32_t)result);
+			goto Cleanup;
+		}
+	}
+
+	// All transactions must have a namespace.
+	as_msg_field *nf = as_msg_field_get(m, AS_MSG_FIELD_TYPE_NAMESPACE);
+
+	if (! nf) {
+		cf_warning(AS_TSVC, "no namespace in protocol request");
+		as_transaction_error(tr, NULL, AS_PROTO_RESULT_FAIL_NAMESPACE);
+		goto Cleanup;
+	}
+
+	as_namespace *ns = as_namespace_get_bymsgfield(nf);
+
+	if (! ns) {
+		uint32_t ns_sz = as_msg_field_get_value_sz(nf);
+		CF_ZSTR_DEFINE(ns_name, AS_ID_NAMESPACE_SZ, nf->data, ns_sz);
+
+		cf_warning(AS_TSVC, "unknown namespace %s (%u) in protocol request - check configuration file",
+				ns_name, ns_sz);
+
+		as_transaction_error(tr, NULL, AS_PROTO_RESULT_FAIL_NAMESPACE);
+		goto Cleanup;
+	}
+
+	// Have we finished the very first partition balance?
+	if (! as_partition_balance_is_init_resolved()) {
+		cf_debug(AS_TSVC, "rejecting transaction - initial partition balance unresolved");
+		as_transaction_error(tr, NULL, AS_PROTO_RESULT_FAIL_UNAVAILABLE);
+		// Note that we forfeited namespace info above so scan & query don't get
+		// counted as single-record error.
+		goto Cleanup;
+	}
+
+	//------------------------------------------------------
+	// Multi-record transaction.
+	//
+
+	if (as_transaction_is_multi_record(tr)) {
+		if (m->transaction_ttl != 0) {
+			// Old batch and queries may specify transaction_ttl, but don't use
+			// g_config.transaction_max_ns as a default. Assuming specified TTL
+			// is large enough that it's not worth checking for timeout here.
+			tr->end_time = tr->start_time +
+					((uint64_t)m->transaction_ttl * 1000000);
+		}
+
+		if (as_transaction_is_batch_direct(tr)) {
+			// Old batch.
+			if (! as_security_check_data_op(tr, ns, PERM_READ)) {
+				as_multi_rec_transaction_error(tr, tr->result_code);
+				goto Cleanup;
+			}
+
+			if ((rv = as_batch_direct_queue_task(tr, ns)) != 0) {
+				as_multi_rec_transaction_error(tr, rv);
+				cf_atomic64_incr(&g_stats.batch_errors);
+			}
+		}
+		else if (as_transaction_is_query(tr)) {
+			// Query.
+			cf_atomic64_incr(&ns->query_reqs);
+
+			if (! as_security_check_data_op(tr, ns,
+					as_transaction_is_udf(tr) ? PERM_UDF_QUERY : PERM_QUERY)) {
+				as_multi_rec_transaction_error(tr, tr->result_code);
+				goto Cleanup;
+			}
+
+			if (as_query(tr, ns) != 0) {
+				cf_atomic64_incr(&ns->query_fail);
+				as_multi_rec_transaction_error(tr, tr->result_code);
+			}
+		}
+		else {
+			// Scan.
+			if (! as_security_check_data_op(tr, ns,
+					as_transaction_is_udf(tr) ? PERM_UDF_SCAN : PERM_SCAN)) {
+				as_multi_rec_transaction_error(tr, tr->result_code);
+				goto Cleanup;
+			}
+
+			if ((rv = as_scan(tr, ns)) != 0) {
+				as_multi_rec_transaction_error(tr, rv);
+			}
+		}
+
+		goto Cleanup;
+	}
+
+	//------------------------------------------------------
+	// Single-record transaction.
+	//
+
+	// Calculate end_time based on message transaction TTL. May be recalculating
+	// for re-queued transactions, but nice if end_time not copied on/off queue.
+	if (m->transaction_ttl != 0) {
+		tr->end_time = tr->start_time +
+				((uint64_t)m->transaction_ttl * 1000000);
+	}
+	else {
+		// Incorporate g_config.transaction_max_ns if appropriate.
+		// TODO - should g_config.transaction_max_ns = 0 be special?
+		tr->end_time = tr->start_time + g_config.transaction_max_ns;
+	}
+
+	// Did the transaction time out while on the queue?
+	if (cf_getns() > tr->end_time) {
+		cf_debug(AS_TSVC, "transaction timed out in queue");
+		as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		goto Cleanup;
+	}
+
+	// All single-record transactions must have a digest, or a key from which
+	// to calculate it.
+	if (as_transaction_has_digest(tr)) {
+		// Modern client - just copy digest into tr.
+
+		as_msg_field *df = as_msg_field_get(m, AS_MSG_FIELD_TYPE_DIGEST_RIPE);
+		uint32_t digest_sz = as_msg_field_get_value_sz(df);
+
+		if (digest_sz != sizeof(cf_digest)) {
+			cf_warning(AS_TSVC, "digest msg field size %u", digest_sz);
+			as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_PARAMETER);
+			goto Cleanup;
+		}
+
+		tr->keyd = *(cf_digest *)df->data;
+	}
+	else if (! as_transaction_is_batch_sub(tr)) {
+		// Old client - calculate digest from key & set, directly into tr.
+
+		as_msg_field *kf = as_msg_field_get(m, AS_MSG_FIELD_TYPE_KEY);
+		uint32_t key_sz = as_msg_field_get_value_sz(kf);
+
+		as_msg_field *sf = as_transaction_has_set(tr) ?
+				as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET) : NULL;
+		uint32_t set_sz = sf ? as_msg_field_get_value_sz(sf) : 0;
+
+		cf_digest_compute2(sf->data, set_sz, kf->data, key_sz, &tr->keyd);
+	}
+	// else - batch sub-transactions already (and only) have digest in tr.
+
+	// Process the transaction.
+
+	bool is_write = (m->info2 & AS_MSG_INFO2_WRITE) != 0;
+	bool is_read = (m->info1 & AS_MSG_INFO1_READ) != 0;
+	// Both can be set together, but is_write puts us on the 'write path' -
+	// write reservation, replica writes, etc. Writes quickly get split into
+	// write, delete, or UDF after the reservation.
+
+	uint32_t pid = as_partition_getid(&tr->keyd);
+	cf_node dest;
+
+	if (is_write) {
+		if (should_security_check_data_op(tr) &&
+				! as_security_check_data_op(tr, ns, PERM_WRITE)) {
+			as_transaction_error(tr, ns, tr->result_code);
+			goto Cleanup;
+		}
+
+		rv = as_partition_reserve_write(ns, pid, &tr->rsv, &dest);
+	}
+	else if (is_read) {
+		if (should_security_check_data_op(tr) &&
+				! as_security_check_data_op(tr, ns, PERM_READ)) {
+			as_transaction_error(tr, ns, tr->result_code);
+			goto Cleanup;
+		}
+
+		rv = as_partition_reserve_read(ns, pid, &tr->rsv,
+				read_would_duplicate_resolve(ns, m), &dest);
+	}
+	else {
+		cf_warning(AS_TSVC, "transaction is neither read nor write - unexpected");
+		as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_PARAMETER);
+		goto Cleanup;
+	}
+
+	if (rv == -2) {
+		// Partition is unavailable.
+		as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_UNAVAILABLE);
+		goto Cleanup;
+	}
+
+	if (dest == 0) {
+		cf_crash(AS_TSVC, "invalid destination while reserving partition");
+	}
+
+	if (rv == 0) {
+		// <><><><><><>  Reservation Succeeded  <><><><><><>
+
+		if (! as_transaction_is_restart(tr)) {
+			tr->benchmark_time = 0;
+			detail_unique_client_rw(tr, is_write);
+		}
+
+		transaction_status status;
+
+		if (is_write) {
+			if (as_transaction_is_delete(tr)) {
+				status = as_delete_start(tr);
+			}
+			else if (tr->origin == FROM_IUDF || as_transaction_is_udf(tr)) {
+				status = as_udf_start(tr);
+			}
+			else if (tr->origin == FROM_RE_REPL) {
+				status = as_re_replicate_start(tr);
+			}
+			else {
+				status = as_write_start(tr);
+			}
+		}
+		else {
+			status = as_read_start(tr);
+		}
+
+		switch (status) {
+		case TRANS_DONE_ERROR:
+		case TRANS_DONE_SUCCESS:
+			// Done, response already sent - free msg & release reservation.
+			as_partition_release(&tr->rsv);
+			break;
+		case TRANS_IN_PROGRESS:
+			// Don't free msg or release reservation - both owned by rw_request.
+			free_msgp = false;
+			break;
+		case TRANS_WAITING:
+			// Will be re-queued - don't free msg, but release reservation.
+			free_msgp = false;
+			as_partition_release(&tr->rsv);
+			break;
+		default:
+			cf_crash(AS_TSVC, "invalid transaction status %d", status);
+			break;
+		}
+	}
+	else {
+		// <><><><><><>  Reservation Failed  <><><><><><>
+
+		switch (tr->origin) {
+		case FROM_CLIENT:
+		case FROM_BATCH:
+			as_proxy_divert(dest, tr, ns);
+			// CLIENT: fabric owns msgp, BATCH: it's shared, don't free it.
+			free_msgp = false;
+			break;
+		case FROM_PROXY:
+			as_proxy_return_to_sender(tr, ns);
+			tr->from.proxy_node = 0; // pattern, not needed
+			break;
+		case FROM_IUDF:
+			tr->from.iudf_orig->cb(tr->from.iudf_orig->udata,
+					AS_PROTO_RESULT_FAIL_UNKNOWN);
+			tr->from.iudf_orig = NULL; // pattern, not needed
+			break;
+		case FROM_NSUP:
+			break;
+		case FROM_RE_REPL:
+			tr->from.re_repl_orig_cb(tr);
+			tr->from.re_repl_orig_cb = NULL; // pattern, not needed
+			break;
+		default:
+			cf_crash(AS_PROTO, "unexpected transaction origin %u", tr->origin);
+			break;
+		}
+	}
+
+Cleanup:
+
+	if (free_msgp && tr->origin != FROM_BATCH) {
+		cf_free(msgp);
+	}
+} // end process_transaction()
+
+
+//==========================================================
+// Local helpers.
+//
+
+void
+tsvc_add_threads(uint32_t qid, uint32_t n_threads)
+{
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	for (uint32_t n = 0; n < n_threads; n++) {
+		if (pthread_create(&thread, &attrs, run_tsvc,
+				(void*)(uint64_t)qid) == 0) {
+			g_queues_n_threads[qid]++;
+		}
+		else {
+			cf_warning(AS_TSVC, "tsvc queue %u failed thread create", qid);
+		}
+	}
+}
+
+
+void
+tsvc_remove_threads(uint32_t qid, uint32_t n_threads)
+{
+	as_transaction death_tr = { .msgp = NULL };
+
+	for (uint32_t n = 0; n < n_threads; n++) {
+		// Send terminator (transaction with NULL msgp).
+		cf_queue_push(g_transaction_queues[qid], &death_tr);
+		g_queues_n_threads[qid]--;
+	}
+}
+
+
+// Service transactions - arg is the queue we're to service.
+void *
+run_tsvc(void *arg)
+{
+	uint32_t qid = (uint32_t)(uint64_t)arg;
+
+	if (g_config.auto_pin != CF_TOPO_AUTO_PIN_NONE &&
+			g_config.n_namespaces_not_inlined != 0) {
+		cf_detail(AS_TSVC, "pinning thread to CPU %u", qid);
+		cf_topo_pin_to_cpu((cf_topo_cpu_index)qid);
+	}
+
+	cf_queue *q = g_transaction_queues[qid];
+
+	while (true) {
+		as_transaction tr;
+
+		if (cf_queue_pop(q, &tr, CF_QUEUE_FOREVER) != CF_QUEUE_OK) {
+			cf_crash(AS_TSVC, "unable to pop from transaction queue");
+		}
+
+		if (! tr.msgp) {
+			break; // thread termination via configuration change
+		}
+
+		cf_debug(AS_TSVC, "running on CPU %hu", cf_topo_current_cpu());
+
+		if (g_config.svc_benchmarks_enabled &&
+				tr.benchmark_time != 0 && ! as_transaction_is_restart(&tr)) {
+			histogram_insert_data_point(g_stats.svc_queue_hist,
+					tr.benchmark_time);
+		}
+
+		as_tsvc_process_transaction(&tr);
+	}
+
+	return NULL;
+}
diff --git a/as/src/base/ticker.c b/as/src/base/ticker.c
new file mode 100644
index 00000000..ba38fc3b
--- /dev/null
+++ b/as/src/base/ticker.c
@@ -0,0 +1,919 @@
+/*
+ * ticker.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/ticker.h"
+
+#include <malloc.h>
+#include <mcheck.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+#include <sys/param.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "hist.h"
+#include "hist_track.h"
+#include "meminfo.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/secondary_index.h"
+#include "base/stats.h"
+#include "base/thr_info.h"
+#include "base/thr_sindex.h"
+#include "base/thr_tsvc.h"
+#include "fabric/clustering.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "fabric/partition.h"
+#include "fabric/skew_monitor.h"
+#include "storage/storage.h"
+#include "transaction/proxy.h"
+#include "transaction/rw_request_hash.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+extern int as_nsup_queue_get_size();
+extern bool g_shutdown_started;
+
+void* run_ticker(void* arg);
+void log_ticker_frame(uint64_t delta_time);
+
+void log_line_clock();
+void log_line_system_memory();
+void log_line_in_progress();
+void log_line_fds();
+void log_line_heartbeat();
+void log_fabric_rate(uint64_t delta_time);
+void log_line_early_fail();
+void log_line_batch_index();
+
+void log_line_objects(as_namespace* ns, uint64_t n_objects,
+		repl_stats* mp);
+void log_line_tombstones(as_namespace* ns, uint64_t n_tombstones,
+		repl_stats* mp);
+void log_line_appeals(as_namespace* ns);
+void log_line_migrations(as_namespace* ns);
+void log_line_memory_usage(as_namespace* ns, size_t total_mem, size_t index_mem,
+		size_t sindex_mem, size_t data_mem);
+void log_line_device_usage(as_namespace* ns);
+
+void log_line_client(as_namespace* ns);
+void log_line_xdr_client(as_namespace* ns);
+void log_line_batch_sub(as_namespace* ns);
+void log_line_scan(as_namespace* ns);
+void log_line_query(as_namespace* ns);
+void log_line_udf_sub(as_namespace* ns);
+void log_line_retransmits(as_namespace* ns);
+void log_line_re_repl(as_namespace* ns);
+void log_line_special_errors(as_namespace* ns);
+
+void dump_global_histograms();
+void dump_namespace_histograms(as_namespace* ns);
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_ticker_start()
+{
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, run_ticker, NULL) != 0) {
+		cf_crash(AS_INFO, "failed to create ticker thread");
+	}
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+void*
+run_ticker(void* arg)
+{
+	uint64_t last_time = cf_getns();
+
+	while (true) {
+		// Wake up every 1 second to check the ticker interval.
+		struct timespec delay = { 1, 0 };
+		nanosleep(&delay, NULL);
+
+		uint64_t curr_time = cf_getns();
+		uint64_t delta_time = curr_time - last_time;
+
+		if (delta_time < (uint64_t)g_config.ticker_interval * 1000000000) {
+			continue; // period has not been reached for showing a frame
+		}
+
+		last_time = curr_time;
+
+		// Reduce likelihood of ticker frames showing after shutdown signal.
+		if (g_shutdown_started) {
+			break;
+		}
+
+		log_ticker_frame(delta_time);
+	}
+
+	return NULL;
+}
+
+
+void
+log_ticker_frame(uint64_t delta_time)
+{
+	cf_info(AS_INFO, "NODE-ID %lx CLUSTER-SIZE %u",
+			g_config.self_node,
+			as_exchange_cluster_size()
+			);
+
+	log_line_clock();
+	log_line_system_memory();
+	log_line_in_progress();
+	log_line_fds();
+	log_line_heartbeat();
+	log_fabric_rate(delta_time);
+	log_line_early_fail();
+	log_line_batch_index();
+
+	dump_global_histograms();
+
+	size_t total_ns_memory_inuse = 0;
+
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace* ns = g_config.namespaces[i];
+
+		uint64_t n_objects = ns->n_objects;
+		uint64_t n_tombstones = ns->n_tombstones;
+
+		size_t index_mem = as_index_size_get(ns) * (n_objects + n_tombstones);
+		size_t sindex_mem = ns->n_bytes_sindex_memory;
+		size_t data_mem = ns->n_bytes_memory;
+		size_t total_mem = index_mem + sindex_mem + data_mem;
+
+		total_ns_memory_inuse += total_mem;
+
+		repl_stats mp;
+		as_partition_get_replica_stats(ns, &mp);
+
+		log_line_objects(ns, n_objects, &mp);
+		log_line_tombstones(ns, n_tombstones, &mp);
+		log_line_appeals(ns);
+		log_line_migrations(ns);
+		log_line_memory_usage(ns, total_mem, index_mem, sindex_mem, data_mem);
+		log_line_device_usage(ns);
+
+		log_line_client(ns);
+		log_line_xdr_client(ns);
+		log_line_batch_sub(ns);
+		log_line_scan(ns);
+		log_line_query(ns);
+		log_line_udf_sub(ns);
+		log_line_retransmits(ns);
+		log_line_re_repl(ns);
+		log_line_special_errors(ns);
+
+		dump_namespace_histograms(ns);
+	}
+
+	if (g_config.fabric_dump_msgs) {
+		as_fabric_msg_queue_dump();
+	}
+
+	cf_dump_ticker_cache();
+}
+
+
+void
+log_line_clock()
+{
+	cf_dyn_buf_define_size(outliers_db, 17 * AS_CLUSTER_SZ);
+	uint32_t num_outliers = as_skew_monitor_outliers_append(&outliers_db);
+
+	if (num_outliers != 0) {
+		cf_dyn_buf_append_char(&outliers_db, 0);
+
+		cf_info(AS_INFO, "   cluster-clock: skew-ms %lu outliers (%s)",
+				as_skew_monitor_skew(),
+				outliers_db.buf
+				);
+	}
+	else {
+		cf_info(AS_INFO, "   cluster-clock: skew-ms %lu",
+				as_skew_monitor_skew()
+				);
+	}
+
+	cf_dyn_buf_free(&outliers_db);
+}
+
+
+void
+log_line_system_memory()
+{
+	uint64_t freemem;
+	int freepct;
+	bool swapping;
+
+	cf_meminfo(NULL, &freemem, &freepct, &swapping);
+
+	size_t allocated_kbytes;
+	size_t active_kbytes;
+	size_t mapped_kbytes;
+	double efficiency_pct;
+
+	cf_alloc_heap_stats(&allocated_kbytes, &active_kbytes, &mapped_kbytes,
+			&efficiency_pct, NULL);
+
+	cf_info(AS_INFO, "   system-memory: free-kbytes %lu free-pct %d%s heap-kbytes (%lu,%lu,%lu) heap-efficiency-pct %.1lf",
+			freemem / 1024,
+			freepct,
+			swapping ? " SWAPPING!" : "",
+			allocated_kbytes, active_kbytes, mapped_kbytes,
+			efficiency_pct
+			);
+}
+
+
+void
+log_line_in_progress()
+{
+	cf_info(AS_INFO, "   in-progress: tsvc-q %d info-q %d nsup-delete-q %d rw-hash %u proxy-hash %u tree-gc-q %d",
+			as_tsvc_queue_get_size(),
+			as_info_queue_get_size(),
+			as_nsup_queue_get_size(),
+			rw_request_hash_count(),
+			as_proxy_hash_count(),
+			as_index_tree_gc_queue_size()
+			);
+}
+
+
+void
+log_line_fds()
+{
+	uint64_t n_proto_fds_opened = g_stats.proto_connections_opened;
+	uint64_t n_proto_fds_closed = g_stats.proto_connections_closed;
+	uint64_t n_hb_fds_opened = g_stats.heartbeat_connections_opened;
+	uint64_t n_hb_fds_closed = g_stats.heartbeat_connections_closed;
+	uint64_t n_fabric_fds_opened = g_stats.fabric_connections_opened;
+	uint64_t n_fabric_fds_closed = g_stats.fabric_connections_closed;
+
+	uint64_t n_proto_fds_open = n_proto_fds_opened - n_proto_fds_closed;
+	uint64_t n_hb_fds_open = n_hb_fds_opened - n_hb_fds_closed;
+	uint64_t n_fabric_fds_open = n_fabric_fds_opened - n_fabric_fds_closed;
+
+	cf_info(AS_INFO, "   fds: proto (%lu,%lu,%lu) heartbeat (%lu,%lu,%lu) fabric (%lu,%lu,%lu)",
+			n_proto_fds_open, n_proto_fds_opened, n_proto_fds_closed,
+			n_hb_fds_open, n_hb_fds_opened, n_hb_fds_closed,
+			n_fabric_fds_open, n_fabric_fds_opened, n_fabric_fds_closed
+			);
+}
+
+
+void
+log_line_heartbeat()
+{
+	cf_info(AS_INFO, "   heartbeat-received: self %lu foreign %lu",
+			g_stats.heartbeat_received_self, g_stats.heartbeat_received_foreign
+			);
+}
+
+
+void
+log_fabric_rate(uint64_t delta_time)
+{
+	fabric_rate rate = { { 0 } };
+
+	as_fabric_rate_capture(&rate);
+
+	uint64_t dt_sec = delta_time / 1000000000;
+
+	if (dt_sec < 1) {
+		dt_sec = 1;
+	}
+
+	g_stats.fabric_bulk_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_BULK] / dt_sec;
+	g_stats.fabric_bulk_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_BULK] / dt_sec;
+	g_stats.fabric_ctrl_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_CTRL] / dt_sec;
+	g_stats.fabric_ctrl_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_CTRL] / dt_sec;
+	g_stats.fabric_meta_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_META] / dt_sec;
+	g_stats.fabric_meta_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_META] / dt_sec;
+	g_stats.fabric_rw_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_RW] / dt_sec;
+	g_stats.fabric_rw_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_RW] / dt_sec;
+
+	cf_info(AS_INFO, "   fabric-bytes-per-second: bulk (%lu,%lu) ctrl (%lu,%lu) meta (%lu,%lu) rw (%lu,%lu)",
+			g_stats.fabric_bulk_s_rate, g_stats.fabric_bulk_r_rate,
+			g_stats.fabric_ctrl_s_rate, g_stats.fabric_ctrl_r_rate,
+			g_stats.fabric_meta_s_rate, g_stats.fabric_meta_r_rate,
+			g_stats.fabric_rw_s_rate, g_stats.fabric_rw_r_rate
+			);
+}
+
+
+void
+log_line_early_fail()
+{
+	uint64_t n_demarshal = g_stats.n_demarshal_error;
+	uint64_t n_tsvc_client = g_stats.n_tsvc_client_error;
+	uint64_t n_tsvc_batch_sub = g_stats.n_tsvc_batch_sub_error;
+	uint64_t n_tsvc_udf_sub = g_stats.n_tsvc_udf_sub_error;
+
+	if ((n_demarshal |
+			n_tsvc_client |
+			n_tsvc_batch_sub |
+			n_tsvc_udf_sub) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "   early-fail: demarshal %lu tsvc-client %lu tsvc-batch-sub %lu tsvc-udf-sub %lu",
+			n_demarshal,
+			n_tsvc_client,
+			n_tsvc_batch_sub,
+			n_tsvc_udf_sub
+			);
+}
+
+
+void
+log_line_batch_index()
+{
+	uint64_t n_complete = g_stats.batch_index_complete;
+	uint64_t n_error = g_stats.batch_index_errors;
+	uint64_t n_timeout = g_stats.batch_index_timeout;
+
+	if ((n_complete | n_error | n_timeout) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "   batch-index: batches (%lu,%lu,%lu)",
+			n_complete, n_error, n_timeout
+			);
+}
+
+
+void
+log_line_objects(as_namespace* ns, uint64_t n_objects, repl_stats* mp)
+{
+	// TODO - show if all 0's ???
+	cf_info(AS_INFO, "{%s} objects: all %lu master %lu prole %lu non-replica %lu",
+			ns->name,
+			n_objects,
+			mp->n_master_objects,
+			mp->n_prole_objects,
+			mp->n_non_replica_objects
+			);
+}
+
+
+void
+log_line_tombstones(as_namespace* ns, uint64_t n_tombstones, repl_stats* mp)
+{
+	if ((n_tombstones |
+			mp->n_master_tombstones |
+			mp->n_prole_tombstones |
+			mp->n_non_replica_tombstones) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} tombstones: all %lu master %lu prole %lu non-replica %lu",
+			ns->name,
+			n_tombstones,
+			mp->n_master_tombstones,
+			mp->n_prole_tombstones,
+			mp->n_non_replica_tombstones
+			);
+}
+
+
+void
+log_line_appeals(as_namespace* ns)
+{
+	int64_t remaining_tx = (int64_t)ns->appeals_tx_remaining;
+	int64_t active_tx = (int64_t)ns->appeals_tx_active;
+	int64_t active_rx = (int64_t)ns->appeals_rx_active;
+
+	if (remaining_tx > 0 || active_tx > 0 || active_rx > 0) {
+		cf_info(AS_INFO, "{%s} appeals: remaining-tx %ld active (%ld,%ld)",
+				ns->name,
+				remaining_tx, active_tx, active_rx
+				);
+	}
+}
+
+
+void
+log_line_migrations(as_namespace* ns)
+{
+	int64_t initial_tx = (int64_t)ns->migrate_tx_partitions_initial;
+	int64_t initial_rx = (int64_t)ns->migrate_rx_partitions_initial;
+	int64_t remaining_tx = (int64_t)ns->migrate_tx_partitions_remaining;
+	int64_t remaining_rx = (int64_t)ns->migrate_rx_partitions_remaining;
+	int64_t initial = initial_tx + initial_rx;
+	int64_t remaining = remaining_tx + remaining_rx;
+
+	if (initial > 0 && remaining > 0) {
+		float complete_pct = (1 - ((float)remaining / (float)initial)) * 100;
+
+		cf_info(AS_INFO, "{%s} migrations: remaining (%ld,%ld,%ld) active (%ld,%ld,%ld) complete-pct %0.2f",
+				ns->name,
+				remaining_tx, remaining_rx, ns->migrate_signals_remaining,
+				ns->migrate_tx_partitions_active, ns->migrate_rx_partitions_active, ns->migrate_signals_active,
+				complete_pct
+				);
+	}
+	else {
+		cf_info(AS_INFO, "{%s} migrations: complete", ns->name);
+	}
+}
+
+
+void
+log_line_memory_usage(as_namespace* ns, size_t total_mem, size_t index_mem,
+		size_t sindex_mem, size_t data_mem)
+{
+	double mem_used_pct = (double)(total_mem * 100) / (double)ns->memory_size;
+
+	if (ns->storage_data_in_memory) {
+		cf_info(AS_INFO, "{%s} memory-usage: total-bytes %lu index-bytes %lu sindex-bytes %lu data-bytes %lu used-pct %.2lf",
+				ns->name,
+				total_mem,
+				index_mem,
+				sindex_mem,
+				data_mem,
+				mem_used_pct
+				);
+	}
+	else {
+		cf_info(AS_INFO, "{%s} memory-usage: total-bytes %lu index-bytes %lu sindex-bytes %lu used-pct %.2lf",
+				ns->name,
+				total_mem,
+				index_mem,
+				sindex_mem,
+				mem_used_pct
+				);
+	}
+}
+
+
+void
+log_line_device_usage(as_namespace* ns)
+{
+	if (ns->storage_type != AS_STORAGE_ENGINE_SSD) {
+		return;
+	}
+
+	int available_pct;
+	uint64_t inuse_disk_bytes;
+	as_storage_stats(ns, &available_pct, &inuse_disk_bytes);
+
+	if (ns->storage_data_in_memory) {
+		cf_info(AS_INFO, "{%s} device-usage: used-bytes %lu avail-pct %d",
+				ns->name,
+				inuse_disk_bytes,
+				available_pct
+				);
+	}
+	else {
+		uint32_t n_reads_from_cache = ns->n_reads_from_cache;
+		uint32_t n_total_reads = ns->n_reads_from_device + n_reads_from_cache;
+
+		cf_atomic32_set(&ns->n_reads_from_device, 0);
+		cf_atomic32_set(&ns->n_reads_from_cache, 0);
+
+		ns->cache_read_pct =
+				(float)(100 * n_reads_from_cache) /
+				(float)(n_total_reads == 0 ? 1 : n_total_reads);
+
+		cf_info(AS_INFO, "{%s} device-usage: used-bytes %lu avail-pct %d cache-read-pct %.2f",
+				ns->name,
+				inuse_disk_bytes,
+				available_pct,
+				ns->cache_read_pct
+				);
+	}
+}
+
+
+void
+log_line_client(as_namespace* ns)
+{
+	uint64_t n_tsvc_error = ns->n_client_tsvc_error;
+	uint64_t n_tsvc_timeout = ns->n_client_tsvc_timeout;
+	uint64_t n_proxy_complete = ns->n_client_proxy_complete;
+	uint64_t n_proxy_error = ns->n_client_proxy_error;
+	uint64_t n_proxy_timeout = ns->n_client_proxy_timeout;
+	uint64_t n_read_success = ns->n_client_read_success;
+	uint64_t n_read_error = ns->n_client_read_error;
+	uint64_t n_read_timeout = ns->n_client_read_timeout;
+	uint64_t n_read_not_found = ns->n_client_read_not_found;
+	uint64_t n_write_success = ns->n_client_write_success;
+	uint64_t n_write_error = ns->n_client_write_error;
+	uint64_t n_write_timeout = ns->n_client_write_timeout;
+	uint64_t n_delete_success = ns->n_client_delete_success;
+	uint64_t n_delete_error = ns->n_client_delete_error;
+	uint64_t n_delete_timeout = ns->n_client_delete_timeout;
+	uint64_t n_delete_not_found = ns->n_client_delete_not_found;
+	uint64_t n_udf_complete = ns->n_client_udf_complete;
+	uint64_t n_udf_error = ns->n_client_udf_error;
+	uint64_t n_udf_timeout = ns->n_client_udf_timeout;
+	uint64_t n_lang_read_success = ns->n_client_lang_read_success;
+	uint64_t n_lang_write_success = ns->n_client_lang_write_success;
+	uint64_t n_lang_delete_success = ns->n_client_lang_delete_success;
+	uint64_t n_lang_error = ns->n_client_lang_error;
+
+	if ((n_tsvc_error | n_tsvc_timeout |
+			n_proxy_complete | n_proxy_error | n_proxy_timeout |
+			n_read_success | n_read_error | n_read_timeout | n_read_not_found |
+			n_write_success | n_write_error | n_write_timeout |
+			n_delete_success | n_delete_error | n_delete_timeout | n_delete_not_found |
+			n_udf_complete | n_udf_error | n_udf_timeout |
+			n_lang_read_success | n_lang_write_success | n_lang_delete_success | n_lang_error) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} client: tsvc (%lu,%lu) proxy (%lu,%lu,%lu) read (%lu,%lu,%lu,%lu) write (%lu,%lu,%lu) delete (%lu,%lu,%lu,%lu) udf (%lu,%lu,%lu) lang (%lu,%lu,%lu,%lu)",
+			ns->name,
+			n_tsvc_error, n_tsvc_timeout,
+			n_proxy_complete, n_proxy_error, n_proxy_timeout,
+			n_read_success, n_read_error, n_read_timeout, n_read_not_found,
+			n_write_success, n_write_error, n_write_timeout,
+			n_delete_success, n_delete_error, n_delete_timeout, n_delete_not_found,
+			n_udf_complete, n_udf_error, n_udf_timeout,
+			n_lang_read_success, n_lang_write_success, n_lang_delete_success, n_lang_error
+			);
+}
+
+
+void
+log_line_xdr_client(as_namespace* ns)
+{
+	uint64_t n_write_success = ns->n_xdr_write_success;
+	uint64_t n_write_error = ns->n_xdr_write_error;
+	uint64_t n_write_timeout = ns->n_xdr_write_timeout;
+	uint64_t n_delete_success = ns->n_xdr_delete_success;
+	uint64_t n_delete_error = ns->n_xdr_delete_error;
+	uint64_t n_delete_timeout = ns->n_xdr_delete_timeout;
+	uint64_t n_delete_not_found = ns->n_xdr_delete_not_found;
+
+	if ((n_write_success | n_write_error | n_write_timeout |
+			n_delete_success | n_delete_error | n_delete_timeout | n_delete_not_found) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} xdr-client: write (%lu,%lu,%lu) delete (%lu,%lu,%lu,%lu)",
+			ns->name,
+			n_write_success, n_write_error, n_write_timeout,
+			n_delete_success, n_delete_error, n_delete_timeout, n_delete_not_found
+			);
+}
+
+
+void
+log_line_batch_sub(as_namespace* ns)
+{
+	uint64_t n_tsvc_error = ns->n_batch_sub_tsvc_error;
+	uint64_t n_tsvc_timeout = ns->n_batch_sub_tsvc_timeout;
+	uint64_t n_proxy_complete = ns->n_batch_sub_proxy_complete;
+	uint64_t n_proxy_error = ns->n_batch_sub_proxy_error;
+	uint64_t n_proxy_timeout = ns->n_batch_sub_proxy_timeout;
+	uint64_t n_read_success = ns->n_batch_sub_read_success;
+	uint64_t n_read_error = ns->n_batch_sub_read_error;
+	uint64_t n_read_timeout = ns->n_batch_sub_read_timeout;
+	uint64_t n_read_not_found = ns->n_batch_sub_read_not_found;
+
+	if ((n_tsvc_error | n_tsvc_timeout |
+			n_proxy_complete | n_proxy_error | n_proxy_timeout |
+			n_read_success | n_read_error | n_read_timeout | n_read_not_found) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} batch-sub: tsvc (%lu,%lu) proxy (%lu,%lu,%lu) read (%lu,%lu,%lu,%lu)",
+			ns->name,
+			n_tsvc_error, n_tsvc_timeout,
+			n_proxy_complete, n_proxy_error, n_proxy_timeout,
+			n_read_success, n_read_error, n_read_timeout, n_read_not_found
+			);
+}
+
+
+void
+log_line_scan(as_namespace* ns)
+{
+	uint64_t n_basic_complete = ns->n_scan_basic_complete;
+	uint64_t n_basic_error = ns->n_scan_basic_error;
+	uint64_t n_basic_abort = ns->n_scan_basic_abort;
+	uint64_t n_aggr_complete = ns->n_scan_aggr_complete;
+	uint64_t n_aggr_error = ns->n_scan_aggr_error;
+	uint64_t n_aggr_abort = ns->n_scan_aggr_abort;
+	uint64_t n_udf_bg_complete = ns->n_scan_udf_bg_complete;
+	uint64_t n_udf_bg_error = ns->n_scan_udf_bg_error;
+	uint64_t n_udf_bg_abort = ns->n_scan_udf_bg_abort;
+
+	if ((n_basic_complete | n_basic_error | n_basic_abort |
+			n_aggr_complete | n_aggr_error | n_aggr_abort |
+			n_udf_bg_complete | n_udf_bg_error | n_udf_bg_abort) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} scan: basic (%lu,%lu,%lu) aggr (%lu,%lu,%lu) udf-bg (%lu,%lu,%lu)",
+			ns->name,
+			n_basic_complete, n_basic_error, n_basic_abort,
+			n_aggr_complete, n_aggr_error, n_aggr_abort,
+			n_udf_bg_complete, n_udf_bg_error, n_udf_bg_abort
+			);
+}
+
+
+void
+log_line_query(as_namespace* ns)
+{
+	uint64_t n_basic_success = ns->n_lookup_success;
+	uint64_t n_basic_failure = ns->n_lookup_errs + ns->n_lookup_abort;
+	uint64_t n_aggr_success = ns->n_agg_success;
+	uint64_t n_aggr_failure = ns->n_agg_errs + ns->n_agg_abort;
+	uint64_t n_udf_bg_success = ns->n_query_udf_bg_success;
+	uint64_t n_udf_bg_failure = ns->n_query_udf_bg_failure;
+
+	if ((n_basic_success | n_basic_failure |
+			n_aggr_success | n_aggr_failure |
+			n_udf_bg_success | n_udf_bg_failure) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} query: basic (%lu,%lu) aggr (%lu,%lu) udf-bg (%lu,%lu)",
+			ns->name,
+			n_basic_success, n_basic_failure,
+			n_aggr_success, n_aggr_failure,
+			n_udf_bg_success, n_udf_bg_failure
+			);
+}
+
+
+void
+log_line_udf_sub(as_namespace* ns)
+{
+	uint64_t n_tsvc_error = ns->n_udf_sub_tsvc_error;
+	uint64_t n_tsvc_timeout = ns->n_udf_sub_tsvc_timeout;
+	uint64_t n_udf_complete = ns->n_udf_sub_udf_complete;
+	uint64_t n_udf_error = ns->n_udf_sub_udf_error;
+	uint64_t n_udf_timeout = ns->n_udf_sub_udf_timeout;
+	uint64_t n_lang_read_success = ns->n_udf_sub_lang_read_success;
+	uint64_t n_lang_write_success = ns->n_udf_sub_lang_write_success;
+	uint64_t n_lang_delete_success = ns->n_udf_sub_lang_delete_success;
+	uint64_t n_lang_error = ns->n_udf_sub_lang_error;
+
+	if ((n_tsvc_error | n_tsvc_timeout |
+			n_udf_complete | n_udf_error | n_udf_timeout |
+			n_lang_read_success | n_lang_write_success | n_lang_delete_success | n_lang_error) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} udf-sub: tsvc (%lu,%lu) udf (%lu,%lu,%lu) lang (%lu,%lu,%lu,%lu)",
+			ns->name,
+			n_tsvc_error, n_tsvc_timeout,
+			n_udf_complete, n_udf_error, n_udf_timeout,
+			n_lang_read_success, n_lang_write_success, n_lang_delete_success, n_lang_error
+			);
+}
+
+
+void
+log_line_retransmits(as_namespace* ns)
+{
+	uint64_t n_migrate_record_retransmits = ns->migrate_record_retransmits;
+	uint64_t n_client_read_dup_res = ns->n_retransmit_client_read_dup_res;
+	uint64_t n_client_write_dup_res = ns->n_retransmit_client_write_dup_res;
+	uint64_t n_client_write_repl_write = ns->n_retransmit_client_write_repl_write;
+	uint64_t n_client_delete_dup_res = ns->n_retransmit_client_delete_dup_res;
+	uint64_t n_client_delete_repl_write = ns->n_retransmit_client_delete_repl_write;
+	uint64_t n_client_udf_dup_res = ns->n_retransmit_client_udf_dup_res;
+	uint64_t n_client_udf_repl_write = ns->n_retransmit_client_udf_repl_write;
+	uint64_t n_batch_sub_dup_res = ns->n_retransmit_batch_sub_dup_res;
+	uint64_t n_udf_sub_dup_res = ns->n_retransmit_udf_sub_dup_res;
+	uint64_t n_udf_sub_repl_write = ns->n_retransmit_udf_sub_repl_write;
+
+	if ((n_migrate_record_retransmits |
+			n_client_read_dup_res |
+			n_client_write_dup_res | n_client_write_repl_write |
+			n_client_delete_dup_res | n_client_delete_repl_write |
+			n_client_udf_dup_res | n_client_udf_repl_write |
+			n_batch_sub_dup_res |
+			n_udf_sub_dup_res | n_udf_sub_repl_write) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} retransmits: migration %lu client-read %lu client-write (%lu,%lu) client-delete (%lu,%lu) client-udf (%lu,%lu) batch-sub %lu udf-sub (%lu,%lu)",
+			ns->name,
+			n_migrate_record_retransmits,
+			n_client_read_dup_res,
+			n_client_write_dup_res, n_client_write_repl_write,
+			n_client_delete_dup_res, n_client_delete_repl_write,
+			n_client_udf_dup_res, n_client_udf_repl_write,
+			n_batch_sub_dup_res,
+			n_udf_sub_dup_res, n_udf_sub_repl_write
+			);
+}
+
+
+void
+log_line_re_repl(as_namespace* ns)
+{
+	uint64_t n_re_repl_success = ns->n_re_repl_success;
+	uint64_t n_re_repl_error = ns->n_re_repl_error;
+	uint64_t n_re_repl_timeout = ns->n_re_repl_timeout;
+
+	if ((n_re_repl_success | n_re_repl_error | n_re_repl_timeout) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} re-repl: all-triggers (%lu,%lu,%lu)",
+			ns->name,
+			n_re_repl_success, n_re_repl_error, n_re_repl_timeout
+			);
+}
+
+
+void
+log_line_special_errors(as_namespace* ns)
+{
+	uint64_t n_fail_key_busy = ns->n_fail_key_busy;
+	uint64_t n_fail_record_too_big = ns->n_fail_record_too_big;
+
+	if ((n_fail_key_busy |
+			n_fail_record_too_big) == 0) {
+		return;
+	}
+
+	cf_info(AS_INFO, "{%s} special-errors: key-busy %lu record-too-big %lu",
+			ns->name,
+			n_fail_key_busy,
+			n_fail_record_too_big
+			);
+}
+
+
+void
+dump_global_histograms()
+{
+	if (g_stats.batch_index_hist_active) {
+		histogram_dump(g_stats.batch_index_hist);
+	}
+
+	if (g_config.info_hist_enabled) {
+		histogram_dump(g_stats.info_hist);
+	}
+
+	if (g_config.svc_benchmarks_enabled) {
+		histogram_dump(g_stats.svc_demarshal_hist);
+		histogram_dump(g_stats.svc_queue_hist);
+	}
+
+	if (g_config.fabric_benchmarks_enabled) {
+		histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_BULK]);
+		histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_BULK]);
+		histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_BULK]);
+		histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_BULK]);
+		histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_CTRL]);
+		histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_CTRL]);
+		histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_CTRL]);
+		histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_CTRL]);
+		histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_META]);
+		histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_META]);
+		histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_META]);
+		histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_META]);
+		histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_RW]);
+		histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_RW]);
+		histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_RW]);
+		histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_RW]);
+	}
+
+	as_query_histogram_dumpall();
+}
+
+
+void
+dump_namespace_histograms(as_namespace* ns)
+{
+	if (ns->read_hist_active) {
+		cf_hist_track_dump(ns->read_hist);
+	}
+
+	if (ns->read_benchmarks_enabled) {
+		histogram_dump(ns->read_start_hist);
+		histogram_dump(ns->read_restart_hist);
+		histogram_dump(ns->read_dup_res_hist);
+		histogram_dump(ns->read_repl_ping_hist);
+		histogram_dump(ns->read_local_hist);
+		histogram_dump(ns->read_response_hist);
+	}
+
+	if (ns->write_hist_active) {
+		cf_hist_track_dump(ns->write_hist);
+	}
+
+	if (ns->write_benchmarks_enabled) {
+		histogram_dump(ns->write_start_hist);
+		histogram_dump(ns->write_restart_hist);
+		histogram_dump(ns->write_dup_res_hist);
+		histogram_dump(ns->write_master_hist);
+		histogram_dump(ns->write_repl_write_hist);
+		histogram_dump(ns->write_response_hist);
+	}
+
+	if (ns->udf_hist_active) {
+		cf_hist_track_dump(ns->udf_hist);
+	}
+
+	if (ns->udf_benchmarks_enabled) {
+		histogram_dump(ns->udf_start_hist);
+		histogram_dump(ns->udf_restart_hist);
+		histogram_dump(ns->udf_dup_res_hist);
+		histogram_dump(ns->udf_master_hist);
+		histogram_dump(ns->udf_repl_write_hist);
+		histogram_dump(ns->udf_response_hist);
+	}
+
+	if (ns->query_hist_active) {
+		cf_hist_track_dump(ns->query_hist);
+	}
+
+	if (ns->query_rec_count_hist_active) {
+		histogram_dump(ns->query_rec_count_hist);
+	}
+
+	if (ns->proxy_hist_enabled) {
+		histogram_dump(ns->proxy_hist);
+	}
+
+	if (ns->batch_sub_benchmarks_enabled) {
+		histogram_dump(ns->batch_sub_start_hist);
+		histogram_dump(ns->batch_sub_restart_hist);
+		histogram_dump(ns->batch_sub_dup_res_hist);
+		histogram_dump(ns->batch_sub_repl_ping_hist);
+		histogram_dump(ns->batch_sub_read_local_hist);
+		histogram_dump(ns->batch_sub_response_hist);
+	}
+
+	if (ns->udf_sub_benchmarks_enabled) {
+		histogram_dump(ns->udf_sub_start_hist);
+		histogram_dump(ns->udf_sub_restart_hist);
+		histogram_dump(ns->udf_sub_dup_res_hist);
+		histogram_dump(ns->udf_sub_master_hist);
+		histogram_dump(ns->udf_sub_repl_write_hist);
+		histogram_dump(ns->udf_sub_response_hist);
+	}
+
+	if (ns->re_repl_hist_active) {
+		histogram_dump(ns->re_repl_hist);
+	}
+
+	if (ns->storage_benchmarks_enabled) {
+		as_storage_ticker_stats(ns);
+	}
+
+	as_sindex_histogram_dumpall(ns);
+}
diff --git a/as/src/base/transaction.c b/as/src/base/transaction.c
new file mode 100644
index 00000000..b1f6d294
--- /dev/null
+++ b/as/src/base/transaction.c
@@ -0,0 +1,480 @@
+/*
+ * transaction.c
+ *
+ * Copyright (C) 2008-2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * Operations on transactions
+ */
+
+#include "base/transaction.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "fault.h"
+#include "socket.h"
+
+#include "base/batch.h"
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/scan.h"
+#include "base/security.h"
+#include "base/stats.h"
+#include "base/thr_demarshal.h"
+#include "fabric/partition.h"
+#include "transaction/proxy.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_utils.h"
+#include "transaction/udf.h"
+
+
+void
+as_transaction_init_head(as_transaction *tr, cf_digest *keyd, cl_msg *msgp)
+{
+	tr->msgp				= msgp;
+	tr->msg_fields			= 0;
+
+	tr->origin				= 0;
+	tr->from_flags			= 0;
+
+	tr->from.any			= NULL;
+	tr->from_data.any		= 0;
+
+	tr->keyd				= keyd ? *keyd : cf_digest_zero;
+
+	tr->start_time			= 0;
+	tr->benchmark_time		= 0;
+}
+
+void
+as_transaction_init_body(as_transaction *tr)
+{
+	AS_PARTITION_RESERVATION_INIT(tr->rsv);
+
+	tr->end_time			= 0;
+	tr->result_code			= AS_PROTO_RESULT_OK;
+	tr->flags				= 0;
+	tr->generation			= 0;
+	tr->void_time			= 0;
+	tr->last_update_time	= 0;
+}
+
+void
+as_transaction_copy_head(as_transaction *to, const as_transaction *from)
+{
+	to->msgp				= from->msgp;
+	to->msg_fields			= from->msg_fields;
+
+	to->origin				= from->origin;
+	to->from_flags			= from->from_flags;
+
+	to->from.any			= from->from.any;
+	to->from_data.any		= from->from_data.any;
+
+	to->keyd				= from->keyd;
+
+	to->start_time			= from->start_time;
+	to->benchmark_time		= from->benchmark_time;
+}
+
+void
+as_transaction_init_from_rw(as_transaction *tr, rw_request *rw)
+{
+	as_transaction_init_head_from_rw(tr, rw);
+	// Note - we don't clear rw->msgp, destructor will free it.
+
+	as_partition_reservation_copy(&tr->rsv, &rw->rsv);
+	// Note - destructor will still release the reservation.
+
+	tr->end_time = rw->end_time;
+	tr->result_code = rw->result_code;
+	tr->flags = rw->flags;
+	tr->generation = rw->generation;
+	tr->void_time = rw->void_time;
+	tr->last_update_time = rw->last_update_time;
+}
+
+void
+as_transaction_init_head_from_rw(as_transaction *tr, rw_request *rw)
+{
+	tr->msgp				= rw->msgp;
+	tr->msg_fields			= rw->msg_fields;
+	tr->origin				= rw->origin;
+	tr->from_flags			= rw->from_flags;
+	tr->from.any			= rw->from.any;
+	tr->from_data.any		= rw->from_data.any;
+	tr->keyd				= rw->keyd;
+	tr->start_time			= rw->start_time;
+	tr->benchmark_time		= rw->benchmark_time;
+
+	rw->from.any = NULL;
+	// Note - we don't clear rw->msgp, destructor will free it.
+}
+
+bool
+as_transaction_set_msg_field_flag(as_transaction *tr, uint8_t type)
+{
+	switch (type) {
+	case AS_MSG_FIELD_TYPE_NAMESPACE:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_NAMESPACE;
+		break;
+	case AS_MSG_FIELD_TYPE_SET:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_SET;
+		break;
+	case AS_MSG_FIELD_TYPE_KEY:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_KEY;
+		break;
+	case AS_MSG_FIELD_TYPE_DIGEST_RIPE:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_DIGEST_RIPE;
+		break;
+	case AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_DIGEST_RIPE_ARRAY;
+		break;
+	case AS_MSG_FIELD_TYPE_TRID:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_TRID;
+		break;
+	case AS_MSG_FIELD_TYPE_SCAN_OPTIONS:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_SCAN_OPTIONS;
+		break;
+	case AS_MSG_FIELD_TYPE_SOCKET_TIMEOUT:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_SOCKET_TIMEOUT;
+		break;
+	case AS_MSG_FIELD_TYPE_INDEX_NAME:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_INDEX_NAME;
+		break;
+	case AS_MSG_FIELD_TYPE_INDEX_RANGE:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_INDEX_RANGE;
+		break;
+	case AS_MSG_FIELD_TYPE_INDEX_TYPE:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_INDEX_TYPE;
+		break;
+	case AS_MSG_FIELD_TYPE_UDF_FILENAME:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_FILENAME;
+		break;
+	case AS_MSG_FIELD_TYPE_UDF_FUNCTION:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_FUNCTION;
+		break;
+	case AS_MSG_FIELD_TYPE_UDF_ARGLIST:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_ARGLIST;
+		break;
+	case AS_MSG_FIELD_TYPE_UDF_OP:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_OP;
+		break;
+	case AS_MSG_FIELD_TYPE_QUERY_BINLIST:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_QUERY_BINLIST;
+		break;
+	case AS_MSG_FIELD_TYPE_BATCH: // shouldn't get here - batch parent handles this
+		tr->msg_fields |= AS_MSG_FIELD_BIT_BATCH;
+		break;
+	case AS_MSG_FIELD_TYPE_BATCH_WITH_SET: // shouldn't get here - batch parent handles this
+		tr->msg_fields |= AS_MSG_FIELD_BIT_BATCH_WITH_SET;
+		break;
+	case AS_MSG_FIELD_TYPE_PREDEXP:
+		tr->msg_fields |= AS_MSG_FIELD_BIT_PREDEXP;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+bool
+as_transaction_prepare(as_transaction *tr, bool swap)
+{
+	uint64_t size = tr->msgp->proto.sz;
+
+	if (size < sizeof(as_msg)) {
+		cf_warning(AS_PROTO, "proto body size %lu smaller than as_msg", size);
+		return false;
+	}
+
+	// The proto data is not smaller than an as_msg - safe to swap header.
+	as_msg *m = &tr->msgp->msg;
+
+	if (swap) {
+		as_msg_swap_header(m);
+	}
+
+	uint8_t* p_end = (uint8_t*)m + size;
+	uint8_t* p_read = m->data;
+
+	// Parse and swap fields first.
+	for (uint16_t n = 0; n < m->n_fields; n++) {
+		if (p_read + sizeof(as_msg_field) > p_end) {
+			cf_warning(AS_PROTO, "incomplete as_msg_field");
+			return false;
+		}
+
+		as_msg_field* p_field = (as_msg_field*)p_read;
+
+		if (swap) {
+			as_msg_swap_field(p_field);
+		}
+
+		p_read = as_msg_field_skip(p_field);
+
+		if (! p_read) {
+			cf_warning(AS_PROTO, "bad as_msg_field");
+			return false;
+		}
+
+		if (p_read > p_end) {
+			cf_warning(AS_PROTO, "incomplete as_msg_field value");
+			return false;
+		}
+
+		// Store which message fields are present - prevents lots of re-parsing.
+		if (! as_transaction_set_msg_field_flag(tr, p_field->type)) {
+			cf_debug(AS_PROTO, "skipping as_msg_field type %u", p_field->type);
+		}
+	}
+
+	// Parse and swap bin-ops, if any.
+	for (uint16_t n = 0; n < m->n_ops; n++) {
+		if (p_read + sizeof(as_msg_op) > p_end) {
+			cf_warning(AS_PROTO, "incomplete as_msg_op");
+			return false;
+		}
+
+		as_msg_op* op = (as_msg_op*)p_read;
+
+		if (swap) {
+			as_msg_swap_op(op);
+		}
+
+		p_read = as_msg_op_skip(op);
+
+		if (! p_read) {
+			cf_warning(AS_PROTO, "bad as_msg_op");
+			return false;
+		}
+
+		if (p_read > p_end) {
+			cf_warning(AS_PROTO, "incomplete as_msg_op data");
+			return false;
+		}
+	}
+
+	if (p_read != p_end) {
+		cf_warning(AS_PROTO, "extra bytes follow fields and bin-ops");
+		return false;
+	}
+
+	return true;
+}
+
+// Initialize an internal UDF transaction (for a UDF scan/query). Allocates a
+// message with namespace and digest - no set for now, since these transactions
+// won't get security checked, and they can't create a record.
+void
+as_transaction_init_iudf(as_transaction *tr, as_namespace *ns, cf_digest *keyd,
+		iudf_origin* iudf_orig, bool is_durable_delete)
+{
+	uint8_t info2 = AS_MSG_INFO2_WRITE |
+			(is_durable_delete ? AS_MSG_INFO2_DURABLE_DELETE : 0);
+
+	cl_msg *msgp = as_msg_create_internal(ns->name, keyd, 0, info2, 0);
+
+	as_transaction_init_head(tr, NULL, msgp);
+
+	as_transaction_set_msg_field_flag(tr, AS_MSG_FIELD_TYPE_NAMESPACE);
+	as_transaction_set_msg_field_flag(tr, AS_MSG_FIELD_TYPE_DIGEST_RIPE);
+
+	tr->origin = FROM_IUDF;
+	tr->from.iudf_orig = iudf_orig;
+
+	// Do this last, to exclude the setup time in this function.
+	tr->start_time = cf_getns();
+}
+
+void
+as_transaction_demarshal_error(as_transaction* tr, uint32_t error_code)
+{
+	as_msg_send_reply(tr->from.proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, 0);
+	tr->from.proto_fd_h = NULL;
+
+	cf_free(tr->msgp);
+	tr->msgp = NULL;
+
+	cf_atomic64_incr(&g_stats.n_demarshal_error);
+}
+
+#define UPDATE_ERROR_STATS(name) \
+	if (ns) { \
+		if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { \
+			cf_atomic64_incr(&ns->n_##name##_tsvc_timeout); \
+		} \
+		else { \
+			cf_atomic64_incr(&ns->n_##name##_tsvc_error); \
+		} \
+	} \
+	else { \
+		cf_atomic64_incr(&g_stats.n_tsvc_##name##_error); \
+	}
+
+void
+as_transaction_error(as_transaction* tr, as_namespace* ns, uint32_t error_code)
+{
+	if (error_code == 0) {
+		cf_warning(AS_PROTO, "converting error code 0 to 1 (unknown)");
+		error_code = AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	// The 'from' checks below are unnecessary, only paranoia.
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		if (tr->from.proto_fd_h) {
+			as_msg_send_reply(tr->from.proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr));
+			tr->from.proto_fd_h = NULL; // pattern, not needed
+		}
+		UPDATE_ERROR_STATS(client);
+		break;
+	case FROM_PROXY:
+		if (tr->from.proxy_node != 0) {
+			as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr));
+			tr->from.proxy_node = 0; // pattern, not needed
+		}
+		break;
+	case FROM_BATCH:
+		if (tr->from.batch_shared) {
+			as_batch_add_error(tr->from.batch_shared, tr->from_data.batch_index, error_code);
+			tr->from.batch_shared = NULL; // pattern, not needed
+			tr->msgp = NULL; // pattern, not needed
+		}
+		UPDATE_ERROR_STATS(batch_sub);
+		break;
+	case FROM_IUDF:
+		if (tr->from.iudf_orig) {
+			tr->from.iudf_orig->cb(tr->from.iudf_orig->udata, error_code);
+			tr->from.iudf_orig = NULL; // pattern, not needed
+		}
+		UPDATE_ERROR_STATS(udf_sub);
+		break;
+	case FROM_NSUP:
+		break;
+	case FROM_RE_REPL:
+		if (tr->from.re_repl_orig_cb) {
+			tr->result_code = error_code;
+			tr->from.re_repl_orig_cb(tr);
+			tr->from.re_repl_orig_cb = NULL; // pattern, not needed
+		}
+		// Re-replications take care of stats independently.
+		break;
+	default:
+		cf_crash(AS_PROTO, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+}
+
+// TODO - temporary, until scan & query can do their own synchronous failure
+// responses. (Here we forfeit namespace info and add to global-scope error.)
+void
+as_multi_rec_transaction_error(as_transaction* tr, uint32_t error_code)
+{
+	if (error_code == 0) {
+		cf_warning(AS_PROTO, "converting error code 0 to 1 (unknown)");
+		error_code = AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		if (tr->from.proto_fd_h) {
+			as_msg_send_reply(tr->from.proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr));
+			tr->from.proto_fd_h = NULL; // pattern, not needed
+		}
+		cf_atomic64_incr(&g_stats.n_tsvc_client_error);
+		break;
+	default:
+		cf_crash(AS_PROTO, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+}
+
+// Helper to release transaction file handles.
+void
+as_release_file_handle(as_file_handle *proto_fd_h)
+{
+	int rc = cf_rc_release(proto_fd_h);
+
+	if (rc > 0) {
+		return;
+	}
+	else if (rc < 0) {
+		cf_warning(AS_PROTO, "release file handle: negative ref-count %d", rc);
+		return;
+	}
+
+	cf_socket_close(&proto_fd_h->sock);
+	cf_socket_term(&proto_fd_h->sock);
+	proto_fd_h->fh_info &= ~FH_INFO_DONOT_REAP;
+
+	if (proto_fd_h->proto)	{
+		as_proto *p = proto_fd_h->proto;
+
+		if ((p->version != PROTO_VERSION) || (p->type >= PROTO_TYPE_MAX)) {
+			cf_warning(AS_PROTO, "release file handle: bad proto buf, corruption");
+		}
+		else {
+			cf_free(proto_fd_h->proto);
+			proto_fd_h->proto = NULL;
+		}
+	}
+
+	if (proto_fd_h->security_filter) {
+		as_security_filter_destroy(proto_fd_h->security_filter);
+		proto_fd_h->security_filter = NULL;
+	}
+
+	cf_rc_free(proto_fd_h);
+	cf_atomic64_incr(&g_stats.proto_connections_closed);
+}
+
+void
+as_end_of_transaction(as_file_handle *proto_fd_h, bool force_close)
+{
+	thr_demarshal_rearm(proto_fd_h);
+
+	if (force_close) {
+		cf_socket_shutdown(&proto_fd_h->sock);
+	}
+
+	as_release_file_handle(proto_fd_h);
+}
+
+void
+as_end_of_transaction_ok(as_file_handle *proto_fd_h)
+{
+	as_end_of_transaction(proto_fd_h, false);
+}
+
+void
+as_end_of_transaction_force_close(as_file_handle *proto_fd_h)
+{
+	as_end_of_transaction(proto_fd_h, true);
+}
diff --git a/as/src/base/truncate.c b/as/src/base/truncate.c
new file mode 100644
index 00000000..10c00044
--- /dev/null
+++ b/as/src/base/truncate.c
@@ -0,0 +1,621 @@
+/*
+ * truncate.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/truncate.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "fault.h"
+#include "shash.h"
+#include "vmapx.h"
+
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/system_metadata.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct truncate_reduce_cb_info_s {
+	as_namespace* ns;
+	as_index_tree* tree;
+	int64_t n_deleted;
+} truncate_reduce_cb_info;
+
+static const uint32_t NUM_TRUNCATE_THREADS = 4;
+
+// Truncate system metadata module name.
+const char AS_TRUNCATE_MODULE[] = "truncate";
+#define TRUNCATE_MODULE ((char*)AS_TRUNCATE_MODULE)
+// TODO - change smd API to take const char* module names?
+
+// Includes 1 for delimiter and 1 for null-terminator.
+#define TRUNCATE_KEY_SIZE (AS_ID_NAMESPACE_SZ + AS_SET_NAME_MAX_SIZE)
+
+// System metadata key format token.
+#define TOK_DELIMITER ('|')
+
+// Detect excessive clock skew for warning purposes only.
+static const uint64_t WARN_CLOCK_SKEW_MS = 1000UL * 5;
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_shash* g_truncate_filter_hash = NULL;
+static bool g_truncate_smd_loaded = false;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+bool filter_hash_put(const as_smd_item_t* item);
+void filter_hash_delete(const as_smd_item_t* item);
+
+bool truncate_smd_conflict_cb(char* module, as_smd_item_t* existing_item, as_smd_item_t* new_item, void* udata);
+int truncate_smd_accept_cb(char* module, as_smd_item_list_t* items, void* udata, uint32_t accept_opt);
+int truncate_smd_can_accept_cb(char* module, as_smd_item_t *item, void *udata);
+
+void truncate_action_do(as_namespace* ns, const char* set_name, uint64_t lut);
+void truncate_action_undo(as_namespace* ns, const char* set_name);
+void truncate_all(as_namespace* ns);
+void* run_truncate(void* arg);
+void truncate_finish(as_namespace* ns);
+void truncate_reduce_cb(as_index_ref* r_ref, void* udata);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline uint64_t
+lut_from_smd(const as_smd_item_t* item)
+{
+	return strtoul(item->value, NULL, 10);
+}
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_truncate_init(as_namespace* ns)
+{
+	truncate_startup_hash_init(ns);
+
+	ns->truncate.state = TRUNCATE_IDLE;
+	pthread_mutex_init(&ns->truncate.state_lock, 0);
+}
+
+
+void
+as_truncate_init_smd()
+{
+	// Create the global filter shash used on the SMD principal.
+	g_truncate_filter_hash = cf_shash_create(cf_shash_fn_zstr,
+			TRUNCATE_KEY_SIZE, sizeof(truncate_hval),
+			1024 * g_config.n_namespaces, 0);
+
+	// Register the system metadata custom callbacks.
+	if (as_smd_create_module(TRUNCATE_MODULE,
+			NULL, NULL,
+			truncate_smd_conflict_cb, NULL,
+			truncate_smd_accept_cb, NULL,
+			truncate_smd_can_accept_cb, NULL) != 0) {
+		cf_crash(AS_TRUNCATE, "truncate init - failed smd create module");
+	}
+
+	while (! g_truncate_smd_loaded) {
+		usleep(1000);
+	}
+}
+
+
+// SMD key is "ns-name|set-name" or "ns-name".
+// SMD value is last-update-time as decimal string.
+bool
+as_truncate_cmd(const char* ns_name, const char* set_name, const char* lut_str)
+{
+	char smd_key[TRUNCATE_KEY_SIZE];
+
+	strcpy(smd_key, ns_name);
+
+	if (set_name) {
+		char* p_write = smd_key + strlen(ns_name);
+
+		*p_write++ = TOK_DELIMITER;
+		strcpy(p_write, set_name);
+	}
+
+	uint64_t now = cf_clepoch_milliseconds();
+	uint64_t lut;
+
+	if (lut_str) {
+		uint64_t utc_nanosec = strtoul(lut_str, NULL, 0);
+
+		// Last update time as human-readable UTC seconds.
+		// TODO - make generic utility?
+		char utc_sec[64] = { 0 };
+		time_t utc_time = utc_nanosec / 1000000000;
+		struct tm utc_tm;
+
+		if (cf_fault_is_using_local_time()) {
+			localtime_r(&utc_time, &utc_tm);
+			strftime(utc_sec, sizeof(utc_sec), "%b %d %Y %T GMT%z", &utc_tm);
+		}
+		else {
+			gmtime_r(&utc_time, &utc_tm);
+			strftime(utc_sec, sizeof(utc_sec), "%b %d %Y %T %Z", &utc_tm);
+		}
+
+		lut = cf_clepoch_ms_from_utc_ns(utc_nanosec);
+
+		if (lut == 0) {
+			cf_warning(AS_TRUNCATE, "command lut %s (%s) would truncate to 0",
+					lut_str, utc_sec);
+			return false;
+		}
+
+		if (lut > now) {
+			cf_warning(AS_TRUNCATE, "command lut %s (%s) is in the future",
+					lut_str, utc_sec);
+			return false;
+		}
+
+		cf_info(AS_TRUNCATE, "{%s} got command to truncate to %s (%lu)",
+				smd_key, utc_sec, lut);
+	}
+	else {
+		// Use a last-update-time threshold of now.
+		lut = now;
+
+		cf_info(AS_TRUNCATE, "{%s} got command to truncate to now (%lu)",
+				smd_key, lut);
+	}
+
+	char smd_value[13 + 1]; // 0xFFffffFFFF (40 bits) is 13 decimal characters
+
+	sprintf(smd_value, "%lu", lut);
+
+	// Broadcast the truncate command to all nodes (including this one).
+	as_smd_set_metadata(TRUNCATE_MODULE, smd_key, smd_value);
+
+	return true;
+}
+
+
+// SMD key is "ns-name|set-name" or "ns-name".
+void
+as_truncate_undo_cmd(const char* ns_name, const char* set_name)
+{
+	char smd_key[TRUNCATE_KEY_SIZE];
+
+	strcpy(smd_key, ns_name);
+
+	if (set_name) {
+		char* p_write = smd_key + strlen(ns_name);
+
+		*p_write++ = TOK_DELIMITER;
+		strcpy(p_write, set_name);
+	}
+
+	cf_info(AS_TRUNCATE, "{%s} got command to undo truncate", smd_key);
+
+	// Broadcast the truncate-undo command to all nodes (including this one).
+	as_smd_delete_metadata(TRUNCATE_MODULE, smd_key);
+}
+
+
+bool
+as_truncate_now_is_truncated(struct as_namespace_s* ns, uint16_t set_id)
+{
+	uint64_t now = cf_clepoch_milliseconds();
+
+	if (now < ns->truncate.lut) {
+		return true;
+	}
+
+	as_set* p_set = as_namespace_get_set_by_id(ns, set_id);
+
+	return p_set ? now < p_set->truncate_lut : false;
+}
+
+
+bool
+as_truncate_record_is_truncated(const as_record* r, as_namespace* ns)
+{
+	if (r->last_update_time < ns->truncate.lut) {
+		return true;
+	}
+
+	as_set* p_set = as_namespace_get_record_set(ns, r);
+
+	return p_set ? r->last_update_time < p_set->truncate_lut : false;
+}
+
+
+//==========================================================
+// Local helpers - generic.
+//
+
+bool
+filter_hash_put(const as_smd_item_t* item)
+{
+	char hkey[TRUNCATE_KEY_SIZE] = { 0 }; // pad for consistent shash key
+
+	strcpy(hkey, item->key);
+
+	truncate_hval new_hval = { .lut = lut_from_smd(item) };
+	truncate_hval ex_hval;
+
+	if (cf_shash_get(g_truncate_filter_hash, hkey, &ex_hval) != CF_SHASH_OK ||
+			new_hval.lut > ex_hval.lut) {
+		cf_shash_put(g_truncate_filter_hash, hkey, &new_hval);
+
+		return true;
+	}
+
+	// This is normal on principal, from truncate_smd_accept_cb().
+	cf_detail(AS_TRUNCATE, "{%s} truncate lut %lu <= filter lut %lu", item->key,
+			(uint64_t)new_hval.lut, (uint64_t)ex_hval.lut);
+
+	return false;
+}
+
+
+void
+filter_hash_delete(const as_smd_item_t* item)
+{
+	char hkey[TRUNCATE_KEY_SIZE] = { 0 }; // pad for consistent shash key
+
+	strcpy(hkey, item->key);
+
+	if (cf_shash_delete(g_truncate_filter_hash, hkey) != CF_SHASH_OK) {
+		cf_warning(AS_TRUNCATE, "{%s} failed filter-hash delete", item->key);
+	}
+}
+
+
+//==========================================================
+// Local helpers - SMD callbacks.
+//
+
+bool
+truncate_smd_conflict_cb(char* module, as_smd_item_t* existing_item,
+		as_smd_item_t* new_item, void* udata)
+{
+	return lut_from_smd(existing_item) >= lut_from_smd(new_item);
+}
+
+
+int
+truncate_smd_accept_cb(char* module, as_smd_item_list_t* items, void* udata,
+		uint32_t accept_opt)
+{
+	if ((accept_opt & AS_SMD_ACCEPT_OPT_CREATE) != 0) {
+		g_truncate_smd_loaded = true;
+		return 0;
+	}
+
+	bool is_merge = (accept_opt & AS_SMD_ACCEPT_OPT_MERGE) != 0;
+
+	for (int i = 0; i < (int)items->num_items; i++) {
+		as_smd_item_t* item = items->item[i];
+
+		if (item->action == AS_SMD_ACTION_SET) {
+			// If we're here via SMD API command (as opposed to via merge), SMD
+			// principal's hash will already have this item - ignore filter
+			// result, let as_set/as_namespace cached value do the filtering.
+			if (! filter_hash_put(item) && is_merge) {
+				continue;
+			}
+		}
+		else if (item->action == AS_SMD_ACTION_DELETE) {
+			filter_hash_delete(item);
+		}
+		else {
+			cf_warning(AS_TRUNCATE, "smd accept cb - unknown action");
+			continue;
+		}
+
+		const char* ns_name = item->key;
+		const char* tok = strchr(ns_name, TOK_DELIMITER);
+
+		uint32_t ns_len = tok ? (uint32_t)(tok - ns_name) : strlen(ns_name);
+		as_namespace* ns = as_namespace_get_bybuf((uint8_t*)ns_name, ns_len);
+
+		if (! ns) {
+			cf_detail(AS_TRUNCATE, "skipping invalid ns");
+			continue;
+		}
+
+		const char* set_name = tok ? tok + 1 : NULL;
+
+		if (item->action == AS_SMD_ACTION_SET) {
+			uint64_t lut = lut_from_smd(item);
+
+			if (g_truncate_smd_loaded) {
+				truncate_action_do(ns, set_name, lut);
+			}
+			else {
+				truncate_action_startup(ns, set_name, lut);
+			}
+		}
+		else {
+			truncate_action_undo(ns, set_name);
+		}
+	}
+
+	return 0;
+}
+
+
+int
+truncate_smd_can_accept_cb(char* module, as_smd_item_t* item, void* udata)
+{
+	if (item->action == AS_SMD_ACTION_SET) {
+		if (filter_hash_put(item)) {
+			return 0;
+		}
+
+		cf_info(AS_TRUNCATE, "{%s} ignoring redundant truncate lut", item->key);
+
+		return -1;
+	}
+	else if (item->action == AS_SMD_ACTION_DELETE) {
+		return 0;
+	}
+	else {
+		cf_warning(AS_TRUNCATE, "smd can accept cb - unknown action");
+		return -1;
+	}
+}
+
+
+//==========================================================
+// Local helpers - SMD callbacks' helpers.
+//
+
+void
+truncate_action_do(as_namespace* ns, const char* set_name, uint64_t lut)
+{
+	uint64_t now = cf_clepoch_milliseconds();
+
+	if (lut > now + WARN_CLOCK_SKEW_MS) {
+		cf_warning(AS_TRUNCATE, "lut is %lu ms in the future - clock skew?",
+				lut - now);
+	}
+
+	if (set_name) {
+		as_set* p_set = as_namespace_get_set_by_name(ns, set_name);
+
+		if (! p_set) {
+			cf_info(AS_TRUNCATE, "{%s|%s} truncate for nonexistent set",
+					ns->name, set_name);
+			return;
+		}
+
+		if (lut <= p_set->truncate_lut) {
+			cf_info(AS_TRUNCATE, "{%s|%s} truncate lut %lu <= vmap lut %lu",
+					ns->name, set_name, lut, p_set->truncate_lut);
+			return;
+		}
+
+		cf_info(AS_TRUNCATE, "{%s|%s} truncating to %lu", ns->name, set_name,
+				lut);
+
+		p_set->truncate_lut = lut;
+	}
+	else {
+		if (lut <= ns->truncate.lut) {
+			cf_info(AS_TRUNCATE, "{%s} truncate lut %lu <= ns lut %lu",
+					ns->name, lut, ns->truncate.lut);
+			return;
+		}
+
+		cf_info(AS_TRUNCATE, "{%s} truncating to %lu", ns->name, lut);
+
+		ns->truncate.lut = lut;
+	}
+
+	// Truncate to new last-update-time.
+
+	pthread_mutex_lock(&ns->truncate.state_lock);
+
+	switch (ns->truncate.state) {
+	case TRUNCATE_IDLE:
+		cf_info(AS_TRUNCATE, "{%s} starting truncate", ns->name);
+		truncate_all(ns);
+		break;
+	case TRUNCATE_RUNNING:
+		cf_info(AS_TRUNCATE, "{%s} flagging truncate to restart", ns->name);
+		ns->truncate.state = TRUNCATE_RESTART;
+		break;
+	case TRUNCATE_RESTART:
+		cf_info(AS_TRUNCATE, "{%s} truncate already will restart", ns->name);
+		break;
+	default:
+		cf_crash(AS_TRUNCATE, "bad truncate state %d", ns->truncate.state);
+		break;
+	}
+
+	pthread_mutex_unlock(&ns->truncate.state_lock);
+}
+
+
+void
+truncate_action_undo(as_namespace* ns, const char* set_name)
+{
+	if (set_name) {
+		as_set* p_set = as_namespace_get_set_by_name(ns, set_name);
+
+		if (! p_set) {
+			cf_info(AS_TRUNCATE, "{%s|%s} undo truncate for nonexistent set",
+					ns->name, set_name);
+			return;
+		}
+
+		cf_info(AS_TRUNCATE, "{%s|%s} undoing truncate - was to %lu", ns->name,
+				set_name, p_set->truncate_lut);
+
+		p_set->truncate_lut = 0;
+	}
+	else {
+		cf_info(AS_TRUNCATE, "{%s} undoing truncate - was to %lu", ns->name,
+				ns->truncate.lut);
+
+		ns->truncate.lut = 0;
+	}
+}
+
+
+// Called under truncate lock.
+void
+truncate_all(as_namespace* ns)
+{
+	// TODO - skipping sindex deletion shortcut - can't do that if we want to
+	// keep writing through set truncates. Is this ok?
+
+	ns->truncate.state = TRUNCATE_RUNNING;
+	cf_atomic32_set(&ns->truncate.n_threads_running, NUM_TRUNCATE_THREADS);
+	cf_atomic32_set(&ns->truncate.pid, -1);
+
+	cf_atomic64_set(&ns->truncate.n_records_this_run, 0);
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	for (uint32_t i = 0; i < NUM_TRUNCATE_THREADS; i++) {
+		if (pthread_create(&thread, &attrs, run_truncate, (void*)ns) != 0) {
+			cf_crash(AS_TRUNCATE, "failed to create truncate thread");
+			// TODO - be forgiving? Is there any point?
+		}
+	}
+}
+
+
+void*
+run_truncate(void* arg)
+{
+	as_namespace* ns = (as_namespace*)arg;
+	uint32_t pid;
+
+	while ((pid = (uint32_t)cf_atomic32_incr(&ns->truncate.pid)) <
+			AS_PARTITIONS) {
+		as_partition_reservation rsv;
+		as_partition_reserve(ns, pid, &rsv);
+
+		truncate_reduce_cb_info cb_info = { .ns = ns, .tree = rsv.tree };
+
+		as_index_reduce(rsv.tree, truncate_reduce_cb, (void*)&cb_info);
+		as_partition_release(&rsv);
+
+		cf_atomic64_add(&ns->truncate.n_records_this_run, cb_info.n_deleted);
+	}
+
+	truncate_finish(ns);
+
+	return NULL;
+}
+
+
+void
+truncate_finish(as_namespace* ns)
+{
+	if (cf_atomic32_decr(&ns->truncate.n_threads_running) == 0) {
+		pthread_mutex_lock(&ns->truncate.state_lock);
+
+		ns->truncate.n_records += ns->truncate.n_records_this_run;
+
+		cf_info(AS_TRUNCATE, "{%s} truncated records (%lu,%lu)", ns->name,
+				ns->truncate.n_records_this_run, ns->truncate.n_records);
+
+		switch (ns->truncate.state) {
+		case TRUNCATE_RUNNING:
+			cf_info(AS_TRUNCATE, "{%s} done truncate", ns->name);
+			ns->truncate.state = TRUNCATE_IDLE;
+			break;
+		case TRUNCATE_RESTART:
+			cf_info(AS_TRUNCATE, "{%s} restarting truncate", ns->name);
+			truncate_all(ns);
+			break;
+		case TRUNCATE_IDLE:
+		default:
+			cf_crash(AS_TRUNCATE, "bad truncate state %d", ns->truncate.state);
+			break;
+		}
+
+		pthread_mutex_unlock(&ns->truncate.state_lock);
+	}
+}
+
+
+void
+truncate_reduce_cb(as_index_ref* r_ref, void* udata)
+{
+	as_record* r = r_ref->r;
+	truncate_reduce_cb_info* cb_info = (truncate_reduce_cb_info*)udata;
+	as_namespace* ns = cb_info->ns;
+
+	if (r->last_update_time < ns->truncate.lut) {
+		cb_info->n_deleted++;
+		record_delete_adjust_sindex(r, ns);
+		as_index_delete(cb_info->tree, &r->keyd);
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	as_set* p_set = as_namespace_get_record_set(ns, r);
+
+	// Delete records not updated since their set's threshold last-update-time.
+	if (p_set && r->last_update_time < p_set->truncate_lut) {
+		cb_info->n_deleted++;
+		record_delete_adjust_sindex(r, ns);
+		as_index_delete(cb_info->tree, &r->keyd);
+	}
+
+	as_record_done(r_ref, ns);
+}
diff --git a/as/src/base/truncate_ce.c b/as/src/base/truncate_ce.c
new file mode 100644
index 00000000..25a21562
--- /dev/null
+++ b/as/src/base/truncate_ce.c
@@ -0,0 +1,62 @@
+/*
+ * truncate_ce.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "base/truncate.h"
+
+#include "base/datamodel.h"
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_truncate_done_startup(as_namespace* ns)
+{
+}
+
+
+void
+as_truncate_list_cenotaphs(as_namespace* ns)
+{
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+void
+truncate_startup_hash_init(as_namespace* ns)
+{
+}
+
+
+void
+truncate_action_startup(as_namespace* ns, const char* set_name, uint64_t lut)
+{
+}
+
diff --git a/as/src/base/udf_aerospike.c b/as/src/base/udf_aerospike.c
new file mode 100644
index 00000000..d543bea4
--- /dev/null
+++ b/as/src/base/udf_aerospike.c
@@ -0,0 +1,971 @@
+/*
+ * udf_aerospike.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/udf_aerospike.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <asm/byteorder.h>
+
+#include "aerospike/as_aerospike.h"
+#include "aerospike/as_boolean.h"
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_bytes.h"
+#include "aerospike/as_integer.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_string.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/truncate.h"
+#include "base/udf_record.h"
+#include "base/xdr_serverside.h"
+#include "storage/storage.h"
+#include "transaction/rw_utils.h"
+#include "transaction/udf.h"
+
+
+static int udf_aerospike_rec_remove(const as_aerospike *, const as_rec *);
+/*
+ * Internal Function: udf_aerospike_delbin
+ *
+ * Parameters:
+ * 		r 		- udf_record to be manipulated
+ * 		bname 	- name of the bin to be deleted
+ *
+ * Return value:
+ * 		0  on success
+ * 	   -1  on failure
+ *
+ * Description:
+ * 		The function deletes the bin with the name
+ * 		passed in as parameter. The as_bin_destroy function
+ * 		which is called here, only frees the data and
+ * 		the bin is marked as not in use. The bin can then be reused later.
+ *
+ * 		Synchronization : object lock acquired by the transaction thread executing UDF.
+ * 		Partition reservation takes place just before the transaction starts executing
+ * 		( look for as_partition_reserve_udf in thr_tsvc.c )
+ *
+ * 		Callers:
+ * 		udf_aerospike__apply_update_atomic
+ * 		In this function, if it fails at the time of update, the record is set
+ * 		to rollback all the updates till this point. The case where it fails in
+ * 		rollback is not handled.
+ *
+ * 		Side Notes:
+ * 		i.	write_to_device will be set to true on a successful bin destroy.
+ * 		If all the updates from udf_aerospike__apply_update_atomic (including this) are
+ * 		successful, the record will be written to disk and reopened so that the rest of
+ * 		sets of updates can be applied.
+ *
+ * 		ii.	If delete from sindex fails, we do not handle it.
+ */
+static int
+udf_aerospike_delbin(udf_record * urecord, const char * bname)
+{
+	as_storage_rd *rd = urecord->rd;
+	as_namespace *ns = rd->ns;
+
+	// Check that bname is not completely invalid
+	if (bname == NULL || (ns->single_bin && bname[0] != 0) || (! ns->single_bin && bname[0] == 0)) {
+		cf_warning(AS_UDF, "udf_aerospike_delbin: Invalid Parameters: [Invalid bin name supplied]... Fail");
+		return -1;
+	}
+
+	// Check quality of bname -- check that it is proper length, then make sure
+	// that the bin exists.
+	if (strlen(bname) >= AS_ID_BIN_SZ) {
+		// Can't read bin if name too large.
+		cf_warning(AS_UDF, "udf_aerospike_delbin: Invalid Parameters [bin name(%s) too big]... Fail", bname);
+		return -1;
+	}
+
+	as_bin * b = as_bin_get(rd, bname);
+	if ( !b ) {
+		cf_debug(AS_UDF, "udf_aerospike_delbin: Invalid Operation [Bin name(%s) not found of delete]... Fail", bname);
+		return -1;
+	}
+
+	const char * set_name = as_index_get_set_name(rd->r, ns);
+	
+	bool has_sindex = record_has_sindex(rd->r, ns);
+	SINDEX_BINS_SETUP(sbins, ns->sindex_cnt);
+	as_sindex * si_arr[ns->sindex_cnt];
+	int si_arr_index = 0;
+	int sbins_populated  = 0;
+	if (has_sindex) {
+		si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]);
+		sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, sbins, AS_SINDEX_OP_DELETE);
+	}
+
+	int32_t i = as_bin_get_index(rd, bname);
+	if (i != -1) {
+		if (has_sindex) {
+			if (sbins_populated > 0) {	
+				urecord->tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED;
+				as_sindex_update_by_sbin(ns, as_index_get_set_name(rd->r, ns), sbins, sbins_populated, &rd->r->keyd);
+			}
+		}
+		as_bin_destroy(rd, i);
+	} else {
+		cf_warning(AS_UDF, "udf_aerospike_delbin: Internal Error [Deleting non-existing bin %s]... Fail", bname);
+	}
+
+	if (has_sindex) {
+		as_sindex_sbin_freeall(sbins, sbins_populated);
+		as_sindex_release_arr(si_arr, si_arr_index);
+	}
+
+	return 0;
+}
+/*
+ * Internal function: udf__aerospike_get_particle_buf
+ *
+ * Parameters:
+ * 		r 		-- udf_record_bin for which particle buf is requested
+ * 		type    -- bin type
+ * 		pbytes  -- current space required
+ *
+ * Return value:
+ * 		NULL on failure
+ * 		valid buf pointer success
+ *
+ * Description:
+ * 		The function find space on preallocated particle_data for requested size.
+ * 		In case it is found it tries to allocate space for bin independently. 
+ * 		Return back the pointer to the offset on preallocated particle_data or newly
+ * 		allocated space.
+ *
+ * 		Return NULL if both fails
+ *
+ *      Note: ubin->particle_buf will be set if new per bin memory is allocated.
+ *
+ * 		Callers:
+ * 		udf_aerospike_setbin
+ */
+uint8_t *
+udf__aerospike_get_particle_buf(udf_record *urecord, udf_record_bin *ubin, uint32_t pbytes)
+{
+	if (pbytes > urecord->rd->ns->storage_write_block_size) {
+		cf_warning(AS_UDF, "udf__aerospike_get_particle_buf: Invalid Operation [Bin %s data too big size=%u]... Fail", ubin->name, pbytes);
+		return NULL;
+	}
+
+	uint32_t alloc_size = pbytes == 0 ? 0 : urecord->rd->ns->storage_write_block_size;
+	uint8_t *buf = NULL;
+
+	if (ubin->particle_buf) {
+		buf = ubin->particle_buf;
+	} else {
+		// Disable dynamic shifting from the flat allocater to dynamic
+		// allocation.
+		if ((urecord->cur_particle_data + pbytes) < urecord->end_particle_data) {
+			buf = urecord->cur_particle_data;
+			urecord->cur_particle_data += pbytes;
+		} else if (alloc_size) {
+			// If there is no space in preallocated buffer then go
+			// ahead and allocate space per bin. This may happen
+			// if user keeps doing lot of execute update exhausting
+			// the buffer. After this point the record size check will
+			// trip instead of at the code when bin value is set.
+			ubin->particle_buf = cf_malloc(alloc_size);
+			buf = ubin->particle_buf;
+		}
+	}
+	return buf;
+}
+/*
+ * Internal function: udf_aerospike_setbin
+ *
+ * Parameters:
+ *      offset  -- offset of udf bin in updates array 
+ * 		r 		-- udf_record to be manipulated
+ * 		bname 	-- name of the bin to be deleted
+ *		val		-- value to be updated with
+ *
+ * Return value:
+ * 		0  on success
+ * 	   -1  on failure
+ *
+ * Description:
+ * 		The function sets the bin with the name
+ * 		passed in as parameter to the value, passed as the third parameter.
+ * 		Before updating the bin, it is checked if the value can fit in the storage
+ *
+ * 		Synchronization : object lock acquired by the transaction thread executing UDF.
+ * 		Partition reservation takes place just before the transaction starts executing
+ * 		( look for as_partition_reserve_udf in thr_tsvc.c )
+ *
+ * 		Callers:
+ * 		udf_aerospike__apply_update_atomic
+ * 		In this function, if it fails at the time of update, the record is set
+ * 		to rollback all the updates till this point. The case where it fails in
+ * 		rollback is not handled.
+ *
+ * 		Side Notes:
+ * 		i.	write_to_device will be set to true on a successful bin update.
+ * 		If all the updates from udf_aerospike__apply_update_atomic (including this) are
+ * 		successful, the record will be written to disk and reopened so that the rest of
+ * 		sets of updates can be applied.
+ *
+ * 		ii.	If put in sindex fails, we do not handle it.
+ *
+ * 		TODO make sure anything goes into setbin only if the bin value is
+ * 		          changed
+ */
+static int
+udf_aerospike_setbin(udf_record * urecord, int offset, const char * bname, const as_val * val)
+{
+	as_storage_rd *rd = urecord->rd;
+	as_namespace *ns = rd->ns;
+
+	if (bname == NULL || (ns->single_bin && bname[0] != 0) || (! ns->single_bin && bname[0] == 0)) {
+		cf_warning(AS_UDF, "udf_aerospike_setbin: Invalid Parameters: [Invalid bin name supplied]... Fail");
+		return -1;
+	}
+
+	if (as_particle_type_from_asval(val) == AS_PARTICLE_TYPE_NULL) {
+		cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] called with unusable as_val", bname);
+		return -3;
+	}
+
+	uint8_t type = as_val_type(val);
+
+	as_bin * b = as_bin_get_or_create(rd, bname);
+
+	if ( !b ) {
+		cf_warning(AS_UDF, "udf_aerospike_setbin: Internal Error [Bin %s not found.. Possibly ran out of bins]... Fail", bname);
+		return -1;
+	}
+
+	bool has_sindex = record_has_sindex(rd->r, ns);
+	SINDEX_BINS_SETUP(sbins, 2 * ns->sindex_cnt);
+	as_sindex * si_arr[2 * ns->sindex_cnt];
+	int sbins_populated = 0;
+	int si_arr_index = 0;
+	const char * set_name = as_index_get_set_name(rd->r, ns);
+
+	if (has_sindex ) {
+		si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]);
+		sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_DELETE);
+	}
+
+	// we know we are doing an update now, make sure there is particle data,
+	// set to be 1 wblock size now @TODO!
+	int ret = 0;
+
+	cf_detail(AS_UDF, "udf_setbin: bin %s type %d ", bname, type );
+
+	if (ns->storage_data_in_memory) {
+		if (as_bin_particle_replace_from_asval(b, val) != 0) {
+			cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] failed to replace particle", bname);
+			ret = -4;
+		}
+	}
+	else {
+		uint32_t size = as_particle_size_from_asval(val);
+		uint8_t *particle_buf = udf__aerospike_get_particle_buf(urecord, &urecord->updates[offset], size);
+
+		if (particle_buf) {
+			as_bin_particle_stack_from_asval(b, particle_buf, val);
+		}
+		else {
+			cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] failed to get space for particle size %u", bname, size);
+			ret = -4;
+		}
+	}
+
+	// Update sindex if required
+	if (has_sindex) {
+		if (ret) {
+			if (sbins_populated > 0) {
+				as_sindex_sbin_freeall(sbins, sbins_populated);
+			}
+			as_sindex_release_arr(si_arr, si_arr_index);
+			return ret;
+		}
+
+		si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]);
+		sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_INSERT);
+		if (sbins_populated > 0) {
+			urecord->tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED;
+			as_sindex_update_by_sbin(ns, as_index_get_set_name(rd->r, ns), sbins, sbins_populated, &rd->r->keyd);
+			as_sindex_sbin_freeall(sbins, sbins_populated);
+		}
+		as_sindex_release_arr(si_arr, si_arr_index);
+	}
+
+	return ret;
+} // end udf_aerospike_setbin()
+
+/*
+ * Check and validate parameter before performing operation
+ *
+ * return:
+ *      UDF_ERR * in case of failure
+ *      0 in case of success
+ */
+static int
+udf_aerospike_param_check(const as_aerospike *as, const as_rec *rec, char *fname, int lineno)
+{
+	if (!as) {
+		cf_debug(AS_UDF, "Invalid Parameters: aerospike=%p", as);
+		return UDF_ERR_INTERNAL_PARAMETER;
+	}
+
+	int ret = udf_record_param_check(rec, fname, lineno);
+	if (ret) {
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Internal function: udf_aerospike__apply_update_atomic
+ *
+ * Parameters:
+ * 		rec --	udf_record to be updated
+ *
+ * Return Values:
+ * 		 0 success
+ * 		-1 failure
+ *
+ * Description:
+ * 		This function applies all the updates atomically. That is,
+ * 		if one of the bin update/delete/create fails, the entire function
+ * 		will fail. If the nth update fails, all the n-1 updates are rolled
+ * 		back to their initial values
+ *
+ * 		Special Notes:
+ * 		i. The basic checks of bin name being too long or if there is enough space
+ * 		on the disk for the bin values is done before allocating space for any
+ * 		of the bins.
+ *
+ * 		ii. If one of the updates to be rolled back is a bin creation,
+ * 		udf_aerospike_delbin is called. This will not free up the bin metadata.
+ * 		So there will be a small memory mismatch b/w replica (which did not get the
+ * 		record at all and hence no memory is accounted) and the master will be seen.
+ * 		To avoid such cases, we are doing checks upfront.
+ *
+ * 		Callers:
+ * 		udf_aerospike__execute_updates
+ * 		In this function, if udf_aerospike__apply_update_atomic fails, the record
+ * 		is not committed to the storage. On success, record is closed which commits to
+ * 		the storage and reopened for the next set of udf updates.
+ * 		The return value from udf_aerospike__apply_update_atomic is passed on to the
+ * 		callers of this function.
+ */
+int
+udf_aerospike__apply_update_atomic(udf_record *urecord)
+{
+	int rc						= 0;
+	int failmax					= 0;
+	int new_bins				= 0;	// How many new bins have to be created in this update
+	as_storage_rd * rd			= urecord->rd;
+	as_namespace * ns			= rd->ns;
+	bool has_sindex				= record_has_sindex(rd->r, ns);
+	bool is_record_dirty		= false;
+
+	// This will iterate over all the updates and apply them to storage.
+	// The items will remain, and be used as cache values. If an error
+	// occurred during setbin(), we rollback all the operation which
+	// is and return failure
+	cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates);
+
+	// loop twice to make sure the updates are performed first so in case
+	// something wrong it can be rolled back. The deletes will go through
+	// successfully generally.
+
+	// In first iteration, just calculate how many new bins need to be created
+	for(uint32_t i = 0; i < urecord->nupdates; i++ ) {
+		if ( urecord->updates[i].dirty ) {
+			char *      k = urecord->updates[i].name;
+			if ( k != NULL ) {
+				if ( !as_bin_get(rd, k) ) {
+					new_bins++;
+				}
+			}
+		}
+	}
+	// Free bins - total bins not in use in the record
+	// Delta bins - new bins that need to be created
+	int inuse_bins = as_bin_inuse_count(rd);
+	int free_bins  = rd->n_bins - inuse_bins;
+	int delta_bins = new_bins - free_bins;
+	cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d",
+			  rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins);
+
+	// Check bin usage limit.
+	if ((inuse_bins + new_bins > UDF_RECORD_BIN_ULIMIT) ||
+			(urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS)) {
+		cf_warning(AS_UDF, "bin limit of %d for UDF exceeded: %d bins in use, %d bins free, %s%d new bins needed",
+				(int)UDF_RECORD_BIN_ULIMIT, inuse_bins, free_bins,
+				(urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS) ? ">" : "", new_bins);
+		goto Rollback;
+	}
+
+	// Allocate space for all the new bins that need to be created beforehand
+	if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) {
+		as_bin_allocate_bin_space(rd, delta_bins);
+	}
+
+	if (!rd->ns->storage_data_in_memory && !urecord->particle_data) {
+		urecord->particle_data = cf_malloc(rd->ns->storage_write_block_size);
+		urecord->cur_particle_data = urecord->particle_data;
+		urecord->end_particle_data = urecord->particle_data + rd->ns->storage_write_block_size;
+	}
+
+	if (has_sindex) {
+		SINDEX_GRLOCK();
+	}
+
+	// In second iteration apply updates.
+	for(uint32_t i = 0; i < urecord->nupdates; i++ ) {
+		urecord->updates[i].oldvalue  = NULL;
+		if ( urecord->updates[i].dirty && rc == 0) {
+
+			char *      k = urecord->updates[i].name;
+			as_val *    v = urecord->updates[i].value;
+
+			if ( k != NULL ) {
+				if ( v == NULL || v->type == AS_NIL ) {
+					// if the value is NIL, then do a delete
+					cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k);
+					urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k);
+					// Only case delete fails if bin is not found that is 
+					// as good as delete. Ignore return code !!
+					udf_aerospike_delbin(urecord, k);
+
+					if (urecord->dirty != NULL) {
+						xdr_fill_dirty_bins(urecord->dirty);
+					}
+				}
+				else {
+					// otherwise, it is a set
+					cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k);
+					urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k);
+					rc = udf_aerospike_setbin(urecord, i, k, v);
+					if (rc) {
+						if (urecord->updates[i].oldvalue) {
+							as_val_destroy(urecord->updates[i].oldvalue);
+							urecord->updates[i].oldvalue = NULL;
+						} 
+						failmax = i;
+						goto Rollback;
+					}
+
+					if (urecord->dirty != NULL) {
+						xdr_add_dirty_bin(ns, urecord->dirty, k, strlen(k));
+					}
+				}
+			}
+
+			is_record_dirty = true;
+		}
+	}
+
+	{
+		// This is _NOT_ for writing to the storage but for simply performing sizing
+		// calculation. If we know the upper bounds of size of rec_props.. we could 
+		// avoid this work and check with that much correction ... 
+		//
+		// See
+		//  - udf_rw_post_processing for building rec_props for replication
+		//  - udf_record_close for building rec_props for writing it to storage
+		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
+		uint8_t rec_props_data[rec_props_data_size];
+		if (rec_props_data_size > 0) {
+			as_storage_record_set_rec_props(rd, rec_props_data);
+		}
+
+		if (! as_storage_record_size_and_check(rd)) {
+			cf_warning(AS_UDF, "record failed storage size check, will not be updated");
+			failmax = (int)urecord->nupdates;
+			goto Rollback;
+		}
+
+		if (rd->ns->clock_skew_stop_writes) {
+			failmax = (int)urecord->nupdates;
+			goto Rollback;
+		}
+
+		if (cf_atomic32_get(rd->ns->stop_writes) == 1) {
+			cf_warning(AS_UDF, "UDF failed by stop-writes, record will not be updated");
+			failmax = (int)urecord->nupdates;
+			goto Rollback;
+		}
+
+		if (! as_storage_has_space(rd->ns)) {
+			cf_warning(AS_UDF, "drives full, record will not be updated");
+			failmax = (int)urecord->nupdates;
+			goto Rollback;
+		}
+
+		if (! is_valid_ttl(rd->ns, urecord->tr->msgp->msg.record_ttl)) {
+			cf_warning(AS_UDF, "invalid ttl %u", urecord->tr->msgp->msg.record_ttl);
+			failmax = (int)urecord->nupdates;
+			goto Rollback;
+		}
+	}
+
+	if (has_sindex) {
+		SINDEX_GRUNLOCK();
+	}
+
+	// If there were updates do miscellaneous successful commit
+	// tasks
+	if (is_record_dirty 
+			|| (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED)) {
+		urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // will write to storage
+	}
+
+	// Clean up oldvalue cache and reset dirty. All the changes made 
+	// here has made to the particle buffer. Nothing will now be backed out.
+	for (uint32_t i = 0; i < urecord->nupdates; i++) {
+		udf_record_bin * bin = &urecord->updates[i];
+		if (bin->oldvalue != NULL ) {
+			as_val_destroy(bin->oldvalue);
+			bin->oldvalue = NULL;
+		}
+		bin->dirty    = false;
+	}
+	return rc;
+
+Rollback:
+	cf_debug(AS_UDF, "Rollback Called: failmax %d", failmax);
+	for (int i = 0; i < failmax; i++) {
+		if (urecord->updates[i].dirty) {
+			char *      k = urecord->updates[i].name;
+			// Pick the oldvalue for rollback
+			as_val *    v = urecord->updates[i].oldvalue;
+			if ( k != NULL ) {
+				if ( v == NULL || v->type == AS_NIL ) {
+					// if the value is NIL, then do a delete
+					cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k);
+					rc = udf_aerospike_delbin(urecord, k);
+				}
+				else {
+					// otherwise, it is a set
+					cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k);
+					rc = udf_aerospike_setbin(urecord, i, k, v);
+					if (rc) {
+						cf_warning(AS_UDF, "Rollback failed .. not good ... !!");
+					}
+				}
+			}
+			if (v) {
+				as_val_destroy(v);
+				cf_debug(AS_UDF, "ROLLBACK as_val_destroy()");
+			}
+		}
+	}
+
+	if (is_record_dirty && urecord->dirty != NULL) {
+		xdr_clear_dirty_bins(urecord->dirty);
+	}
+
+	if (has_sindex) {
+		SINDEX_GRUNLOCK();
+	}
+
+	// Reset the flat size in case the stuff is backedout !!! it should not
+	// fail in the backout code ...
+	if (! as_storage_record_size_and_check(rd)) {
+		cf_warning(AS_UDF, "Does not fit even after rollback... it is trouble");
+	}
+
+	// Do not clean up the cache in case of failure
+	return -1;
+}
+
+/*
+ * Internal function: udf_aerospike_execute_updates
+ *
+ * Parameters:
+ * 		rec - udf record to be updated
+ *
+ * Return values
+ * 		 0 on success
+ *		-1 on failure
+ *
+ * Description:
+ * 		Execute set of udf_record updates. If these updates are successfully
+ * 		applied atomically, the storage record is closed (committed to the disk)
+ * 		and reopened. The cache is freed up at the end.
+ *
+ * 		Callers:
+ * 		udf_aerospike_rec_create, interface func - aerospike:create(r)
+ * 		udf_aerospike_rec_update, interface func - aerospike:update(r)
+ * 		udf_aerospike__execute_updates is the key function which is executed in these
+ * 		functions. The return value is directly passed on to the lua.
+ */
+int
+udf_aerospike__execute_updates(udf_record * urecord)
+{
+	int rc = 0;
+	as_storage_rd *rd    = urecord->rd;
+
+	if ( urecord->nupdates == 0  &&
+			(urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED) == 0 ) {
+		cf_detail(AS_UDF, "No Update when execute update is called");
+		return 0;
+	}
+
+	// fail updates in case update is not allowed. Queries and scans do not
+	// not allow updates. Updates will never be true .. just being paranoid
+	if (!(urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES)) {
+		cf_warning(AS_UDF, "Udf: execute updates: allow updates false; FAIL");
+		return -1;
+	}
+
+	// Commit semantics is either all the update make it or none of it
+	rc = udf_aerospike__apply_update_atomic(urecord);
+
+	// allocate down if bins are deleted / not in use
+	if (rd->ns && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) {
+		int32_t delta_bins = (int32_t)as_bin_inuse_count(rd) - (int32_t)rd->n_bins;
+		if (delta_bins) {
+			as_bin_allocate_bin_space(rd, delta_bins);
+		}
+	}
+	return rc;
+}
+
+static void
+udf_aerospike_destroy(as_aerospike * as)
+{
+	as_aerospike_destroy(as);
+}
+
+static cf_clock
+udf_aerospike_get_current_time(const as_aerospike * as)
+{
+	(void)as;
+	return cf_clock_getabsolute();
+}
+
+/**
+ * aerospike::create(record)
+ * Function: udf_aerospike_rec_create
+ *
+ * Parameters:
+ * 		as - as_aerospike
+ *		rec - as_rec
+ *
+ * Return Values:
+ * 		1 if record is being read or on a create, it already exists
+ * 		o/w return value of udf_aerospike__execute_updates
+ *
+ * Description:
+ * 		Create a new record in local storage.
+ * 		The record will only be created if it does not exist.
+ * 		This assumes the record has a digest that is valid for local storage.
+ *
+ *		Synchronization : object lock acquired by the transaction thread executing UDF.
+ * 		Partition reservation takes place just before the transaction starts executing
+ * 		( look for as_partition_reserve_udf in thr_tsvc.c )
+ *
+ * 		Callers:
+ * 		lua interfacing function, mod_lua_aerospike_rec_create
+ * 		The return value of udf_aerospike_rec_create is pushed on to the lua stack
+ *
+ * 		Notes:
+ * 		The 'read' and 'exists' flag of udf_record are set to true.
+*/
+static int
+udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec)
+{
+	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+
+	udf_record * urecord  = (udf_record *) as_rec_source(rec);
+
+	// make sure record isn't already successfully read
+	if ((urecord->flag & UDF_RECORD_FLAG_OPEN) != 0) {
+		if (as_bin_inuse_has(urecord->rd)) {
+			cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists");
+			return 1;
+		}
+		// else - binless record ok...
+
+		if ((ret = udf_aerospike__execute_updates(urecord)) != 0) {
+			cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates");
+			udf_aerospike_rec_remove(as, rec);
+		}
+
+		return ret;
+	}
+
+	as_transaction *tr    = urecord->tr;
+	as_index_ref   *r_ref = urecord->r_ref;
+	as_storage_rd  *rd    = urecord->rd;
+	as_index_tree  *tree  = tr->rsv.tree;
+
+	// make sure we got the record as a create
+	int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns);
+	cf_detail_digest(AS_UDF, &tr->keyd, "Creating Record ");
+
+	// rv 0 means record exists, 1 means create, < 0 means fail
+	// TODO: Verify correct result codes.
+	if (rv == 1) {
+		// Record created.
+	} else if (rv == 0) {
+		// If it's an expired or truncated record, pretend it's a fresh create.
+		if (as_record_is_doomed(r_ref->r, tr->rsv.ns)) {
+			as_record_rescue(r_ref, tr->rsv.ns);
+		} else {
+			cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2");
+			as_record_done(r_ref, tr->rsv.ns);
+			// DO NOT change it has special meaning for caller
+			return 1;
+		}
+	} else if (rv < 0) {
+		cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv);
+		return rv;
+	}
+
+	// Associates the set name with the storage rec and index
+	if (tr->msgp) {
+		// Set the set name to index and close record if the setting the set name
+		// is not successful
+		int rv_set = as_transaction_has_set(tr) ?
+				set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0;
+		if (rv_set != 0) {
+			cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname");
+			as_index_delete(tree, &tr->keyd);
+			as_record_done(r_ref, tr->rsv.ns);
+			return 4;
+		}
+
+		// Don't write record if it would be truncated.
+		if (as_truncate_now_is_truncated(tr->rsv.ns, as_index_get_set_id(r_ref->r))) {
+			as_index_delete(tree, &tr->keyd);
+			as_record_done(r_ref, tr->rsv.ns);
+			return 4;
+		}
+	}
+
+	// open up storage
+	as_storage_record_create(tr->rsv.ns, r_ref->r, rd);
+
+	// If the message has a key, apply it to the record.
+	if (! get_msg_key(tr, rd)) {
+		cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key");
+		as_storage_record_close(rd);
+		as_index_delete(tree, &tr->keyd);
+		as_record_done(r_ref, tr->rsv.ns);
+		return 4;
+	}
+
+	// if multibin storage, we will use urecord->stack_bins, so set the size appropriately
+	if (rd->ns->single_bin) {
+		rd->n_bins = 1;
+	}
+	else if (! rd->ns->storage_data_in_memory) {
+		rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin);
+	}
+
+	// side effect: will set the unused bins to properly unused
+	as_storage_rd_load_bins(rd, urecord->stack_bins); // TODO - handle error returned
+
+	int rc = udf_aerospike__execute_updates(urecord);
+
+	if (rc != 0) {
+		//  Creating the udf record failed, destroy the as_record
+		cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc);
+		udf_record_close(urecord); // handles particle data and cache only
+		as_storage_record_close(rd);
+		as_index_delete(tree, &tr->keyd);
+		as_record_done(r_ref, tr->rsv.ns);
+		return rc;
+	}
+
+	// Success...
+
+	urecord->flag |= UDF_RECORD_FLAG_OPEN | UDF_RECORD_FLAG_STORAGE_OPEN;
+
+	return 0;
+}
+
+/**
+ * aerospike::update(record)
+ * Function: udf_aerospike_rec_update
+ *
+ * Parameters:
+ *
+ * Return Values:
+ * 		-2 if record does not exist
+ * 		o/w return value of udf_aerospike__execute_updates
+ *
+ * Description:
+ * 		Updates an existing record in local storage.
+ * 		The record will only be updated if it exists.
+ *
+ *		Synchronization : object lock acquired by the transaction thread executing UDF.
+ * 		Partition reservation takes place just before the transaction starts executing
+ * 		( look for as_partition_reserve_udf in thr_tsvc.c )
+ *
+ * 		Callers:
+ * 		lua interfacing function, mod_lua_aerospike_rec_update
+ * 		The return value of udf_aerospike_rec_update is pushed on to the lua stack
+ *
+ * 		Notes:
+ * 		If the record does not exist or is not read by anyone yet, we cannot
+ * 		carry on with the update. 'exists' and 'set' are set to false on record
+ * 		init or record remove.
+*/
+static int
+udf_aerospike_rec_update(const as_aerospike * as, const as_rec * rec)
+{
+	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+
+	// make sure record exists and is already opened up
+	if (!urecord || !(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)
+			|| !(urecord->flag & UDF_RECORD_FLAG_OPEN) ) {
+		cf_warning(AS_UDF, "Record not found to be open while updating urecord flag=%d", urecord ? urecord->flag : -1);
+		return -2;
+	}
+	cf_detail_digest(AS_UDF, &urecord->rd->r->keyd, "Executing Updates");
+	ret = udf_aerospike__execute_updates(urecord);
+
+	if (ret < 0) {
+		cf_warning(AS_UDF, "udf_aerospike_rec_update: failure executing record updates (%d)", ret);
+	}
+
+	return ret;
+}
+
+/**
+ * Function udf_aerospike_rec_exists
+ *
+ * Parameters:
+ *
+ * Return Values:
+ * 		1 if record exists
+ * 		0 o/w
+ *
+ * Description:
+ * Check to see if the record exists
+ */
+static int
+udf_aerospike_rec_exists(const as_aerospike * as, const as_rec * rec)
+{
+	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+
+	return (urecord && (urecord->flag & UDF_RECORD_FLAG_OPEN)) ? true : false;
+}
+
+/*
+ * Function: udf_aerospike_rec_remove
+ *
+ * Parameters:
+ *
+ * Return Values:
+ *		1 if record does not exist
+ *		0 on success
+ *
+ * Description:
+ * Removes an existing record from local storage.
+ * The record will only be removed if it exists.
+ */
+static int
+udf_aerospike_rec_remove(const as_aerospike * as, const as_rec * rec)
+{
+	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+
+	// make sure record is already exists before removing it
+	if (!urecord || !(urecord->flag & UDF_RECORD_FLAG_OPEN)) {
+		return 1;
+	}
+
+	as_storage_rd* rd = urecord->rd;
+
+	if (rd->ns->storage_data_in_memory && ! rd->ns->single_bin) {
+		delete_adjust_sindex(rd);
+	}
+
+	as_record_destroy_bins(rd);
+
+	if (rd->ns->storage_data_in_memory && ! rd->ns->single_bin) {
+		as_record_free_bin_space(rd->r);
+		rd->bins = NULL;
+		rd->n_bins = 0;
+	}
+
+	if (urecord->particle_data) {
+		cf_free(urecord->particle_data);
+		urecord->particle_data = NULL;
+	}
+
+	udf_record_cache_free(urecord);
+	urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES;
+
+	return 0;
+}
+
+/**
+ * Writes a log message
+ */
+static int
+udf_aerospike_log(const as_aerospike * a, const char * file, const int line, const int lvl, const char * msg)
+{
+	(void)a;
+	cf_fault_event(AS_UDF, lvl, file, line, "%s", (char *) msg);
+	return 0;
+}
+
+// Would someone please explain the structure of these hooks?  Why are some null?
+const as_aerospike_hooks udf_aerospike_hooks = {
+	.rec_create       = udf_aerospike_rec_create,
+	.rec_update       = udf_aerospike_rec_update,
+	.rec_remove       = udf_aerospike_rec_remove,
+	.rec_exists       = udf_aerospike_rec_exists,
+	.log              = udf_aerospike_log,
+	.get_current_time = udf_aerospike_get_current_time,
+	.destroy          = udf_aerospike_destroy
+};
diff --git a/as/src/base/udf_arglist.c b/as/src/base/udf_arglist.c
new file mode 100644
index 00000000..e3b79b63
--- /dev/null
+++ b/as/src/base/udf_arglist.c
@@ -0,0 +1,81 @@
+/*
+ * udf_arglist.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "aerospike/as_list.h"
+#include "aerospike/as_list_iterator.h"
+#include "aerospike/as_msgpack.h"
+
+#include "base/proto.h"
+#include "base/udf_arglist.h"
+
+/******************************************************************************
+ * STATIC FUNCTIONS
+ ******************************************************************************/
+
+static bool udf_arglist_foreach(const as_list *, as_list_foreach_callback, void *);
+static as_val *udf_arglist_get(const as_list *, const uint32_t idx);
+
+/******************************************************************************
+ * VARIABLES
+ ******************************************************************************/
+
+const as_list_hooks udf_arglist_hooks = {
+	.destroy		= NULL,
+	.hashcode		= NULL,
+	.size			= NULL,
+	.append			= NULL,
+	.prepend		= NULL,
+	.get			= udf_arglist_get,
+	.set			= NULL,
+	.head			= NULL,
+	.tail			= NULL,
+	.drop			= NULL,
+	.take			= NULL,
+	.foreach		= udf_arglist_foreach,
+	.iterator_init	= NULL,
+	.iterator_new	= NULL
+};
+
+/******************************************************************************
+ * FUNCTIONS
+ ******************************************************************************/
+
+static bool udf_arglist_foreach(const as_list * l, as_list_foreach_callback callback, void * context) {
+	if (l) {
+		as_list_iterator list_iter;
+		as_iterator* iter = (as_iterator*) &list_iter;
+		as_list_iterator_init(&list_iter, l);
+
+		while (as_iterator_has_next(iter)) {
+			const as_val* v = as_iterator_next(iter);
+			callback((as_val *) v, context);
+		}
+		as_iterator_destroy(iter);
+	}
+
+	return true;
+}
+
+static as_val *udf_arglist_get(const as_list * l, const uint32_t idx) {
+	return as_list_get(l, idx);
+}
+
diff --git a/as/src/base/udf_cask.c b/as/src/base/udf_cask.c
new file mode 100644
index 00000000..137cb9b3
--- /dev/null
+++ b/as/src/base/udf_cask.c
@@ -0,0 +1,745 @@
+/*
+ * udf_cast.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/udf_cask.h"
+
+#include <dirent.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <openssl/sha.h>
+
+#include "jansson.h"
+
+#include "aerospike/as_module.h"
+#include "aerospike/mod_lua.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_b64.h"
+#include "citrusleaf/cf_crypto.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/thr_info.h"
+#include "base/system_metadata.h"
+#include <sys/stat.h>
+
+char udf_smd_module_name[] = "UDF";
+char *as_udf_type_name[] = {"LUA", 0};
+
+static bool g_udf_smd_loaded = false;
+
+static int file_read(char *, uint8_t **, size_t *, unsigned char *);
+static int file_write(char *, uint8_t *, size_t, unsigned char *);
+static int file_remove(char *);
+static int file_generation(char *, uint8_t *, size_t, unsigned char *);
+
+static inline int file_resolve(char * filepath, char * filename, char * ext) {
+
+	char *  p               = filepath;
+	char *  user_path       = g_config.mod_lua.user_path;
+	size_t  user_path_len   = strlen(user_path);
+	int     filename_len    = strlen(filename);
+
+	memcpy(p, user_path, sizeof(char) * user_path_len);
+	p += user_path_len;
+
+	memcpy(p, "/", 1);
+	p += 1;
+
+	memcpy(p, filename, filename_len);
+	p += filename_len;
+
+	if ( ext ) {
+		int ext_len = strlen(ext);
+		memcpy(p, ext, ext_len);
+		p += ext_len;
+	}
+
+	p[0] = '\0';
+
+	return 0;
+}
+
+static int file_read(char * filename, uint8_t ** content, size_t * content_len, unsigned char * hash) {
+
+	char    filepath[256]   = {0};
+	char    line[1024]      = {0};
+	size_t  line_len        = sizeof(line);
+
+	file_resolve(filepath, filename, NULL);
+
+	cf_dyn_buf_define(buf);
+
+	FILE *file = fopen(filepath, "r");
+
+	if ( file ) {
+
+		while( fgets(line, line_len, file) != NULL ) {
+			cf_dyn_buf_append_string(&buf, line);
+		}
+
+		fclose(file);
+		file = NULL;
+
+		if ( buf.used_sz > 0 ) {
+
+			char *src = cf_dyn_buf_strdup(&buf);
+
+			file_generation(filepath, (uint8_t *)src, buf.used_sz, hash);
+
+			uint32_t src_len = (uint32_t)buf.used_sz;
+			uint32_t out_size = cf_b64_encoded_len(src_len);
+
+			*content = (uint8_t *)cf_malloc(out_size);
+			*content_len = out_size;
+
+			cf_b64_encode((const uint8_t*)src, src_len, (char*)(*content));
+
+			cf_free(src);
+			src = NULL;
+
+			return 0;
+		}
+
+		*content = NULL;
+		*content_len = 0;
+		return 2;
+	}
+
+	*content = NULL;
+	*content_len = 0;
+	return 1;
+}
+
+static int file_write(char * filename, uint8_t * content, size_t content_len, unsigned char * hash) {
+
+	char    filepath[256]   = {0};
+
+	file_resolve(filepath, filename, NULL);
+
+	FILE *file = fopen(filepath, "w");
+
+	if (file == NULL) {
+		cf_warning(AS_UDF, "could not open udf put to %s: %s", filepath, cf_strerror(errno));
+		return -1;
+	}
+	int r = fwrite(content, sizeof(char), content_len, file);
+	if (r <= 0) {
+		cf_warning(AS_UDF, "could not write file %s: %d", filepath, r);
+		fclose(file);
+		return -1;
+	}
+
+	fclose(file);
+	file = NULL;
+
+	file_generation(filepath, content, content_len, hash);
+
+	return 0;
+}
+
+static int file_remove(char * filename) {
+	char filepath[256] = {0};
+	file_resolve(filepath, filename, NULL);
+	unlink(filepath);
+	return 0;
+}
+
+static int file_generation(char * filename, uint8_t * content, size_t content_len, unsigned char * hash) {
+	unsigned char sha1[128] = {0};
+	int len = 20;
+	SHA1((const unsigned char *) content, (unsigned long) content_len, (unsigned char *) sha1);
+	cf_b64_encode(sha1, len, (char*)hash);
+	hash[cf_b64_encoded_len(len)] = 0;
+	return 0;
+}
+
+// return -1 if not found otherwise the index in as_udf_type_name
+static int udf_type_getid(char *type) {
+	int index = 0;
+	while (as_udf_type_name[index]) {
+		if (strcmp( type, as_udf_type_name[index]) == 0 ) {
+			return(index);
+		}
+		index++;
+	}
+	return(-1);
+}
+
+/*
+ * Type for user data passed to the get metadata callback.
+ */
+typedef struct udf_get_data_s {
+	cf_dyn_buf *db;        // DynBuf for output.
+	pthread_cond_t *cv;    // Condition variable for signaling callback completion.
+	pthread_mutex_t *mt;   // Mutex protecting the condition variable.
+	bool done;             // Has the callback finished?
+} udf_get_data_t;
+
+/*
+ * UDF SMD get metadata items callback.
+ */
+static int udf_cask_get_metadata_cb(char *module, as_smd_item_list_t *items, void *udata)
+{
+	udf_get_data_t *p_get_data = (udf_get_data_t *) udata;
+	cf_dyn_buf *out = p_get_data->db;
+
+	unsigned char   hash[SHA_DIGEST_LENGTH];
+	// hex string to be returned to the client
+	unsigned char   sha1_hex_buff[CF_SHA_HEX_BUFF_LEN];
+	// Currently just return directly for LUA
+	uint8_t udf_type = AS_UDF_TYPE_LUA;
+
+	for (int index = 0; index < items->num_items; index++) {
+		as_smd_item_t *item = items->item[index];
+		cf_debug(AS_UDF, "UDF metadata item[%d]:  module \"%s\" ; key \"%s\" ; value \"%s\" ; generation %u ; timestamp %lu",
+				 index, item->module_name, item->key, item->value, item->generation, item->timestamp);
+		cf_dyn_buf_append_string(out, "filename=");
+		cf_dyn_buf_append_buf(out, (uint8_t *)item->key, strlen(item->key));
+		cf_dyn_buf_append_string(out, ",");
+		SHA1((uint8_t *)item->value, strlen(item->value), hash);
+
+		// Convert to a hexadecimal string
+		cf_convert_sha1_to_hex(hash, sha1_hex_buff);
+		cf_dyn_buf_append_string(out, "hash=");
+		cf_dyn_buf_append_buf(out, sha1_hex_buff, CF_SHA_HEX_BUFF_LEN);
+		cf_dyn_buf_append_string(out, ",type=");
+		cf_dyn_buf_append_string(out, as_udf_type_name[udf_type]);
+		cf_dyn_buf_append_string(out, ";");
+	}
+
+	pthread_mutex_lock(p_get_data->mt);
+
+	p_get_data->done = true;
+	int retval = pthread_cond_signal(p_get_data->cv);
+	if (retval) {
+		cf_warning(AS_UDF, "pthread_cond_signal failed (rv %d)", retval);
+	}
+
+	pthread_mutex_unlock(p_get_data->mt);
+
+	return retval;
+}
+
+/*
+ *  Implementation of the "udf-list" Info. Command.
+ */
+int udf_cask_info_list(char *name, cf_dyn_buf *out)
+{
+	cf_debug(AS_UDF, "UDF CASK INFO LIST");
+
+	pthread_mutex_t get_data_mutex = PTHREAD_MUTEX_INITIALIZER;
+	pthread_cond_t get_data_cond_var = PTHREAD_COND_INITIALIZER;
+
+	udf_get_data_t get_data;
+	get_data.db = out;
+	get_data.cv = &get_data_cond_var;
+	get_data.mt = &get_data_mutex;
+	get_data.done = false;
+
+	pthread_mutex_lock(&get_data_mutex);
+
+	int retval = as_smd_get_metadata(udf_smd_module_name, "", udf_cask_get_metadata_cb, &get_data);
+	if (!retval) {
+		do { // [Note:  Loop protects against spurious wakeups.]
+			if ((retval = pthread_cond_wait(&get_data_cond_var, &get_data_mutex))) {
+				cf_warning(AS_UDF, "pthread_cond_wait failed (rv %d)", retval);
+				break;
+			}
+		} while (!get_data.done);
+	} else {
+		cf_warning(AS_UDF, "failed to get UDF metadata (rv %d)", retval);
+	}
+
+	pthread_mutex_unlock(&get_data_mutex);
+
+	pthread_mutex_destroy(&get_data_mutex);
+	pthread_cond_destroy(&get_data_cond_var);
+
+	return retval;
+}
+
+/*
+ * Reading local directory to get specific module item's contents.
+ * In future if needed we can change this to reading from smd metadata. 
+ */
+int udf_cask_info_get(char *name, char * params, cf_dyn_buf * out) {
+
+	int                 resp                = 0;
+	char                filename[128]       = {0};
+	int                 filename_len        = sizeof(filename);
+	uint8_t *           content             = NULL;
+	size_t              content_len         = 0;
+	unsigned char       content_gen[256]    = {0};
+	uint8_t             udf_type            = AS_UDF_TYPE_LUA;
+
+	cf_debug(AS_INFO, "UDF CASK INFO GET");
+
+	// get (required) script filename
+	if ( as_info_parameter_get(params, "filename", filename, &filename_len) ) {
+		cf_info(AS_INFO, "invalid or missing filename");
+		cf_dyn_buf_append_string(out, "error=invalid_filename");
+		return 0;
+	}
+
+	mod_lua_rdlock(&mod_lua);
+	// read the script from filesystem
+	resp = file_read(filename, &content, &content_len, content_gen);
+	mod_lua_unlock(&mod_lua);
+	if ( resp ) {
+		switch ( resp ) {
+			case 1 : {
+				cf_dyn_buf_append_string(out, "error=not_found");
+				break;
+			}
+			case 2 : {
+				cf_dyn_buf_append_string(out, "error=empty");
+				break;
+			}
+			default : {
+				cf_dyn_buf_append_string(out, "error=unknown_error");
+				break; // complier complains without a break;
+			}
+		}
+	}
+	else {
+		// put back the result
+		cf_dyn_buf_append_string(out, "gen=");
+		cf_dyn_buf_append_string(out, (char *) content_gen);
+		cf_dyn_buf_append_string(out, ";type=");
+		cf_dyn_buf_append_string(out, as_udf_type_name[udf_type]);
+		cf_dyn_buf_append_string(out, ";content=");
+		cf_dyn_buf_append_buf(out, content, content_len);
+		cf_dyn_buf_append_string(out, ";");
+	}
+
+	if ( content ) {
+		cf_free(content);
+		content = NULL;
+	}
+
+	return 0;
+}
+
+// An info put call will call system metadata
+//
+// Data is reflected into json as an object with the following fields
+// which can be added to later if necessary, for example, instead of using
+// the specific data, it could include the URL to the data
+//
+// key - name of the UDF file
+//
+// content64 - base64 encoded data
+// type - language to execute
+// name - reptition of the name, same as the key
+
+int udf_cask_info_put(char *name, char * params, cf_dyn_buf * out) {
+
+	cf_debug(AS_INFO, "UDF CASK INFO PUT");
+
+	int					rc 					= 0;
+	char                filename[128]       = {0};
+	int                 filename_len        = sizeof(filename);
+	// Content_len from the client and its expected size
+	char                content_len[32]     = {0};
+	int 		        clen		        = sizeof(content_len);
+	// Udf content from the client and its expected length
+	char	 		    *udf_content        = NULL;
+	int 		        udf_content_len    = 0;
+	// Udf type from the client and its expected size
+	char 		         type[8]            = {0};
+	int 		         type_len 	        = sizeof(type);
+
+	// get (required) script filename
+	char *tmp_char;
+
+	if ( as_info_parameter_get(params, "filename", filename, &filename_len)
+			|| !(tmp_char = strchr(filename, '.'))               // No extension in filename
+			|| tmp_char == filename                              // '.' at the begining of filename
+			|| strlen (tmp_char) <= 1) {                         // '.' in filename, but no extnsion e.g. "abc."
+		cf_info(AS_INFO, "invalid or missing filename");
+		cf_dyn_buf_append_string(out, "error=invalid_filename");
+		return 0;
+	}
+
+	if ( as_info_parameter_get(params, "content-len", content_len, &(clen)) ) {
+		cf_info(AS_INFO, "invalid or missing content-len");
+		cf_dyn_buf_append_string(out, "error=invalid_content_len");
+		return 0;
+	}
+
+	if ( as_info_parameter_get(params, "udf-type", type, &type_len) ) {
+		// Replace with DEFAULT IS LUA
+		strcpy(type, as_udf_type_name[0]);
+	}
+
+	// check type field
+	if (-1 == udf_type_getid(type)) {
+		cf_info(AS_INFO, "invalid or missing udf-type : %s not valid", type);
+		cf_dyn_buf_append_string(out, "error=invalid_udf_type");
+		return 0;
+	}
+
+	// get b64 encoded script
+	udf_content_len = atoi(content_len) + 1;
+	udf_content = (char *) cf_malloc(udf_content_len);
+
+	// cf_info(AS_UDF, "content_len = %s", content_len);
+	// cf_info(AS_UDF, "udf_content_len = %d", udf_content_len);
+
+
+	// get (required) script content - base64 encoded here.
+	if ( as_info_parameter_get(params, "content", udf_content, &(udf_content_len)) ) {
+		cf_info(AS_UDF, "invalid content");
+		cf_dyn_buf_append_string(out, "error=invalid_content");
+		cf_free(udf_content);
+		return 0;
+	}
+
+	// base 64 decode it
+	uint32_t encoded_len = strlen(udf_content);
+	uint32_t decoded_len = cf_b64_decoded_buf_size(encoded_len) + 1;
+	
+	// Don't allow UDF file size > 1MB 
+	if ( decoded_len > MAX_UDF_CONTENT_LENGTH) {
+		cf_info(AS_INFO, "lua file size:%d > 1MB", decoded_len);
+		cf_dyn_buf_append_string(out, "error=invalid_udf_content_len, lua file size > 1MB");
+		cf_free(udf_content);
+		return 0;
+	}
+
+	char * decoded_str = cf_malloc(decoded_len);
+
+	if ( ! cf_b64_validate_and_decode(udf_content, encoded_len, (uint8_t*)decoded_str, &decoded_len) ) {
+		cf_info(AS_UDF, "invalid base64 content %s", filename);
+		cf_dyn_buf_append_string(out, "error=invalid_base64_content");
+		cf_free(decoded_str);
+		cf_free(udf_content);
+		return 0;
+	}
+
+	decoded_str[decoded_len] = '\0';
+
+	as_module_error err;
+	rc = as_module_validate(&mod_lua, NULL, filename, decoded_str, decoded_len, &err);
+
+	cf_free(decoded_str);
+	decoded_str = NULL;
+	decoded_len = 0;
+
+	if ( rc ) {
+		cf_warning(AS_UDF, "udf-put: compile error: [%s:%d] %s", err.file, err.line, err.message);
+		cf_dyn_buf_append_string(out, "error=compile_error");
+		cf_dyn_buf_append_string(out, ";file=");
+		cf_dyn_buf_append_string(out, err.file);
+		cf_dyn_buf_append_string(out, ";line=");
+		cf_dyn_buf_append_uint32(out, err.line);
+
+		uint32_t message_len = strlen(err.message);
+		uint32_t enc_message_len = cf_b64_encoded_len(message_len);
+		char enc_message[enc_message_len];
+
+		cf_b64_encode((const uint8_t*)err.message, message_len, enc_message);
+
+		cf_dyn_buf_append_string(out, ";message=");
+		cf_dyn_buf_append_buf(out, (uint8_t *)enc_message, enc_message_len);
+
+		cf_free(udf_content);
+		return 0;
+	}
+
+	// Create an empty JSON object
+	json_t *udf_obj = 0;
+	if (!(udf_obj = json_object())) {
+		cf_warning(AS_UDF, "failed to create JSON array for receiving UDF");
+		cf_free(udf_content);
+		return -1;
+	}
+	int e = 0;
+	e += json_object_set_new(udf_obj, "content64", json_string(udf_content));
+	e += json_object_set_new(udf_obj, "type", json_string(type));
+	e += json_object_set_new(udf_obj, "name", json_string(filename));
+
+	cf_free(udf_content);
+
+	if (e) {
+		cf_warning(AS_UDF, "could not encode UDF object, error %d", e);
+		json_decref(udf_obj);
+		return(-1);
+	}
+	// make it into a string, yet another buffer copy
+	char *udf_obj_str = json_dumps(udf_obj, 0/*flags*/);
+	json_decref(udf_obj);
+	udf_obj = 0;
+
+	cf_debug(AS_UDF, "created json object %s", udf_obj_str);
+
+	// how do I know whether to call create or add?
+	e = as_smd_set_metadata(udf_smd_module_name, filename, udf_obj_str);
+	if (e) {
+		cf_warning(AS_UDF, "could not add UDF metadata, error %d", e);
+		cf_free(udf_obj_str);
+		return(-1);
+	}
+
+	cf_info(AS_UDF, "UDF module '%s' (%s/%s) registered", filename, g_config.mod_lua.user_path, filename);
+
+	// free the metadata
+	cf_free(udf_obj_str);
+	udf_obj_str = 0;
+
+	return 0;
+}
+
+int udf_cask_info_remove(char *name, char * params, cf_dyn_buf * out) {
+
+	char    filename[128]   = {0};
+	int     filename_len    = sizeof(filename);
+	char file_path[1024]	= {0};
+	struct stat buf;
+
+	cf_debug(AS_INFO, "UDF CASK INFO REMOVE");
+
+	// get (required) script filename
+	if ( as_info_parameter_get(params, "filename", filename, &filename_len) ) {
+		cf_info(AS_UDF, "invalid or missing filename");
+		cf_dyn_buf_append_string(out, "error=invalid_filename");
+	}
+
+	// now check if such a file-name exists :
+	if (!g_config.mod_lua.user_path)
+	{
+		return -1;
+	}
+
+	snprintf(file_path, 1024, "%s/%s", g_config.mod_lua.user_path, filename);
+
+	cf_debug(AS_INFO, " Lua file removal full-path is : %s \n", file_path);
+
+	if (stat(file_path, &buf) != 0) {
+		cf_info(AS_UDF, "failed to read file from : %s, error : %s", file_path, cf_strerror(errno));
+		cf_dyn_buf_append_string(out, "error=file_not_found");
+		return -1;
+	}
+
+	as_smd_delete_metadata(udf_smd_module_name, filename);
+
+	// this is what an error would look like
+	//    cf_dyn_buf_append_string(out, "error=");
+	//    cf_dyn_buf_append_int(out, resp);
+
+	cf_dyn_buf_append_string(out, "ok");
+
+	cf_info(AS_UDF, "UDF module '%s' (%s) removed", filename, file_path);
+
+	return 0;
+}
+
+/*
+ *  Clear out the Lua cache.
+ */
+int udf_cask_info_clear_cache(char *name, char * params, cf_dyn_buf * out)
+{
+	cf_debug(AS_INFO, "UDF CASK INFO CLEAR CACHE");
+
+	mod_lua_wrlock(&mod_lua);
+
+	as_module_event e = {
+		.type = AS_MODULE_EVENT_CLEAR_CACHE
+	};
+	as_module_update(&mod_lua, &e);
+
+	mod_lua_unlock(&mod_lua);
+
+	cf_dyn_buf_append_string(out, "ok");
+
+	return 0;
+}
+
+/**
+ * (Re-)Configure UDF modules
+ */
+int udf_cask_info_configure(char *name, char * params, cf_dyn_buf * buf) {
+	as_module_configure(&mod_lua, &g_config.mod_lua);
+	return 0;
+}
+
+//
+// take a current list and return the new list
+// Validates that items are correct? or is that done with the add?
+// How do you signal that there are no changes between the current list and the new list?
+
+int
+udf_cask_smd_merge_fn (char *module, as_smd_item_list_t **item_list_out, as_smd_item_list_t **item_lists_in, size_t num_lists, void *udata)
+{
+	cf_debug(AS_UDF, "UDF CASK merge function");
+
+	// (For now, just send back an empty metadata item list.)
+	as_smd_item_list_t *item_list = as_smd_item_list_create(0);
+	*item_list_out = item_list;
+
+	return(0);
+}
+
+// This function must take the current "view of the world" and
+// make the local store the same as that.
+
+int
+udf_cask_smd_accept_fn(char *module, as_smd_item_list_t *items, void *udata, uint32_t accept_opt)
+{
+	if (accept_opt & AS_SMD_ACCEPT_OPT_CREATE) {
+		cf_debug(AS_UDF, "(doing nothing in UDF accept cb for module creation)");
+		g_udf_smd_loaded = true;
+		return 0;
+	}
+
+	cf_debug(AS_UDF, "UDF CASK accept fn : n items %zu", items->num_items);
+
+	// For each item in the list, see if the current version
+	// is different from the curretly stored version
+	// and if the new item is new, write to the storage directory
+	for (int i = 0; i < items->num_items ; i++) {
+
+		as_smd_item_t *item = items->item[i];
+
+		if (item->action == AS_SMD_ACTION_SET) {
+
+			json_error_t json_error;
+			json_t *item_obj = json_loads(item->value, 0 /*flags*/, &json_error);
+			if (!item_obj) {
+				cf_warning(AS_UDF, "failed to parse UDF \"%s\" with JSON error: %s ; source: %s ; line: %d ; column: %d ; position: %d",
+						   item->key, json_error.text, json_error.source, json_error.line, json_error.column, json_error.position);
+				continue;
+			}
+
+			/*item->key is name */
+			json_t *content64_obj = json_object_get(item_obj, "content64");
+			const char *content64_str = json_string_value(content64_obj);
+
+			// base 64 decode it
+			uint32_t encoded_len = strlen(content64_str);
+			uint32_t decoded_len = cf_b64_decoded_buf_size(encoded_len) + 1;
+			char *content_str = cf_malloc(decoded_len);
+
+			if (! cf_b64_validate_and_decode(content64_str, encoded_len, (uint8_t*)content_str, &decoded_len)) {
+				cf_info(AS_UDF, "invalid script on accept, will not register %s", item->key);
+				cf_free(content_str);
+				json_decref(item_obj);
+				continue;
+			}
+
+			content_str[decoded_len] = 0;
+
+			cf_debug(AS_UDF, "pushing to %s, %d bytes [%s]", item->key, decoded_len, content_str);
+			mod_lua_wrlock(&mod_lua);
+
+			// content_gen is actually a hash. Not sure if it's filled out or what.
+			unsigned char       content_gen[256]    = {0};
+			int e = file_write(item->key, (uint8_t *) content_str, decoded_len, content_gen);
+			cf_free(content_str);
+			json_decref(item_obj);
+			if ( e ) {
+				mod_lua_unlock(&mod_lua);
+				cf_info(AS_UDF, "invalid script on accept, will not register %s", item->key);
+				continue;
+			}
+			// Update the cache
+			as_module_event ame = {
+				.type           = AS_MODULE_EVENT_FILE_ADD,
+				.data.filename  = item->key
+			};
+			as_module_update(&mod_lua, &ame);
+			mod_lua_unlock(&mod_lua);
+		}
+		else if (item->action == AS_SMD_ACTION_DELETE) {
+			cf_debug(AS_UDF, "received DELETE SMD action %d key %s", item->action, item->key);
+
+			mod_lua_wrlock(&mod_lua);
+			file_remove(item->key);
+
+			// fixes potential cache issues
+			as_module_event e = {
+				.type           = AS_MODULE_EVENT_FILE_REMOVE,
+				.data.filename  = item->key
+			};
+			as_module_update(&mod_lua, &e);
+
+			mod_lua_unlock(&mod_lua);
+
+		}
+		else {
+			cf_info(AS_UDF, "received unknown SMD action %d", item->action);
+		}
+	}
+
+	return(0);
+}
+
+
+void
+udf_cask_init()
+{
+	// Have to delete the existing files in the user path on startup
+	struct dirent   * entry         = NULL;
+	// opendir(NULL) seg-faults
+	if (!g_config.mod_lua.user_path)
+	{
+		cf_crash(AS_UDF, "cask init: null mod-lua user-path");
+	}
+
+	DIR *dir = opendir(g_config.mod_lua.user_path);
+	if ( dir == 0 ) {
+		cf_crash(AS_UDF, "cask init: could not open udf directory %s: %s", g_config.mod_lua.user_path, cf_strerror(errno));
+	}
+	while ( (entry = readdir(dir))) {
+		// readdir also reads "." and ".." entries.
+		if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, ".."))
+		{
+			char fn[1024];
+			snprintf(fn, sizeof(fn), "%s/%s", g_config.mod_lua.user_path, entry->d_name);
+			int rem_rv = remove(fn);
+			if (rem_rv != 0) {
+				cf_warning(AS_UDF, "Failed to remove the file %s. Error %d", fn, errno);
+			}
+		}
+	}
+	closedir(dir);
+
+	// as_smd_create_module(udf_smd_module_name, udf_cask_smd_merge_fn, 0, udf_cask_smd_accept_fn, 0);
+	// take the default merge function
+	if (as_smd_create_module(udf_smd_module_name, 0, 0, 0, 0, udf_cask_smd_accept_fn, 0, 0, 0)) {
+		cf_crash(AS_UDF, "failed to create SMD module \"%s\"", udf_smd_module_name);
+	}
+
+	while (! g_udf_smd_loaded) {
+		usleep(1000);
+	}
+
+	// there may be existing data. Read it and populate the local file system.
+}
diff --git a/as/src/base/udf_memtracker.c b/as/src/base/udf_memtracker.c
new file mode 100644
index 00000000..aceaded4
--- /dev/null
+++ b/as/src/base/udf_memtracker.c
@@ -0,0 +1,105 @@
+/*
+ * udf_memtracker.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+#include <pthread.h>
+
+#include "fault.h"
+
+#include "base/udf_memtracker.h"
+
+
+/*****************************************************************************
+ * STATIC FUNCTIONS
+ *****************************************************************************/
+
+static pthread_key_t   modules_tlskey = 0;
+static as_memtracker   g_udf_memtracker;
+static int
+udf_memtracker_generic(mem_tracker *mt, const uint32_t num_bytes, memtracker_op op)
+{
+	if (!mt || !mt->udata || !mt->cb) {
+		return false;
+	}
+
+	mt->cb(mt, num_bytes, op);
+	if (op == MEM_RESERVE) {
+		cf_detail(AS_UDF, "%ld: Memory Tracker %p reserved = %d (bytes)",
+				  pthread_self(), mt, num_bytes);
+	} else if (op == MEM_RELEASE) {
+		cf_detail(AS_UDF, "%ld: Memory Tracker %p released = %d (bytes)",
+				  pthread_self(), mt, num_bytes);
+	} else {
+		cf_detail(AS_UDF, "%ld: Memory Tracker %p reset",
+				  pthread_self(), mt);
+	}
+	return 0;
+}
+
+void
+udf_memtracker_setup(mem_tracker *mt)
+{
+	pthread_setspecific(modules_tlskey, mt);
+	cf_detail(AS_UDF, "%ld: Memory Tracker %p set", pthread_self(), mt);
+}
+
+void
+udf_memtracker_cleanup()
+{
+	pthread_setspecific(modules_tlskey, NULL);
+	cf_detail(AS_UDF, "%ld: Memory Tracker reset", pthread_self());
+}
+
+static bool
+udf_memtracker_reset(const as_memtracker *as_mt) {
+	mem_tracker *mt = (mem_tracker *)pthread_getspecific(modules_tlskey);
+	return udf_memtracker_generic(mt, 0, MEM_RESET);
+
+}
+
+static bool
+udf_memtracker_reserve(const as_memtracker *as_mt, const uint32_t num_bytes)
+{
+	mem_tracker *mt = (mem_tracker *)pthread_getspecific(modules_tlskey);
+	return udf_memtracker_generic(mt, num_bytes, MEM_RESERVE);
+}
+
+static bool
+udf_memtracker_release(const as_memtracker *as_mt, const uint32_t num_bytes)
+{
+	mem_tracker *mt = (mem_tracker *)pthread_getspecific(modules_tlskey);
+	return udf_memtracker_generic(mt, num_bytes, MEM_RELEASE);
+}
+
+static const as_memtracker_hooks udf_memtracker_hooks = {
+	.destroy	= NULL,
+	.reserve	= udf_memtracker_reserve,
+	.release	= udf_memtracker_release,
+	.reset		= udf_memtracker_reset
+};
+
+as_memtracker *
+udf_memtracker_init()
+{
+	as_memtracker_init(&g_udf_memtracker, NULL, &udf_memtracker_hooks);
+	return &g_udf_memtracker;
+}
diff --git a/as/src/base/udf_record.c b/as/src/base/udf_record.c
new file mode 100644
index 00000000..2a740e9a
--- /dev/null
+++ b/as/src/base/udf_record.c
@@ -0,0 +1,959 @@
+/*
+ * udf_record.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/udf_record.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aerospike/as_rec.h"
+#include "aerospike/as_val.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/rec_props.h"
+#include "base/transaction.h"
+#include "storage/storage.h"
+#include "transaction/rw_utils.h"
+#include "transaction/udf.h"
+
+
+/*
+ * Function: Open storage record for passed in udf record
+ *           also set up flag like exists / read et al.
+ *
+ * Parameters:
+ * 		urec    : UDF record
+ *
+ * Return value :  0 on success
+ * 				  -1 if the record's bin count exceeds the UDF limit
+ *
+ * Callers:
+ * 		udf_record_open
+ *
+ * Note: There are no checks, so the caller has to make sure that all
+ *       protections are taken and all checks are done.
+ *
+ *  Side effect:
+ *  	Counters will be reset
+ *  	flag will be set
+ *  	bins will be opened
+ */
+int
+udf_storage_record_open(udf_record *urecord)
+{
+	cf_debug_digest(AS_UDF, &urecord->tr->keyd, "[ENTER] Opening record key:");
+	as_storage_rd  *rd    = urecord->rd;
+	as_index       *r	  = urecord->r_ref->r;
+	as_transaction *tr    = urecord->tr;
+
+	as_storage_record_open(tr->rsv.ns, r, rd);
+
+	// Deal with delete durability (enterprise only).
+	if ((urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES) != 0 &&
+			set_delete_durablility(tr, rd) != 0) {
+		as_storage_record_close(rd);
+		return -1;
+	}
+
+	as_storage_rd_load_n_bins(rd); // TODO - handle error returned
+
+	if (rd->n_bins > UDF_RECORD_BIN_ULIMIT) {
+		cf_warning(AS_UDF, "record has too many bins (%d) for UDF processing", rd->n_bins);
+		as_storage_record_close(rd);
+		return -1;
+	}
+
+	// if multibin storage, we will use urecord->stack_bins, so set the size appropriately
+	if ( ! tr->rsv.ns->storage_data_in_memory && ! tr->rsv.ns->single_bin ) {
+		rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin);
+	}
+
+	as_storage_rd_load_bins(rd, urecord->stack_bins); // TODO - handle error returned
+	urecord->starting_memory_bytes = as_storage_record_get_n_bytes_memory(rd);
+
+	as_storage_record_get_key(rd);
+
+	urecord->flag   |= UDF_RECORD_FLAG_STORAGE_OPEN;
+
+	cf_detail_digest(AS_UDF, &tr->keyd, "Storage Open: Rec(%p) flag(%x) Digest:", urecord, urecord->flag);
+	return 0;
+}
+
+/*
+ * Function: Close storage record if it open and also set flags
+ *
+ * Parameters:
+ * 		urec    : UDF record
+ *
+ * Return value : 0 in case storage was open
+ *                1 in case storage was not open
+ *
+ * Callers:
+ * 		udf_record_close
+ *
+ *  Side effect:
+ *  	flag will be reset
+ *  	bins will be closed
+ */
+int
+udf_storage_record_close(udf_record *urecord)
+{
+	if (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) {
+		as_index_ref   *r_ref = urecord->r_ref;
+		as_storage_rd  *rd    = urecord->rd;
+
+		// In case allow update is not set .. the record has been opened for
+		// the aggregation. Do not do any rec property update.
+		// Pick info from index and put it in storage record.
+		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
+		uint8_t rec_props_data[rec_props_data_size];
+		if (urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES) {
+			if (rec_props_data_size > 0) {
+				as_storage_record_set_rec_props(rd, rec_props_data);
+			}
+		}
+
+		bool has_bins = as_bin_inuse_has(rd);
+
+		if (r_ref) {
+			if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) {
+				as_storage_record_write(rd);
+				urecord->flag &= ~UDF_RECORD_FLAG_HAS_UPDATES; // TODO - necessary?
+			}
+
+			if (! has_bins) {
+				write_delete_record(r_ref->r, urecord->tr->rsv.tree);
+			}
+
+			as_storage_record_close(rd);
+		} else {
+			// Should never happen.
+			cf_warning(AS_UDF, "Unexpected Internal Error (null r_ref)");
+		}
+
+		urecord->flag &= ~UDF_RECORD_FLAG_STORAGE_OPEN;
+		cf_detail_digest(AS_UDF, &urecord->tr->keyd, "Storage Close:: Rec(%p) Flag(%x) Digest:",
+				urecord, urecord->flag );
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+/*
+ * Function: Open storage record for passed in udf record
+ *           also set up flag like exists / read et al.
+ *           Does as_record_get as well if it is not done yet.
+ *
+ * Parameters:
+ * 		urec    : UDF record
+ *
+ * Return value :
+ *  	 0 in case record is successfully read
+ * 		-1 in case record is not found
+ * 		-2 in case record is found but has expired
+ *
+ * Callers:
+ * 		query_agg_istream_read
+ */
+int
+udf_record_open(udf_record * urecord)
+{
+	cf_debug_digest(AS_UDF, &urecord->tr->keyd, "[ENTER] Opening record key:");
+	if (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) {
+		cf_info(AS_UDF, "Record already open");
+		return 0;
+	}
+	as_transaction *tr    = urecord->tr;
+	as_index_ref   *r_ref = urecord->r_ref;
+	as_index_tree  *tree  = tr->rsv.tree;
+
+	int rec_rv = 0;
+	if (!(urecord->flag & UDF_RECORD_FLAG_OPEN)) {
+		cf_detail(AS_UDF, "Opening Record");
+		rec_rv = as_record_get_live(tree, &tr->keyd, r_ref, tr->rsv.ns);
+	}
+
+	if (!rec_rv) {
+		as_index *r = r_ref->r;
+		// check to see this isn't an expired record waiting to die
+		if (as_record_is_doomed(r, tr->rsv.ns)) {
+			as_record_done(r_ref, tr->rsv.ns);
+			cf_detail(AS_UDF, "udf_record_open: Record has expired cannot read");
+			rec_rv = -2;
+		} else {
+			urecord->flag   |= UDF_RECORD_FLAG_OPEN;
+			urecord->flag   |= UDF_RECORD_FLAG_PREEXISTS;
+			cf_detail_digest(AS_UDF, &tr->keyd, "Open %p %x Digest:", urecord, urecord->flag);
+			rec_rv = udf_storage_record_open(urecord);
+		}
+	} else {
+		cf_detail_digest(AS_UDF, &urecord->tr->keyd, "udf_record_open: rec_get returned with %d ",
+				rec_rv);
+	}
+	return rec_rv;
+}
+
+/*
+ * Function: Close storage record for udf record. Release
+ *           all locks and partition reservation / namespace
+ *           reservation etc. if requested.
+ *           Also cleans up entire cache (updated from udf)
+ *
+ * Parameters:
+ * 		urec       : UDF record being operated on
+ *
+ * Return value : Nothing
+ *
+ * Callers:
+ * 		query_agg_istream_read
+ * 		as_query__agg
+ * 		udf_record_destroy
+ */
+void
+udf_record_close(udf_record *urecord)
+{
+	as_transaction *tr    = urecord->tr;
+	cf_debug_digest(AS_UDF, &tr->keyd, "[ENTER] Closing record key:");
+
+	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
+		as_index_ref   *r_ref = urecord->r_ref;
+		cf_detail(AS_UDF, "Closing Record");
+		udf_storage_record_close(urecord);
+		as_record_done(r_ref, tr->rsv.ns);
+		urecord->flag &= ~UDF_RECORD_FLAG_OPEN;
+		cf_detail_digest(AS_UDF, &urecord->tr->keyd,
+			"Storage Close:: Rec(%p) Flag(%x) Digest:", urecord, urecord->flag );
+	}
+
+	// Replication happens when the main record replicates
+	if (urecord->particle_data) {
+		cf_free(urecord->particle_data);
+		urecord->particle_data = 0;
+	}
+	udf_record_cache_free(urecord);
+}
+
+/*
+ * Function: This function called to reinitialize the udf_record. It sets up
+ *           the basic value back to default. Can be called after the UDF
+ *           record has been used. Reset the fact that record pre_exits or
+ *           was actually read etc.
+ *
+ * Parameters:
+ * 		urec	: UDF record being initialized
+ *
+ * Return value : Nothing
+ *
+ * Callers:
+ * 		udf_rw_local   (parent record before calling UDF)
+ */
+void
+udf_record_init(udf_record *urecord, bool allow_updates)
+{
+	urecord->tr                 = NULL;
+	urecord->r_ref              = NULL;
+	urecord->rd                 = NULL;
+	urecord->dirty              = NULL;
+	urecord->nupdates           = 0;
+	urecord->particle_data      = NULL;
+	urecord->cur_particle_data  = NULL;
+	urecord->end_particle_data  = NULL;
+	urecord->starting_memory_bytes = 0;
+
+	// Init flag
+	urecord->flag               = UDF_RECORD_FLAG_ISVALID;
+
+	if (allow_updates) {
+		urecord->flag           |= UDF_RECORD_FLAG_ALLOW_UPDATES;
+	}
+
+	urecord->keyd               = cf_digest_zero;
+	for (uint32_t i = 0; i < UDF_RECORD_BIN_ULIMIT; i++) {
+		urecord->updates[i].particle_buf = NULL;
+	}
+}
+
+/*
+static int print_buffer(as_buffer * buff) {
+    msgpack_sbuffer sbuf;
+    msgpack_sbuffer_init(&sbuf);
+
+    sbuf.data = buff->data;
+    sbuf.size = buff->size;
+    sbuf.alloc = buff->capacity;
+
+    msgpack_zone mempool;
+    msgpack_zone_init(&mempool, 2048);
+
+    msgpack_object deserialized;
+    msgpack_unpack(sbuf.data, sbuf.size, NULL, &mempool, &deserialized);
+
+    printf("msg_buf:\n");
+    msgpack_object_print(stdout, deserialized);
+    puts("");
+
+    msgpack_zone_destroy(&mempool);
+    return 0;
+}
+*/
+
+/*
+ * Function: Get bin value from cached copy. All the update in a
+ *           commit window is not applied to the record directly
+ *           but maintained in-memory cache. This function used
+ *           to retrieve cached value
+ *
+ *           Similar function for get and free of cache
+ *
+ * Return value :
+ * 		value  (as_val) in case of success [for get]
+ * 		NULL  in case of failure
+ * 		set and free return Nothing
+ *
+ * Callers:
+ * 		GET and SET
+ * 		udf_record_get
+ * 		udf_record_set
+ * 		udf_record_remove
+ *
+ * 		FREE
+ * 		udf_aerospike__execute_updates (when crossing commit window)
+ * 		udf_record_close               (finally closing record)
+ * 		udf_rw_commit                  (commit the udf record)
+ */
+static as_val *
+udf_record_cache_get(udf_record * urecord, const char * name)
+{
+	cf_debug(AS_UDF, "[ENTER] BinName(%s) ", name );
+	if ( urecord->nupdates > 0 ) {
+		cf_detail(AS_UDF, "udf_record_get: %s find", name);
+		for ( uint32_t i = 0; i < urecord->nupdates; i++ ) {
+			udf_record_bin * bin = &(urecord->updates[i]);
+			if ( strncmp(name, bin->name, AS_ID_BIN_SZ) == 0 ) {
+				cf_detail(AS_UDF, "Bin %s found, type(%d)", name, bin->value->type );
+				return bin->value; // note it's OK if the bin contains a nil
+			}
+		}
+	}
+	return NULL;
+}
+
+void
+udf_record_cache_free(udf_record * urecord)
+{
+	cf_debug(AS_UDF, "[ENTER] NumUpdates(%d) ", urecord->nupdates );
+
+	for (uint32_t i = 0; i < urecord->nupdates; i ++ ) {
+		udf_record_bin * bin = &urecord->updates[i];
+		if ( bin->name[0] != '\0' && bin->value != NULL ) {
+			bin->name[0] = '\0';
+			as_val_destroy(bin->value);
+			bin->value = NULL;
+		}
+		if ( bin->name[0] != '\0' && bin->oldvalue != NULL ) {
+			bin->name[0] = '\0';
+			as_val_destroy(bin->oldvalue);
+			bin->oldvalue = NULL;
+		}
+	}
+
+	for (uint32_t i = 0; i < UDF_RECORD_BIN_ULIMIT; i++) {
+		if (urecord->updates[i].particle_buf) {
+			cf_free(urecord->updates[i].particle_buf);
+			urecord->updates[i].particle_buf = NULL;
+		}
+	}
+	urecord->nupdates = 0;
+	urecord->flag &= ~UDF_RECORD_FLAG_TOO_MANY_BINS;
+}
+
+/**
+ * Set the cache value for a bin, including flags.
+ */
+static void
+udf_record_cache_set(udf_record * urecord, const char * name, as_val * value,
+					 bool dirty)
+{
+	cf_debug(AS_UDF, "[ENTER] urecord(%p) name(%p)[%s] dirty(%d)",
+			  urecord, name, name, dirty);
+
+	bool modified = false;
+
+	for ( uint32_t i = 0; i < urecord->nupdates; i++ ) {
+		udf_record_bin * bin = &(urecord->updates[i]);
+
+		// bin exists, then we will release old value and set new value.
+		if ( strncmp(name, bin->name, AS_ID_BIN_SZ) == 0 ) {
+			cf_detail(AS_UDF, "udf_record_set: %s found", name);
+
+			// release previously set value
+			as_val_destroy(bin->value);
+
+			// set new value, with dirty flag
+			if( value != NULL ) {
+				bin->value = (as_val *) value;
+			}
+			bin->dirty = dirty;
+			cf_detail(AS_UDF, "udf_record_set: %s set for %p:%p", name,
+					urecord, bin->value);
+
+			modified = true;
+			break;
+		}
+	}
+
+	// If not modified, then we will add the bin to the cache
+	if ( ! modified ) {
+		if ( urecord->nupdates < UDF_RECORD_BIN_ULIMIT ) {
+			udf_record_bin * bin = &(urecord->updates[urecord->nupdates]);
+			strncpy(bin->name, name, AS_ID_BIN_SZ);
+			bin->value = (as_val *) value;
+			bin->dirty = dirty;
+			urecord->nupdates++;
+			cf_detail(AS_UDF, "udf_record_set: %s not modified, add for %p:%p",
+					name, urecord, bin->value);
+		}
+		else {
+			cf_warning(AS_UDF, "UDF bin limit (%d) exceeded (bin %s)",
+					UDF_RECORD_BIN_ULIMIT, name);
+			urecord->flag |= UDF_RECORD_FLAG_TOO_MANY_BINS;
+		}
+	}
+}
+
+/*
+ * Internal Function: Read the bin from storage and convert it
+ *                    into as_val and return
+ *
+ * Parameters:
+ * 		r    : udf record
+ * 		bname: Bin name of the bin which need to be read.
+ *
+ * Return value :
+ * 	 	value (as_val *) in case of success
+ * 		NULL  in case of failure
+ *
+ * Description:
+ * 		Expectation is the record is already open. No checks are
+ * 		performed in this function. Caller needs to make sure the
+ * 		record is good to read e.g binname etc.
+ *
+ * 		NB: as_val which is returned is allocated one. It is callers
+ * 		    responsibility to free else in case it is passed on to
+ * 		    lua ... lua has responsibility of garbage collecting it.
+ * 		    Hence this function call incurs and malloc cost.
+ *
+ * Callers:
+ * 		udf_record_get
+ */
+as_val *
+udf_record_storage_get(const udf_record *urecord, const char *name)
+{
+	if (!name) {
+		cf_detail(AS_UDF, "Passed Null bin name to storage get");
+		return NULL;
+	}
+	as_bin * bb = as_bin_get(urecord->rd, name);
+
+	if ( !bb ) {
+		cf_detail(AS_UDF, "udf_record_get: bin not found (%s)", name);
+		return NULL;
+	}
+
+	return as_bin_particle_to_asval(bb);
+}
+
+/*
+ * Check and validate parameter before performing operation
+ *
+ * return:
+ *      2 : UDF_ERR_INTERNAL_PARAM
+ *      3 : UDF_ERR_RECORD_IS_NOT_VALID
+ *      4 : UDF_ERR_PARAMETER
+ *      0 : Success
+ *
+ */
+int
+udf_record_param_check(const as_rec *rec, char *fname, int lineno)
+{
+	if (! rec) {
+		cf_warning(AS_UDF, "Invalid Parameter: null record");
+		return UDF_ERR_INTERNAL_PARAMETER;
+	}
+
+	udf_record *urecord = (udf_record *)as_rec_source(rec);
+	if (!urecord) {
+		return UDF_ERR_INTERNAL_PARAMETER;;
+	}
+
+	if (!(urecord->flag & UDF_RECORD_FLAG_ISVALID)) {
+		cf_debug(AS_UDF, "(%s:%d): Trying to Open Invalid Record", fname, lineno);
+		return UDF_ERR_RECORD_NOT_VALID;
+	}
+
+	return 0;
+}
+
+static int
+udf_record_param_check_w_bin(const as_rec *rec, const char *bname, char *fname, int lineno)
+{
+	int rv = udf_record_param_check(rec, fname, lineno);
+
+	if (rv != 0) {
+		return rv;
+	}
+
+	if (! bname) {
+		cf_warning(AS_UDF, "Invalid Parameter: null bin name");
+		return UDF_ERR_INTERNAL_PARAMETER;
+	}
+
+	udf_record *urecord = (udf_record *)as_rec_source(rec);
+	as_namespace *ns = urecord->tr->rsv.ns;
+
+	if (ns->single_bin) {
+		if (*bname != 0) {
+			cf_warning(AS_UDF, "Invalid Parameter: non-empty bin name in single-bin namespace");
+			return UDF_ERR_INTERNAL_PARAMETER;
+		}
+
+		return 0;
+	}
+
+	if (*bname == 0) {
+		cf_warning(AS_UDF, "Invalid Parameter: empty bin name");
+		return UDF_ERR_INTERNAL_PARAMETER;
+	}
+
+	if (strlen(bname) >= AS_ID_BIN_SZ) {
+		cf_warning(AS_UDF, "Invalid Parameter: bin name %s too big", bname);
+		return UDF_ERR_PARAMETER;
+	}
+
+	if (! as_bin_name_within_quota(ns, bname)) {
+		cf_warning(AS_UDF, "{%s} exceeded bin name quota", ns->name);
+		return UDF_ERR_PARAMETER;
+	}
+
+	return 0;
+}
+
+/*********************************************************************
+ * INTERFACE FUNCTIONS                                               *
+ *																	 *
+ * See the as_aerospike for the API definition						 *
+ ********************************************************************/
+static as_val *
+udf_record_get(const as_rec * rec, const char * name)
+{
+	if (udf_record_param_check_w_bin(rec, name, __FILE__, __LINE__)) {
+		return NULL;
+	}
+	udf_record  *   urecord = (udf_record *) as_rec_source(rec);
+	as_val *        value   = NULL;
+
+	cf_debug(AS_UDF, "[ENTER] rec(%p) name(%s)", rec, name );
+
+	// Get from cache
+	value = udf_record_cache_get(urecord, name);
+
+	// If value not NULL, then return it.
+	if ( value != NULL ) {
+		return value;
+	}
+
+	// Check in the cache before trying to look up in record
+	// Note: Record may not have been created yet ... Do not
+	// change the order unless you fully understand what you
+	// are doing
+	if ( !(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) ) {
+		if (udf_record_open(urecord)) { // lazy read the record from storage
+			return NULL;
+		}
+	}
+
+	// Check if storage is available
+	if ( !urecord->rd->ns ) {
+		cf_detail(AS_UDF, "udf_record_get: storage unavailable");
+		return NULL;
+	}
+
+	value = udf_record_storage_get(urecord, name);
+
+	// We have a value, so we will cache it.
+	// DO NOT remove this. We need to cache copy to makes sure ref count 
+	// gets decremented post handing this as_val over to the lua world
+	if (value) {
+		udf_record_cache_set(urecord, name, value, false);
+	}
+
+	cf_detail(AS_UDF, "udf_record_get: end (%s) [%p,%p]", name, urecord, value);
+	return value;
+}
+
+static int
+udf_record_set(const as_rec * rec, const char * name, const as_val * value)
+{
+	int ret = udf_record_param_check_w_bin(rec, name, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	cf_detail(AS_UDF, "udf_record_set: begin (%s)", name);
+	if ( urecord && name ) {
+		udf_record_cache_set(urecord, name, (as_val *) value, true);
+	}
+	cf_detail(AS_UDF, "udf_record_set: end (%s)", name);
+
+	return 0;
+}
+
+static int
+udf_record_set_ttl(const as_rec * rec,  uint32_t  ttl)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	if (!(urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES)) {
+		return -1;
+	}
+
+	urecord->tr->msgp->msg.record_ttl = ttl;
+	urecord->flag |= UDF_RECORD_FLAG_METADATA_UPDATED;
+
+	return 0;
+}
+
+static int
+udf_record_drop_key(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	if (!(urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES)) {
+		return -1;
+	}
+
+	// Flag the key to be dropped.
+	if (urecord->rd->key) {
+		urecord->rd->key = NULL;
+		urecord->rd->key_size = 0;
+	}
+
+	urecord->flag |= UDF_RECORD_FLAG_METADATA_UPDATED;
+
+	return 0;
+}
+
+static int
+udf_record_remove(const as_rec * rec, const char * name)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return ret;
+	}
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+
+
+	cf_detail(AS_UDF, "udf_record_remove: begin (%s)", name);
+	if ( urecord && name ) {
+		udf_record_cache_set(urecord, name, (as_val *) &as_nil, true);
+	}
+	cf_detail(AS_UDF, "udf_record_remove: end (%s)", name);
+
+	return 0;
+}
+
+static uint32_t
+udf_record_ttl(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return 0;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+
+	if ((urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) {
+		uint32_t now = as_record_void_time_get();
+
+		return urecord->r_ref->r->void_time > now ?
+				urecord->r_ref->r->void_time - now : 0;
+	}
+	else {
+		cf_info(AS_UDF, "Error in getting ttl: no record found");
+		return 0; // since we can't indicate the record doesn't exist
+	}
+	return 0;
+}
+
+static uint64_t
+udf_record_last_update_time(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return 0;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) {
+		return urecord->r_ref->r->last_update_time;
+	}
+	else {
+		cf_warning(AS_UDF, "Error getting last update time: no record found");
+		return 0;
+	}
+}
+
+static uint16_t
+udf_record_gen(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return 0;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) != 0) {
+		return plain_generation(urecord->rd->r->generation, urecord->rd->ns);
+	}
+	else {
+		cf_warning(AS_UDF, "Error in getting generation: no record found");
+		return 0;
+	}
+}
+
+// Local utility.
+static as_val *
+as_val_from_flat_key(uint8_t * flat_key, uint32_t size)
+{
+	uint8_t type = *flat_key;
+	uint8_t * key = flat_key + 1;
+
+	switch ( type ) {
+		case AS_PARTICLE_TYPE_INTEGER:
+			// TODO - verify size is (1 + 8) ???
+			// Flat integer keys are in big-endian order.
+			return (as_val *) as_integer_new(cf_swap_from_be64(*(int64_t *)key));
+		case AS_PARTICLE_TYPE_STRING:
+		{
+			// Key length is size - 1, then +1 for null-termination.
+			char * buf = cf_malloc(size);
+			uint32_t len = size - 1;
+			memcpy(buf, key, len);
+			buf[len] = '\0';
+
+			return (as_val *) as_string_new(buf, true);
+		}
+		case AS_PARTICLE_TYPE_BLOB:
+		{
+			uint32_t blob_size = size - 1;
+			uint8_t *buf = cf_malloc(blob_size);
+
+			memcpy(buf, key, blob_size);
+
+			return (as_val *) as_bytes_new_wrap(buf, blob_size, true);
+		}
+		default:
+			return NULL;
+	}
+}
+
+static as_val *
+udf_record_key(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return NULL;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) {
+		if (urecord->rd->key) {
+			return as_val_from_flat_key(urecord->rd->key, urecord->rd->key_size);
+		}
+		// TODO - perhaps look for the key in the message.
+		return NULL;
+	}
+	else {
+		cf_warning(AS_UDF, "Error in getting key: no record found");
+		return NULL;
+	}
+}
+
+static const char *
+udf_record_setname(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return NULL;
+	}
+
+	udf_record * urecord = (udf_record *) as_rec_source(rec);
+	if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) {
+		return as_index_get_set_name(urecord->r_ref->r, urecord->rd->ns);
+	}
+	else {
+		cf_warning(AS_UDF, "Error in getting set name: no record found");
+		return NULL;
+	}
+}
+
+bool
+udf_record_destroy(as_rec *rec)
+{
+	if (!rec) {
+		return false;
+	}
+
+	udf_record *urecord = (udf_record *) as_rec_source(rec);
+	udf_record_close(urecord);
+	as_rec_destroy(rec);
+	return true;
+} 
+
+static as_bytes *
+udf_record_digest(const as_rec *rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return NULL;
+	}
+
+	udf_record *urecord = (udf_record *)as_rec_source(rec);
+	if (urecord && urecord->flag & UDF_RECORD_FLAG_OPEN) {
+		cf_digest *keyd = cf_malloc(sizeof(cf_digest));
+		memcpy(keyd, &urecord->keyd, CF_DIGEST_KEY_SZ);
+		as_bytes *b = as_bytes_new_wrap(keyd->digest, CF_DIGEST_KEY_SZ, true);
+		return b;
+	}
+	return NULL;
+}
+
+static int
+udf_record_bin_names(const as_rec *rec, as_rec_bin_names_callback callback, void * udata)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return 1;
+	}
+
+	udf_record *urecord = (udf_record *)as_rec_source(rec);
+	char * bin_names = NULL;
+	if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) {
+		uint16_t nbins;
+
+		if (urecord->rd->ns->single_bin) {
+			nbins = 1;
+			bin_names = alloca(1);
+			*bin_names = 0;
+		}
+		else {
+			nbins = urecord->rd->n_bins;
+			bin_names = alloca(nbins * AS_ID_BIN_SZ);
+			for (uint16_t i = 0; i < nbins; i++) {
+				as_bin *b = &urecord->rd->bins[i];
+				if (! as_bin_inuse(b)) {
+					nbins = i;
+					break;
+				}
+				const char * name = as_bin_get_name_from_id(urecord->rd->ns, b->id);
+				strcpy(bin_names + (i * AS_ID_BIN_SZ), name);
+			}
+		}
+		callback(bin_names, nbins, AS_ID_BIN_SZ, udata);
+		return 0;
+	}
+	else {
+		cf_warning(AS_UDF, "Error in getting bin names: no record found");
+		bin_names = alloca(1);
+		*bin_names = 0;
+		callback(bin_names, 1, AS_ID_BIN_SZ, udata);
+		return -1;
+	}
+}
+
+static uint16_t
+udf_record_numbins(const as_rec * rec)
+{
+	int ret = udf_record_param_check(rec, __FILE__, __LINE__);
+	if (ret) {
+		return 0;
+	}
+
+	udf_record *urecord = (udf_record *) as_rec_source(rec);
+	if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) {
+
+		if (urecord->rd->ns->single_bin) {
+			return 1;
+		}
+
+		uint16_t i;
+		as_storage_rd *rd = urecord->rd;
+		for (i = 0; i < rd->n_bins; i++) {
+			as_bin *b = &rd->bins[i];
+			if (! as_bin_inuse(b)) {
+				break;
+			}
+		}
+		return i;
+	}
+	else {
+		cf_warning(AS_UDF, "Error in getting numbins: no record found");
+		return 0;
+	}
+}
+
+const as_rec_hooks udf_record_hooks = {
+	.get		= udf_record_get,
+	.set		= udf_record_set,
+	.remove		= udf_record_remove,
+	.ttl		= udf_record_ttl,
+	.last_update_time	= udf_record_last_update_time,
+	.gen		= udf_record_gen,
+	.key		= udf_record_key,
+	.setname	= udf_record_setname,
+	.destroy	= NULL,
+	.digest		= udf_record_digest,
+	.set_ttl	= udf_record_set_ttl,
+	.drop_key	= udf_record_drop_key,
+	.bin_names	= udf_record_bin_names,
+	.numbins	= udf_record_numbins
+};
diff --git a/as/src/base/udf_timer.c b/as/src/base/udf_timer.c
new file mode 100644
index 00000000..78a66fcd
--- /dev/null
+++ b/as/src/base/udf_timer.c
@@ -0,0 +1,96 @@
+/*
+ * udf_timer.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/udf_timer.h"
+
+#include <pthread.h>
+
+#include "citrusleaf/cf_clock.h"
+
+#include "fault.h"
+
+
+/*****************************************************************************
+ * STATIC FUNCTIONS
+ *****************************************************************************/
+
+static pthread_key_t   timer_tlskey = 0;
+static pthread_once_t  key_once = PTHREAD_ONCE_INIT;
+
+static void
+udf_make_key()
+{
+	pthread_key_create(&timer_tlskey, NULL);
+}
+
+void
+udf_timer_setup(time_tracker *tt)
+{
+	pthread_once(&key_once, udf_make_key);
+	pthread_setspecific(timer_tlskey, tt);
+	cf_detail(AS_UDF, "tid=%ld tt=%p", pthread_self(), tt);
+}
+
+void
+udf_timer_cleanup()
+{
+	pthread_setspecific(timer_tlskey, NULL);
+	cf_detail(AS_UDF, "tid=%ld", pthread_self());
+}
+
+bool
+udf_timer_timedout(const as_timer * timer)
+{
+	time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey);
+	cf_detail(AS_UDF, "tid=%ld tt=%p", pthread_self(), tt);
+
+	if (!tt || !tt->end_time) {
+		return true;
+	}
+	uint64_t now = cf_getns();
+	bool timedout = (now > tt->end_time(tt));
+	if (timedout) {
+		cf_warning(AS_UDF, "UDF Timed Out [%lu:%lu]", now / 1000000, tt->end_time(tt) / 1000000);
+		return true;
+	}
+	return false;
+}
+
+uint64_t
+udf_timer_timeslice(const as_timer * timer)
+{
+	time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey);
+	cf_detail(AS_UDF, "tid=%ld tt=%p", pthread_self(), tt);
+
+	if (!tt || !tt->end_time) {
+		return 0;
+	}
+	uint64_t timeslice = (tt->end_time(tt) - cf_getns()) / 1000000;
+	return (timeslice > 0) ? timeslice : 1;
+}
+
+
+const as_timer_hooks udf_timer_hooks = {
+	.destroy	= NULL,
+	.timedout	= udf_timer_timedout,
+	.timeslice	= udf_timer_timeslice
+};
diff --git a/as/src/base/xdr_config.c b/as/src/base/xdr_config.c
new file mode 100644
index 00000000..d23f7658
--- /dev/null
+++ b/as/src/base/xdr_config.c
@@ -0,0 +1,73 @@
+/*
+ * xdr_config.c
+ *
+ * Copyright (C) 2011-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ *  Configuration file-related routines shared between the server and XDR.
+ */
+
+#include <string.h>
+#include "base/xdr_config.h"
+
+void xdr_config_defaults()
+{
+	xdr_config *c = &g_xcfg;
+	memset(c, 0, sizeof(xdr_config));
+
+	c->xdr_section_configured = false;	// Indicates if XDR is configured or not
+	c->xdr_global_enabled = false;		// This config option overrides the enable-xdr setting of the namespace(s)
+	c->xdr_digestlog_path = NULL;		// Path where the digest information is written to the disk
+	c->xdr_info_port = 0;
+	c->xdr_max_ship_throughput = 0;		// XDR TPS limit
+	c->xdr_max_ship_bandwidth = 0;		// XDR bandwidth limit
+	c->xdr_min_dlog_free_pct = 0;		// Namespace writes are stopped below this limit
+	c->xdr_hotkey_time_ms = 100;		// Expiration time for the de-duplication cache
+	c->xdr_read_threads = 4;			// Number of XDR read threads.
+	c->xdr_write_timeout = 10000;		// Timeout for each element that is shipped.
+	c->xdr_client_threads = 3;			// Number of async client threads (event loops)
+	c->xdr_forward_xdrwrites = false;	// If the writes due to xdr should be forwarded
+	c->xdr_nsup_deletes_enabled = false;// Shall XDR ship deletes of evictions or expiration
+	c->xdr_internal_shipping_delay = 0;	// Default sleep between shipping each batch is 0 seconds
+	c->xdr_conf_change_flag = false;
+	c->xdr_shipping_enabled = true;
+	c->xdr_delete_shipping_enabled = true;
+	c->xdr_ship_bins = false;
+	c->xdr_info_request_timeout_ms = 10000;
+	c->xdr_compression_threshold = 0; 	// 0 disables compressed shipping, > 0 specifies minimum request size for compression
+	c->xdr_handle_failednode = true;
+	c->xdr_handle_linkdown = true;
+	c->xdr_digestlog_iowait_ms = 500;
+
+	for (uint32_t index = 0; index < DC_MAX_NUM; index++) {
+		g_dc_xcfg_opt[index].dc_name = NULL;
+		g_dc_xcfg_opt[index].dc_node_v.vector = NULL;
+		g_dc_xcfg_opt[index].dc_addr_map_v.vector = NULL;
+		g_dc_xcfg_opt[index].dc_security_cfg.sec_config_file = NULL;
+		g_dc_xcfg_opt[index].dc_use_alternate_services = false;
+		g_dc_xcfg_opt[index].dc_connections = 64;
+		g_dc_xcfg_opt[index].dc_connections_idle_ms = 55000;
+	}
+}
+
+xdr_config		g_xcfg = { 0 };
+dc_config_opt	g_dc_xcfg_opt[DC_MAX_NUM];
+int				g_dc_count = 0;
+
diff --git a/as/src/base/xdr_serverside_stubs.c b/as/src/base/xdr_serverside_stubs.c
new file mode 100644
index 00000000..5f29d328
--- /dev/null
+++ b/as/src/base/xdr_serverside_stubs.c
@@ -0,0 +1,130 @@
+/*
+ * xdr_serverside_stubs.c
+ *
+ * Copyright (C) 2014-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/xdr_serverside.h"
+
+int as_xdr_init()
+{
+	return -1;
+}
+
+void xdr_config_post_process()
+{
+}
+
+void as_xdr_start()
+{
+}
+
+int as_xdr_shutdown()
+{
+	return -1;
+}
+
+void xdr_sig_handler(int signum)
+{
+}
+
+void xdr_broadcast_lastshipinfo(uint64_t val[])
+{
+}
+
+void xdr_clear_dirty_bins(xdr_dirty_bins *dirty)
+{
+}
+
+void xdr_fill_dirty_bins(xdr_dirty_bins *dirty)
+{
+}
+
+void xdr_copy_dirty_bins(xdr_dirty_bins *from, xdr_dirty_bins *to)
+{
+}
+
+void xdr_add_dirty_bin(as_namespace *ns, xdr_dirty_bins *dirty, const char *name, size_t name_len)
+{
+}
+
+void xdr_write(as_namespace *ns, cf_digest *keyd, uint16_t generation, cf_node masternode, xdr_op_type op_type, uint16_t set_id, xdr_dirty_bins *dirty)
+{
+}
+
+void as_xdr_read_txn(as_transaction *txn)
+{
+}
+
+void as_xdr_info_init(void)
+{
+}
+
+void as_xdr_info_port(cf_serv_cfg *serv_cfg)
+{
+	(void)serv_cfg;
+}
+
+int as_info_command_xdr(char *name, char *params, cf_dyn_buf *db)
+{
+	return -1;
+}
+
+void as_xdr_get_stats(cf_dyn_buf *db)
+{
+}
+
+void as_xdr_get_config(cf_dyn_buf *db)
+{
+}
+
+bool as_xdr_set_config(char *params)
+{
+	return false;
+}
+
+bool as_xdr_set_config_ns(char *ns_name, char *params)
+{
+	return false;
+}
+
+bool is_xdr_delete_shipping_enabled()
+{
+	return false;
+}
+
+bool is_xdr_digestlog_low(as_namespace *ns)
+{
+	return false;
+}
+
+bool is_xdr_forwarding_enabled()
+{
+	return false;
+}
+
+bool is_xdr_nsup_deletes_enabled()
+{
+	return false;
+}
+
+void xdr_cfg_add_int_ext_mapping(dc_config_opt *dc_cfg, char* orig, char* alt)
+{
+}
+
diff --git a/as/src/fabric/clustering.c b/as/src/fabric/clustering.c
new file mode 100644
index 00000000..c07ebc30
--- /dev/null
+++ b/as/src/fabric/clustering.c
@@ -0,0 +1,8163 @@
+/*
+ * clustering.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fabric/clustering.h"
+
+#include <errno.h>
+#include <math.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/param.h> // For MAX() and MIN().
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_random.h"
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+#include "shash.h"
+
+#include "base/cfg.h"
+#include "fabric/fabric.h"
+#include "fabric/hlc.h"
+
+/*
+ * Overview
+ * ========
+ * Clustering v5 implementation based on the design at
+ * https://aerospike.atlassian.net/wiki/pages/viewpage.action?spaceKey=DEV&title=Central+Wiki%3A++Clustering+V5
+ *
+ * Public and private view of the cluster
+ * =======================================
+ * This clustering algorithm introduces an orphan state, in which this node is
+ * not part of a cluster, but is looking to form/join a cluster. During this
+ * transitionary phase, the public view of the cluster the tuple, <cluster_key,
+ * succession_list), does not change from the last view. However the internal
+ * view, which is published along with the heartbeat messages, is set to <0,
+ * []>.
+ *
+ * This ensures clients continue to function, (maybe with errors), during the
+ * transition from orphan to part of a cluster state. This is in line with the
+ * clustering v4 and prior behaviour.
+ *
+ * TODO: (revise)
+ *
+ * Deviations from paxos
+ * =====================
+ *
+ * Accepted value
+ * ---------------
+ *
+ * Accepted value is not send along with accept and accepted message. The latest
+ * accepted value overwrites the previous value at a node. In paxos if a node
+ * has already accepted a value, it is send back to the proposer who should use
+ * the value with highest proposal id as the final value. The proposer generates
+ * the final consensus value as the succession list with the nodes that have
+ * both returned promise and accepted replies.
+ *
+ * This is not safe in terms of achieveing a single paxos value, however it is
+ * safe in that nodes courted by other principals will get filtered out during
+ * paxos and not require additional paxos rounds.
+ *
+ * It is still possible that the final consensus succession list might has a few
+ * nodes moving out owing to a neighboring principal. However the faulty node
+ * check in the next quantum interval will fix this.
+ *
+ * Quorum
+ * ------
+ * The prepare phase uses a majority quorum for the promise messages, to speed
+ * through the paxos round. However the accept phase uses a complete / full
+ * quorum for accepted messages. This helps with ensuring that when a node
+ * generartes a cluster change event all cluster member have applied the current
+ * cluster membership.
+ *
+ * Design
+ * ======
+ * The clustering sub-system with rest of Aerospike via input event notification
+ * (primarily heartbeat events) and output events notifications (primary cluster
+ * change notifications).
+ *
+ * The subsystem is driven by internal events (that also encapsulate external
+ * input event notifications) like timer, quantum interval start, adjaceny
+ * changed, message received, etc.
+ *
+ * The clustering-v5 subsystem is further organized as the following sub-modules
+ * each of which reacts to the above mentioned events based on individual state
+ * transition diagrams.
+ *
+ * 	1. Timer
+ * 	2. Quantum interval generator
+ * 	3. Paxos proposer
+ * 	4. Paxos acceptor
+ * 	5. Register
+ * 	6. External event publisher
+ * 	7. Internal event dispatcher
+ * 	8. Clustering main
+ *
+ * The sub modules also interact with each other via inline internal event
+ * dispatch and handling.
+ *
+ * Timer
+ * -----
+ * Generates timer events that serve as the internal tick/clock for the
+ * clustering-v5 sub system. Other sub-modules use the timer events to drive
+ * actions to be performed at fixed intervals, for e.g. message retransmits.
+ *
+ * Quantum interval generator
+ * --------------------------
+ * Generates quantum interval start events, at which cluster change decision are
+ * taken.
+ *
+ * Paxos proposer
+ * --------------
+ * The paxos proposer proposes a cluster change. The node may or may not be the
+ * eventual principal for the cluster.
+ *
+ * Paxos acceptor
+ * --------------
+ * Participates in voting for a proposal. A paxos proposer is also necessarily
+ * an accetor in this design.
+ *
+ * Register
+ * --------
+ * Holds current cluster membership and cluster key. It is responsible for
+ * ensuring all cluster members have their registers in sync before publishing
+ * an external cluster change event.
+ *
+ * External event publisher
+ * ------------------------
+ * Generate and publishes external events or cluster changes. Runs as a separate
+ * thread to prevent interference and potential deadlocks with the clustering
+ * subsystem.
+ *
+ * Internal event dispatcher
+ * -------------------------
+ * Dispatches internal events to current function based in the event type and
+ * current state.
+ *
+ * Clustering main
+ * ---------------
+ * Monitors the cluster and triggers cluster changes.
+ *
+ * State transitions
+ * =================
+ * TODO: diagrams for each sub-module
+ *
+ * Message send rules
+ * ==================
+ * Message send should preferably be outside the main clustering lock and should
+ * not be followed by any state change in the same function. This is because
+ * fabric relays messages to self inline in the send call itself which can lead
+ * to corruption if the message handler involves a state change as well or can
+ * result in the message handler seeing inconsistent partially updated state.
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Constants
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * A soft limit for the maximum cluster size. Meant to be optimize hash and list
+ * data structures and not as a limit on the number of nodes.
+ */
+#define AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT 200
+
+/**
+ * Timer event generation interval.
+ */
+#define CLUSTERING_TIMER_TICK_INTERVAL 75
+
+/**
+ * Maximum time paxos round would take for completion. 3 RTTs paxos message
+ * exchanges and 1 RTT as a buffer.
+ */
+#define PAXOS_COMPLETION_TIME_MAX (4 * network_rtt_max())
+
+/**
+ * Maximum quantum interval duration, should be at least two heartbeat
+ * intervals, to ensure there is at least one exchange of clustering information
+ * over heartbeats.
+ */
+#define QUANTUM_INTERVAL_MAX MAX(5000, 2 * as_hb_tx_interval_get())
+
+/**
+ * Block size for allocating node plugin data. Ensure the allocation is in
+ * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating.
+ */
+#define HB_PLUGIN_DATA_BLOCK_SIZE 128
+
+/**
+ * Scratch size for clustering messages.
+ *
+ * TODO: Compute this properly.
+ */
+#define AS_CLUSTERING_MSG_SCRATCH_SIZE 1024
+
+/**
+ * Majority value for preferred principal to be selected for move. Use tow
+ * thirds as the majority value.
+ */
+#define AS_CLUSTERING_PREFERRRED_PRINCIPAL_MAJORITY (2 / 3)
+
+/*
+ * ----------------------------------------------------------------------------
+ * Paxos data structures
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Paxos sequence number. We will use the hybrid logical clock timestamp as
+ * sequence numbers, to ensure node restarts do not reset the sequence number
+ * back to zero and sequence numbers are monotoniocally increasing. A sequence
+ * number value of zero is invalid.
+ */
+typedef as_hlc_timestamp as_paxos_sequence_number;
+
+/**
+ * Paxos proposal identifier.
+ * Note: The nodeid can be skipped when sending the proposal id over the wire
+ * and can be inferred from the source duirng paxos message exchanges.
+ */
+typedef struct as_paxos_proposal_id_s
+{
+	/**
+	 * The sequence number.
+	 */
+	as_paxos_sequence_number sequence_number;
+
+	/**
+	 * The proposing node's nodeid to break ties.
+	 */
+	cf_node src_nodeid;
+} as_paxos_proposal_id;
+
+/**
+ * The proposed cluster membership.
+ */
+typedef struct as_paxos_proposed_value_s
+{
+	/**
+	 * The cluster key.
+	 */
+	as_cluster_key cluster_key;
+
+	/**
+	 * The succession list.
+	 */
+	cf_vector succession_list;
+} as_paxos_proposed_value;
+
+/**
+ * Paxos acceptor state.
+ */
+typedef enum
+{
+	/**
+	 * Acceptor is idel with no active paxos round.
+	 */
+	AS_PAXOS_ACCEPTOR_STATE_IDLE,
+
+	/**
+	 * Acceptor has received and acked a promise message.
+	 */
+	AS_PAXOS_ACCEPTOR_STATE_PROMISED,
+
+	/**
+	 * Acceptor has received and accepted an accept message from a proposer.
+	 */
+	AS_PAXOS_ACCEPTOR_STATE_ACCEPTED
+} as_paxos_acceptor_state;
+
+/**
+ * Data tracked by the node in the role of a paxos acceptor.
+ * All nodes are paxos acceptors.
+ */
+typedef struct as_paxos_acceptor_s
+{
+	/**
+	 * The paxos acceptor state.
+	 */
+	as_paxos_acceptor_state state;
+
+	/**
+	 * Monotonic timestamp when the first message for current proposal was
+	 * received from the proposer.
+	 */
+	cf_clock acceptor_round_start;
+
+	/**
+	 * Monotonic timestamp when the promise message was sent.
+	 */
+	cf_clock promise_send_time;
+
+	/**
+	 * Monotonic timestamp when the promise message was sent.
+	 */
+	cf_clock accepted_send_time;
+
+	/**
+	 * Id of the last proposal, promised or accepted by this node.
+	 */
+	as_paxos_proposal_id last_proposal_received_id;
+} as_paxos_acceptor;
+
+/**
+ * State of a paxos proposer.
+ */
+typedef enum as_paxos_proposer_state_e
+{
+	/**
+	 * Paxos proposer is idle. No pending paxos rounds.
+	 */
+	AS_PAXOS_PROPOSER_STATE_IDLE,
+
+	/**
+	 * Paxos proposer sent out a prepare message.
+	 */
+	AS_PAXOS_PROPOSER_STATE_PREPARE_SENT,
+
+	/**
+	 * Paxos proposer has sent out an accept message.
+	 */
+	AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT
+} as_paxos_proposer_state;
+
+/**
+ * Data tracked by the node in the role of a paxos proposer. The proposer node
+ * may or may not be the current or eventual principal.
+ */
+typedef struct as_paxos_proposer_s
+{
+	/**
+	 * The state of the proposer.
+	 */
+	as_paxos_proposer_state state;
+
+	/**
+	 * The sequence number / id for the last proposed paxos value.
+	 */
+	as_paxos_sequence_number sequence_number;
+
+	/**
+	 * The proposed cluster value.
+	 */
+	as_paxos_proposed_value proposed_value;
+
+	/**
+	 * The time current paxos round was started.
+	 */
+	cf_clock paxos_round_start_time;
+
+	/**
+	 * The time current proposal's prepare message was sent.
+	 */
+	cf_clock prepare_send_time;
+
+	/**
+	 * The time current proposal's accept message was sent.
+	 */
+	cf_clock accept_send_time;
+
+	/**
+	 * The time current proposal's learn message was sent.
+	 */
+	cf_clock learn_send_time;
+
+	/**
+	 * Indicates if learn message needs retransmit.
+	 */
+	bool learn_retransmit_needed;
+
+	/**
+	 * The set of acceptor nodes including self.
+	 */
+	cf_vector acceptors;
+
+	/**
+	 * Set of nodeids that send out a promise response to the current prepare
+	 * message.
+	 */
+	cf_vector promises_received;
+
+	/**
+	 * Set of nodeids that send out an accepted response to the current accept
+	 * message.
+	 */
+	cf_vector accepted_received;
+} as_paxos_proposer;
+
+/**
+ * Result of paxos round start call.
+ */
+typedef enum as_paxos_start_result_e
+{
+	/**
+	 * Paxos round started successfully.
+	 */
+	AS_PAXOS_RESULT_STARTED,
+
+	/**
+	 * cluster size is less than minimum required cluster size.
+	 */
+	AS_PAXOS_RESULT_CLUSTER_TOO_SMALL,
+
+	/**
+	 * Paxos round already in progress. Paxos not started.
+	 */
+	AS_PAXOS_RESULT_ROUND_RUNNING
+} as_paxos_start_result;
+
+/**
+ * Node clustering status.
+ */
+typedef enum
+{
+	/**
+	 * Peer node is orphaned.
+	 */
+	AS_NODE_ORPHAN,
+
+	/**
+	 * Peer node has a cluster assigned.
+	 */
+	AS_NODE_CLUSTER_ASSIGNED,
+
+	/**
+	 * Peer node status is unknown.
+	 */
+	AS_NODE_UNKNOWN
+} as_clustering_peer_node_state;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering data structures
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Clustering message types.
+ */
+typedef enum
+{
+	/*
+	 * ---- Clustering management messages ----
+	 */
+	AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST,
+	AS_CLUSTERING_MSG_TYPE_JOIN_REJECT,
+	AS_CLUSTERING_MSG_TYPE_MERGE_MOVE,
+	AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED,
+
+	/*
+	 * ---- Paxos messages ----
+	 */
+	AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE,
+	AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE,
+	AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK,
+	AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT,
+	AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED,
+	AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK,
+	AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN,
+} as_clustering_msg_type;
+
+/**
+ * The fields in the clustering message.
+ */
+typedef enum
+{
+	/**
+	 * Clustering message identifier.
+	 */
+	AS_CLUSTERING_MSG_ID,
+
+	/**
+	 * Clustering message type.
+	 */
+	AS_CLUSTERING_MSG_TYPE,
+
+	/**
+	 * The source node send timestamp.
+	 */
+	AS_CLUSTERING_MSG_HLC_TIMESTAMP,
+
+	/**
+	 * The paxos sequence number. Not all messages will have this.
+	 */
+	AS_CLUSTERING_MSG_SEQUENCE_NUMBER,
+
+	/**
+	 * The proposed cluster key. Only part of the paxos accept message.
+	 */
+	AS_CLUSTERING_MSG_CLUSTER_KEY,
+
+	/**
+	 * The proposed succession list. Only part of the paxos accept message.
+	 */
+	AS_CLUSTERING_MSG_SUCCESSION_LIST,
+
+	/**
+	 * The proposed principal relevant only to cluster move commands, which will
+	 * merge two well formed paxos clusters.
+	 */
+	AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL,
+
+	/**
+	 * Sentinel value to keep track of the number of message fields.
+	 */
+	AS_CLUSTERING_MGS_SENTINEL
+} as_clustering_msg_field;
+
+/**
+ * Internal clustering event type.
+ */
+typedef enum
+{
+	/**
+	 * Timer event.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_TIMER,
+
+	/**
+	 * Incoming message event.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_MSG,
+
+	/**
+	 * A join request was accepted.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED,
+
+	/**
+	 * Indicates the start of a quantum interval.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START,
+
+	/**
+	 * Indicates that self node's cluster membership changed.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED,
+
+	/**
+	 * Indicates that self node's cluster membership has been synced across all
+	 * cluster members.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED,
+
+	/**
+	 * Indicates that self node has been marked as an orphan.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED,
+
+	/**
+	 * Indicates an incoming heartbeat event.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_HB,
+
+	/**
+	 * Indicates that plugin data for a node has changed.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED,
+
+	/**
+	 * The paxos round being accepted succeeded and the proposed value should be
+	 * committed.
+	 * This implies that all the proposed cluster members have all agreed on the
+	 * proposed cluster key and the proposed cluster membership.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS,
+
+	/**
+	 * The last paxos round being accepted failed.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_FAIL,
+
+	/**
+	 * The paxos round proposed by this node.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_SUCCESS,
+
+	/**
+	 * The last paxos round proposed failed.
+	 */
+	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL,
+} as_clustering_internal_event_type;
+
+/**
+ * An event used internally by the clustering subsystem.
+ */
+typedef struct as_clustering_internal_event_s
+{
+	/**
+	 * The event type.
+	 */
+	as_clustering_internal_event_type type;
+
+	/**
+	 * The event qualifier.
+	 */
+	as_clustering_event_qualifier qualifier;
+
+	/*
+	 * ----- Quantum interval start event related fields
+	 */
+	/**
+	 * Indicates if this quantum interval start can be skipped by the event
+	 * handler.
+	 */
+	bool quantum_interval_is_skippable;
+
+	/*
+	 * ----- Message event related fields.
+	 */
+	/**
+	 * The source node id.
+	 */
+	cf_node msg_src_nodeid;
+
+	/**
+	 * Incoming message type.
+	 */
+	as_clustering_msg_type msg_type;
+
+	/**
+	 * The hlc timestamp for message receipt.
+	 */
+	as_hlc_msg_timestamp msg_hlc_ts;
+
+	/**
+	 * Local monotonic received timestamp.
+	 */
+	cf_clock msg_recvd_ts;
+
+	/**
+	 * The received message.
+	 */
+	msg* msg;
+
+	/*
+	 * ----- HB event related fields.
+	 */
+	/**
+	 * Number of heartbeat events.
+	 */
+	int hb_n_events;
+
+	/**
+	 * Heartbeat events.
+	 */
+	as_hb_event_node* hb_events;
+
+	/*
+	 * ----- HB plugin data changed event related fields.
+	 */
+	/**
+	 * Node id of the node whose plugin data has changed.
+	 */
+	cf_node plugin_data_changed_nodeid;
+
+	/**
+	 * Node's plugin data.
+	 */
+	as_hb_plugin_node_data* plugin_data;
+
+	/**
+	 * The hlc timestamp for message receipt.
+	 */
+	as_hlc_msg_timestamp plugin_data_changed_hlc_ts;
+
+	/**
+	 * Local monotonic received timestamp.
+	 */
+	cf_clock plugin_data_changed_ts;
+
+	/*
+	 * ----- Join request handled related fields.
+	 */
+	cf_node join_request_source_nodeid;
+
+	/*
+	 * ----- Paxos success related fields.
+	 */
+	/**
+	 * New succession list.
+	 */
+	cf_vector *new_succession_list;
+
+	/**
+	 * New cluster key.
+	 */
+	as_cluster_key new_cluster_key;
+
+	/**
+	 * New paxos sequence number.
+	 */
+	as_paxos_sequence_number new_sequence_number;
+} as_clustering_internal_event;
+
+/**
+ * The clustering timer state.
+ */
+typedef struct as_clustering_timer_s
+{
+	/**
+	 * The timer thread id.
+	 */
+	pthread_t timer_tid;
+} as_clustering_timer;
+
+/**
+ * Clustering subsystem state.
+ */
+typedef enum
+{
+	AS_CLUSTERING_SYS_STATE_UNINITIALIZED,
+	AS_CLUSTERING_SYS_STATE_RUNNING,
+	AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN,
+	AS_CLUSTERING_SYS_STATE_STOPPED
+} as_clustering_sys_state;
+
+/**
+ * Type of quantum interval fault. Ensure the vtable in quantum iterval table is
+ * updated for each type.
+ */
+typedef enum as_clustering_quantum_fault_type_e
+{
+	/**
+	 * A new node arrived.
+	 */
+	QUANTUM_FAULT_NODE_ARRIVED,
+
+	/**
+	 * A node not our principal departed from the cluster.
+	 */
+	QUANTUM_FAULT_NODE_DEPARTED,
+
+	/**
+	 * We are in a cluster and out principal departed.
+	 */
+	QUANTUM_FAULT_PRINCIPAL_DEPARTED,
+
+	/**
+	 * A member node's adjacency list has changed.
+	 */
+	QUANTUM_FAULT_PEER_ADJACENCY_CHANGED,
+
+	/**
+	 * Join request accepted.
+	 */
+	QUANTUM_FAULT_JOIN_ACCEPTED,
+
+	/**
+	 * We have seen a principal who might send us a merge request.
+	 */
+	QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN,
+
+	/**
+	 * A node in our cluster has been orphaned.
+	 */
+	QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED,
+
+	/**
+	 * Sentinel value. Should be the last in the enum.
+	 */
+	QUANTUM_FAULT_TYPE_SENTINEL
+} as_clustering_quantum_fault_type;
+
+/**
+ * Fault information for for first fault event detected in a quantum interval.
+ */
+typedef struct as_clustering_quantum_fault_s
+{
+	/**
+	 * First time the fault event was detected in current quantum based on
+	 * monotonic clock. Should be initialized to zero at quantum start / end.
+	 */
+	cf_clock event_ts;
+
+	/**
+	 * Last time the fault event was detected in current quantum based on
+	 * monotonic clock. Should be initialized to zero at quantum start / end.
+	 */
+	cf_clock last_event_ts;
+} as_clustering_quantum_fault;
+
+/**
+ * Function to determine the minimum wait time after given fault happens.
+ */
+typedef uint32_t
+(as_clustering_quantum_fault_wait_fn)(as_clustering_quantum_fault* fault);
+
+/**
+ * Vtable for different types of faults.
+ */
+typedef struct as_clustering_quantum_fault_vtable_s
+{
+	/**
+	 * String used to log this fault type.
+	 */
+	char *fault_log_str;
+
+	/**
+	 * Function providing the wait time for this fault type.
+	 */
+	as_clustering_quantum_fault_wait_fn* wait_fn;
+} as_clustering_quantum_fault_vtable;
+
+/**
+ * Generates quantum intervals.
+ */
+typedef struct as_clustering_quantum_interval_generator_s
+{
+	/**
+	 * Quantum interval fault vtable.
+	 */
+	as_clustering_quantum_fault_vtable vtable[QUANTUM_FAULT_TYPE_SENTINEL];
+
+	/**
+	 * Quantum interval faults.
+	 */
+	as_clustering_quantum_fault fault[QUANTUM_FAULT_TYPE_SENTINEL];
+
+	/**
+	 * Time quantum interval last started.
+	 */
+	cf_clock last_quantum_start_time;
+
+	/**
+	 * For quantum interval being skippable respect the last quantum interval
+	 * since quantum_interval() will be affected by changes to hb config.
+	 */
+	uint32_t last_quantum_interval;
+
+	/**
+	 * Indicates if current quantum interval should be postponed.
+	 */
+	bool is_interval_postponed;
+} as_clustering_quantum_interval_generator;
+
+/**
+ * State of the clustering register.
+ */
+typedef enum
+{
+	/**
+	 * The register contents are in synced with all cluster members.
+	 */
+	AS_CLUSTERING_REGISTER_STATE_SYNCED,
+
+	/**
+	 * The register contents are being synced with other cluster members.
+	 */
+	AS_CLUSTERING_REGISTER_STATE_SYNCING
+} as_clustering_register_state;
+
+/**
+ * Stores current cluster key and succession list and generates external events.
+ */
+typedef struct as_clustering_register_s
+{
+	/**
+	 * The register state.
+	 */
+	as_clustering_register_state state;
+
+	/**
+	 * Current cluster key.
+	 */
+	as_cluster_key cluster_key;
+
+	/**
+	 * Current succession list.
+	 */
+	cf_vector succession_list;
+
+	/**
+	 * Indicates if this node has transitioned to orphan state after being in a
+	 * valid cluster.
+	 */
+	bool has_orphan_transitioned;
+
+	/**
+	 * The sequence number for the current cluster.
+	 */
+	as_paxos_sequence_number sequence_number;
+
+	/**
+	 * Nodes pending sync.
+	 */
+	cf_vector sync_pending;
+
+	/**
+	 * Nodes that send a sync applied for an unexpected cluster. Store it in
+	 * case this is an imminent cluster change we will see in the future. All
+	 * the nodes in this vector have sent the same cluster key and the same
+	 * succession list.
+	 */
+	cf_vector ooo_change_applied_received;
+
+	/**
+	 * Cluster key sent by nodes in ooo_change_applied_received vector.
+	 */
+	as_cluster_key ooo_cluster_key;
+
+	/**
+	 * Succession sent by nodes in ooo_change_applied_received vector.
+	 */
+	cf_vector ooo_succession_list;
+
+	/**
+	 * Timestamp of the first ooo change applied message.
+	 */
+	as_hlc_timestamp ooo_hlc_timestamp;
+
+	/**
+	 * The time cluster last changed.
+	 */
+	as_hlc_timestamp cluster_modified_hlc_ts;
+
+	/**
+	 * The monotonic clock time cluster last changed.
+	 */
+	cf_clock cluster_modified_time;
+
+	/**
+	 * The last time the register sync was checked in the syncing state.
+	 */
+	cf_clock last_sync_check_time;
+} as_clustering_register;
+
+/**
+ * * Clustering state.
+ */
+typedef enum
+{
+	/**
+	 * Self node is not part of a cluster.
+	 */
+	AS_CLUSTERING_STATE_ORPHAN,
+
+	/**
+	 * Self node is not part of a cluster.
+	 */
+	AS_CLUSTERING_STATE_PRINCIPAL,
+
+	/**
+	 * Self node is part of a cluster but not the principal.
+	 */
+	AS_CLUSTERING_STATE_NON_PRINCIPAL
+} as_clustering_state;
+
+/**
+ * Clustering state maintained by this node.
+ */
+typedef struct as_clustering_s
+{
+
+	/**
+	 * Clustering submodule state, indicates if the clustering sub system is
+	 * running, stopped or initialized.
+	 */
+	as_clustering_sys_state sys_state;
+
+	/**
+	 * Simple view of whether or not the cluster is well-formed.
+	 */
+	bool has_integrity;
+
+	/**
+	 * Clustering relevant state, e.g. orphan, principal, non-principal.
+	 */
+	as_clustering_state state;
+
+	/**
+	 * The preferred principal is a node such that removing current principal
+	 * and making said node new principal will lead to a larger cluster. This is
+	 * updated in the non-principal state at each quantum interval and is sent
+	 * out with each heartbeat pulse.
+	 */
+	cf_node preferred_principal;
+
+	/**
+	 * Pending join requests.
+	 */
+	cf_vector pending_join_requests;
+
+	/**
+	 * The monotonic clock time when this node entered orphan state.
+	 * Will be set to zero when the node is not an orphan.
+	 */
+	cf_clock orphan_state_start_time;
+
+	/**
+	 * Time when the last move command was sent.
+	 */
+	cf_clock move_cmd_issue_time;
+
+	/**
+	 * Hash from nodes whom join request was sent to the time the join request
+	 * was send . Used to prevent sending join request too quickly to the same
+	 * principal again and again.
+	 */
+	cf_shash* join_request_blackout;
+
+	/**
+	 * The principal to which the last join request was sent.
+	 */
+	cf_node last_join_request_principal;
+
+	/**
+	 * The time at which the last join request was sent, to track and timeout
+	 * join requests.
+	 */
+	cf_clock last_join_request_sent_time;
+
+	/**
+	 * The time at which the last join request was retransmitted, to track and
+	 * retransmit join requests.
+	 */
+	cf_clock last_join_request_retransmit_time;
+} as_clustering;
+
+/**
+ * Result of sending out a join request.
+ */
+typedef enum as_clustering_join_request_result_e
+{
+	/**
+	 *
+	 * Join request was sent out.
+	 */
+	AS_CLUSTERING_JOIN_REQUEST_SENT,
+
+	/**
+	 *
+	 * Join request was attempted, but sending failed.
+	 */
+	AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED,
+
+	/**
+	 * Join request already pending. A new join request was not sent.
+	 */
+	AS_CLUSTERING_JOIN_REQUEST_PENDING,
+
+	/**
+	 * No neighboring principals present to send the join request.
+	 */
+	AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS
+} as_clustering_join_request_result;
+
+/**
+ * External event publisher state.
+ */
+typedef struct as_clustering_external_event_publisher_s
+{
+	/**
+	 * State of the external event publisher.
+	 */
+	as_clustering_sys_state sys_state;
+
+	/**
+	 * Inidicates if there is an event to publish.
+	 */
+	bool event_queued;
+
+	/**
+	 * The pending event to publish.
+	 */
+	as_clustering_event to_publish;
+
+	/**
+	 * The static succession list published with the message.
+	 */
+	cf_vector published_succession_list;
+
+	/**
+	 * Conditional variable to signal pending event to publish.
+	 */
+	pthread_cond_t is_pending;
+
+	/**
+	 * Thread id of the publisher thread.
+	 */
+	pthread_t event_publisher_tid;
+
+	/**
+	 * Mutex to protect the conditional variable.
+	 */
+	pthread_mutex_t is_pending_mutex;
+} as_clustering_external_event_publisher;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Forward declarations
+ * ----------------------------------------------------------------------------
+ */
+static void
+internal_event_dispatch(as_clustering_internal_event* timer_event);
+static bool
+clustering_is_our_principal(cf_node nodeid);
+static bool
+clustering_is_principal();
+static bool
+clustering_is_cluster_member(cf_node nodeid);
+
+/*
+ * ----------------------------------------------------------------------------
+ * Non-public hooks to exchange subsystem.
+ * ----------------------------------------------------------------------------
+ */
+extern void
+exchange_clustering_event_listener(as_clustering_event* event);
+
+/*
+ * ----------------------------------------------------------------------------
+ * Timer, timeout values and intervals
+ *
+ * All values should be multiples of timer tick interval.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Timer tick interval, which should be a GCD of all clustering intervals.
+ */
+static uint32_t
+timer_tick_interval()
+{
+	return CLUSTERING_TIMER_TICK_INTERVAL;
+}
+
+/**
+ * Maximum network latency for the cluster.
+ */
+static uint32_t
+network_latency_max()
+{
+	return g_config.fabric_latency_max_ms;
+}
+
+/**
+ * Maximum network rtt for the cluster.
+ */
+static uint32_t
+network_rtt_max()
+{
+	return 2 * network_latency_max();
+}
+
+/**
+ * Quantum interval in milliseconds.
+ */
+static uint32_t
+quantum_interval()
+{
+	uint32_t std_quantum_interval = MIN(QUANTUM_INTERVAL_MAX,
+			as_hb_node_timeout_get()
+					+ 2 * (as_hb_tx_interval_get() + network_latency_max()));
+
+	// Ensure we give paxos enough time to complete.
+	return MAX(PAXOS_COMPLETION_TIME_MAX, std_quantum_interval);
+}
+
+/**
+ * Maximum number of times quantum interval start can be skipped.
+ */
+static uint32_t
+quantum_interval_skip_max()
+{
+	return 2;
+}
+
+/**
+ * Interval at which register sync is checked.
+ */
+static uint32_t
+register_sync_check_interval()
+{
+	return MAX(network_rtt_max(), as_hb_tx_interval_get());
+}
+
+/**
+ * Timeout for a join request, should definitely be larger than a quantum
+ * interval to prevent the requesting node from making new requests before the
+ * current requested principal node can finish the paxos round.
+ */
+static uint32_t
+join_request_timeout()
+{
+	// Allow for
+	// 	- 1 quantum interval, where our request lands just after the potential
+	// principal's quantum interval start.
+	// 	- 0.5 quantum intervals to give time for a paxos round to finish
+	// 	- (quantum_interval_skip_max -1) intervals if the principal had to skip
+	// quantum intervals.
+	return (uint32_t)(
+			(1 + 0.5 + (quantum_interval_skip_max() - 1)) * quantum_interval());
+}
+
+/**
+ * Timeout for a retransmitting a join request.
+ */
+static uint32_t
+join_request_retransmit_timeout()
+{
+	return (uint32_t)(MIN(as_hb_tx_interval_get() / 2, quantum_interval() / 2));
+}
+
+/**
+ * The interval at which a node checks to see if it should join a cluster.
+ */
+static uint32_t
+join_cluster_check_interval()
+{
+	return timer_tick_interval();
+}
+
+/**
+ * Blackout period for join requests to a particular principal to prevent
+ * bombarding it with join requests. Should be less than join_request_timeout().
+ */
+static uint32_t
+join_request_blackout_interval()
+{
+	return MIN(join_request_timeout(),
+			MIN(quantum_interval() / 2, 2 * as_hb_tx_interval_get()));
+}
+
+/**
+ * Blackout period after sending a move command, during which join requests will
+ * be rejected.
+ */
+static uint32_t
+join_request_move_reject_interval()
+{
+	// Wait for one quantum interval before accepting join requests after
+	// sending a move command.
+	return quantum_interval();
+}
+
+/**
+ * Maximum tolerable join request transmission delay in milliseconds. Join
+ * requests delayed by more than this amount will not be accepted.
+ */
+static uint32_t
+join_request_accept_delay_max()
+{
+	// Join request is considered stale / delayed if the (received hlc timestamp
+	// - send hlc timestamp) > this value;
+	return (2 * as_hb_tx_interval_get() + network_latency_max());
+}
+
+/**
+ * Timeout in milliseconds for a paxos proposal. Give a paxos round two thirds
+ * of an interval to timeout.
+ * A paxos round should definitely timeout before the next quantum interval, so
+ * that it does not delay cluster convergence.
+ */
+static uint32_t
+paxos_proposal_timeout()
+{
+	return MAX(quantum_interval() / 2, network_rtt_max());
+}
+
+/**
+ * Timeout in milliseconds after which a paxos message is retransmitted.
+ */
+static uint32_t
+paxos_msg_timeout()
+{
+	return MAX(MIN(quantum_interval() / 4, 100), network_rtt_max());
+}
+
+/**
+ * Maximum amount of time a node will be in orphan state. After this timeout the
+ * node will try forming a new cluster even if there are other adjacent
+ * clusters/nodes visible.
+ */
+static uint32_t
+clustering_orphan_timeout()
+{
+	return UINT_MAX;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Stack allocation
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Maximum memory size allocated on the call stack.
+ */
+#define STACK_ALLOC_LIMIT() (16 * 1024)
+
+/**
+ * Allocate a buffer on stack if possible. Larger buffers are heap allocated to
+ * prevent stack overflows.
+ */
+#define BUFFER_ALLOC_OR_DIE(size)									\
+(((size) > STACK_ALLOC_LIMIT()) ? cf_malloc(size) : alloca(size))
+
+/**
+ * Free the buffer allocated by BUFFER_ALLOC
+ */
+#define BUFFER_FREE(buffer, size)									\
+if (((size) > STACK_ALLOC_LIMIT()) && buffer) {cf_free(buffer);}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Logging
+ * ----------------------------------------------------------------------------
+ */
+#define LOG_LENGTH_MAX() (800)
+#define CRASH(format, ...) cf_crash(AS_CLUSTERING, format, ##__VA_ARGS__)
+#define WARNING(format, ...) cf_warning(AS_CLUSTERING, format, ##__VA_ARGS__)
+#define INFO(format, ...) cf_info(AS_CLUSTERING, format, ##__VA_ARGS__)
+#define DEBUG(format, ...) cf_debug(AS_CLUSTERING, format, ##__VA_ARGS__)
+#define DETAIL(format, ...) cf_detail(AS_CLUSTERING, format, ##__VA_ARGS__)
+
+#ifdef TRACE_ENABLED
+#define TRACE(format, ...) cf_detail(AS_CLUSTERING, format, ##__VA_ARGS__)
+#else
+#define TRACE(format, ...)
+#endif
+
+#ifdef TRACE_ENABLED
+#define TRACE_LOG(context, format, ...) cf_detail(context, format, ##__VA_ARGS__)
+#else
+#define TRACE_LOG(context, format, ...)
+#endif
+
+#define CF_TRACE CF_FAULT_SEVERITY_UNDEF
+
+#define ASSERT(expression, message, ...)				\
+if (!(expression)) {WARNING(message, ##__VA_ARGS__);}
+
+#define log_cf_node_array(message, nodes, node_count, severity)		\
+as_clustering_log_cf_node_array(severity, AS_CLUSTERING, message,	\
+								 		nodes, node_count)
+#define log_cf_node_vector(message, nodes, severity) \
+	as_clustering_log_cf_node_vector(severity, AS_CLUSTERING, message,	\
+										nodes)
+
+/*
+ * ----------------------------------------------------------------------------
+ * Vector functions
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Clear / delete all entries in a vector.
+ */
+static void
+vector_clear(cf_vector* vector)
+{
+	cf_vector_delete_range(vector, 0, cf_vector_size(vector));
+}
+
+/**
+ * Create temporary stack variables.
+ */
+#define TOKEN_PASTE(x, y) x##y
+#define STACK_VAR(x, y) TOKEN_PASTE(x, y)
+
+/**
+ * Initialize a lockless vector, initially sized to  store cluster node number
+ * of elements.
+ */
+#define vector_lockless_init(vectorp, value_type)							\
+({																			\
+	cf_vector_init(vectorp, sizeof(value_type),								\
+			AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);		\
+})
+
+/**
+ * Create and initialize a lockless stack allocated vector to initially sized to
+ * store cluster node number of elements.
+ */
+#define vector_stack_lockless_create(value_type)									\
+({																					\
+	cf_vector * STACK_VAR(vector, __LINE__) = (cf_vector*)alloca(					\
+			sizeof(cf_vector));														\
+	size_t buffer_size = AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT						\
+			* sizeof(value_type);													\
+	void* STACK_VAR(buff, __LINE__) = alloca(buffer_size); cf_vector_init_smalloc(	\
+			STACK_VAR(vector, __LINE__), sizeof(value_type),						\
+			(uint8_t*)STACK_VAR(buff, __LINE__), buffer_size,						\
+			VECTOR_FLAG_INITZERO);													\
+	STACK_VAR(vector, __LINE__);													\
+})
+
+/**
+ * Check two vector for equality. Two vector are euql if they have the same
+ * number of elements and corresponding elements are equal. For now simple
+ * memory compare is used to compare elements. Assumes the vectors are not
+ * accessed by other threads during this operation.
+ *
+ * @param v1 the first vector to compare.
+ * @param v2 the second vector to compare.
+ * @return true if the vectors are true, false otherwise.
+ */
+static bool
+vector_equals(cf_vector* v1, cf_vector* v2)
+{
+	int v1_count = cf_vector_size(v1);
+	int v2_count = cf_vector_size(v2);
+	int v1_elem_sz = VECTOR_ELEM_SZ(v1);
+	int v2_elem_sz = VECTOR_ELEM_SZ(v2);
+
+	if (v1_count != v2_count || v1_elem_sz != v2_elem_sz) {
+		return false;
+	}
+
+	for (int i = 0; i < v1_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* v1_element = cf_vector_getp(v1, i);
+		void* v2_element = cf_vector_getp(v2, i);
+
+		if (v1_element == v2_element) {
+			// Same reference or both are NULL.
+			continue;
+		}
+
+		if (v1_element == NULL || v2_element == NULL) {
+			// Exactly one reference is NULL.
+			return false;
+		}
+
+		if (memcmp(v1_element, v2_element, v1_elem_sz) != 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * Find the index of an element in the vector. Equality is based on mem compare.
+ *
+ * @param vector the source vector.
+ * @param element the element to find.
+ * @return the index if the element is found, -1 otherwise.
+ */
+static int
+vector_find(cf_vector* vector, void* element)
+{
+	int element_count = cf_vector_size(vector);
+	size_t value_len = VECTOR_ELEM_SZ(vector);
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(vector, i);
+		if (src_element) {
+			if (memcmp(element, src_element, value_len) == 0) {
+				return i;
+			}
+		}
+	}
+	return -1;
+}
+
+/**
+ * Copy all elements form the source vector to the destination vector to the
+ * destination vector. Assumes the source and destination vector are not being
+ * modified while the copy operation is in progress.
+ *
+ * @param dest the destination vector.
+ * @param src the source vector.
+ * @return the number of elements copied.
+ */
+static int
+vector_copy(cf_vector* dest, cf_vector* src)
+{
+	int element_count = cf_vector_size(src);
+	int copied_count = 0;
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(src, i);
+		if (src_element) {
+			cf_vector_append(dest, src_element);
+			copied_count++;
+		}
+	}
+	return copied_count;
+}
+
+/**
+ * Copy all elements form the source vector to the destination vector only if
+ * they do not exist in the destination vector. Assumes the source and
+ * destination vector are not being modified while the copy operation is in
+ * progress.
+ *
+ * @param dest the destination vector.
+ * @param src the source vector.
+ * @return the number of elements copied.
+ */
+static int
+vector_copy_unique(cf_vector* dest, cf_vector* src)
+{
+	int element_count = cf_vector_size(src);
+	int copied_count = 0;
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(src, i);
+		if (src_element) {
+			cf_vector_append_unique(dest, src_element);
+			copied_count++;
+		}
+	}
+	return copied_count;
+}
+
+/**
+ * Sorts in place the elements in the vector using the inout comparator function
+ * and retains only unique elements. Assumes the source vector is not being
+ * modified while the sort operation is in progress.
+ *
+ * @param src the source vector.
+ * @return comparator the comparator function, which must return an integer less
+ * than, equal to, or greater than zero if the first argument is  considered  to
+ * be  respectively  less  than,  equal  to, or greater than the second
+ */
+static void
+vector_sort_unique(cf_vector* src, int
+(*comparator)(const void*, const void*))
+{
+	int element_count = cf_vector_size(src);
+	size_t value_len = VECTOR_ELEM_SZ(src);
+	size_t array_size = element_count * value_len;
+	void* element_array = BUFFER_ALLOC_OR_DIE(array_size);
+
+	// A lame approach to sorting. Copying the elements to an array and invoking
+	// qsort.
+	uint8_t* next_element_ptr = element_array;
+	int array_element_count = 0;
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(src, i);
+		if (src_element) {
+			memcpy(next_element_ptr, src_element, value_len);
+			next_element_ptr += value_len;
+			array_element_count++;
+		}
+	}
+
+	qsort(element_array, array_element_count, value_len, comparator);
+
+	vector_clear(src);
+	next_element_ptr = element_array;
+	for (int i = 0; i < array_element_count; i++) {
+		cf_vector_append_unique(src, next_element_ptr);
+		next_element_ptr += value_len;
+	}
+
+	BUFFER_FREE(element_array, array_size);
+	return;
+}
+
+/**
+ * Remove all elements from the to_remove vector present in the target vector.
+ * Equality is based on simple mem compare.
+ *
+ * @param target the target vector being modified.
+ * @param to_remove the vector whose elements must be removed from the target.
+ * @return the number of elements removed.
+ */
+static int
+vector_subtract(cf_vector* target, cf_vector* to_remove)
+{
+	int element_count = cf_vector_size(to_remove);
+	int removed_count = 0;
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* to_remove_element = cf_vector_getp(to_remove, i);
+		if (to_remove_element) {
+			int found_at = 0;
+			while ((found_at = vector_find(target, to_remove_element)) >= 0) {
+				cf_vector_delete(target, found_at);
+				removed_count++;
+			}
+		}
+	}
+
+	return removed_count;
+}
+
+/**
+ * Convert a vector to an array.
+ * FIXME: return pointer to the internal vector storage.
+ */
+static cf_node*
+vector_to_array(cf_vector* vector)
+{
+	return (cf_node*)vector->vector;
+}
+
+/**
+ * Copy elements in a vector to an array.
+ * @param array the destination array. Should be large enough to hold the number
+ * all elements in the vector.
+ * @param src the source vector.
+ * @param element_count the number of elements to copy from the source vector.
+ */
+static void
+vector_array_cpy(void* array, cf_vector* src, int element_count)
+{
+	uint8_t* element_ptr = array;
+	int element_size = VECTOR_ELEM_SZ(src);
+	for (int i = 0; i < element_count; i++) {
+		cf_vector_get(src, i, element_ptr);
+		element_ptr += element_size;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Globals
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The big fat lock for all clustering state.
+ */
+static pthread_mutex_t g_clustering_lock =
+		PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * The fat lock for all clustering events listener changes.
+ */
+static pthread_mutex_t g_clustering_event_publisher_lock =
+		PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * Debugging lock acquition.
+ * #define LOCK_DEBUG_ENABLED 1
+ */
+#ifdef LOCK_DEBUG_ENABLED
+#define LOCK_DEBUG(format, ...) DEBUG(format, ##__VA_ARGS__)
+#else
+#define LOCK_DEBUG(format, ...)
+#endif
+
+/**
+ * Acquire a lock on the clustering module.
+ */
+#define CLUSTERING_LOCK()						\
+({												\
+	pthread_mutex_lock (&g_clustering_lock);	\
+	LOCK_DEBUG("locked in %s", __FUNCTION__);	\
+})
+
+/**
+ * Relinquish the lock on the clustering module.
+ */
+#define CLUSTERING_UNLOCK()							\
+({													\
+	pthread_mutex_unlock (&g_clustering_lock);		\
+	LOCK_DEBUG("unLocked in %s", __FUNCTION__);		\
+})
+
+/**
+ * Acquire a lock on the clustering publisher.
+ */
+#define CLUSTERING_EVENT_PUBLISHER_LOCK()						\
+({																\
+	pthread_mutex_lock (&g_clustering_event_publisher_lock);	\
+	LOCK_DEBUG("publisher locked in %s", __FUNCTION__);			\
+})
+
+/**
+ * Relinquish the lock on the clustering publisher.
+ */
+#define CLUSTERING_EVENT_PUBLISHER_UNLOCK()						\
+({																\
+	pthread_mutex_unlock (&g_clustering_event_publisher_lock);	\
+	LOCK_DEBUG("publisher unLocked in %s", __FUNCTION__);		\
+})
+
+/**
+ * Singleton timer.
+ */
+static as_clustering_timer g_timer;
+
+/**
+ * Singleton external events publisher.
+ */
+static as_clustering_external_event_publisher g_external_event_publisher;
+
+/**
+ * Singleton cluster register to store this node's cluster membership.
+ */
+static as_clustering_register g_register;
+
+/**
+ * Singleton clustrering state all initialized to zero.
+ */
+static as_clustering g_clustering = { 0 };
+
+/**
+ * Singleton paxos proposer.
+ */
+static as_paxos_proposer g_proposer;
+
+/**
+ * Singleton paxos acceptor.
+ */
+static as_paxos_acceptor g_acceptor;
+
+/**
+ * Singleton quantum interval generator.
+ */
+static as_clustering_quantum_interval_generator g_quantum_interval_generator;
+
+/**
+ * Message template for heart beat messages.
+ */
+static msg_template g_clustering_msg_template[] = {
+
+{ AS_CLUSTERING_MSG_ID, M_FT_UINT32 },
+
+{ AS_CLUSTERING_MSG_TYPE, M_FT_UINT32 },
+
+{ AS_CLUSTERING_MSG_HLC_TIMESTAMP, M_FT_UINT64 },
+
+{ AS_CLUSTERING_MSG_SEQUENCE_NUMBER, M_FT_UINT64 },
+
+{ AS_CLUSTERING_MSG_CLUSTER_KEY, M_FT_UINT64 },
+
+{ AS_CLUSTERING_MSG_SUCCESSION_LIST, M_FT_BUF },
+
+{ AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, M_FT_UINT64 }
+
+};
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering life cycle
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Check if clustering is initialized.
+ */
+static bool
+clustering_is_initialized()
+{
+	CLUSTERING_LOCK();
+	bool initialized = (g_clustering.sys_state
+			!= AS_CLUSTERING_SYS_STATE_UNINITIALIZED);
+	CLUSTERING_UNLOCK();
+	return initialized;
+}
+
+/**
+ * * Check if clustering is running.
+ */
+static bool
+clustering_is_running()
+{
+	CLUSTERING_LOCK();
+	bool running = g_clustering.sys_state == AS_CLUSTERING_SYS_STATE_RUNNING;
+	CLUSTERING_UNLOCK();
+	return running;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Config related functions
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The nodeid for this node.
+ */
+static cf_node
+config_self_nodeid_get()
+{
+	return g_config.self_node;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Compatibility mode functions
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Return current protocol version identifier.
+ */
+as_cluster_proto_identifier
+clustering_protocol_identifier_get()
+{
+	return 0x707C;
+}
+
+/**
+ * Compare clustering protocol versions for compatibility.
+ */
+bool
+clustering_versions_are_compatible(as_cluster_proto_identifier v1,
+		as_cluster_proto_identifier v2)
+{
+	return v1 == v2;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Timer event generator
+ *
+ * TODO: Can be abstracted out as a single scheduler single utility across
+ * modules.
+ * ----------------------------------------------------------------------------
+ */
+
+static void
+timer_init()
+{
+	CLUSTERING_LOCK();
+	memset(&g_timer, 0, sizeof(g_timer));
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Clustering timer event generator thread, to help with retries and retransmits
+ * across all states.
+ */
+static void*
+timer_thr(void* arg)
+{
+	as_clustering_internal_event timer_event;
+	memset(&timer_event, 0, sizeof(timer_event));
+	timer_event.type = AS_CLUSTERING_INTERNAL_EVENT_TIMER;
+
+	while (clustering_is_running()) {
+		// Wait for a while and retry.
+		internal_event_dispatch(&timer_event);
+		usleep(timer_tick_interval() * 1000);
+	}
+
+	return NULL;
+}
+
+/**
+ * Start the timer.
+ */
+static void
+timer_start()
+{
+	CLUSTERING_LOCK();
+	if (pthread_create(&g_timer.timer_tid, 0, timer_thr, NULL) != 0) {
+		CRASH("could not create timer thread: %s", cf_strerror(errno));
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Stop the timer.
+ */
+static void
+timer_stop()
+{
+	CLUSTERING_LOCK();
+	pthread_join(g_timer.timer_tid, NULL);
+	CLUSTERING_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Heartbeat subsystem interfacing
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * The structure of data clustring subsystem pushes with in hb pulse messages
+ * and retains as plugin data is as follows.
+ *
+ * Each row occupies 4 bytes.
+ *
+ * V5 heartbeat wire payload structure.
+ * ===============================
+ *
+ * ------------|-------------|------------|------------|
+ * |         Clustering Protocol identifier            |
+ * |---------------------------------------------------|
+ * |                                                   |
+ * |-------- Cluster Key ------------------------------|
+ * |                                                   |
+ * |---------------------------------------------------|
+ * |                                                   |
+ * |-------- Paxos sequence number --------------------|
+ * |                                                   |
+ * |---------------------------------------------------|
+ * |                                                   |
+ * |-------- Preferred principal ----------------------|
+ * |                                                   |
+ * |---------------------------------------------------|
+ * |         Length of succession list                 |
+ * |---------------------------------------------------|
+ * |                                                   |
+ * |-------- Succ. Node id 0 --------------------------|
+ * |                                                   |
+ * |---------------------------------------------------|
+ * |                                                   |
+ * |-------- Succ. Node id 1 --------------------------|
+ * |                                                   |
+ * |---------------------------------------------------|
+ * |                        .                          |
+ * |                        .                          |
+ *
+ *
+ * Cluster key and succession lists helps with detecting cluster integrity,
+ * Plain clusterkey should be good enough but matching succession lists adds to
+ * another level of safety (may not be required but being cautious).
+ *
+ * For orpahned node cluster key and length of succession list are set to zero.
+ *
+ * The parsed hb pluging data is just the same as the wire payload structure.
+ * The plugin code ensure invalid content will never be parsed as plugin data to
+ * memory. The direct implication is that if plugin data is not NULL,
+ * required fields
+ * - Clustering protocol identifier
+ * - Cluster key
+ * - Succession list length will always be present when read back from the
+ * heartbeat subsystem and the succession list will be consistent with the
+ * succession list length.
+ */
+
+/**
+ * Read plugin data from hb layer for a node, using stack allocated space.
+ * Will attempt a max of 3 attempts before crashing.
+ * plugin_data_p->data_size will be zero and plugin_data_p->data will be NULL if
+ * an entry for the node does not exist.
+ */
+#define clustering_hb_plugin_data_get(nodeid, plugin_data_p,				\
+		hb_msg_hlc_ts_p, msg_recv_ts_p)										\
+({																			\
+	(plugin_data_p)->data_capacity = 1024;									\
+	int tries_remaining = 3;												\
+	bool enoent = false;													\
+	bool rv = -1;															\
+	while (tries_remaining--) {												\
+		(plugin_data_p)->data = alloca((plugin_data_p)->data_capacity);		\
+		if (as_hb_plugin_data_get(nodeid, AS_HB_PLUGIN_CLUSTERING,			\
+				plugin_data_p, hb_msg_hlc_ts_p, msg_recv_ts_p) == 0) {		\
+			rv = 0;															\
+			break;															\
+		}																	\
+		if (errno == ENOENT) {												\
+			enoent = true;													\
+			break;															\
+		}																	\
+		if (errno == ENOMEM) {												\
+			(plugin_data_p)->data_capacity = (plugin_data_p)->data_size;	\
+		}																	\
+	}																		\
+	if (rv != 0 && !enoent && tries_remaining < 0) {						\
+		CRASH("error allocating space for paxos hb plugin data");			\
+	}																		\
+	if (enoent) {															\
+		(plugin_data_p)->data_size = 0;										\
+		(plugin_data_p)->data = NULL;										\
+	}																		\
+	rv;																		\
+})
+
+/**
+ * Get a pointer to the protocol identifier inside plugin data. Will be NULL if
+ * plugin data is null or there are not enough bytes in the data to hold the
+ * identifier.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to the protocol identifier on success, NULL on failure.
+ */
+static as_cluster_proto_identifier*
+clustering_hb_plugin_proto_get(void* plugin_data, size_t plugin_data_size)
+{
+	if (plugin_data == NULL
+			|| plugin_data_size < sizeof(as_cluster_proto_identifier)) {
+		// The data does not hold valid data or there is no cluster key and or
+		// succession list is missing.
+		return NULL;
+	}
+
+	return (as_cluster_proto_identifier*)plugin_data;
+}
+
+/**
+ * Retrieves the cluster key from clustering hb plugin data.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to the cluster key on success, NULL on failure.
+ */
+static as_cluster_key*
+clustering_hb_plugin_cluster_key_get(void* plugin_data, size_t plugin_data_size)
+{
+	uint8_t* proto = (uint8_t*)clustering_hb_plugin_proto_get(plugin_data,
+			plugin_data_size);
+	if (proto == NULL) {
+		// The data does not hold valid data.
+		return NULL;
+	}
+
+	if ((uint8_t*)plugin_data + plugin_data_size
+			< proto + sizeof(as_cluster_proto_identifier)
+					+ sizeof(as_cluster_key)) {
+		// Not enough bytes for cluster key.
+		return NULL;
+	}
+
+	return (as_cluster_key*)(proto + sizeof(as_cluster_proto_identifier));
+}
+
+/**
+ * Retrieves the sequence number from clustering hb plugin data.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to the sequence number on success, NULL on failure.
+ */
+static as_paxos_sequence_number*
+clustering_hb_plugin_sequence_number_get(void* plugin_data,
+		size_t plugin_data_size)
+{
+	uint8_t* cluster_key = (uint8_t*)clustering_hb_plugin_cluster_key_get(
+			plugin_data, plugin_data_size);
+	if (cluster_key == NULL) {
+		// The data does not hold valid data or there is no cluster key.
+		return NULL;
+	}
+
+	if ((uint8_t*)plugin_data + plugin_data_size
+			< cluster_key + sizeof(as_cluster_key)
+					+ sizeof(as_paxos_sequence_number)) {
+		// Not enough bytes for succession list length.
+		return NULL;
+	}
+
+	return (as_paxos_sequence_number*)(cluster_key + sizeof(as_cluster_key));
+}
+
+/**
+ * Retrieves the preferred principal from clustering hb plugin data.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to the preferred principal on success, NULL on failure.
+ */
+static cf_node*
+clustering_hb_plugin_preferred_principal_get(void* plugin_data,
+		size_t plugin_data_size)
+{
+	uint8_t* sequence_number_p =
+			(uint8_t*)clustering_hb_plugin_sequence_number_get(plugin_data,
+					plugin_data_size);
+	if (sequence_number_p == NULL) {
+		// The data does not hold valid data or there is no sequence number.
+		return NULL;
+	}
+
+	if ((uint8_t*)plugin_data + plugin_data_size
+			< sequence_number_p + sizeof(as_paxos_sequence_number)
+					+ sizeof(cf_node)) {
+		// Not enough bytes for preferred principal.
+		return NULL;
+	}
+
+	return (as_paxos_sequence_number*)(sequence_number_p
+			+ sizeof(as_paxos_sequence_number));
+}
+
+/**
+ * Retrieves the succession list length pointer from clustering hb plugin data.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to succession list length on success, NULL on failure.
+ */
+static uint32_t*
+clustering_hb_plugin_succession_length_get(void* plugin_data,
+		size_t plugin_data_size)
+{
+	uint8_t* preferred_principal_p =
+			(uint8_t*)clustering_hb_plugin_preferred_principal_get(plugin_data,
+					plugin_data_size);
+	if (preferred_principal_p == NULL) {
+		// The data does not hold valid data or there is no preferred principal
+		// and or succession list is missing.
+		return NULL;
+	}
+
+	if ((uint8_t*)plugin_data + plugin_data_size
+			< preferred_principal_p + sizeof(cf_node) + sizeof(uint32_t)) {
+		// Not enough bytes for succession list length.
+		return NULL;
+	}
+
+	return (uint32_t*)(preferred_principal_p + sizeof(cf_node));
+}
+
+/**
+ * Retrieves the pointer to the first node in the succession list.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to first node in succession list on success, NULL on failure
+ * or if the succession list is empty.
+ */
+static cf_node*
+clustering_hb_plugin_succession_get(void* plugin_data, size_t plugin_data_size)
+{
+	uint8_t* succession_list_length_p =
+			(uint8_t*)clustering_hb_plugin_succession_length_get(plugin_data,
+					plugin_data_size);
+	if (succession_list_length_p == NULL) {
+		// The data does not hold valid data or there is no cluster key and or
+		// succession list is missing.
+		return NULL;
+	}
+
+	if (*(uint32_t*)succession_list_length_p == 0) {
+		// Empty succession list.
+		return NULL;
+	}
+
+	if ((uint8_t*)plugin_data + plugin_data_size
+			< succession_list_length_p + sizeof(uint32_t)
+					+ (sizeof(cf_node) * (*(uint32_t*)succession_list_length_p))) {
+		// Not enough bytes for succession list length.
+		return NULL;
+	}
+
+	return (cf_node*)(succession_list_length_p + sizeof(uint32_t));
+}
+
+/**
+ * Validate the correctness of plugin data. By ensuring all required fields are
+ * present and the succession list matches the provided length.
+ * @param plugin_data can be NULL.
+ * @param plugin_data_size the size of plugin data.
+ * @return pointer to first node in succession list on success, NULL on failure.
+ */
+static bool
+clustering_hb_plugin_data_is_valid(void* plugin_data, size_t plugin_data_size)
+{
+	void* proto_identifier_p = clustering_hb_plugin_proto_get(plugin_data,
+			plugin_data_size);
+	if (proto_identifier_p == NULL) {
+		DEBUG("plugin data missing protocol identifier");
+		return false;
+	}
+
+	as_cluster_proto_identifier current_proto_identifier =
+			clustering_protocol_identifier_get();
+	if (!clustering_versions_are_compatible(current_proto_identifier,
+			*(as_cluster_proto_identifier*)proto_identifier_p)) {
+		DEBUG("protocol versions incompatible - expected %"PRIx32" but was: %"PRIx32,
+				current_proto_identifier,
+				*(as_cluster_proto_identifier*)proto_identifier_p);
+		return false;
+	}
+
+	void* cluster_key_p = clustering_hb_plugin_cluster_key_get(plugin_data,
+			plugin_data_size);
+	if (cluster_key_p == NULL) {
+		DEBUG("plugin data missing cluster key");
+		return false;
+	}
+
+	void* sequence_number_p = clustering_hb_plugin_sequence_number_get(
+			plugin_data, plugin_data_size);
+	if (sequence_number_p == NULL) {
+		DEBUG("plugin data missing sequence number");
+		return false;
+	}
+
+	void* preferred_principal_p = clustering_hb_plugin_preferred_principal_get(
+			plugin_data, plugin_data_size);
+	if (preferred_principal_p == NULL) {
+		DEBUG("plugin data missing preferred principal");
+		return false;
+	}
+
+	uint32_t* succession_list_length_p =
+			(void*)clustering_hb_plugin_succession_length_get(plugin_data,
+					plugin_data_size);
+	if (succession_list_length_p == NULL) {
+		DEBUG("plugin data missing succession list length");
+		return false;
+	}
+
+	void* succession_list_p = clustering_hb_plugin_succession_get(plugin_data,
+			plugin_data_size);
+
+	if (*succession_list_length_p > 0 && succession_list_p == NULL) {
+		DEBUG("succession list length %d, but succession list is empty",
+				*succession_list_length_p);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * Determines if the plugin data with hb subsystem is old to be ignored.
+ * ALL access to plugin data should be vetted through this function.  The plugin
+ * data is obsolete if it was send before the current cluster state or has a
+ * version mismatch.
+ *
+ * This is detemined by comparing the plugin data hb message hlc timestamp and
+ * monotonic timestamps with the cluster formation hlc and monotonic times.
+ *
+ * @param cluster_modified_hlc_ts the hlc timestamp when current cluster change
+ * happened. Sent to avoid locking in this function.
+ * @param cluster_modified_time  the monotonic timestamp when current cluster
+ * change happened. Sento to avoid locking in this function.
+ * @param plugin_data the plugin data.
+ * @param plugin_data_size the size of plugin data.
+ * @param msg_recv_ts the monotonic timestamp for plugin data receive.
+ * @param hb_msg_hlc_ts the hlc timestamp for plugin data receive.
+ * @return true if plugin data is obsolete, false otherwise.
+ */
+static bool
+clustering_hb_plugin_data_is_obsolete(as_hlc_timestamp cluster_modified_hlc_ts,
+		cf_clock cluster_modified_time, void* plugin_data,
+		size_t plugin_data_size, cf_clock msg_recv_ts,
+		as_hlc_msg_timestamp* hb_msg_hlc_ts)
+{
+	if (!clustering_hb_plugin_data_is_valid(plugin_data, plugin_data_size)) {
+		// Plugin data is invalid. Assume it to be obsolete.
+		// Seems like a redundant check but required in case clustering protocol
+		// was switched to an incompatible version.
+		return true;
+	}
+
+	if (as_hlc_send_timestamp_order(cluster_modified_hlc_ts, hb_msg_hlc_ts)
+			!= AS_HLC_HAPPENS_BEFORE) {
+		// Cluster formation time after message send or the order is unknown,
+		// assume cluster formation is after message send. the caller should
+		// ignore this message.
+		return true;
+	}
+
+	// HB data should be atleast after cluster formation time + one hb interval
+	// to send out our cluster state + one network delay for our information to
+	// reach the remote node + one hb interval for the other node to send out
+	// the his updated state + one network delay for the updated state to reach
+	// us.
+	if (cluster_modified_time + 2 * as_hb_tx_interval_get()
+			+ 2 * g_config.fabric_latency_max_ms > msg_recv_ts) {
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * Indicates if the plugin data for a node indicates that it is an orphan node.
+ */
+static as_clustering_peer_node_state
+clustering_hb_plugin_data_node_status(void* plugin_data,
+		size_t plugin_data_size)
+{
+	if (!clustering_hb_plugin_data_is_valid(plugin_data, plugin_data_size)) {
+		// Either we have not hb channel to this node or it has sen invalid
+		// plugin data. Assuming the cluster state is unknown.
+		return AS_NODE_UNKNOWN;
+	}
+
+	as_cluster_key* cluster_key = clustering_hb_plugin_cluster_key_get(
+			plugin_data, plugin_data_size);
+
+	if (*cluster_key == 0) {
+		return AS_NODE_ORPHAN;
+	}
+
+	// Redundant paranoid check.
+	uint32_t* succession_list_length_p =
+			clustering_hb_plugin_succession_length_get(plugin_data,
+					plugin_data_size);
+
+	if (*succession_list_length_p == 0) {
+		return AS_NODE_ORPHAN;
+	}
+
+	return AS_NODE_CLUSTER_ASSIGNED;
+}
+
+/**
+ * Push clustering payload into a heartbeat pulse message. The payload format is
+ * as described above.
+ */
+static void
+clustering_hb_plugin_set_fn(msg* msg)
+{
+	if (!clustering_is_initialized()) {
+		// Clustering not initialized. Send no data at all.
+		return;
+	}
+
+	CLUSTERING_LOCK();
+
+	uint32_t cluster_size = cf_vector_size(&g_register.succession_list);
+
+	size_t payload_size =
+		// For the paxos version identifier
+		sizeof(uint32_t)
+			// For cluster key
+			+ sizeof(as_cluster_key)
+				// For sequence number
+				+ sizeof(as_paxos_sequence_number)
+					// For preferred principal
+					+ sizeof(cf_node)
+						// For succession list length.
+						+ sizeof(uint32_t)
+							// For succession list.
+							+ (sizeof(cf_node) * cluster_size);
+
+	uint8_t* payload = alloca(payload_size);
+
+	uint8_t* current_field_p = payload;
+
+	// Set the paxos protocol identifier.
+	uint32_t protocol = clustering_protocol_identifier_get();
+	memcpy(current_field_p, &protocol, sizeof(protocol));
+	current_field_p += sizeof(protocol);
+
+	// Set cluster key.
+	memcpy(current_field_p, &g_register.cluster_key,
+			sizeof(g_register.cluster_key));
+	current_field_p += sizeof(g_register.cluster_key);
+
+	// Set the sequence number.
+	memcpy(current_field_p, &g_register.sequence_number,
+			sizeof(g_register.sequence_number));
+	current_field_p += sizeof(g_register.sequence_number);
+
+	// Set the preferred principal.
+	memcpy(current_field_p, &g_clustering.preferred_principal,
+			sizeof(g_clustering.preferred_principal));
+	current_field_p += sizeof(g_clustering.preferred_principal);
+
+	// Set succession length
+	memcpy(current_field_p, &cluster_size, sizeof(cluster_size));
+	current_field_p += sizeof(cluster_size);
+
+	// Copy over the succession list.
+	cf_node* succession = (cf_node*)(current_field_p);
+	for (int i = 0; i < cluster_size; i++) {
+		cf_vector_get(&g_register.succession_list, i, &succession[i]);
+	}
+
+	if (msg_set_buf(msg, AS_HB_MSG_PAXOS_DATA, payload, payload_size,
+			MSG_SET_COPY) != 0) {
+		CRASH("error setting succession list on msg");
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Plugin parse function that copies the msg payload verbatim to a plugin data.
+ */
+static void
+clustering_hb_plugin_parse_data_fn(msg* msg, cf_node source,
+		as_hb_plugin_node_data* plugin_data)
+{
+	// Lockless check to prevent deadlocks.
+	if (g_clustering.sys_state == AS_CLUSTERING_SYS_STATE_UNINITIALIZED) {
+		// Ignore this heartbeat.
+		plugin_data->data_size = 0;
+		return;
+	}
+
+	void* payload;
+	size_t payload_size;
+
+	if (msg_get_buf(msg, AS_HB_MSG_PAXOS_DATA, (uint8_t**)&payload,
+			&payload_size, MSG_GET_DIRECT) != 0) {
+		cf_ticker_warning(AS_CLUSTERING,
+				"received empty clustering payload in heartbeat pulse from node %"PRIx64,
+				source);
+		plugin_data->data_size = 0;
+		return;
+	}
+
+	// Validate and retain only valid plugin data.
+	if (!clustering_hb_plugin_data_is_valid(payload, payload_size)) {
+		cf_ticker_warning(AS_CLUSTERING,
+				"received invalid clustering payload in heartbeat pulse from node %"PRIx64,
+				source);
+		plugin_data->data_size = 0;
+		return;
+	}
+
+	if (payload_size > plugin_data->data_capacity) {
+		// Round up to nearest multiple of block size to prevent very frequent
+		// reallocation.
+		size_t data_capacity = ((payload_size + HB_PLUGIN_DATA_BLOCK_SIZE - 1)
+				/ HB_PLUGIN_DATA_BLOCK_SIZE) * HB_PLUGIN_DATA_BLOCK_SIZE;
+
+		// Reallocate since we have outgrown existing capacity.
+		plugin_data->data = cf_realloc(plugin_data->data, data_capacity);
+		plugin_data->data_capacity = data_capacity;
+	}
+
+	plugin_data->data_size = payload_size;
+	memcpy(plugin_data->data, payload, payload_size);
+}
+
+/**
+ * Check if the input succession list from hb plugin data matches, with a
+ * succession list vector.
+ * @param succession_list the first succession list.
+ * @param succession_list_length the length of the succession list.
+ * @param succession_list_vector the second succession list as a vector. Should
+ * be protected from multithreaded access while this function is running.
+ * @return true if the succcession lists are equal, false otherwise.
+ */
+bool
+clustering_hb_succession_list_matches(cf_node* succession_list,
+		uint32_t succession_list_length, cf_vector* succession_list_vector)
+{
+	if (succession_list_length != cf_vector_size(succession_list_vector)) {
+		return false;
+	}
+
+	for (uint32_t i = 0; i < succession_list_length; i++) {
+		cf_node* vector_element = cf_vector_getp(succession_list_vector, i);
+		if (vector_element == NULL || *vector_element != succession_list[i]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Quantum interval generator
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Time taken for the effect of a fault to get propogated via HB.
+ */
+static uint32_t
+quantum_interval_hb_fault_comm_delay()
+{
+	return as_hb_tx_interval_get() + network_latency_max();
+}
+
+/**
+ * Quantum wait time after node arrived event.
+ */
+static uint32_t
+quantum_interval_node_arrived_wait_time(as_clustering_quantum_fault* fault)
+{
+	return MIN(quantum_interval(),
+			(fault->last_event_ts - fault->event_ts) / 2
+					+ 2 * quantum_interval_hb_fault_comm_delay()
+					+ quantum_interval() / 2);
+}
+
+/**
+ * Quantum wait time after node departs.
+ */
+static uint32_t
+quantum_interval_node_departed_wait_time(as_clustering_quantum_fault* fault)
+{
+	return MIN(quantum_interval(),
+			as_hb_node_timeout_get()
+					+ 2 * quantum_interval_hb_fault_comm_delay()
+					+ quantum_interval() / 4);
+}
+
+/**
+ * Quantum wait time after a peer nodes adjacency changed.
+ */
+static uint32_t
+quantum_interval_peer_adjacency_changed_wait_time(
+		as_clustering_quantum_fault* fault)
+{
+	return MIN(quantum_interval(), quantum_interval_hb_fault_comm_delay());
+}
+
+/**
+ * Quantum wait time after accepting a join request.
+ */
+static uint32_t
+quantum_interval_join_accepted_wait_time(as_clustering_quantum_fault* fault)
+{
+	// Ensure we wait for atleast one heartbeat interval to receive the latest
+	// heartbeat after the last join request and for other nodes to send their
+	// join requests as well.
+	return MIN(quantum_interval(),
+			(fault->last_event_ts - fault->event_ts)
+					+ join_cluster_check_interval() + network_latency_max()
+					+ as_hb_tx_interval_get());
+}
+
+/**
+ * Quantum wait time after principal node departs.
+ */
+static uint32_t
+quantum_interval_principal_departed_wait_time(
+		as_clustering_quantum_fault* fault)
+{
+	// Anticipate an incoming join request from other orphaned cluster members.
+	return MIN(quantum_interval(),
+			as_hb_node_timeout_get()
+					+ 2 * quantum_interval_hb_fault_comm_delay()
+					+ MAX(quantum_interval() / 4,
+							quantum_interval_join_accepted_wait_time(fault)));
+}
+
+/**
+ * Quantum wait time after seeing a cluster that might send us a join request.
+ */
+static uint32_t
+quantum_interval_inbound_merge_candidate_wait_time(
+		as_clustering_quantum_fault* fault)
+{
+	return quantum_interval();
+}
+
+/**
+ * Quantum wait time after a cluster member has been orphaned.
+ */
+static uint32_t
+quantum_interval_member_orphaned_wait_time(as_clustering_quantum_fault* fault)
+{
+	return quantum_interval();
+}
+
+/**
+ * Marks the current quantum interval as skipped. A kludge to allow quantum to
+ * allow quantum interval generator to mark quantum intervals as postponed.
+ */
+static void
+quantum_interval_mark_postponed()
+{
+	CLUSTERING_LOCK();
+	g_quantum_interval_generator.is_interval_postponed = true;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Update the vtable for a fault.
+ */
+static void
+quantum_interval_vtable_update(as_clustering_quantum_fault_type type,
+		char *fault_log_str, as_clustering_quantum_fault_wait_fn wait_fn)
+{
+	CLUSTERING_LOCK();
+	g_quantum_interval_generator.vtable[type].fault_log_str = fault_log_str;
+	g_quantum_interval_generator.vtable[type].wait_fn = wait_fn;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Initialize quantum interval generator.
+ */
+static void
+quantum_interval_generator_init()
+{
+	CLUSTERING_LOCK();
+	memset(&g_quantum_interval_generator, 0,
+			sizeof(g_quantum_interval_generator));
+	g_quantum_interval_generator.last_quantum_start_time = cf_getms();
+	g_quantum_interval_generator.last_quantum_interval = quantum_interval();
+
+	// Initialize the vtable.
+	quantum_interval_vtable_update(QUANTUM_FAULT_NODE_ARRIVED, "node arrived",
+			quantum_interval_node_arrived_wait_time);
+	quantum_interval_vtable_update(QUANTUM_FAULT_NODE_DEPARTED, "node departed",
+			quantum_interval_node_departed_wait_time);
+	quantum_interval_vtable_update(QUANTUM_FAULT_PRINCIPAL_DEPARTED,
+			"principal departed",
+			quantum_interval_principal_departed_wait_time);
+	quantum_interval_vtable_update(QUANTUM_FAULT_PEER_ADJACENCY_CHANGED,
+			"peer adjacency changed",
+			quantum_interval_peer_adjacency_changed_wait_time);
+	quantum_interval_vtable_update(QUANTUM_FAULT_JOIN_ACCEPTED,
+			"join request accepted", quantum_interval_join_accepted_wait_time);
+	quantum_interval_vtable_update(QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN,
+			"merge candidate seen",
+			quantum_interval_inbound_merge_candidate_wait_time);
+	quantum_interval_vtable_update(QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED,
+			"member orphaned", quantum_interval_member_orphaned_wait_time);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Get the earliest possible monotonic clock time the next quantum interval can
+ * start.
+ *
+ * Start quantum interval after the last update to any one of adjacency,
+ * pending_join_requests , neighboring_principals. The heuristic is that these
+ * should be stable to initiate cluster merge / join or cluster formation
+ * requests.
+ */
+static cf_clock
+quantum_interval_earliest_start_time()
+{
+	CLUSTERING_LOCK();
+	cf_clock fault_event_time = 0;
+	for (int i = 0; i < QUANTUM_FAULT_TYPE_SENTINEL; i++) {
+		if (g_quantum_interval_generator.fault[i].event_ts) {
+			fault_event_time = MAX(fault_event_time,
+					g_quantum_interval_generator.fault[i].event_ts
+							+ g_quantum_interval_generator.vtable[i].wait_fn(
+									&g_quantum_interval_generator.fault[i]));
+		}
+
+		TRACE("Fault:%s event_ts:%"PRIu64,
+				g_quantum_interval_generator.vtable[i].fault_log_str,
+				g_quantum_interval_generator.fault[i].event_ts);
+	}
+
+	TRACE("Last Quantum interval:%"PRIu64,
+			g_quantum_interval_generator.last_quantum_start_time);
+
+	cf_clock start_time = g_quantum_interval_generator.last_quantum_start_time
+			+ quantum_interval();
+	if (fault_event_time) {
+		// Ensure we have at least 1/2 quantum interval of separation between
+		// quantum intervals to give chance to multiple fault events that  are
+		// resonably close in time.
+		start_time = MAX(
+				g_quantum_interval_generator.last_quantum_start_time
+						+ quantum_interval() / 2, fault_event_time);
+	}
+	CLUSTERING_UNLOCK();
+
+	return start_time;
+}
+
+/**
+ * Reset quantum interval fault.
+ * @param fault_type the fault type.
+ */
+static void
+quantum_interval_fault_reset(as_clustering_quantum_fault_type fault_type)
+{
+	CLUSTERING_LOCK();
+	memset(&g_quantum_interval_generator.fault[fault_type], 0,
+			sizeof(g_quantum_interval_generator.fault[fault_type]));
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Update a fault event based on the current fault ts.
+ * @param fault the fault to update.
+ * @param fault_ts the new fault timestamp
+ * @param src_nodeid the fault causing nodeid, 0 if the nodeid is not known.
+ */
+static void
+quantum_interval_fault_update(as_clustering_quantum_fault_type fault_type,
+		cf_clock fault_ts, cf_node src_nodeid)
+{
+	CLUSTERING_LOCK();
+	as_clustering_quantum_fault* fault =
+			&g_quantum_interval_generator.fault[fault_type];
+	if (fault->event_ts == 0
+			|| fault_ts - fault->event_ts > quantum_interval() / 2) {
+		// Fault event detected first time in this quantum or we are seeing the
+		// effect of a different event more than half quantum apart.
+		fault->event_ts = fault_ts;
+		DETAIL("updated '%s' fault with ts %"PRIu64" for node %"PRIx64,
+				g_quantum_interval_generator.vtable[fault_type].fault_log_str, fault_ts, src_nodeid);
+	}
+
+	fault->last_event_ts = fault_ts;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Reset the state for the next quantum interval.
+ */
+static void
+quantum_interval_generator_reset(cf_clock last_quantum_start_time)
+{
+	CLUSTERING_LOCK();
+	if (!g_quantum_interval_generator.is_interval_postponed) {
+		// Update last quantum interval.
+		g_quantum_interval_generator.last_quantum_interval = MAX(0,
+				last_quantum_start_time
+						- g_quantum_interval_generator.last_quantum_start_time);
+
+		g_quantum_interval_generator.last_quantum_start_time =
+				last_quantum_start_time;
+		for (int i = 0; i < QUANTUM_FAULT_TYPE_SENTINEL; i++) {
+			quantum_interval_fault_reset(i);
+		}
+	}
+	g_quantum_interval_generator.is_interval_postponed = false;
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle timer event and generate a quantum internal event if required.
+ */
+static void
+quantum_interval_generator_timer_event_handle(
+		as_clustering_internal_event* timer_event)
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+
+	cf_clock earliest_quantum_start_time =
+			quantum_interval_earliest_start_time();
+
+	cf_clock expected_quantum_start_time =
+			g_quantum_interval_generator.last_quantum_start_time
+					+ g_quantum_interval_generator.last_quantum_interval;
+
+	// Provide a buffer for current quantum interval to finish gracefully as
+	// long as it is less than half a quantum interval.
+	cf_clock quantum_wait_buffer = MIN(
+			earliest_quantum_start_time > expected_quantum_start_time ?
+					earliest_quantum_start_time - expected_quantum_start_time :
+					0, g_quantum_interval_generator.last_quantum_interval / 2);
+
+	// Fire quantum interval start event if it is time, or if we have skipped
+	// quantum interval start for more that the max skip number of intervals.
+	// Add a buffer of wait time to ensure we wait a bit more if we can cover
+	// the waiting time.
+	bool is_skippable = g_quantum_interval_generator.last_quantum_start_time
+			+ (quantum_interval_skip_max() + 1)
+					* g_quantum_interval_generator.last_quantum_interval
+			+ quantum_wait_buffer > now;
+	bool fire_quantum_event = earliest_quantum_start_time <= now
+			|| !is_skippable;
+	CLUSTERING_UNLOCK();
+
+	if (fire_quantum_event) {
+		as_clustering_internal_event timer_event;
+		memset(&timer_event, 0, sizeof(timer_event));
+		timer_event.type = AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START;
+		timer_event.quantum_interval_is_skippable = is_skippable;
+		internal_event_dispatch(&timer_event);
+
+		// Reset for next interval generation.
+		quantum_interval_generator_reset(now);
+	}
+}
+
+/**
+ * Check if the interval generator has seen an adjacency fault in the current
+ * quantum interval.
+ * @return true if the quantum interval generator has seen an adjacency fault,
+ * false otherwise.
+ */
+static bool
+quantum_interval_is_adjacency_fault_seen()
+{
+	CLUSTERING_LOCK();
+	bool is_fault_seen =
+			g_quantum_interval_generator.fault[QUANTUM_FAULT_NODE_ARRIVED].event_ts
+					|| g_quantum_interval_generator.fault[QUANTUM_FAULT_NODE_DEPARTED].event_ts
+					|| g_quantum_interval_generator.fault[QUANTUM_FAULT_PRINCIPAL_DEPARTED].event_ts;
+	CLUSTERING_UNLOCK();
+	return is_fault_seen;
+}
+
+/**
+ * Check if the interval generator has seen a peer node adjacency changed fault
+ * in current quantum interval.
+ * @return true if the quantum interval generator has seen a peer node adjacency
+ * changed fault,
+ * false otherwise.
+ */
+static bool
+quantum_interval_is_peer_adjacency_fault_seen()
+{
+	CLUSTERING_LOCK();
+	bool is_fault_seen =
+			g_quantum_interval_generator.fault[QUANTUM_FAULT_PEER_ADJACENCY_CHANGED].event_ts;
+	CLUSTERING_UNLOCK();
+	return is_fault_seen;
+}
+
+/**
+ * Update the fault time for this quantum on self heartbeat adjacency list
+ * change.
+ */
+static void
+quantum_interval_generator_hb_event_handle(
+		as_clustering_internal_event* hb_event)
+{
+	CLUSTERING_LOCK();
+
+	cf_clock min_event_time[AS_HB_NODE_EVENT_SENTINEL];
+	cf_clock min_event_node[AS_HB_NODE_EVENT_SENTINEL];
+
+	memset(min_event_time, 0, sizeof(min_event_time));
+	memset(min_event_node, 0, sizeof(min_event_node));
+
+	as_hb_event_node* events = hb_event->hb_events;
+	for (int i = 0; i < hb_event->hb_n_events; i++) {
+		if (min_event_time[events[i].evt] == 0
+				|| min_event_time[events[i].evt] > events[i].event_time) {
+			min_event_time[events[i].evt] = events[i].event_time;
+			min_event_node[events[i].evt] = events[i].nodeid;
+		}
+
+		if (events[i].evt == AS_HB_NODE_DEPART
+				&& clustering_is_our_principal(events[i].nodeid)) {
+			quantum_interval_fault_update(QUANTUM_FAULT_PRINCIPAL_DEPARTED,
+					events[i].event_time, events[i].nodeid);
+		}
+	}
+
+	for (int i = 0; i < AS_HB_NODE_EVENT_SENTINEL; i++) {
+		if (min_event_time[i]) {
+			switch (i) {
+			case AS_HB_NODE_ARRIVE:
+				quantum_interval_fault_update(QUANTUM_FAULT_NODE_ARRIVED,
+						min_event_time[i], min_event_node[i]);
+				break;
+			case AS_HB_NODE_DEPART:
+				quantum_interval_fault_update(QUANTUM_FAULT_NODE_DEPARTED,
+						min_event_time[i], min_event_node[i]);
+				break;
+			case AS_HB_NODE_ADJACENCY_CHANGED:
+				if (clustering_is_cluster_member(min_event_node[i])) {
+					quantum_interval_fault_update(
+							QUANTUM_FAULT_PEER_ADJACENCY_CHANGED,
+							min_event_time[i], min_event_node[i]);
+				}
+				break;
+			default:
+				break;
+			}
+
+		}
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Update the fault time for this quantum on clustering information for an
+ * adjacent node change. Assumes the node's plugin data is not obsolete.
+ */
+static void
+quantum_interval_generator_hb_plugin_data_changed_handle(
+		as_clustering_internal_event* change_event)
+{
+	CLUSTERING_LOCK();
+
+	if (clustering_hb_plugin_data_is_obsolete(
+			g_register.cluster_modified_hlc_ts,
+			g_register.cluster_modified_time, change_event->plugin_data->data,
+			change_event->plugin_data->data_size,
+			change_event->plugin_data_changed_ts,
+			&change_event->plugin_data_changed_hlc_ts)) {
+		// The plugin data is obsolete. Can't take decisions based on it.
+		goto Exit;
+	}
+
+	// Get the changed node's succession list, cluster key. All the fields
+	// should be present since the obsolete check also checked for fields being
+	// valid.
+	cf_node* succession_list_p = clustering_hb_plugin_succession_get(
+			change_event->plugin_data->data,
+			change_event->plugin_data->data_size);
+	uint32_t* succession_list_length_p =
+			clustering_hb_plugin_succession_length_get(
+					change_event->plugin_data->data,
+					change_event->plugin_data->data_size);
+
+	if (*succession_list_length_p > 0
+			&& !clustering_is_our_principal(succession_list_p[0])
+			&& clustering_is_principal()) {
+		if (succession_list_p[0] < config_self_nodeid_get()) {
+			// We are seeing a new principal who could potentially merge with
+			// this cluster.
+			if (g_quantum_interval_generator.fault[QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN].event_ts
+					!= 1) {
+				quantum_interval_fault_update(
+						QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, cf_getms(),
+						change_event->plugin_data_changed_nodeid);
+			}
+		}
+		else {
+			// We see a cluster with higher nodeid and most probably we will not
+			// be the principal of the merged cluster. Reset the fault
+			// timestamp, however set it to 1 to differentiate between no fault
+			// and a fault to be ingnored in this quantum interval. A value of 1
+			// for practical purposes will never push the quantum interval
+			// forward.
+			quantum_interval_fault_update(
+					QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, 1,
+					change_event->plugin_data_changed_nodeid);
+		}
+	}
+	else {
+		if (clustering_is_principal() && *succession_list_length_p == 0
+				&& vector_find(&g_register.succession_list,
+						&change_event->plugin_data_changed_nodeid) >= 0) {
+			// One of our cluster members switched to orphan state. Most likely
+			// a quick restart.
+			quantum_interval_fault_update(QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED,
+					cf_getms(), change_event->plugin_data_changed_nodeid);
+		}
+		else {
+			// A node becoming an orphan node or seeing a succession with our
+			// principal does not mean we have seen a new cluster.
+		}
+	}
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Update the fault time for this quantum on self heartbeat adjacency list
+ * change.
+ */
+static void
+quantum_interval_generator_join_request_accepted_handle(
+		as_clustering_internal_event* join_request_event)
+{
+	quantum_interval_fault_update(QUANTUM_FAULT_JOIN_ACCEPTED, cf_getms(),
+			join_request_event->join_request_source_nodeid);
+}
+
+/**
+ * Dispatch internal clustering events for the quantum interval generator.
+ */
+static void
+quantum_interval_generator_event_dispatch(as_clustering_internal_event* event)
+{
+	switch (event->type) {
+	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
+		quantum_interval_generator_timer_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_HB:
+		quantum_interval_generator_hb_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED:
+		quantum_interval_generator_hb_plugin_data_changed_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED:
+		quantum_interval_generator_join_request_accepted_handle(event);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * Start quantum interval generator.
+ */
+static void
+quantum_interval_generator_start()
+{
+	CLUSTERING_LOCK();
+	g_quantum_interval_generator.last_quantum_start_time = cf_getms();
+	CLUSTERING_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering common
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Generate a new random and most likely a unique cluster key.
+ * @param current_cluster_key current cluster key to prevent collision.
+ * @return randomly generated cluster key.
+ */
+static as_cluster_key
+clustering_cluster_key_generate(as_cluster_key current_cluster_key)
+{
+	// Generate one uuid and use this for the cluster key
+	as_cluster_key cluster_key = 0;
+
+	// Generate a non-zero cluster key that fits in 6 bytes.
+	while ((cluster_key = (cf_get_rand64() >> 16)) == 0
+			|| cluster_key == current_cluster_key) {
+		;
+	}
+
+	return cluster_key;
+}
+
+/**
+ * Indicates if this node is an orphan. A node is deemed orphan if it is not a
+ * memeber of any cluster.
+ */
+static bool
+clustering_is_orphan()
+{
+	CLUSTERING_LOCK();
+
+	bool is_orphan = cf_vector_size(&g_register.succession_list) <= 0
+			|| g_register.cluster_key == 0;
+
+	CLUSTERING_UNLOCK();
+
+	return is_orphan;
+}
+
+/**
+ * Return the principal node for current cluster.
+ * @param principal (output) the current principal for the cluster.
+ * @return 0 if there is a valid principal, -1 if the node is in orphan state
+ * and there is no valid principal.
+ */
+static int
+clustering_principal_get(cf_node* principal)
+{
+	CLUSTERING_LOCK();
+	int rv = -1;
+
+	if (cf_vector_get(&g_register.succession_list, 0, principal) == 0) {
+		rv = 0;
+	}
+
+	CLUSTERING_UNLOCK();
+
+	return rv;
+}
+
+/**
+ * Indicates if this node is the principal for its cluster.
+ */
+static bool
+clustering_is_principal()
+{
+	CLUSTERING_LOCK();
+	cf_node current_principal;
+
+	bool is_principal = clustering_principal_get(&current_principal) == 0
+			&& current_principal == config_self_nodeid_get();
+
+	CLUSTERING_UNLOCK();
+
+	return is_principal;
+}
+
+/**
+ * Indicates if input node is this node's principal. Input node can be self node
+ * as well.
+ */
+static bool
+clustering_is_our_principal(cf_node nodeid)
+{
+	CLUSTERING_LOCK();
+	cf_node current_principal;
+
+	bool is_principal = clustering_principal_get(&current_principal) == 0
+			&& current_principal == nodeid;
+
+	CLUSTERING_UNLOCK();
+
+	return is_principal;
+}
+
+/**
+ * Indicates if a node is our cluster member.
+ */
+static bool
+clustering_is_cluster_member(cf_node nodeid)
+{
+	CLUSTERING_LOCK();
+	bool is_member = vector_find(&g_register.succession_list, &nodeid) >= 0;
+	CLUSTERING_UNLOCK();
+	return is_member;
+}
+
+/**
+ * Indicates if the input node is present in a succession list.
+ * @param nodeid the nodeid to search.
+ * @param succession_list the succession list.
+ * @param succession_list_length the length of the succession list.
+ * @return true if the node is present in the succession list, false otherwise.
+ */
+static bool
+clustering_is_node_in_succession(cf_node nodeid, cf_node* succession_list,
+		int succession_list_length)
+{
+	for (int i = 0; i < succession_list_length; i++) {
+		if (succession_list[i] == nodeid) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * Indicates if the input node can be accepted as this a paxos proposer. We can
+ * accept the new node as our principal if we are in the orphan state or if the
+ * input node is already our principal.
+ *
+ * Note: In case we send a join request to a node with a lower node id,  input
+ * node's nodeid can be less than our nodeid. This is still valid as the
+ * proposer who will hand over the principalship to us once paxos round is over.
+ *
+ * @param nodeid the nodeid of the proposer to check.
+ * @return true if this input node is an acceptable proposer.
+ */
+static bool
+clustering_can_accept_as_proposer(cf_node nodeid)
+{
+	return clustering_is_orphan() || clustering_is_our_principal(nodeid);
+}
+
+/**
+ * Plugin data iterate function that finds and collects neighboring principals,
+ * excluding current principal if any .
+ */
+static void
+clustering_neighboring_principals_find(cf_node nodeid, void* plugin_data,
+		size_t plugin_data_size, cf_clock recv_monotonic_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
+{
+	cf_vector* neighboring_principals = (cf_vector*)udata;
+
+	CLUSTERING_LOCK();
+
+	// For determining neighboring principal it is alright if this data is
+	// within two heartbeat intervals. So obsolete check has the timestamps as
+	// zero. This way we will not reject principals that have nothing to do with
+	// our cluster changes.
+	if (recv_monotonic_ts + 2 * as_hb_tx_interval_get() >= cf_getms()
+			&& !clustering_hb_plugin_data_is_obsolete(0, 0, plugin_data,
+					plugin_data_size, recv_monotonic_ts, msg_hlc_ts)) {
+		cf_node* succession_list = clustering_hb_plugin_succession_get(
+				plugin_data, plugin_data_size);
+
+		uint32_t* succession_list_length_p =
+				clustering_hb_plugin_succession_length_get(plugin_data,
+						plugin_data_size);
+
+		if (succession_list != NULL && succession_list_length_p != NULL
+				&& *succession_list_length_p > 0
+				&& succession_list[0] != config_self_nodeid_get()) {
+			cf_vector_append_unique(neighboring_principals,
+					&succession_list[0]);
+		}
+	}
+	else {
+		DETAIL(
+				"neighboring principal check skipped - found obsolete plugin data for node %"PRIx64,
+				nodeid);
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Get a list of adjacent principal nodes ordered by descending nodeids.
+ */
+static void
+clustering_neighboring_principals_get(cf_vector* neighboring_principals)
+{
+	CLUSTERING_LOCK();
+
+	// Use a single iteration over the clustering data received via the
+	// heartbeats instead of individual calls to get a consistent view and avoid
+	// small lock and releases.
+	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
+			clustering_neighboring_principals_find, neighboring_principals);
+
+	vector_sort_unique(neighboring_principals, cf_node_compare_desc);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Find dead nodes in current succession list.
+ */
+static void
+clustering_dead_nodes_find(cf_vector* dead_nodes)
+{
+	CLUSTERING_LOCK();
+
+	cf_vector* succession_list_p = &g_register.succession_list;
+	int succession_list_count = cf_vector_size(succession_list_p);
+	for (int i = 0; i < succession_list_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		cf_node cluster_member_nodeid = *((cf_node*)cf_vector_getp(
+				succession_list_p, i));
+
+		if (!as_hb_is_alive(cluster_member_nodeid)) {
+			cf_vector_append(dead_nodes, &cluster_member_nodeid);
+		}
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Indicates if a node is faulty. A node in the succecssion list deemed faulty
+ * - if the node is alive and it reports to be an orphan or is part of some
+ * other cluster.
+ * - if the node is alive its clustering protocol identifier does not match this
+ * node's clustering protocol identifier.
+ */
+static bool
+clustering_node_is_faulty(cf_node nodeid)
+{
+	if (nodeid == config_self_nodeid_get()) {
+		// Self node is never faulty wrt clustering.
+		return false;
+	}
+
+	CLUSTERING_LOCK();
+	bool is_faulty = false;
+	as_hlc_msg_timestamp hb_msg_hlc_ts;
+	cf_clock msg_recv_ts = 0;
+	as_hb_plugin_node_data plugin_data = { 0 };
+
+	if (clustering_hb_plugin_data_get(nodeid, &plugin_data, &hb_msg_hlc_ts,
+			&msg_recv_ts) != 0
+			|| clustering_hb_plugin_data_is_obsolete(
+					g_register.cluster_modified_hlc_ts,
+					g_register.cluster_modified_time, plugin_data.data,
+					plugin_data.data_size, msg_recv_ts, &hb_msg_hlc_ts)) {
+		INFO(
+				"faulty check skipped - found obsolete plugin data for node %"PRIx64,
+				nodeid);
+		is_faulty = false;
+		goto Exit;
+	}
+
+	// We have clustering data from the node after the current cluster change.
+	// Compare protocol identifier, clusterkey, and succession.
+	as_cluster_proto_identifier* proto_p = clustering_hb_plugin_proto_get(
+			plugin_data.data, plugin_data.data_size);
+
+	if (proto_p == NULL
+			|| !clustering_versions_are_compatible(*proto_p,
+					clustering_protocol_identifier_get())) {
+		DEBUG("for node %"PRIx64" protocol version mismatch - expected: %"PRIx32" but was : %"PRIx32,
+				nodeid, clustering_protocol_identifier_get(),
+				proto_p != NULL ? *proto_p : 0);
+		is_faulty = true;
+		goto Exit;
+	}
+
+	as_cluster_key* cluster_key_p = clustering_hb_plugin_cluster_key_get(
+			plugin_data.data, plugin_data.data_size);
+	if (cluster_key_p == NULL || *cluster_key_p != g_register.cluster_key) {
+		DEBUG("for node %"PRIx64" cluster key mismatch - expected: %"PRIx64" but was : %"PRIx64,
+				nodeid, g_register.cluster_key, cluster_key_p != NULL ? *cluster_key_p : 0);
+		is_faulty = true;
+		goto Exit;
+	}
+
+	// Check succession list just to be sure.
+	// We have clustering data from the node after the current cluster change.
+	cf_node* succession_list = clustering_hb_plugin_succession_get(
+			plugin_data.data, plugin_data.data_size);
+
+	uint32_t* succession_list_length_p =
+			clustering_hb_plugin_succession_length_get(plugin_data.data,
+					plugin_data.data_size);
+
+	if (succession_list == NULL || succession_list_length_p == NULL
+			|| !clustering_hb_succession_list_matches(succession_list,
+					*succession_list_length_p, &g_register.succession_list)) {
+		INFO("for node %"PRIx64" succession list mismatch", nodeid);
+
+		log_cf_node_vector("self succession list:", &g_register.succession_list,
+				CF_INFO);
+
+		if (succession_list) {
+			log_cf_node_array("node succession list:", succession_list,
+					succession_list && succession_list_length_p ?
+							*succession_list_length_p : 0, CF_INFO);
+		}
+		else {
+			INFO("node succession list: (empty)");
+		}
+
+		is_faulty = true;
+		goto Exit;
+	}
+
+Exit:
+	CLUSTERING_UNLOCK();
+	return is_faulty;
+}
+
+/**
+ * Find "faulty" nodes in current succession list.
+ */
+static void
+clustering_faulty_nodes_find(cf_vector* faulty_nodes)
+{
+	CLUSTERING_LOCK();
+
+	if (clustering_is_orphan()) {
+		goto Exit;
+	}
+
+	cf_vector* succession_list_p = &g_register.succession_list;
+	int succession_list_count = cf_vector_size(succession_list_p);
+	for (int i = 0; i < succession_list_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		cf_node cluster_member_nodeid = *((cf_node*)cf_vector_getp(
+				succession_list_p, i));
+		if (clustering_node_is_faulty(cluster_member_nodeid)) {
+			cf_vector_append(faulty_nodes, &cluster_member_nodeid);
+		}
+	}
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Indicates if a node is in sync with this node's cluster. A node in the
+ * succecssion list is deemed in sync if the node is alive and it reports to be
+ * in the same cluster via its heartbeats.
+ */
+static bool
+clustering_node_is_sync(cf_node nodeid)
+{
+	if (nodeid == config_self_nodeid_get()) {
+		// Self node is always in sync wrt clustering.
+		return true;
+	}
+
+	CLUSTERING_LOCK();
+	bool is_sync = false;
+	as_hlc_msg_timestamp hb_msg_hlc_ts;
+	cf_clock msg_recv_ts = 0;
+	as_hb_plugin_node_data plugin_data = { 0 };
+	bool data_exists =
+	clustering_hb_plugin_data_get(nodeid, &plugin_data, &hb_msg_hlc_ts,
+			&msg_recv_ts) == 0;
+
+	// Latest valid plugin data is ok as long as other checks are met. Hence the
+	// timestamps are zero.
+	if (!data_exists || msg_recv_ts + 2 * as_hb_tx_interval_get() < cf_getms()
+			|| clustering_hb_plugin_data_is_obsolete(0, 0, plugin_data.data,
+					plugin_data.data_size, msg_recv_ts, &hb_msg_hlc_ts)) {
+		is_sync = false;
+		goto Exit;
+	}
+
+	// We have clustering data from the node after the current cluster change.
+	// Compare protocol identifier, clusterkey, and succession.
+	as_cluster_proto_identifier* proto_p = clustering_hb_plugin_proto_get(
+			plugin_data.data, plugin_data.data_size);
+
+	if (proto_p == NULL
+			|| !clustering_versions_are_compatible(*proto_p,
+					clustering_protocol_identifier_get())) {
+		DEBUG(
+				"for node %"PRIx64" protocol version mismatch - expected: %"PRIx32" but was : %"PRIx32,
+				nodeid, clustering_protocol_identifier_get(),
+				proto_p != NULL ? *proto_p : 0);
+		is_sync = false;
+		goto Exit;
+	}
+
+	as_cluster_key* cluster_key_p = clustering_hb_plugin_cluster_key_get(
+			plugin_data.data, plugin_data.data_size);
+	if (cluster_key_p == NULL || *cluster_key_p != g_register.cluster_key) {
+		DEBUG(
+				"for node %"PRIx64" cluster key mismatch - expected: %"PRIx64" but was : %"PRIx64,
+				nodeid, g_register.cluster_key, cluster_key_p != NULL ? *cluster_key_p : 0);
+		is_sync = false;
+		goto Exit;
+	}
+
+	// Check succession list just to be sure.
+	// We have clustering data from the node after the current cluster change.
+	cf_node* succession_list = clustering_hb_plugin_succession_get(
+			plugin_data.data, plugin_data.data_size);
+
+	uint32_t* succession_list_length_p =
+			clustering_hb_plugin_succession_length_get(plugin_data.data,
+					plugin_data.data_size);
+
+	if (succession_list == NULL || succession_list_length_p == NULL
+			|| !clustering_hb_succession_list_matches(succession_list,
+					*succession_list_length_p, &g_register.succession_list)) {
+		DEBUG("for node %"PRIx64" succession list mismatch", nodeid);
+
+		log_cf_node_vector("self succession list:", &g_register.succession_list,
+				CF_DEBUG);
+
+		if (succession_list) {
+			log_cf_node_array("node succession list:", succession_list,
+					succession_list && succession_list_length_p ?
+							*succession_list_length_p : 0, CF_DEBUG);
+		}
+		else {
+			DEBUG("node succession list: (empty)");
+		}
+
+		is_sync = false;
+		goto Exit;
+	}
+
+	is_sync = true;
+
+Exit:
+	CLUSTERING_UNLOCK();
+	return is_sync;
+}
+
+/**
+ * Find orphan nodes using clustering data for each node in the heartbeat's
+ * adjacency list.
+ */
+static void
+clustering_orphan_nodes_find(cf_node nodeid, void* plugin_data,
+		size_t plugin_data_size, cf_clock recv_monotonic_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
+{
+	cf_vector* orphans = udata;
+
+	CLUSTERING_LOCK();
+
+	// For determining orphan it is alright if this data is within two heartbeat
+	// intervals. So obsolete check has the timestamps as zero.
+	if (recv_monotonic_ts + 2 * as_hb_tx_interval_get() >= cf_getms()
+			&& !clustering_hb_plugin_data_is_obsolete(0, 0, plugin_data,
+					plugin_data_size, recv_monotonic_ts, msg_hlc_ts)) {
+		if (clustering_hb_plugin_data_node_status(plugin_data, plugin_data_size)
+				== AS_NODE_ORPHAN) {
+			cf_vector_append(orphans, &nodeid);
+		}
+
+	}
+	else {
+		DETAIL(
+				"orphan check skipped - found obsolete plugin data for node %"PRIx64,
+				nodeid);
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Get a list of neighboring nodes that are orphans. Does not include self node.
+ */
+static void
+clustering_neighboring_orphans_get(cf_vector* neighboring_orphans)
+{
+	CLUSTERING_LOCK();
+
+	// Use a single iteration over the clustering data received via the
+	// heartbeats instead of individual calls to get a consistent view and avoid
+	// small lock and release.
+	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
+			clustering_orphan_nodes_find, neighboring_orphans);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Find neighboring nodes using clustering data for each node in the heartbeat's
+ * adjacency list.
+ */
+static void
+clustering_neighboring_nodes_find(cf_node nodeid, void* plugin_data,
+		size_t plugin_data_size, cf_clock recv_monotonic_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
+{
+	cf_vector* nodes = udata;
+	cf_vector_append(nodes, &nodeid);
+}
+
+/**
+ * Get a list of all neighboring nodes. Does not include self node.
+ */
+static void
+clustering_neighboring_nodes_get(cf_vector* neighboring_nodes)
+{
+	CLUSTERING_LOCK();
+
+	// Use a single iteration over the clustering data received via the
+	// heartbeats instead of individual calls to get a consistent view and avoid
+	// small lock and release.
+	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
+			clustering_neighboring_nodes_find, neighboring_nodes);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Evict nodes not forming a clique from the succession list.
+ */
+static uint32_t
+clustering_succession_list_clique_evict(cf_vector* succession_list,
+		char* evict_msg)
+{
+	uint32_t num_evicted = 0;
+	if (g_config.clustering_config.clique_based_eviction_enabled) {
+		// Remove nodes that do not form a clique.
+		cf_vector* evicted_nodes = vector_stack_lockless_create(cf_node);
+		as_hb_maximal_clique_evict(succession_list, evicted_nodes);
+		num_evicted = cf_vector_size(evicted_nodes);
+		log_cf_node_vector(evict_msg, evicted_nodes,
+				num_evicted > 0 ? CF_INFO : CF_DEBUG);
+
+		vector_subtract(succession_list, evicted_nodes);
+		cf_vector_destroy(evicted_nodes);
+	}
+	return num_evicted;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering network message functions
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Fill common source node specific fields for the message.
+ * @param msg the message to fill the source fields into.
+ */
+static void
+msg_src_fields_fill(msg* msg)
+{
+	// Set the hb protocol id / version.
+	if (msg_set_uint32(msg, AS_CLUSTERING_MSG_ID,
+			clustering_protocol_identifier_get()) != 0) {
+		CRASH("error setting clustering protocol on msg");
+	}
+
+	// Set the send timestamp
+	if (msg_set_uint64(msg, AS_CLUSTERING_MSG_HLC_TIMESTAMP,
+			as_hlc_timestamp_now()) != 0) {
+		CRASH("error setting send timestamp on msg");
+	}
+}
+
+/**
+ * Read the protocol identifier for this clustering message. These functions can
+ * get called multiple times for a single message. Hence they do not increment
+ * error counters.
+ * @param msg the incoming message.
+ * @param id the output id.
+ * @return 0 if the type could be parsed -1 on failure.
+ */
+static int
+msg_proto_id_get(msg* msg, uint32_t* id)
+{
+	if (msg_get_uint32(msg, AS_CLUSTERING_MSG_ID, id) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Read the message type.  These functions can get called multiple times for a
+ * single message. Hence they do not increment error counters.
+ * @param msg the incoming message.
+ * @param type the output message type.
+ * @return 0 if the type could be parsed -1 on failure.
+ */
+static int
+msg_type_get(msg* msg, as_clustering_msg_type* type)
+{
+	if (msg_get_uint32(msg, AS_CLUSTERING_MSG_TYPE, type) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Set the type for an outgoing message.
+ * @param msg the outgoing message.
+ * @param msg_type the type to set.
+ */
+static void
+msg_type_set(msg* msg, as_clustering_msg_type msg_type)
+{
+	// Set the message type.
+	if (msg_set_uint32(msg, AS_CLUSTERING_MSG_TYPE, msg_type) != 0) {
+		CRASH("error setting type on msg");
+	}
+}
+
+/**
+ * Read the proposed principal field from the message.
+ * @param msg the incoming message.
+ * @param nodeid the output nodeid.
+ * @return 0 if the type could be parsed -1 on failure.
+ */
+static int
+msg_proposed_principal_get(msg* msg, cf_node* nodeid)
+{
+	if (msg_get_uint64(msg, AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, nodeid)
+			!= 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Set the proposed principal field in the message.
+ * @param msg the outgoing message.
+ * @param nodeid the proposed principal nodeid.
+ */
+static void
+msg_proposed_principal_set(msg* msg, cf_node nodeid)
+{
+	if (msg_set_uint64(msg, AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, nodeid)
+			!= 0) {
+		CRASH("error setting proposed principal");
+	}
+}
+
+/**
+ * Read the HLC send timestamp for the message. These functions can get called
+ * multiple times for a single message. Hence they do not increment error
+ * counters.
+ * @param msg the incoming message.
+ * @param send_ts the output hls timestamp.
+ * @return 0 if the type could be parsed -1 on failure.
+ */
+static int
+msg_send_ts_get(msg* msg, as_hlc_timestamp* send_ts)
+{
+	if (msg_get_uint64(msg, AS_CLUSTERING_MSG_HLC_TIMESTAMP, send_ts) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Set the sequence number for an outgoing message.
+ * @param msg the outgoing message.
+ * @param sequence_number the sequence number to set.
+ */
+static void
+msg_sequence_number_set(msg* msg, as_paxos_sequence_number sequence_number)
+{
+	// Set the message type.
+	if (msg_set_uint64(msg, AS_CLUSTERING_MSG_SEQUENCE_NUMBER, sequence_number)
+			!= 0) {
+		CRASH("error setting sequence number on msg");
+	}
+}
+
+/**
+ * Read sequence number from the message.
+ * @param msg the incoming message.
+ * @param sequence_number the output sequence number.
+ * @return 0 if the sequence number could be parsed -1 on failure.
+ */
+static int
+msg_sequence_number_get(msg* msg, as_paxos_sequence_number* sequence_number)
+{
+	if (msg_get_uint64(msg, AS_CLUSTERING_MSG_SEQUENCE_NUMBER, sequence_number)
+			!= 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Set the cluster key for an outgoing message field.
+ * @param msg the outgoing message.
+ * @param cluster_key the cluster key to set.
+ * @param field the field to set the cluster key to.
+ */
+static void
+msg_cluster_key_field_set(msg* msg, as_cluster_key cluster_key,
+		as_clustering_msg_field field)
+{
+	// Set the cluster key.
+	if (msg_set_uint64(msg, field, cluster_key) != 0) {
+		CRASH("error setting cluster key on msg");
+	}
+}
+
+/**
+ * Set the cluster key for an outgoing message.
+ * @param msg the outgoing message.
+ * @param cluster_key the cluster key to set.
+ */
+static void
+msg_cluster_key_set(msg* msg, as_cluster_key cluster_key)
+{
+	msg_cluster_key_field_set(msg, cluster_key, AS_CLUSTERING_MSG_CLUSTER_KEY);
+}
+
+/**
+ * Read cluster key from a message field.
+ * @param msg the incoming message.
+ * @param cluster_key the output cluster key.
+ * @param field the field to set the cluster key to.
+ * @return 0 if the cluster key could be parsed -1 on failure.
+ */
+static int
+msg_cluster_key_field_get(msg* msg, as_cluster_key* cluster_key,
+		as_clustering_msg_field field)
+{
+	if (msg_get_uint64(msg, field, cluster_key) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Read cluster key from the message.
+ * @param msg the incoming message.
+ * @param cluster_key the output cluster key.
+ * @return 0 if the cluster key could be parsed -1 on failure.
+ */
+static int
+msg_cluster_key_get(msg* msg, as_cluster_key* cluster_key)
+{
+	return msg_cluster_key_field_get(msg, cluster_key,
+			AS_CLUSTERING_MSG_CLUSTER_KEY);
+}
+
+/**
+ * Set the succession list for an outgoing message in a particular field.
+ * @param msg the outgoing message.
+ * @param succession_list the succession list to set.
+ * @param field the field to set for the succession list.
+ */
+static void
+msg_succession_list_field_set(msg* msg, cf_vector* succession_list,
+		as_clustering_msg_field field)
+
+{
+	int num_elements = cf_vector_size(succession_list);
+	size_t buffer_size = num_elements * sizeof(cf_node);
+	cf_node* succession_buffer = (cf_node*)BUFFER_ALLOC_OR_DIE(buffer_size);
+
+	for (int i = 0; i < num_elements; i++) {
+		cf_vector_get(succession_list, i, &succession_buffer[i]);
+	}
+
+	if (msg_set_buf(msg, field, (uint8_t*)succession_buffer, buffer_size,
+			MSG_SET_COPY) != 0) {
+		CRASH("error setting succession list on msg");
+	}
+
+	BUFFER_FREE(succession_buffer, buffer_size);
+}
+
+/**
+ * Set the succession list for an outgoing message.
+ * @param msg the outgoing message.
+ * @param succession_list the succession list to set.
+ */
+static void
+msg_succession_list_set(msg* msg, cf_vector* succession_list)
+{
+	int num_elements = cf_vector_size(succession_list);
+	if (num_elements <= 0) {
+		// Empty succession list being sent. Definitely wrong.Something is amiss
+		// let it through. The receiver will reject it anyways.
+		WARNING("setting empty succession list");
+		return;
+	}
+
+	msg_succession_list_field_set(msg, succession_list,
+			AS_CLUSTERING_MSG_SUCCESSION_LIST);
+}
+
+/**
+ * Read succession list from a message field.
+ * @param msg the incoming message.
+ * @param succession_list the output succession list.
+ * @param field the field to read from.
+ * @return 0 if the succession list could be parsed -1 on failure.
+ */
+static int
+msg_succession_list_field_get(msg* msg, cf_vector* succession_list,
+		as_clustering_msg_field field)
+{
+	vector_clear(succession_list);
+	cf_node* succession_buffer;
+	size_t buffer_size;
+	if (msg_get_buf(msg, field, (uint8_t**)&succession_buffer, &buffer_size,
+			MSG_GET_DIRECT) != 0) {
+		// Empty succession list should not be allowed.
+		return -1;
+	}
+
+	// Correct adjacency list length.
+	int num_elements = buffer_size / sizeof(cf_node);
+
+	for (int i = 0; i < num_elements; i++) {
+		cf_vector_append(succession_list, &succession_buffer[i]);
+	}
+
+	vector_sort_unique(succession_list, cf_node_compare_desc);
+
+	return 0;
+}
+
+/**
+ * Read succession list from the message.
+ * @param msg the incoming message.
+ * @param succession_list the output succession list.
+ * @return 0 if the succession list could be parsed -1 on failure.
+ */
+static int
+msg_succession_list_get(msg* msg, cf_vector* succession_list)
+{
+	return msg_succession_list_field_get(msg, succession_list,
+			AS_CLUSTERING_MSG_SUCCESSION_LIST);
+}
+
+/**
+ * Get the paxos proposal id for message event.
+ * @param event the message event.
+ * @param proposal_id the paxos proposal id.
+ * @return 0 if the type could be parsed -1 on failure.
+ */
+static int
+msg_event_proposal_id_get(as_clustering_internal_event* event,
+		as_paxos_proposal_id* proposal_id)
+{
+	if (msg_sequence_number_get(event->msg, &proposal_id->sequence_number)
+			!= 0) {
+		return -1;
+	}
+	proposal_id->src_nodeid = event->msg_src_nodeid;
+	return 0;
+}
+
+/**
+ * Get a network message object from the message pool with all common fields for
+ * clustering, like protocol identifier, and hlc timestamp filled in.
+ * @param type the type of the message.
+ */
+static msg*
+msg_pool_get(as_clustering_msg_type type)
+{
+	msg* msg = as_fabric_msg_get(M_TYPE_CLUSTERING);
+	msg_src_fields_fill(msg);
+	msg_type_set(msg, type);
+	return msg;
+}
+
+/**
+ * Return a message back to the message pool.
+ */
+static void
+msg_pool_return(msg* msg)
+{
+	as_fabric_msg_put(msg);
+}
+
+/**
+ * Determines if the received message is old to be ignored.
+ *
+ * This is detemined by comparing the message hlc timestamp and monotonic
+ * timestamps with the cluster formation hlc and monotonic times.
+ *
+ * @param cluster_modified_hlc_ts the hlc timestamp when for current cluster
+ * change happened. Sent to avoid locking in this function.
+ * @param cluster_modified_time  the monotonic timestamp when for current
+ * cluster change happened. Sento to avoid locking in this function.
+ * @param msg_recv_ts the monotonic timestamp for plugin data receive.
+ * @param msg_hlc_ts the hlc timestamp for plugin data receive.
+ * @return true if plugin data is obsolete, false otherwise.
+ */
+bool
+msg_is_obsolete(as_hlc_timestamp cluster_modified_hlc_ts,
+		cf_clock cluster_modified_time, cf_clock msg_recv_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts)
+{
+	if (as_hlc_send_timestamp_order(cluster_modified_hlc_ts, msg_hlc_ts)
+			!= AS_HLC_HAPPENS_BEFORE) {
+		// Cluster formation time after message send or the order is unknown,
+		// assume cluster formation is after message received.
+		// The caller should ignore this message.
+		return true;
+	}
+
+	// MSG should be atleast after cluster formation time + one hb interval to
+	// send out our cluster state + one network delay for our information to
+	// reach the remote node + one hb for the other node to send out the his
+	// updated state +
+	// one network delay for the updated  state to reach us.
+	if (cluster_modified_time + 2 * as_hb_tx_interval_get()
+			+ 2 * g_config.fabric_latency_max_ms > msg_recv_ts) {
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * Send a message to all input nodes. This is best effort some sends could fail.
+ * The message will be returned back to the pool.
+ * @param msg the message to send.
+ * @param nodes the nodes to send the message to.
+ * @return 0 on successfu queueing of message (does not imply guaranteed
+ * delivery), -1 if the message could not be queued.
+ */
+static int
+msg_node_send(msg* msg, cf_node node)
+{
+	int rv = as_fabric_send(node, msg, AS_FABRIC_CHANNEL_CTRL);
+	if (rv) {
+		// Fabric did not clean up the message, return it back to the message
+		// pool.
+		msg_pool_return(msg);
+	}
+	return rv;
+}
+
+/**
+ * Send a message to all input nodes. This is best effort some sends could fail.
+ * The message will be returned back to the pool.
+ * @param msg the message to send.
+ * @param nodes the nodes to send the message to.
+ * @return the number of nodes the message was sent to. Does not imply
+ * guaranteed receipt by these nodes however.
+ */
+static int
+msg_nodes_send(msg* msg, cf_vector* nodes)
+{
+	int node_count = cf_vector_size(nodes);
+	int sent_count = 0;
+
+	if (node_count <= 0) {
+		return sent_count;
+	}
+
+	int alloc_size = node_count * sizeof(cf_node);
+	cf_node* send_list = (cf_node*)BUFFER_ALLOC_OR_DIE(alloc_size);
+
+	vector_array_cpy(send_list, nodes, node_count);
+
+	if (as_fabric_send_list(send_list, node_count, msg, AS_FABRIC_CHANNEL_CTRL)
+			!= 0) {
+		// Fabric did not clean up the message, return it back to the message
+		// pool.
+		msg_pool_return(msg);
+	}
+
+	BUFFER_FREE(send_list, alloc_size);
+	return sent_count;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Paxos common
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Compare paxos proposal ids. Compares the sequence numbers, ties in sequence
+ * number are broken by nodeids.
+ *
+ * @param id1 the first identifier.
+ * @param id2 the second identifier.
+ *
+ * @return 0 if id1 equals id2, 1 if id1 > id2 and -1 if id1 < id2.
+ */
+static int
+paxos_proposal_id_compare(as_paxos_proposal_id* id1, as_paxos_proposal_id* id2)
+{
+	if (id1->sequence_number != id2->sequence_number) {
+		return id1->sequence_number > id2->sequence_number ? 1 : -1;
+	}
+
+	// Sequence numbers match, compare nodeids.
+	if (id1->src_nodeid != id2->src_nodeid) {
+		return id1->src_nodeid > id2->src_nodeid ? 1 : -1;
+	}
+
+	// Node id and sequence numbers match.
+	return 0;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Paxos proposer
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Dump paxos proposer state to logs.
+ */
+static void
+paxos_proposer_dump(bool verbose)
+{
+	CLUSTERING_LOCK();
+
+	// Output paxos proposer state.
+	switch (g_proposer.state) {
+	case AS_PAXOS_PROPOSER_STATE_IDLE:
+		INFO("CL: paxos proposer: idle");
+		break;
+	case AS_PAXOS_PROPOSER_STATE_PREPARE_SENT:
+		INFO("CL: paxos proposer: prepare sent");
+		break;
+	case AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT:
+		INFO("CL: paxos proposer: accept sent");
+		break;
+	}
+
+	if (verbose) {
+		if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE) {
+			INFO("CL: paxos proposal start time: %"PRIu64" now: %"PRIu64,
+					g_proposer.paxos_round_start_time, cf_getms());
+			INFO("CL: paxos proposed cluster key: %"PRIx64,
+					g_proposer.proposed_value.cluster_key);
+			INFO("CL: paxos proposed sequence: %"PRIu64,
+					g_proposer.sequence_number);
+			log_cf_node_vector("CL: paxos proposed succession:",
+					&g_proposer.proposed_value.succession_list, CF_INFO);
+			log_cf_node_vector("CL: paxos promises received:",
+					&g_proposer.promises_received, CF_INFO);
+			log_cf_node_vector("CL: paxos accepted received:",
+					&g_proposer.accepted_received, CF_INFO);
+		}
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Reset state on failure of a paxos round.
+ */
+static void
+paxos_proposer_reset()
+{
+	CLUSTERING_LOCK();
+
+	// Flipping state to idle to indicate paxos round is over.
+	g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE;
+	memset(&g_proposer.sequence_number, 0, sizeof(g_proposer.sequence_number));
+
+	g_proposer.proposed_value.cluster_key = 0;
+	vector_clear(&g_proposer.proposed_value.succession_list);
+
+	vector_clear(&g_proposer.acceptors);
+
+	DETAIL("paxos round over for proposal id %"PRIx64":%"PRIu64,
+			config_self_nodeid_get(), g_proposer.sequence_number);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Invoked to fail an ongoing paxos proposal.
+ */
+static void
+paxos_proposer_fail()
+{
+	// Cleanup state for the paxos round.
+	paxos_proposer_reset();
+
+	as_clustering_internal_event paxos_fail_event;
+	memset(&paxos_fail_event, 0, sizeof(paxos_fail_event));
+	paxos_fail_event.type = AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL;
+
+	internal_event_dispatch(&paxos_fail_event);
+}
+
+/**
+ * Indicates if a paxos proposal from self node is active.
+ */
+static bool
+paxos_proposer_proposal_is_active()
+{
+	CLUSTERING_LOCK();
+	bool rv = g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE;
+	CLUSTERING_UNLOCK();
+	return rv;
+}
+
+/**
+ * Send paxos prepare message current list of acceptor nodes.
+ */
+static void
+paxos_proposer_prepare_send()
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE);
+
+	CLUSTERING_LOCK();
+
+	// Set the sequence number
+	msg_sequence_number_set(msg, g_proposer.sequence_number);
+
+	log_cf_node_vector("paxos prepare message sent to:", &g_proposer.acceptors,
+			CF_DEBUG);
+
+	g_proposer.prepare_send_time = cf_getms();
+
+	cf_vector* acceptors = vector_stack_lockless_create(cf_node);
+	vector_copy(acceptors, &g_proposer.acceptors);
+
+	CLUSTERING_UNLOCK();
+
+	// Sent the message to the acceptors.
+	msg_nodes_send(msg, acceptors);
+	cf_vector_destroy(acceptors);
+}
+
+/**
+ * Send paxos accept message current list of acceptor nodes.
+ */
+static void
+paxos_proposer_accept_send()
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT);
+
+	CLUSTERING_LOCK();
+
+	// Set the sequence number
+	msg_sequence_number_set(msg, g_proposer.sequence_number);
+
+	// Skip send of the proposed value for accept, since we do not use it. Learn
+	// message is the only way a consensus value is sent out.
+	log_cf_node_vector("paxos accept message sent to:", &g_proposer.acceptors,
+			CF_DEBUG);
+
+	g_proposer.accept_send_time = cf_getms();
+
+	cf_vector* acceptors = vector_stack_lockless_create(cf_node);
+	vector_copy(acceptors, &g_proposer.acceptors);
+
+	CLUSTERING_UNLOCK();
+
+	// Sent the message to the acceptors.
+	msg_nodes_send(msg, acceptors);
+	cf_vector_destroy(acceptors);
+}
+
+/**
+ * Send paxos learn message current list of acceptor nodes.
+ */
+static void
+paxos_proposer_learn_send()
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN);
+
+	CLUSTERING_LOCK();
+
+	// Set the sequence number
+	msg_sequence_number_set(msg, g_proposer.sequence_number);
+
+	// Set the cluster key
+	msg_cluster_key_set(msg, g_proposer.proposed_value.cluster_key);
+
+	// Set the succession list
+	msg_succession_list_set(msg, &g_proposer.proposed_value.succession_list);
+
+	log_cf_node_vector("paxos learn message sent to:", &g_proposer.acceptors,
+			CF_DEBUG);
+
+	g_proposer.learn_send_time = cf_getms();
+
+	cf_vector* acceptors = vector_stack_lockless_create(cf_node);
+	vector_copy(acceptors, &g_proposer.acceptors);
+
+	CLUSTERING_UNLOCK();
+
+	// Sent the message to the acceptors.
+	msg_nodes_send(msg, acceptors);
+	cf_vector_destroy(acceptors);
+}
+
+/**
+ * Handle an incoming paxos promise message.
+ */
+static void
+paxos_proposer_promise_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+	msg* msg = event->msg;
+
+	DEBUG("received paxos promise from node %"PRIx64, src_nodeid);
+
+	CLUSTERING_LOCK();
+	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_PREPARE_SENT) {
+		// We are not in the prepare phase. Reject this message.
+		DEBUG("ignoring paxos promise from node %"PRIx64" - we are not in prepare phase",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) {
+		WARNING("ignoring paxos promise from node %"PRIx64" - it is not in acceptor list",
+				src_nodeid);
+		goto Exit;
+	}
+
+	as_paxos_sequence_number sequence_number = 0;
+	if (msg_sequence_number_get(msg, &sequence_number) != 0) {
+		WARNING("ignoring paxos promise from node %"PRIx64" with invalid proposal id",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (sequence_number != g_proposer.sequence_number) {
+		// Not a matching promise message. Ignore.
+		INFO("ignoring paxos promise from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64,
+				src_nodeid, sequence_number,
+				g_proposer.sequence_number);
+		goto Exit;
+	}
+
+	cf_vector_append_unique(&g_proposer.promises_received, &src_nodeid);
+
+	int promised_count = cf_vector_size(&g_proposer.promises_received);
+	int acceptor_count = cf_vector_size(&g_proposer.acceptors);
+
+	// Use majority quorum to move on.
+	if (promised_count >= 1 + (acceptor_count / 2)) {
+		// We have quorum number of promises. go ahead to the accept phase.
+		g_proposer.state = AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT;
+		paxos_proposer_accept_send();
+	}
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming paxos prepare nack message.
+ */
+static void
+paxos_proposer_prepare_nack_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+	msg* msg = event->msg;
+
+	DEBUG("received paxos prepare nack from node %"PRIx64, src_nodeid);
+
+	CLUSTERING_LOCK();
+	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_PREPARE_SENT) {
+		// We are not in the prepare phase. Reject this message.
+		INFO("ignoring paxos prepare nack from node %"PRIx64" - we are not in prepare phase",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) {
+		WARNING("ignoring paxos prepare nack from node %"PRIx64" - it is not in acceptor list",
+				src_nodeid);
+		goto Exit;
+	}
+
+	as_paxos_sequence_number sequence_number = 0;
+	if (msg_sequence_number_get(msg, &sequence_number) != 0) {
+		WARNING("ignoring paxos prepare nack from node %"PRIx64" with invalid proposal id",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (sequence_number != g_proposer.sequence_number) {
+		// Not a matching prepare nack message. Ignore.
+		INFO("ignoring paxos prepare nack from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64,
+				src_nodeid, sequence_number,
+				g_proposer.sequence_number);
+		goto Exit;
+	}
+
+	INFO(
+			"aborting current paxos proposal because of a prepare nack from node %"PRIx64,
+			src_nodeid);
+	paxos_proposer_fail();
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Invoked when all acceptors have accepted the proposal.
+ */
+static void
+paxos_proposer_success()
+{
+	CLUSTERING_LOCK();
+
+	// Set the proposer to back idle state.
+	g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE;
+
+	// Send out learn message and enable retransmits of learn message.
+	g_proposer.learn_retransmit_needed = true;
+	paxos_proposer_learn_send();
+
+	// Retain the sequence_number, cluster key and succession list for
+	// retransmits of the learn message.
+	as_clustering_internal_event paxos_success_event;
+	memset(&paxos_success_event, 0, sizeof(paxos_success_event));
+	paxos_success_event.type =
+			AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_SUCCESS;
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming paxos accepted message.
+ */
+static void
+paxos_proposer_accepted_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+	msg* msg = event->msg;
+
+	DEBUG("received paxos accepted from node %"PRIx64, src_nodeid);
+
+	CLUSTERING_LOCK();
+
+	// We also allow accepted messages in the idle state to deal with a loss of
+	// the learn message.
+	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT
+			&& g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE) {
+		// We are not in the accept phase. Reject this message.
+		DEBUG("ignoring paxos accepted from node %"PRIx64" - we are not in accept phase. Actual phase %d",
+				src_nodeid, g_proposer.state);
+		goto Exit;
+	}
+
+	if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) {
+		WARNING("ignoring paxos accepted from node %"PRIx64" - it is not in acceptor list",
+				src_nodeid);
+		goto Exit;
+	}
+
+	as_paxos_sequence_number sequence_number = 0;
+	if (msg_sequence_number_get(msg, &sequence_number) != 0) {
+		WARNING("ignoring paxos accepted from node %"PRIx64" with invalid proposal id",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (sequence_number != g_proposer.sequence_number) {
+		// Not a matching accepted message. Ignore.
+		INFO("ignoring paxos accepted from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64,
+				src_nodeid, sequence_number,
+				g_proposer.sequence_number);
+		goto Exit;
+	}
+
+	cf_vector_append_unique(&g_proposer.accepted_received, &src_nodeid);
+
+	int accepted_count = cf_vector_size(&g_proposer.accepted_received);
+	int acceptor_count = cf_vector_size(&g_proposer.acceptors);
+
+	// Use a simple quorum, all acceptors should accept for success.
+	if (accepted_count == acceptor_count) {
+		// This is the point after which the succession list will not change for
+		// this paxos round. Ensure that we meet the minimum cluster size
+		// criterion.
+		int cluster_size = cf_vector_size(
+				&g_proposer.proposed_value.succession_list);
+		if (cluster_size < g_config.clustering_config.cluster_size_min) {
+			WARNING(
+					"failing paxos round - the remaining number of nodes %d is less than minimum cluster size %d",
+					cluster_size, g_config.clustering_config.cluster_size_min);
+			// Fail paxos.
+			paxos_proposer_fail();
+			goto Exit;
+		}
+
+		// We have quorum number of accepted nodes. The proposal succeeded.
+		paxos_proposer_success();
+	}
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming paxos accept nack message.
+ */
+static void
+paxos_proposer_accept_nack_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+	msg* msg = event->msg;
+
+	DEBUG("received paxos accept nack from node %"PRIx64, src_nodeid);
+
+	CLUSTERING_LOCK();
+	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT) {
+		// We are not in the accept phase. Reject this message.
+		INFO("ignoring paxos accept nack from node %"PRIx64" - we are not in accept phase",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) {
+		WARNING("ignoring paxos accept nack from node %"PRIx64" - it is not in acceptor list",
+				src_nodeid);
+		goto Exit;
+	}
+
+	as_paxos_sequence_number sequence_number = 0;
+	if (msg_sequence_number_get(msg, &sequence_number) != 0) {
+		WARNING("ignoring paxos accept nack from node %"PRIx64" with invalid proposal id",
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (sequence_number != g_proposer.sequence_number) {
+		// Not a matching accept nack message. Ignore.
+		INFO("ignoring paxos accept nack from node %"PRIx64"because its proposal id %"PRIu64" does not match expected id %"PRIu64,
+				src_nodeid, sequence_number,
+				g_proposer.sequence_number);
+		goto Exit;
+	}
+
+	INFO(
+			"aborting current paxos proposal because of an accept nack from node %"PRIx64,
+			src_nodeid);
+	paxos_proposer_fail();
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming message.
+ */
+static void
+paxos_proposer_msg_event_handle(as_clustering_internal_event* msg_event)
+{
+	switch (msg_event->msg_type) {
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE:
+		paxos_proposer_promise_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK:
+		paxos_proposer_prepare_nack_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED:
+		paxos_proposer_accepted_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK:
+		paxos_proposer_accept_nack_handle(msg_event);
+		break;
+	default:	// Other message types are not of interest.
+		break;
+	}
+}
+
+/**
+ * Handle heartbeat event.
+ */
+static void
+paxos_proposer_hb_event_handle(as_clustering_internal_event* hb_event)
+{
+	if (!paxos_proposer_proposal_is_active()) {
+		return;
+	}
+
+	CLUSTERING_LOCK();
+	for (int i = 0; i < hb_event->hb_n_events; i++) {
+		if (hb_event->hb_events[i].evt == AS_HB_NODE_DEPART) {
+			cf_node departed_node = hb_event->hb_events[i].nodeid;
+			if (vector_find(&g_proposer.acceptors, &departed_node)) {
+				// One of the acceptors has departed. Abort the paxos proposal.
+				INFO("paxos acceptor %"PRIx64" departed - aborting current paxos proposal", departed_node);
+				paxos_proposer_fail();
+				break;
+			}
+		}
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check and retransmit prepare message if paxos promise messages have not yet
+ * being received.
+ */
+static void
+paxos_proposer_prepare_check_retransmit()
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+	if (g_proposer.state == AS_PAXOS_PROPOSER_STATE_PREPARE_SENT
+			&& g_proposer.prepare_send_time + paxos_msg_timeout() < now) {
+		paxos_proposer_prepare_send();
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check and retransmit accept message if paxos accepted has yet being received.
+ */
+static void
+paxos_proposer_accept_check_retransmit()
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+	if (g_proposer.state == AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT
+			&& g_proposer.accept_send_time + paxos_msg_timeout() < now) {
+		paxos_proposer_accept_send();
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check and retransmit learn message if all acceptors have not applied the
+ * current cluster change.
+ */
+static void
+paxos_proposer_learn_check_retransmit()
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+	bool learn_timedout = g_proposer.learn_retransmit_needed
+			&& (g_proposer.state == AS_PAXOS_PROPOSER_STATE_IDLE)
+			&& (g_proposer.proposed_value.cluster_key != 0)
+			&& (g_proposer.learn_send_time + paxos_msg_timeout() < now);
+
+	if (learn_timedout) {
+		// If the register is not synced, most likely the learn message did not
+		// make it through, retransmit the learn message to move the paxos
+		// acceptor forward and start register sync.
+		INFO("retransmitting paxos learn message");
+		paxos_proposer_learn_send();
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle a timer event and retransmit messages if required.
+ */
+static void
+paxos_proposer_timer_event_handle()
+{
+	CLUSTERING_LOCK();
+	switch (g_proposer.state) {
+	case AS_PAXOS_PROPOSER_STATE_IDLE:
+		paxos_proposer_learn_check_retransmit();
+		break;
+	case AS_PAXOS_PROPOSER_STATE_PREPARE_SENT:
+		paxos_proposer_prepare_check_retransmit();
+		break;
+	case AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT:
+		paxos_proposer_accept_check_retransmit();
+		break;
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle register getting synched.
+ */
+static void
+paxos_proposer_register_synched()
+{
+	CLUSTERING_LOCK();
+	// Register synched we no longer need learn messages to be retransmitted.
+	g_proposer.learn_retransmit_needed = false;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Initialize paxos proposer state.
+ */
+static void
+paxos_proposer_init()
+{
+	CLUSTERING_LOCK();
+	// Memset to zero which ensures that all proposer state variables have zero
+	// which is the correct initial value for elements other that contained
+	// vectors and status.
+	memset(&g_proposer, 0, sizeof(g_proposer));
+
+	// Initialize the proposer state.
+	// No paxos round running, so the state has to be idle.
+	g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE;
+
+	// Set the current acceptor list to be empty.
+	vector_lockless_init(&g_proposer.acceptors, cf_node);
+
+	// Set the current promises received node list to empty.
+	vector_lockless_init(&g_proposer.promises_received, cf_node);
+
+	// Set the current accepted received node list to empty.
+	vector_lockless_init(&g_proposer.accepted_received, cf_node);
+
+	// Initialize the proposed value.
+	vector_lockless_init(&g_proposer.proposed_value.succession_list, cf_node);
+	g_proposer.proposed_value.cluster_key = 0;
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Log paxos results.
+ */
+static void
+paxos_result_log(as_paxos_start_result result, cf_vector* new_succession_list)
+{
+	CLUSTERING_LOCK();
+	switch (result) {
+	case AS_PAXOS_RESULT_STARTED: {
+		// Running check required because paxos round finished for single node
+		// cluster by this time.
+		if (paxos_proposer_proposal_is_active()) {
+			INFO("paxos round started - cluster key: %"PRIx64,
+					g_proposer.proposed_value.cluster_key);
+			log_cf_node_vector("paxos round started - succession list:",
+					&g_proposer.proposed_value.succession_list, CF_INFO);
+		}
+		break;
+	}
+
+	case AS_PAXOS_RESULT_CLUSTER_TOO_SMALL: {
+		WARNING(
+				"paxos round aborted - new cluster size %d less than min cluster size %d",
+				cf_vector_size(new_succession_list),
+				g_config.clustering_config.cluster_size_min);
+		break;
+	}
+
+	case AS_PAXOS_RESULT_ROUND_RUNNING: {
+		// Should never happen in practice. Let the old round finish or timeout.
+		WARNING(
+				"older paxos round still running - should have finished by now");
+	}
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Start a new paxos round.
+ *
+ * @param new_succession_list the new succession list.
+ * @param acceptor_list the list of nodes to use for paxos acceptors.
+ * @param current_cluster_key the current cluster key
+ * @param current_succession_list the current succession list, can be null if
+ * this node is an orphan.
+ */
+static as_paxos_start_result
+paxos_proposer_proposal_start(cf_vector* new_succession_list,
+		cf_vector* acceptor_list)
+{
+	if (cf_vector_size(new_succession_list)
+			< g_config.clustering_config.cluster_size_min) {
+		// Fail paxos.
+		return AS_PAXOS_RESULT_CLUSTER_TOO_SMALL;
+	}
+
+	CLUSTERING_LOCK();
+
+	as_paxos_start_result result;
+	if (paxos_proposer_proposal_is_active()) {
+		result = AS_PAXOS_RESULT_ROUND_RUNNING;
+		goto Exit;
+	}
+
+	// Update state to prepare.
+	g_proposer.state = AS_PAXOS_PROPOSER_STATE_PREPARE_SENT;
+
+	g_proposer.sequence_number = as_hlc_timestamp_now();
+
+	g_proposer.paxos_round_start_time = cf_getms();
+
+	// Populate the proposed value struct with new succession list and a new
+	// cluster key.
+	vector_clear(&g_proposer.proposed_value.succession_list);
+	vector_copy(&g_proposer.proposed_value.succession_list,
+			new_succession_list);
+	g_proposer.proposed_value.cluster_key = clustering_cluster_key_generate(
+			g_register.cluster_key);
+
+	// Remember the acceptors for this paxos round.
+	vector_clear(&g_proposer.acceptors);
+	vector_copy(&g_proposer.acceptors, acceptor_list);
+
+	// Clear the promise received and accepted received vectors for this new
+	// round.
+	vector_clear(&g_proposer.promises_received);
+	vector_clear(&g_proposer.accepted_received);
+
+	paxos_proposer_prepare_send();
+
+	result = AS_PAXOS_RESULT_STARTED;
+
+Exit:
+	CLUSTERING_UNLOCK();
+
+	return result;
+}
+
+/**
+ * Paxos proposer monitor to detect and cleanup long running and most likely
+ * failed paxos rounds.
+ */
+static void
+paxos_proposer_monitor()
+{
+	CLUSTERING_LOCK();
+	if (paxos_proposer_proposal_is_active()) {
+		if (g_proposer.paxos_round_start_time + paxos_proposal_timeout()
+				<= cf_getms()) {
+			// Paxos round is running and has timed out.
+			// Consider paxos round failed.
+			INFO("paxos round timed out for proposal id %"PRIx64":%"PRIu64,
+					config_self_nodeid_get(),
+					g_proposer.sequence_number);
+			paxos_proposer_fail();
+		}
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Paxos acceptor
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Dump paxos acceptor state to logs.
+ */
+static void
+paxos_acceptor_dump(bool verbose)
+{
+	CLUSTERING_LOCK();
+
+	// Output paxos acceptor state.
+	switch (g_acceptor.state) {
+	case AS_PAXOS_ACCEPTOR_STATE_IDLE:
+		INFO("CL: paxos acceptor: idle");
+		break;
+	case AS_PAXOS_ACCEPTOR_STATE_PROMISED:
+		INFO("CL: paxos acceptor: promised");
+		break;
+	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED:
+		INFO("CL: paxos acceptor: accepted");
+		break;
+	}
+
+	if (verbose) {
+		if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_IDLE) {
+			INFO("CL: paxos acceptor start time: %"PRIu64" now: %"PRIu64,
+					g_acceptor.acceptor_round_start, cf_getms());
+			INFO("CL: paxos acceptor proposal id: (%"PRIx64":%"PRIu64")",
+					g_acceptor.last_proposal_received_id.src_nodeid,
+					g_acceptor.last_proposal_received_id.sequence_number);
+			INFO("CL: paxos acceptor promised time: %"PRIu64" now: %"PRIu64,
+					g_acceptor.promise_send_time, cf_getms());
+			INFO("CL: paxos acceptor accepted time: %"PRIu64" now: %"PRIu64,
+					g_acceptor.accepted_send_time, cf_getms());
+		}
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Reset the acceptor for the next round.
+ */
+static void
+paxos_acceptor_reset()
+{
+	CLUSTERING_LOCK();
+	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_IDLE;
+	g_acceptor.acceptor_round_start = 0;
+	g_acceptor.promise_send_time = 0;
+	g_acceptor.accepted_send_time = 0;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Invoked to fail an ongoing paxos proposal.
+ */
+static void
+paxos_acceptor_fail()
+{
+	// Cleanup state for the paxos round.
+	paxos_acceptor_reset();
+
+	as_clustering_internal_event paxos_fail_event;
+	memset(&paxos_fail_event, 0, sizeof(paxos_fail_event));
+	paxos_fail_event.type = AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_FAIL;
+
+	internal_event_dispatch(&paxos_fail_event);
+}
+
+/**
+ * Invoked on success of an ongoing paxos proposal.
+ */
+static void
+paxos_acceptor_success(as_cluster_key cluster_key, cf_vector* succession_list,
+		as_paxos_sequence_number sequence_number)
+{
+	// Cleanup state for the paxos round.
+	paxos_acceptor_reset();
+
+	as_clustering_internal_event paxos_success_event;
+	memset(&paxos_success_event, 0, sizeof(paxos_success_event));
+	paxos_success_event.type =
+			AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS;
+	paxos_success_event.new_succession_list = succession_list;
+	paxos_success_event.new_cluster_key = cluster_key;
+	paxos_success_event.new_sequence_number = sequence_number;
+
+	internal_event_dispatch(&paxos_success_event);
+}
+
+/**
+ * Send paxos promise message to the proposer node.
+ * @param dest  the destination node.
+ * @param sequence_number the sequence number from the incoming message.
+ */
+static void
+paxos_acceptor_promise_send(cf_node dest,
+		as_paxos_sequence_number sequence_number)
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE);
+
+	msg_sequence_number_set(msg, sequence_number);
+
+	DEBUG("paxos promise message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
+
+	CLUSTERING_LOCK();
+	g_acceptor.promise_send_time = cf_getms();
+	CLUSTERING_UNLOCK();
+
+	// Send the message to the proposer.
+	msg_node_send(msg, dest);
+}
+
+/**
+ * Send paxos prepare nack message to the proposer.
+ * @param dest  the destination node.
+ * @param sequence_number the sequence number from the incoming message.
+ */
+static void
+paxos_acceptor_prepare_nack_send(cf_node dest,
+		as_paxos_sequence_number sequence_number)
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK);
+
+	msg_sequence_number_set(msg, sequence_number);
+
+	DEBUG("paxos prepare nack message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
+
+	// Send the message to the proposer.
+	msg_node_send(msg, dest);
+}
+
+/**
+ * Send paxos accepted message to the proposer node.
+ * @param dest  the destination node.
+ * @param sequence_number the sequence number from the incoming message.
+ */
+static void
+paxos_acceptor_accepted_send(cf_node dest,
+		as_paxos_sequence_number sequence_number)
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED);
+
+	msg_sequence_number_set(msg, sequence_number);
+
+	DEBUG("paxos accepted message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
+
+	CLUSTERING_LOCK();
+	g_acceptor.accepted_send_time = cf_getms();
+	CLUSTERING_UNLOCK();
+
+	// Send the message to the proposer.
+	msg_node_send(msg, dest);
+}
+
+/**
+ * Send paxos accept nack message to the proposer.
+ * @param dest  the destination node.
+ * @param sequence_number the sequence number from the incoming message.
+ */
+static void
+paxos_acceptor_accept_nack_send(cf_node dest,
+		as_paxos_sequence_number sequence_number)
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK);
+
+	msg_sequence_number_set(msg, sequence_number);
+
+	DEBUG("paxos accept nack message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
+
+	// Send the message to the proposer.
+	msg_node_send(msg, dest);
+}
+
+/**
+ * Check if the incoming prepare can be promised.
+ */
+static bool
+paxos_acceptor_prepare_can_promise(cf_node src_nodeid,
+		as_paxos_proposal_id* proposal_id)
+{
+	if (!clustering_can_accept_as_proposer(src_nodeid)) {
+		INFO("ignoring paxos prepare from node %"PRIx64" because it cannot be a principal",
+				src_nodeid);
+		return false;
+	}
+
+	bool can_promise = false;
+	CLUSTERING_LOCK();
+	int comparison = paxos_proposal_id_compare(proposal_id,
+			&g_acceptor.last_proposal_received_id);
+
+	switch (g_acceptor.state) {
+	case AS_PAXOS_ACCEPTOR_STATE_IDLE:
+	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: {
+		// Allow only higher valued proposal to prevent replays and also to
+		// ensure convergence in the face of competing proposals.
+		can_promise = comparison > 0;
+	}
+		break;
+	case AS_PAXOS_ACCEPTOR_STATE_PROMISED: {
+		// We allow for replays of the prepare message as well so that the
+		// proposer can receive a promise for this node's lost promise message.
+		can_promise = comparison >= 0;
+	}
+		break;
+	}
+
+	CLUSTERING_UNLOCK();
+
+	return can_promise;
+}
+
+/**
+ * Handle an incoming paxos prepare message.
+ */
+static void
+paxos_acceptor_prepare_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+	DEBUG("received paxos prepare from node %"PRIx64, src_nodeid);
+
+	as_paxos_proposal_id proposal_id = { 0 };
+	if (msg_event_proposal_id_get(event, &proposal_id) != 0) {
+		INFO("ignoring paxos prepare from node %"PRIx64" with invalid proposal id",
+				src_nodeid);
+		return;
+	}
+
+	if (!paxos_acceptor_prepare_can_promise(src_nodeid, &proposal_id)) {
+		INFO("ignoring paxos prepare from node %"PRIx64" with obsolete proposal id (%"PRIx64":%"PRIu64")", proposal_id.src_nodeid, proposal_id.src_nodeid, proposal_id.sequence_number);
+		paxos_acceptor_prepare_nack_send(src_nodeid,
+				proposal_id.sequence_number);
+		return;
+	}
+
+	CLUSTERING_LOCK();
+
+	bool is_new_proposal = paxos_proposal_id_compare(&proposal_id,
+			&g_acceptor.last_proposal_received_id) != 0;
+
+	if (is_new_proposal) {
+		// Remember this to be the last proposal id we received.
+		memcpy(&g_acceptor.last_proposal_received_id, &proposal_id,
+				sizeof(proposal_id));
+
+		// Update the round start time.
+		g_acceptor.acceptor_round_start = cf_getms();
+
+		// Switch to promised state.
+		g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_PROMISED;
+	}
+	else {
+		// This is a retransmit or delayed message in which case we do not
+		// update the state.
+		// If we have already accepted this proposal, we would want to remain in
+		// accepted state.
+	}
+
+	// The proposal is promised. Send back a paxos promise.
+	paxos_acceptor_promise_send(src_nodeid, proposal_id.sequence_number);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check if the incoming accept can be accepted.
+ */
+static bool
+paxos_acceptor_accept_can_accept(cf_node src_nodeid,
+		as_paxos_proposal_id* proposal_id)
+{
+	if (!clustering_can_accept_as_proposer(src_nodeid)) {
+		INFO("ignoring paxos accept from node %"PRIx64" because it cannot be a principal",
+				src_nodeid);
+		return false;
+	}
+
+	bool can_accept = false;
+	CLUSTERING_LOCK();
+	int comparison = paxos_proposal_id_compare(proposal_id,
+			&g_acceptor.last_proposal_received_id);
+
+	switch (g_acceptor.state) {
+	case AS_PAXOS_ACCEPTOR_STATE_IDLE:
+	case AS_PAXOS_ACCEPTOR_STATE_PROMISED:
+	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: {
+		// We allow for replays of the accept message as well, so that the
+		// proposer can receive an accepted for this node's lost accepted
+		// message.
+		can_accept = comparison >= 0;
+	}
+		break;
+	}
+
+	CLUSTERING_UNLOCK();
+
+	return can_accept;
+}
+
+/**
+ * Handle an incoming paxos accept message.
+ */
+static void
+paxos_acceptor_accept_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+
+	DEBUG("received paxos accept from node %"PRIx64, src_nodeid);
+
+	// Its ok to proceed even is paxos is running, because this could be a
+	// competing proposal and the winner will be decided by paxos sequence
+	// number.
+	as_paxos_proposal_id proposal_id = { 0 };
+	if (msg_event_proposal_id_get(event, &proposal_id) != 0) {
+		INFO("ignoring paxos accept from node %"PRIx64" with invalid proposal id",
+				src_nodeid);
+		return;
+	}
+
+	if (!paxos_acceptor_accept_can_accept(src_nodeid, &proposal_id)) {
+		INFO("ignoring paxos accept from node %"PRIx64" with obsolete proposal id (%"PRIx64":%"PRIu64")", proposal_id.src_nodeid, proposal_id.src_nodeid, proposal_id.sequence_number);
+		paxos_acceptor_accept_nack_send(src_nodeid,
+				proposal_id.sequence_number);
+		return;
+	}
+
+	CLUSTERING_LOCK();
+
+	bool is_new_proposal = paxos_proposal_id_compare(&proposal_id,
+			&g_acceptor.last_proposal_received_id) != 0;
+
+	if (is_new_proposal) {
+		// This node has missed the prepare message, but received the accept
+		// message. This is alright.
+
+		// Remember this to be the last proposal id we received.
+		memcpy(&g_acceptor.last_proposal_received_id, &proposal_id,
+				sizeof(proposal_id));
+
+		// Mark this as the start of the acceptor paxos round.
+		g_acceptor.acceptor_round_start = cf_getms();
+	}
+
+	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_ACCEPTED;
+	// The proposal is accepted. Send back a paxos accept.
+	paxos_acceptor_accepted_send(src_nodeid, proposal_id.sequence_number);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming paxos learn message.
+ */
+static void
+paxos_acceptor_learn_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+	msg* msg = event->msg;
+
+	DEBUG("received paxos learn from node %"PRIx64, src_nodeid);
+
+	if (!clustering_can_accept_as_proposer(src_nodeid)) {
+		INFO("ignoring learn message from a non-principal node %"PRIx64" because we are already in a cluster",
+				src_nodeid);
+		return;
+	}
+
+	// Its ok to proceed even if paxos is running, because this could be a
+	// competing proposal and the winner was decided by paxos sequence number.
+	as_paxos_proposal_id proposal_id = { 0 };
+	if (msg_event_proposal_id_get(event, &proposal_id) != 0) {
+		INFO("ignoring paxos learn from node %"PRIx64"with invalid proposal id",
+				src_nodeid);
+		return;
+	}
+
+	CLUSTERING_LOCK();
+
+	if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_ACCEPTED) {
+		INFO(
+				"ignoring paxos learn from node %"PRIx64" - proposal id (%"PRIx64":%"PRIu64") we are already in a cluster",
+				src_nodeid, proposal_id.src_nodeid,
+				proposal_id.sequence_number);
+		goto Exit;
+	}
+
+	if (paxos_proposal_id_compare(&proposal_id,
+			&g_acceptor.last_proposal_received_id) != 0) {
+		// We have not promised nor accepted this proposal,
+		// ignore the learn message.
+		INFO(
+				"ignoring paxos learn from node %"PRIx64" - proposal id (%"PRIx64":%"PRIu64") mismatches current proposal id (%"PRIx64":%"PRIu64")",
+				src_nodeid, proposal_id.src_nodeid,
+				proposal_id.sequence_number,
+				g_acceptor.last_proposal_received_id.src_nodeid,
+				g_acceptor.last_proposal_received_id.sequence_number);
+		goto Exit;
+	}
+
+	as_cluster_key new_cluster_key = 0;
+	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
+
+	if (msg_cluster_key_get(msg, &new_cluster_key) != 0) {
+		INFO("ignoring paxos learn from node %"PRIx64" without cluster key",
+				src_nodeid);
+		goto Exit_destory_succession;
+	}
+
+	if (msg_succession_list_get(msg, new_succession_list) != 0) {
+		INFO("ignoring paxos learn from node %"PRIx64" without succession list",
+				src_nodeid);
+		goto Exit_destory_succession;
+	}
+
+	if (new_cluster_key == g_register.cluster_key) {
+		if (!vector_equals(new_succession_list, &g_register.succession_list)) {
+			// We have the same cluster key repeated for a new round. Should
+			// never happen.
+			CRASH("duplicate cluster key %"PRIx64" generated for different paxos rounds - disastrous", new_cluster_key);
+		}
+
+		INFO("ignoring duplicate paxos learn from node %"PRIx64, src_nodeid);
+		goto Exit_destory_succession;
+	}
+
+	// Paxos round converged, apply the new cluster configuration.
+	paxos_acceptor_success(new_cluster_key, new_succession_list,
+			proposal_id.sequence_number);
+
+Exit_destory_succession:
+	cf_vector_destroy(new_succession_list);
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming message.
+ */
+static void
+paxos_acceptor_msg_event_handle(as_clustering_internal_event *msg_event)
+{
+	switch (msg_event->msg_type) {
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE:
+		paxos_acceptor_prepare_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT:
+		paxos_acceptor_accept_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN:
+		paxos_acceptor_learn_handle(msg_event);
+		break;
+	default:	// Other message types are not of interest.
+		break;
+	}
+}
+
+/**
+ * Check and retransmit promise message if paxos proposer has not moved ahead
+ * and send back an accept message.
+ */
+static void
+paxos_acceptor_promise_check_retransmit()
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+	if (g_acceptor.state == AS_PAXOS_ACCEPTOR_STATE_PROMISED
+			&& g_acceptor.promise_send_time + paxos_msg_timeout() < now) {
+		paxos_acceptor_promise_send(
+				g_acceptor.last_proposal_received_id.src_nodeid,
+				g_acceptor.last_proposal_received_id.sequence_number);
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check and retransmit accepted message if paxos proposer has not send back a
+ * learn message.
+ */
+static void
+paxos_acceptor_accepted_check_retransmit()
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+	if (g_acceptor.state == AS_PAXOS_ACCEPTOR_STATE_ACCEPTED
+			&& g_acceptor.accepted_send_time + paxos_msg_timeout() < now) {
+		paxos_acceptor_accepted_send(
+				g_acceptor.last_proposal_received_id.src_nodeid,
+				g_acceptor.last_proposal_received_id.sequence_number);
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle a timer event and retransmit messages if required.
+ */
+static void
+paxos_acceptor_timer_event_handle()
+{
+	CLUSTERING_LOCK();
+	switch (g_acceptor.state) {
+	case AS_PAXOS_ACCEPTOR_STATE_IDLE: {
+		// No retransmitts required.
+		break;
+	}
+	case AS_PAXOS_ACCEPTOR_STATE_PROMISED:
+		paxos_acceptor_promise_check_retransmit();
+		break;
+	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED:
+		paxos_acceptor_accepted_check_retransmit();
+		break;
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Initialize paxos acceptor state.
+ */
+static void
+paxos_acceptor_init()
+{
+	CLUSTERING_LOCK();
+	// Memset to zero which ensures that all acceptor state variables have zero
+	// which is the correct initial value for elements other that contained
+	// vectors and status.
+	memset(&g_acceptor, 0, sizeof(g_acceptor));
+	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_IDLE;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Paxos acceptor monitor to detect and cleanup long running and most likely
+ * failed paxos rounds.
+ */
+static void
+paxos_acceptor_monitor()
+{
+	CLUSTERING_LOCK();
+	if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_IDLE
+			&& g_acceptor.acceptor_round_start + paxos_proposal_timeout()
+					<= cf_getms()) {
+		// Paxos round is running and has timed out.
+		// Consider paxos round failed.
+		INFO("paxos round timed out for proposal id %"PRIx64":%"PRIu64,
+				config_self_nodeid_get(),
+				g_proposer.sequence_number);
+		paxos_acceptor_fail();
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Paxos lifecycle and common event handling
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Paxos monitor to detect and cleanup long running and most likely failed paxos
+ * rounds.
+ */
+static void
+paxos_monitor()
+{
+	paxos_proposer_monitor();
+	paxos_acceptor_monitor();
+}
+
+/**
+ * Handle an incoming timer event.
+ */
+static void
+paxos_timer_event_handle()
+{
+	// Acceptor retransmits handled here.
+	paxos_acceptor_timer_event_handle();
+
+	// Proposer retransmits handled here.
+	paxos_proposer_timer_event_handle();
+
+	// Invoke Paxos monitor to timeout long running paxos rounds.
+	paxos_monitor();
+}
+
+/**
+ * Handle incoming messages.
+ */
+static void
+paxos_msg_event_handle(as_clustering_internal_event* msg_event)
+{
+	paxos_acceptor_msg_event_handle(msg_event);
+	paxos_proposer_msg_event_handle(msg_event);
+}
+
+/**
+ * Handle heartbeat event.
+ */
+static void
+paxos_hb_event_handle(as_clustering_internal_event* hb_event)
+{
+	paxos_proposer_hb_event_handle(hb_event);
+}
+
+/**
+ * Dispatch clustering events.
+ */
+static void
+paxos_event_dispatch(as_clustering_internal_event* event)
+{
+	switch (event->type) {
+	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
+		paxos_timer_event_handle();
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_MSG:
+		paxos_msg_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_HB:
+		paxos_hb_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED:
+		paxos_proposer_register_synched();
+	default:	// Not of interest for paxos.
+		break;
+	}
+}
+
+/**
+ * Initialize paxos proposer and acceptor data structures.
+ */
+static void
+paxos_init()
+{
+	paxos_proposer_init();
+	paxos_acceptor_init();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering external event publisher
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * * Check if event publisher is running.
+ */
+static bool
+external_event_publisher_is_running()
+{
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+	bool running = g_external_event_publisher.sys_state
+			== AS_CLUSTERING_SYS_STATE_RUNNING;
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+	return running;
+}
+
+/**
+ * Initialize the event publisher.
+ */
+static void
+external_event_publisher_init()
+{
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+	memset(&g_external_event_publisher, 0, sizeof(g_external_event_publisher));
+	vector_lockless_init(&g_external_event_publisher.published_succession_list,
+			cf_node);
+
+	pthread_mutex_init(&g_external_event_publisher.is_pending_mutex, NULL);
+	pthread_cond_init(&g_external_event_publisher.is_pending, NULL);
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * Wakeup the publisher thread.
+ */
+static void
+external_event_publisher_thr_wakeup()
+{
+	pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex);
+	pthread_cond_signal(&g_external_event_publisher.is_pending);
+	pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex);
+}
+
+/**
+ * Queue up and external event to publish.
+ */
+static void
+external_event_queue(as_clustering_event* event)
+{
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+	memcpy(&g_external_event_publisher.to_publish, event,
+			sizeof(g_external_event_publisher.to_publish));
+
+	vector_clear(&g_external_event_publisher.published_succession_list);
+	if (event->succession_list) {
+		// Use the static list for the published event, so that the input event
+		// object can be destroyed irrespective of when the it is published.
+		vector_copy(&g_external_event_publisher.published_succession_list,
+				event->succession_list);
+		g_external_event_publisher.to_publish.succession_list =
+				&g_external_event_publisher.published_succession_list;
+
+	}
+
+	g_external_event_publisher.event_queued = true;
+
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+
+	// Wake up the publisher thread.
+	external_event_publisher_thr_wakeup();
+}
+
+/**
+ * Publish external events if any are pending.
+ */
+static void
+external_events_publish()
+{
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+
+	if (g_external_event_publisher.event_queued) {
+		g_external_event_publisher.event_queued = false;
+		exchange_clustering_event_listener(
+				&g_external_event_publisher.to_publish);
+	}
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * External event publisher thread.
+ */
+static void*
+external_event_publisher_thr(void* arg)
+{
+	pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex);
+
+	while (true) {
+		pthread_cond_wait(&g_external_event_publisher.is_pending,
+				&g_external_event_publisher.is_pending_mutex);
+		if (external_event_publisher_is_running()) {
+			external_events_publish();
+		}
+		else {
+			// Publisher stopped, exit the tread.
+			break;
+		}
+	}
+
+	pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex);
+	return NULL;
+}
+
+/**
+ * Start the event publisher.
+ */
+static void
+external_event_publisher_start()
+{
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+	g_external_event_publisher.sys_state = AS_CLUSTERING_SYS_STATE_RUNNING;
+
+	// Start the event publishing thread.
+	if (pthread_create(&g_external_event_publisher.event_publisher_tid, 0,
+			external_event_publisher_thr, NULL) != 0) {
+		CRASH("could not create event publishing thread: %s",
+				cf_strerror(errno));
+	}
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * Stop the event publisher.
+ */
+static void
+external_event_publisher_stop()
+{
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+	g_external_event_publisher.sys_state =
+			AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN;
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+
+	external_event_publisher_thr_wakeup();
+	pthread_join(g_external_event_publisher.event_publisher_tid, NULL);
+
+	CLUSTERING_EVENT_PUBLISHER_LOCK();
+	g_external_event_publisher.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED;
+	g_external_event_publisher.event_queued = false;
+	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering register
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Dump register state to logs.
+ */
+static void
+register_dump(bool verbose)
+{
+	CLUSTERING_LOCK();
+
+	// Output register state.
+	switch (g_register.state) {
+	case AS_CLUSTERING_REGISTER_STATE_SYNCED:
+		INFO("CL: register: synced");
+		break;
+	case AS_CLUSTERING_REGISTER_STATE_SYNCING:
+		INFO("CL: register: syncing");
+		break;
+	}
+
+	// Cluster state details.
+	INFO("CL: cluster changed at: %"PRIu64" now: %"PRIu64,
+			g_register.cluster_modified_time, cf_getms());
+
+	INFO("CL: cluster key: %"PRIx64, g_register.cluster_key);
+	INFO("CL: cluster sequence: %"PRIu64, g_register.sequence_number);
+	INFO("CL: cluster size: %d", cf_vector_size(&g_register.succession_list));
+
+	if (verbose) {
+		log_cf_node_vector("CL: succession:", &g_register.succession_list,
+				CF_INFO);
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Initialize the register.
+ */
+static void
+register_init()
+{
+	CLUSTERING_LOCK();
+	memset(&g_register, 0, sizeof(g_register));
+	vector_lockless_init(&g_register.succession_list, cf_node);
+	vector_lockless_init(&g_register.sync_pending, cf_node);
+	vector_lockless_init(&g_register.ooo_change_applied_received, cf_node);
+	vector_lockless_init(&g_register.ooo_succession_list, cf_node);
+
+	// We are in the orphan state but that will be considered as sync state.
+	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Returns true if register sync is pending.
+ */
+static bool
+register_is_sycn_pending()
+{
+	CLUSTERING_LOCK();
+	bool sync_pending = cf_vector_size(&g_register.sync_pending) > 0;
+	log_cf_node_vector("pending register sync:", &g_register.sync_pending,
+			CF_TRACE);
+	CLUSTERING_UNLOCK();
+	return sync_pending;
+}
+
+/**
+ * Check if the register is synced across the cluster and move to sync state if
+ * it is synced.
+ */
+static void
+register_check_and_switch_synced()
+{
+	CLUSTERING_LOCK();
+	if (!register_is_sycn_pending()
+			&& g_register.state != AS_CLUSTERING_REGISTER_STATE_SYNCED) {
+		g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED;
+		// Generate internal cluster changed synced.
+		as_clustering_internal_event cluster_synced;
+		memset(&cluster_synced, 0, sizeof(cluster_synced));
+		cluster_synced.type =
+				AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED;
+		internal_event_dispatch(&cluster_synced);
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Update register to become an orphan node.
+ */
+static void
+register_become_orphan(as_clustering_event_qualifier qualifier)
+{
+	CLUSTERING_LOCK();
+	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED;
+	g_register.cluster_key = 0;
+	g_register.sequence_number = 0;
+	g_register.has_orphan_transitioned = true;
+	g_clustering.has_integrity = false;
+	vector_clear(&g_register.succession_list);
+	vector_clear(&g_register.sync_pending);
+
+	g_register.cluster_modified_time = cf_getms();
+	g_register.cluster_modified_hlc_ts = as_hlc_timestamp_now();
+
+	// Queue internal orphaned event.
+	as_clustering_internal_event orphaned_event;
+	memset(&orphaned_event, 0, sizeof(orphaned_event));
+	orphaned_event.type = AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED;
+	orphaned_event.qualifier = qualifier;
+	internal_event_dispatch(&orphaned_event);
+
+	CLUSTERING_UNLOCK();
+
+	INFO("moved self node to orphan state");
+}
+
+/**
+ * Handle timer event in the syncing state.
+ */
+static void
+register_syncing_timer_event_handle()
+{
+	CLUSTERING_LOCK();
+	cf_clock now = cf_getms();
+	if (g_register.last_sync_check_time + register_sync_check_interval()
+			> now) {
+		// Give more time before checking for sync.
+		goto Exit;
+	}
+
+	if (register_is_sycn_pending()) {
+		// Update pending nodes based on heartbeat status.
+		int num_pending = cf_vector_size(&g_register.sync_pending);
+		for (int i = 0; i < num_pending; i++) {
+			cf_node pending;
+			cf_vector_get(&g_register.sync_pending, i, &pending);
+			if (clustering_node_is_sync(pending)) {
+				cf_vector_delete(&g_register.sync_pending, i);
+
+				// Compensate the index for the delete.
+				i--;
+
+				// Adjust vector size.
+				num_pending--;
+			}
+		}
+	}
+
+	register_check_and_switch_synced();
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Send cluster change applied message to all cluster members.
+ */
+static void
+register_cluster_change_applied_msg_send()
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED);
+
+	CLUSTERING_LOCK();
+
+	// Set the cluster key.
+	msg_cluster_key_set(msg, g_register.cluster_key);
+
+	// Set the succession list.
+	msg_succession_list_set(msg, &g_register.succession_list);
+
+	log_cf_node_vector("cluster change applied message sent to:",
+			&g_register.succession_list, CF_DEBUG);
+
+	cf_vector* members = vector_stack_lockless_create(cf_node);
+	vector_copy(members, &g_register.succession_list);
+
+	CLUSTERING_UNLOCK();
+
+	// Sent the message to the cluster members.
+	msg_nodes_send(msg, members);
+	cf_vector_destroy(members);
+}
+
+/**
+ * Validate cluster state. For now ensure the cluster size is greater than the
+ * min cluster size.
+ */
+static void
+register_validate_cluster()
+{
+	CLUSTERING_LOCK();
+	int cluster_size = cf_vector_size(&g_register.succession_list);
+	if (!clustering_is_orphan()
+			&& cluster_size < g_config.clustering_config.cluster_size_min) {
+		WARNING(
+				"cluster size %d less than required minimum size %d - switching to orphan state",
+				cluster_size, g_config.clustering_config.cluster_size_min);
+		register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle a timer event for the register.
+ */
+static void
+register_timer_event_handle()
+{
+	CLUSTERING_LOCK();
+	switch (g_register.state) {
+	case AS_CLUSTERING_REGISTER_STATE_SYNCED:
+		register_validate_cluster();
+		break;
+	case AS_CLUSTERING_REGISTER_STATE_SYNCING:
+		register_syncing_timer_event_handle();
+		break;
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle paxos round succeeding.
+ */
+static void
+register_paxos_acceptor_success_handle(
+		as_clustering_internal_event* paxos_success_event)
+{
+	CLUSTERING_LOCK();
+
+	g_register.has_orphan_transitioned = false;
+
+	g_register.cluster_key = paxos_success_event->new_cluster_key;
+	g_register.sequence_number = paxos_success_event->new_sequence_number;
+
+	vector_clear(&g_register.succession_list);
+	vector_copy(&g_register.succession_list,
+			paxos_success_event->new_succession_list);
+
+	// Update the timestamps as the register has changed its contents.
+	g_register.cluster_modified_time = cf_getms();
+	g_register.cluster_modified_hlc_ts = as_hlc_timestamp_now();
+
+	// Initialize pending list with all cluster members.
+	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCING;
+	vector_clear(&g_register.sync_pending);
+	vector_copy(&g_register.sync_pending, &g_register.succession_list);
+	register_cluster_change_applied_msg_send();
+
+	if (g_register.cluster_key == g_register.ooo_cluster_key
+			&& vector_equals(&g_register.succession_list,
+					&g_register.ooo_succession_list)) {
+		// We have already received change applied message from these node
+		// account for them.
+		vector_subtract(&g_register.sync_pending,
+				&g_register.ooo_change_applied_received);
+	}
+	vector_clear(&g_register.ooo_change_applied_received);
+	vector_clear(&g_register.ooo_succession_list);
+	g_register.ooo_cluster_key = 0;
+	g_register.ooo_hlc_timestamp = 0;
+
+	INFO("applied new cluster key %"PRIx64,
+			paxos_success_event->new_cluster_key);
+	log_cf_node_vector("applied new succession list",
+			&g_register.succession_list, CF_INFO);
+	INFO("applied cluster size %d",
+			cf_vector_size(&g_register.succession_list));
+
+	as_clustering_internal_event cluster_changed;
+	memset(&cluster_changed, 0, sizeof(cluster_changed));
+	cluster_changed.type =
+			AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED;
+	internal_event_dispatch(&cluster_changed);
+
+	// Send change appied message. Its alright even if they are out of order.
+	register_cluster_change_applied_msg_send();
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle incoming cluster change applied message.
+ */
+static void
+register_cluster_change_applied_msg_handle(
+		as_clustering_internal_event* msg_event)
+{
+	CLUSTERING_LOCK();
+	as_cluster_key msg_cluster_key = 0;
+	msg_cluster_key_get(msg_event->msg, &msg_cluster_key);
+	cf_vector *msg_succession_list = vector_stack_lockless_create(cf_node);
+	msg_succession_list_get(msg_event->msg, msg_succession_list);
+	as_hlc_timestamp msg_hlc_timestamp = 0;
+	msg_send_ts_get(msg_event->msg, &msg_hlc_timestamp);
+
+	DEBUG("received cluster change applied message from node %"PRIx64,
+			msg_event->msg_src_nodeid);
+	if (g_register.cluster_key == msg_cluster_key
+			&& vector_equals(&g_register.succession_list,
+					msg_succession_list)) {
+		// This is a matching change applied message.
+		int found_at = 0;
+		if ((found_at = vector_find(&g_register.sync_pending,
+				&msg_event->msg_src_nodeid)) >= 0) {
+			// Remove from the pending list.
+			cf_vector_delete(&g_register.sync_pending, found_at);
+		}
+
+	}
+	else if (g_register.ooo_cluster_key == msg_cluster_key
+			&& vector_equals(&g_register.ooo_succession_list,
+					msg_succession_list)) {
+		DEBUG("received ooo cluster change applied message from node %"PRIx64" with cluster key %"PRIx64, msg_event->msg_src_nodeid, msg_cluster_key);
+		cf_vector_append_unique(&g_register.ooo_change_applied_received,
+				&msg_event->msg_src_nodeid);
+
+	}
+	else if (g_register.ooo_hlc_timestamp < msg_hlc_timestamp) {
+		// Prefer a later version of OOO message.
+		g_register.ooo_cluster_key = msg_cluster_key;
+		g_register.ooo_hlc_timestamp = msg_hlc_timestamp;
+		vector_clear(&g_register.ooo_succession_list);
+		vector_copy(&g_register.ooo_succession_list, msg_succession_list);
+		vector_clear(&g_register.ooo_change_applied_received);
+		cf_vector_append_unique(&g_register.ooo_change_applied_received,
+				&msg_event->msg_src_nodeid);
+		DEBUG("received ooo cluster change applied message from node %"PRIx64" with cluster key %"PRIx64, msg_event->msg_src_nodeid, msg_cluster_key);
+	}
+	else {
+		INFO(
+				"ignoring cluster mismatching change applied message from node %"PRIx64,
+				msg_event->msg_src_nodeid);
+	}
+	cf_vector_destroy(msg_succession_list);
+	register_check_and_switch_synced();
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle incoming message.
+ */
+static void
+register_msg_event_handle(as_clustering_internal_event* msg_event)
+{
+	CLUSTERING_LOCK();
+	as_clustering_msg_type type;
+	msg_type_get(msg_event->msg, &type);
+
+	if (type == AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED) {
+		register_cluster_change_applied_msg_handle(msg_event);
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Dispatch internal events to the register.
+ */
+static void
+register_event_dispatch(as_clustering_internal_event* event)
+{
+	switch (event->type) {
+	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
+		register_timer_event_handle();
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS:
+		register_paxos_acceptor_success_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_MSG:
+		register_msg_event_handle(event);
+		break;
+	default:	// Not of interest for the register.
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering core (triggers cluster changes)
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Send a join reject message to destination node.
+ */
+static void
+clustering_join_reject_send(cf_node dest)
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REJECT);
+
+	DETAIL("sent join reject to node %"PRIx64, dest);
+
+	// Sent the message to the acceptors.
+	msg_node_send(msg, dest);
+}
+
+/**
+ * Send cluster join reject message to all nodes in the vector.
+ */
+static void
+clustering_join_requests_reject(cf_vector* rejected_nodes)
+{
+	int rejected_node_count = cf_vector_size(rejected_nodes);
+	for (int i = 0; i < rejected_node_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		cf_node requesting_nodeid = *((cf_node*)cf_vector_getp(rejected_nodes,
+				i));
+
+		// Send the reject message.
+		clustering_join_reject_send(requesting_nodeid);
+	}
+}
+
+/**
+ * Send join reject message for all pending join requests.
+ */
+static void
+clustering_join_requests_reject_all()
+{
+	CLUSTERING_LOCK();
+
+	cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node);
+	vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests);
+
+	vector_clear(&g_clustering.pending_join_requests);
+
+	CLUSTERING_UNLOCK();
+
+	clustering_join_requests_reject(rejected_nodes);
+
+	cf_vector_destroy(rejected_nodes);
+}
+
+/**
+ * Send a join request to a principal.
+ * @param new_principal the destination principal node.
+ * @return 0 on successful message queue, -1 on failure.
+ */
+static int
+clustering_join_request_send(cf_node new_principal)
+{
+	int rv = -1;
+	CLUSTERING_LOCK();
+
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST);
+
+	DETAIL("sending cluster join request to node %"PRIx64, new_principal);
+
+	if (msg_node_send(msg, new_principal) == 0) {
+		cf_clock now = cf_getms();
+		cf_shash_put(g_clustering.join_request_blackout, &new_principal, &now);
+
+		g_clustering.last_join_request_principal = new_principal;
+		g_clustering.last_join_request_sent_time =
+				g_clustering.last_join_request_retransmit_time = cf_getms();
+
+		INFO("sent cluster join request to %"PRIx64, new_principal);
+		rv = 0;
+	}
+
+	// Send early reject to all nodes that have send us a join request in the
+	// orphan state, because self node is not going to become a principal node.
+	// This allows the requesting nodes to send requests to other
+	// (potential)principals.
+	clustering_join_requests_reject_all();
+
+	CLUSTERING_UNLOCK();
+	return rv;
+}
+
+/**
+ * Retransmit a join request to a previously attmepted principal.
+ * @param last_join_request_principal the principal to retransmit to.
+ */
+static void
+clustering_join_request_retransmit(cf_node last_join_request_principal)
+{
+	CLUSTERING_LOCK();
+	cf_node new_principal = g_clustering.last_join_request_principal;
+	g_clustering.last_join_request_retransmit_time = cf_getms();
+	CLUSTERING_UNLOCK();
+
+	if (new_principal != last_join_request_principal) {
+		// The last attempted principal has changed. Don't retransmit.
+		return;
+	}
+
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST);
+	DETAIL("re-sending cluster join request to node %"PRIx64, new_principal);
+	if (msg_node_send(msg, new_principal) == 0) {
+		DEBUG("re-sent cluster join request to %"PRIx64, new_principal);
+	}
+}
+
+/**
+ *  Remove nodes for which join requests are blocked.
+ *
+ * @param  requestees the nodes considered for join requests.
+ * @param target the result with requestees that are not blocked.
+ */
+static void
+clustering_join_request_filter_blocked(cf_vector* requestees, cf_vector* target)
+{
+	CLUSTERING_LOCK();
+	cf_clock last_sent;
+	int requestee_count = cf_vector_size(requestees);
+	for (int i = 0; i < requestee_count; i++) {
+		cf_node requestee;
+		cf_vector_get(requestees, i, &requestee);
+		if (cf_shash_get(g_clustering.join_request_blackout, &requestee,
+				&last_sent) != CF_SHASH_OK) {
+			// The requestee is not marked for blackout
+			cf_vector_append(target, &requestee);
+		}
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Send a cluster join request to a neighboring principal. If
+ * preferred_principal is set and it is an eligible neighboring principal, a
+ * request is sent to that principal, else this function cycles among eligible
+ * neighboring principals at each call.
+ *
+ * A request will not be sent if there is no neighboring principal.
+ *
+ * @param preferred_principal the preferred principal to join. User zero if
+ * there is no preference.
+ * @return 0 if the join request was send or there is one in progress. -1 if
+ * there are no principals to try and send the join request.
+ */
+static as_clustering_join_request_result
+clustering_principal_join_request_attempt(cf_node preferred_principal)
+{
+	CLUSTERING_LOCK();
+
+	as_clustering_join_request_result rv = AS_CLUSTERING_JOIN_REQUEST_SENT;
+	cf_vector* neighboring_principals = vector_stack_lockless_create(cf_node);
+	cf_vector* eligible_principals = vector_stack_lockless_create(cf_node);
+
+	// Get list of neighboring principals.
+	clustering_neighboring_principals_get(neighboring_principals);
+	if (cf_vector_size(neighboring_principals) == 0) {
+		DEBUG("no neighboring principal found - not sending join request");
+		rv = AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS;
+		goto Exit;
+	}
+
+	clustering_join_request_filter_blocked(neighboring_principals,
+			eligible_principals);
+
+	if (cf_vector_size(eligible_principals) == 0) {
+		DETAIL("no eligible principals found to make a join request");
+		// This principal is still in the blackout list. Do not send a request.
+		rv = AS_CLUSTERING_JOIN_REQUEST_PENDING;
+		goto Exit;
+	}
+
+	int next_join_request_principal_index = -1;
+
+	// We have some well-formed neighboring clusters, try and join them
+	if (preferred_principal != 0) {
+		int preferred_principal_index = vector_find(eligible_principals,
+				&preferred_principal);
+		if (preferred_principal_index >= 0) {
+			DETAIL("sending join request to preferred principal %"PRIx64,
+					preferred_principal);
+
+			// Update the index of the principal to try.
+			next_join_request_principal_index = preferred_principal_index;
+		}
+	}
+
+	if (next_join_request_principal_index == -1) {
+		// Choose the first entry, since we have no valid preferred principal.
+		next_join_request_principal_index = 0;
+		if (g_clustering.last_join_request_principal != 0) {
+			// Choose the node after the current principal. If the current
+			// principal is not found we start at index 0 else the next index.
+			next_join_request_principal_index = vector_find(eligible_principals,
+					&g_clustering.last_join_request_principal) + 1;
+		}
+	}
+
+	// Forget the fact that a join request is pending for a principal.
+	g_clustering.last_join_request_principal = 0;
+
+	cf_node* principal_to_try = cf_vector_getp(eligible_principals,
+			next_join_request_principal_index
+					% cf_vector_size(eligible_principals));
+
+	if (principal_to_try) {
+		rv = clustering_join_request_send(*principal_to_try) == 0 ?
+				AS_CLUSTERING_JOIN_REQUEST_SENT :
+				AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED;
+
+	}
+	else {
+		DEBUG("no neighboring principal found - not sending join request");
+		rv = AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS;
+	}
+
+Exit:
+	if (rv != AS_CLUSTERING_JOIN_REQUEST_SENT) {
+		// Forget the last principal we sent the join request to.
+		g_clustering.last_join_request_principal = 0;
+		g_clustering.last_join_request_sent_time = 0;
+	}
+
+	CLUSTERING_UNLOCK();
+
+	cf_vector_destroy(neighboring_principals);
+	cf_vector_destroy(eligible_principals);
+
+	return rv;
+}
+
+/**
+ * Send a cluster join request to a neighboring orphan who this node thinks will
+ * be best suited to form a new cluster.
+ */
+static as_clustering_join_request_result
+clustering_orphan_join_request_attempt()
+{
+	CLUSTERING_LOCK();
+
+	// Get list of neighboring orphans.
+	cf_vector* orphans = vector_stack_lockless_create(cf_node);
+	clustering_neighboring_orphans_get(orphans);
+
+	// Get filtered list of orphans.
+	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
+	clustering_join_request_filter_blocked(orphans, new_succession_list);
+
+	log_cf_node_vector("neighboring orphans for join request:",
+			new_succession_list, CF_DEBUG);
+
+	// Add self node.
+	cf_node self_nodeid = config_self_nodeid_get();
+	cf_vector_append_unique(new_succession_list, &self_nodeid);
+
+	clustering_succession_list_clique_evict(new_succession_list,
+			"clique based evicted nodes for potential cluster:");
+
+	// Sort the new succession list.
+	vector_sort_unique(new_succession_list, cf_node_compare_desc);
+
+	as_clustering_join_request_result rv =
+			AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS;
+
+	if (cf_vector_size(new_succession_list) > 0) {
+		cf_node new_principal = *((cf_node*)cf_vector_getp(new_succession_list,
+				0));
+		if (new_principal == config_self_nodeid_get()) {
+			// No need to send self a join request.
+			goto Exit;
+		}
+		else {
+			rv = clustering_join_request_send(new_principal) == 0 ?
+					AS_CLUSTERING_JOIN_REQUEST_SENT :
+					AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED;
+		}
+	}
+
+Exit:
+	cf_vector_destroy(new_succession_list);
+	cf_vector_destroy(orphans);
+
+	CLUSTERING_UNLOCK();
+	return rv;
+}
+
+/**
+ * Remove nodes from the blackout hash once they have been in the list for
+ * greater than the blackout period.
+ */
+int
+clustering_join_request_blackout_tend_reduce(const void* key, void* data,
+		void* udata)
+{
+	cf_clock* join_request_send_time = (cf_clock*)data;
+	if (*join_request_send_time + join_request_blackout_interval()
+			< cf_getms()) {
+		return CF_SHASH_REDUCE_DELETE;
+	}
+	return CF_SHASH_OK;
+}
+
+/**
+ * Tend the join request blackout data structure to remove blacked out
+ * principals.
+ */
+static void
+clustering_join_request_blackout_tend()
+{
+	CLUSTERING_LOCK();
+	cf_shash_reduce(g_clustering.join_request_blackout,
+			clustering_join_request_blackout_tend_reduce, NULL);
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Send a cluster join request to a neighboring principal if one exists, else if
+ * there are no neighboring principals, send a join request to a neighboring
+ * orphan node if this node thinks it will win paxos and become the new
+ * principal.
+ */
+static as_clustering_join_request_result
+clustering_join_request_attempt()
+{
+	clustering_join_request_blackout_tend();
+
+	CLUSTERING_LOCK();
+	cf_node last_join_request_principal =
+			g_clustering.last_join_request_principal;
+	cf_clock last_join_request_sent_time =
+			g_clustering.last_join_request_sent_time;
+	cf_clock last_join_request_retransmit_time =
+			g_clustering.last_join_request_retransmit_time;
+	CLUSTERING_UNLOCK();
+
+	// Check if the outgoing join request has timed out.
+	if (last_join_request_principal
+			&& as_hb_is_alive(last_join_request_principal)) {
+		if (last_join_request_sent_time + join_request_timeout() > cf_getms()) {
+			if (last_join_request_retransmit_time
+					+ join_request_retransmit_timeout() < cf_getms()) {
+				// Re-transmit join request to the same principal, to cover the
+				// case where the previous join request was lost.
+				clustering_join_request_retransmit(last_join_request_principal);
+			}
+			// Wait for the principal to respond. do nothing
+			DETAIL(
+					"join request to principal %"PRIx64" pending - not attempting new join request",
+					last_join_request_principal);
+
+			return AS_CLUSTERING_JOIN_REQUEST_PENDING;
+		}
+		// Timeout joining a principal. Choose a different principal.
+		INFO("join request timed out for principal %"PRIx64,
+				last_join_request_principal);
+
+	}
+
+	// Try sending a join request to a neighboring principal.
+	as_clustering_join_request_result rv =
+			clustering_principal_join_request_attempt(0);
+
+	if (rv != AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS) {
+		// There are valid principals around. Don't send a request to
+		// neighboring orphan nodes.
+		return rv;
+	}
+
+	// Send a join request to an orphan node, best suited to be the new
+	// principal.
+	return clustering_orphan_join_request_attempt();
+}
+
+/**
+ * Try to become a principal and start a new cluster.
+ */
+static void
+clustering_cluster_form()
+{
+	ASSERT(clustering_is_orphan(),
+			"should not attempt forming new cluster when not an orphan node");
+
+	CLUSTERING_LOCK();
+	bool paxos_proposal_started = false;
+	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
+	cf_vector* expected_succession_list = vector_stack_lockless_create(cf_node);
+	cf_vector* orphans = vector_stack_lockless_create(cf_node);
+
+	clustering_neighboring_orphans_get(orphans);
+	vector_copy(new_succession_list, orphans);
+
+	log_cf_node_vector("neighboring orphans for cluster formation:",
+			new_succession_list,
+			cf_vector_size(new_succession_list) > 0 ? CF_INFO : CF_DEBUG);
+	log_cf_node_vector("pending join requests:",
+			&g_clustering.pending_join_requests,
+			cf_vector_size(&g_clustering.pending_join_requests) > 0 ?
+					CF_INFO : CF_DEBUG);
+
+	// Add self node.
+	cf_node self_nodeid = config_self_nodeid_get();
+	cf_vector_append_unique(new_succession_list, &self_nodeid);
+
+	clustering_succession_list_clique_evict(new_succession_list,
+			"clique based evicted nodes at cluster formation:");
+
+	// Sort the new succession list.
+	vector_sort_unique(new_succession_list, cf_node_compare_desc);
+
+	cf_vector_append(expected_succession_list, &self_nodeid);
+	vector_copy_unique(expected_succession_list,
+			&g_clustering.pending_join_requests);
+	// Sort the expected succession list.
+	vector_sort_unique(expected_succession_list, cf_node_compare_desc);
+	// The result should match the pending join requests exactly to consider the
+	// new succession list.
+	if (!vector_equals(expected_succession_list, new_succession_list)) {
+		log_cf_node_vector(
+				"skipping forming cluster - cannot form new cluster from pending join requests",
+				&g_clustering.pending_join_requests, CF_INFO);
+		goto Exit;
+	}
+
+	if (cf_vector_size(orphans) > 0
+			&& cf_vector_size(new_succession_list) == 1) {
+		log_cf_node_vector(
+				"skipping forming cluster - there are neighboring orphans that cannot be clustered with",
+				orphans, CF_INFO);
+		goto Exit;
+	}
+
+	if (cf_vector_size(new_succession_list) > 0) {
+		cf_node new_principal = *((cf_node*)cf_vector_getp(new_succession_list,
+				0));
+		if (new_principal == config_self_nodeid_get()) {
+			log_cf_node_vector(
+					"principal node - forming new cluster with succession list:",
+					new_succession_list, CF_INFO);
+
+			as_paxos_start_result result = paxos_proposer_proposal_start(
+					new_succession_list, new_succession_list);
+
+			// Log paxos result.
+			paxos_result_log(result, new_succession_list);
+
+			paxos_proposal_started = (result == AS_PAXOS_RESULT_STARTED);
+		}
+		else {
+			INFO("skipping cluster formation - a new potential principal %"PRIx64" exists",
+					new_principal);
+		}
+	}
+
+Exit:
+	// Compute list of rejected nodes.
+	if (paxos_proposal_started) {
+		// Nodes in set (pending_join - new succession list) could not be
+		// accomodated and should receive a join reject.
+		vector_subtract(&g_clustering.pending_join_requests,
+				new_succession_list);
+	}
+	else {
+		// Reject all pending join requests. Will happen below.
+	}
+
+	cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node);
+	vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests);
+
+	// Clear the pending join requests
+	vector_clear(&g_clustering.pending_join_requests);
+
+	// Send reject messages to rejected nodes.
+	clustering_join_requests_reject(rejected_nodes);
+
+	cf_vector_destroy(rejected_nodes);
+
+	cf_vector_destroy(orphans);
+	cf_vector_destroy(expected_succession_list);
+	cf_vector_destroy(new_succession_list);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Try to join a cluster if there is a neighboring one,
+ * else try to form one.
+ */
+static void
+clustering_join_or_form_cluster()
+{
+	ASSERT(clustering_is_orphan(),
+			"should not attempt forming new cluster when not an orphan node");
+
+	if (paxos_proposer_proposal_is_active()) {
+		// There is an active paxos round with this node as the proposed
+		// principal.
+		// Skip join cluster attempt and give current paxos round a chance to
+		// form the cluster.
+		return;
+	}
+
+	CLUSTERING_LOCK();
+
+	// TODO (Discuss this): after some timeout and exhausting all neighboring
+	// principals, become a single node cluster / try our own cluster. This
+	// might not be required. Nonetheless discuss and figure this  out. Current
+	// behaviour is form new cluster after a timeout.
+
+	// A node is orphan for too long if it has attempted a join request which
+	// timedout and its in orphan state for a while.
+	bool orphan_for_too_long = (clustering_orphan_timeout()
+			+ g_clustering.orphan_state_start_time) < cf_getms()
+			&& g_clustering.last_join_request_principal
+			&& g_clustering.last_join_request_sent_time + join_request_timeout()
+					< cf_getms();
+
+	if (orphan_for_too_long
+			|| clustering_join_request_attempt()
+					== AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS) {
+		// No neighboring principal found or we have been orphan for too long,
+		// try and form a new cluster.
+		clustering_cluster_form();
+	}
+	else {
+		// A join request sent successfully or pending. Wait for the new
+		// principal to respond.
+
+		// We are not going to be a principal node in this quantum, reject all
+		// pending join requests.
+		clustering_join_requests_reject_all();
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Get a list of nodes that need to be added to current succession list from
+ * pending join requests. Bascially filters out node that are not orphans.
+ */
+static void
+clustering_nodes_to_add_get(cf_vector* nodes_to_add)
+{
+	CLUSTERING_LOCK();
+
+	// Use a single iteration over the clustering data received via the
+	// heartbeats instead of individual calls to get a consistent view and avoid
+	// small lock and release.
+	as_hb_plugin_data_iterate(&g_clustering.pending_join_requests,
+			AS_HB_PLUGIN_CLUSTERING, clustering_orphan_nodes_find,
+			nodes_to_add);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle quantum interval start in the orphan state. Try and join / form a
+ * cluster.
+ */
+static void
+clustering_orphan_quantum_interval_start_handle()
+{
+	if (!as_hb_self_is_duplicate()) {
+		// Try to join a cluster or form a new one.
+		clustering_join_or_form_cluster();
+	}
+}
+
+/**
+ * Send a cluster move command to all nodes in the input list.
+ *
+ * @param candidate_principal the principal to which the other nodes should try
+ * and join after receiving the move command.
+ * @param cluster_key current cluster key for receiver validation.
+ * @param nodeids the nodes to send move command to.
+ */
+static void
+clustering_cluster_move_send(cf_node candidate_principal,
+		as_cluster_key cluster_key, cf_vector* nodeids)
+{
+	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_MERGE_MOVE);
+
+	// Set the proposed principal.
+	msg_proposed_principal_set(msg, candidate_principal);
+
+	// Set cluster key for message validation.
+	msg_cluster_key_set(msg, cluster_key);
+
+	log_cf_node_vector("cluster merge move command sent to:", nodeids,
+			CF_DEBUG);
+
+	// Sent the message to the acceptors.
+	msg_nodes_send(msg, nodeids);
+}
+
+/**
+ * Update preferred principal votes using hb plugin data.
+ */
+static void
+clustering_principal_preferred_principal_votes_count(cf_node nodeid,
+		void* plugin_data, size_t plugin_data_size, cf_clock recv_monotonic_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
+{
+	// A hash from each unique non null vinfo to a vector of partition ids
+	// having the vinfo.
+	cf_shash* preferred_principal_votes = (cf_shash*)udata;
+
+	CLUSTERING_LOCK();
+	if (!clustering_hb_plugin_data_is_obsolete(
+			g_register.cluster_modified_hlc_ts,
+			g_register.cluster_modified_time, plugin_data, plugin_data_size,
+			recv_monotonic_ts, msg_hlc_ts)) {
+		cf_node* preferred_principal_p =
+				clustering_hb_plugin_preferred_principal_get(plugin_data,
+						plugin_data_size);
+
+		int current_votes = 0;
+		if (cf_shash_get(preferred_principal_votes, preferred_principal_p,
+				&current_votes) == CF_SHASH_OK) {
+			current_votes++;
+		}
+		else {
+			// We are seeing this preferred principal for the first time.
+			current_votes = 0;
+		}
+
+		cf_shash_put(preferred_principal_votes, preferred_principal_p,
+				&current_votes);
+	}
+	else {
+		DETAIL(
+				"preferred principal voting skipped - found obsolete plugin data for node %"PRIx64,
+				nodeid);
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Get the preferred majority principal.
+ */
+static int
+clustering_principal_preferred_principal_majority_find(const void* key,
+		void* data, void* udata)
+{
+
+	const cf_node* current_preferred_principal = (const cf_node*)key;
+	int current_preferred_principal_votes = *(int*)data;
+	cf_node* majority_preferred_principal = (cf_node*)udata;
+
+	CLUSTERING_LOCK();
+	int preferred_principal_majority =
+			(int)ceil(
+					cf_vector_size(
+							&g_register.succession_list) * AS_CLUSTERING_PREFERRRED_PRINCIPAL_MAJORITY);
+	bool is_majority = current_preferred_principal_votes
+			>= preferred_principal_majority;
+	CLUSTERING_UNLOCK();
+
+	if (is_majority) {
+		*majority_preferred_principal = *current_preferred_principal;
+		// Majority found, halt reduce.
+		return CF_SHASH_ERR_FOUND;
+	}
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Get preferred principal based on a majority of non-principal's preferred
+ * principals.
+ * @return the preferred principal nodeid if there is a majority, else zero.
+ */
+static cf_node
+clustering_principal_majority_preferred_principal_get()
+{
+	// A hash from each unique non null vinfo to a vector of partition ids
+	// having the vinfo.
+	cf_shash* preferred_principal_votes = cf_shash_create(cf_nodeid_shash_fn,
+			sizeof(cf_node), sizeof(int), AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT,
+			0);
+
+	CLUSTERING_LOCK();
+
+	// Use a single iteration over the clustering data received via the
+	// heartbeats instead of individual calls to get a consistent view and avoid
+	// small lock and release.
+	as_hb_plugin_data_iterate(&g_register.succession_list,
+			AS_HB_PLUGIN_CLUSTERING,
+			clustering_principal_preferred_principal_votes_count,
+			preferred_principal_votes);
+
+	// Find the majority preferred principal.
+	cf_node preferred_principal = 0;
+	cf_shash_reduce(preferred_principal_votes,
+			clustering_principal_preferred_principal_majority_find,
+			&preferred_principal);
+
+	CLUSTERING_UNLOCK();
+
+	cf_shash_destroy(preferred_principal_votes);
+
+	DETAIL("preferred principal is %"PRIx64, preferred_principal);
+
+	return preferred_principal;
+}
+
+/**
+ * Indicates if this node is a principal and its cluster can be merged with this
+ * principal node's cluster.
+ *
+ * @param nodeid the candidate nodeid.
+ * @param node_succession_list the candidate node's succession list.
+ * @param node_succession_list_length the length of the node's succession list.
+ * @return true if current node can be merged with this node's cluster.
+ */
+bool
+clustering_is_merge_candidate(cf_node nodeid, cf_node* node_succession_list,
+		int node_succession_list_length)
+{
+	if (node_succession_list_length <= 0 || node_succession_list[0] != nodeid) {
+		// Not a principal node. Ignore.
+		return false;
+	}
+
+	if (nodeid < config_self_nodeid_get()) {
+		// Has a smaller nodeid. Ignore. This node will merge with our cluster.
+		return false;
+	}
+
+	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
+
+	CLUSTERING_LOCK();
+	vector_copy_unique(new_succession_list, &g_register.succession_list);
+	CLUSTERING_UNLOCK();
+
+	bool is_candidate = false;
+
+	// Node is the principal of its cluster. Create the new succession list.
+	for (int i = 0; i < node_succession_list_length; i++) {
+		cf_vector_append_unique(new_succession_list, &node_succession_list[i]);
+	}
+
+	int expected_cluster_size = cf_vector_size(new_succession_list);
+
+	// Find and evict the nodes that  are not well connected.
+	clustering_succession_list_clique_evict(new_succession_list,
+			"clique based evicted nodes at cluster merge:");
+	int new_cluster_size = cf_vector_size(new_succession_list);
+
+	// If no nodes need to be evicted then the merge is fine.
+	is_candidate = (expected_cluster_size == new_cluster_size);
+
+	// Exit:
+	cf_vector_destroy(new_succession_list);
+
+	return is_candidate;
+}
+
+/**
+ * HB plugin iterate function to find principals that this node's cluster can be
+ * merged with.
+ */
+static void
+clustering_merge_candiate_find(cf_node nodeid, void* plugin_data,
+		size_t plugin_data_size, cf_clock recv_monotonic_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
+{
+	cf_node* candidate_principal = (cf_node*)udata;
+
+	CLUSTERING_LOCK();
+
+	if (!clustering_hb_plugin_data_is_obsolete(
+			g_register.cluster_modified_hlc_ts,
+			g_register.cluster_modified_time, plugin_data, plugin_data_size,
+			recv_monotonic_ts, msg_hlc_ts)) {
+		uint32_t* other_succession_list_length =
+				clustering_hb_plugin_succession_length_get(plugin_data,
+						plugin_data_size);
+
+		cf_node* other_succession_list = clustering_hb_plugin_succession_get(
+				plugin_data, plugin_data_size);
+
+		if (other_succession_list != NULL
+				&& clustering_is_merge_candidate(nodeid, other_succession_list,
+						*other_succession_list_length)
+				&& *candidate_principal < nodeid) {
+			DETAIL("principal node %"PRIx64" potential candidate for cluster merge", nodeid);
+			*candidate_principal = nodeid;
+		}
+
+	}
+	else {
+		DETAIL(
+				"merge check skipped - found obsolete plugin data for node %"PRIx64,
+				nodeid);
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Attempt to move to the majority preferred principal.
+ *
+ * @return 0 if the move to preferred principal was attempted, -1 otherwise.
+ */
+static int
+clustering_preferred_principal_move()
+{
+	cf_node preferred_principal =
+			clustering_principal_majority_preferred_principal_get();
+
+	if (preferred_principal == 0
+			|| preferred_principal == config_self_nodeid_get()) {
+		return -1;
+	}
+
+	cf_vector* succession_list = vector_stack_lockless_create(cf_node);
+	as_cluster_key cluster_key = 0;
+	CLUSTERING_LOCK();
+	vector_copy(succession_list, &g_register.succession_list);
+	cluster_key = g_register.cluster_key;
+	// Update the time move command was sent.
+	g_clustering.move_cmd_issue_time = cf_getms();
+	CLUSTERING_UNLOCK();
+
+	INFO("majority nodes find %"PRIx64" to be a better principal - sending move command to all cluster members",
+			preferred_principal);
+	clustering_cluster_move_send(preferred_principal, cluster_key,
+			succession_list);
+	cf_vector_destroy(succession_list);
+
+	return 0;
+}
+
+/**
+ * Attempt to merge with a larger adjacent cluster is the resulting cluster will
+ * form a clique.
+ *
+ * @return 0 if a merge is attempted, -1 otherwise.
+ */
+static int
+clustering_merge_attempt()
+{
+	int rv = -1;
+	CLUSTERING_LOCK();
+	cf_vector* succession_list = vector_stack_lockless_create(cf_node);
+	vector_copy(succession_list, &g_register.succession_list);
+	as_cluster_key cluster_key = g_register.cluster_key;
+	cf_node candidate_principal = 0;
+
+	// Use a single iteration over the clustering data received via the
+	// heartbeats instead of individual calls to get a consistent view and avoid
+	// small lock and release.
+	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
+			clustering_merge_candiate_find, &candidate_principal);
+
+	CLUSTERING_UNLOCK();
+
+	if (candidate_principal == 0) {
+		DEBUG("no cluster merge candidates found");
+		rv = -1;
+		goto Exit;
+	}
+
+	// Send a move command to all nodes in the succession list. Need not switch
+	// to orphan state immediately, this node will receive the move command too
+	// and will handle the move accordingly.
+	INFO("this cluster can merge with cluster with principal %"PRIx64" - sending move command to all cluster members",
+			candidate_principal);
+	clustering_cluster_move_send(candidate_principal, cluster_key,
+			succession_list);
+	rv = 0;
+Exit:
+	cf_vector_destroy(succession_list);
+	return rv;
+}
+
+/**
+ * Handle quantum interval start when self node is the principal of its cluster.
+ */
+static void
+clustering_principal_quantum_interval_start_handle(
+		as_clustering_internal_event* event)
+{
+	DETAIL("principal node quantum wakeup");
+
+	if (as_hb_self_is_duplicate()) {
+		// Cluster is in a bad shape and self node has a duplicate node-id.
+		register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+		return;
+	}
+
+	CLUSTERING_LOCK();
+	bool paxos_proposal_started = false;
+
+	cf_vector* dead_nodes = vector_stack_lockless_create(cf_node);
+	clustering_dead_nodes_find(dead_nodes);
+
+	log_cf_node_vector("dead nodes at quantum start:", dead_nodes,
+			cf_vector_size(dead_nodes) > 0 ? CF_INFO : CF_DEBUG);
+
+	cf_vector* faulty_nodes = vector_stack_lockless_create(cf_node);
+	clustering_faulty_nodes_find(faulty_nodes);
+
+	log_cf_node_vector("faulty nodes at quantum start:", faulty_nodes,
+			cf_vector_size(faulty_nodes) > 0 ? CF_INFO : CF_DEBUG);
+
+	// Having dead node or faulty nodes is a sign of cluster integrity breach.
+	// New nodes should not count as integrity breach.
+	g_clustering.has_integrity = cf_vector_size(faulty_nodes) == 0
+			&& cf_vector_size(dead_nodes) == 0;
+
+	cf_vector* new_nodes = vector_stack_lockless_create(cf_node);
+	clustering_nodes_to_add_get(new_nodes);
+	log_cf_node_vector("join requests at quantum start:", new_nodes,
+			cf_vector_size(new_nodes) > 0 ? CF_INFO : CF_DEBUG);
+
+	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
+	vector_copy_unique(new_succession_list, &g_register.succession_list);
+	vector_subtract(new_succession_list, dead_nodes);
+	vector_subtract(new_succession_list, faulty_nodes);
+	vector_copy_unique(new_succession_list, new_nodes);
+
+	// Add self node. We should not miss self in the succession list, but be
+	// doubly sure.
+	cf_node self_nodeid = config_self_nodeid_get();
+	cf_vector_append_unique(new_succession_list, &self_nodeid);
+
+	vector_sort_unique(new_succession_list, cf_node_compare_desc);
+	uint32_t num_evicted = clustering_succession_list_clique_evict(
+			new_succession_list,
+			"clique based evicted nodes at quantum start:");
+
+	if (event->quantum_interval_is_skippable && cf_vector_size(dead_nodes) != 0
+			&& !quantum_interval_is_adjacency_fault_seen()) {
+		// There is an imminent adjacency fault that has not been seen by the
+		// quantum interval generator, lets not take any action.
+		DEBUG("adjacency fault imminent - skipping quantum interval handling");
+		quantum_interval_mark_postponed();
+		goto Exit;
+	}
+
+	if (event->quantum_interval_is_skippable && num_evicted != 0
+			&& !quantum_interval_is_peer_adjacency_fault_seen()) {
+		// There is an imminent adjacency fault that has not been seen by the
+		// quantum interval generator, lets not take any action.
+		DEBUG(
+				"peer adjacency fault imminent - skipping quantum interval handling");
+		quantum_interval_mark_postponed();
+		goto Exit;
+	}
+
+	if (cf_vector_size(faulty_nodes) == 0 && cf_vector_size(dead_nodes) == 0) {
+		// We might have only pending join requests. Attempt a move to a
+		// preferred principal or a merge before trying to add new nodes.
+		if (clustering_preferred_principal_move() == 0
+				|| clustering_merge_attempt() == 0) {
+			goto Exit;
+		}
+	}
+
+	if (vector_equals(new_succession_list, &g_register.succession_list)
+			&& cf_vector_size(faulty_nodes) == 0) {
+		// There is no change in the succession list and also there are no
+		// faulty nodes. If there are faulty nodes they have probably restarted
+		// quickly, in which case a new cluster transition with the same
+		// succession list is required.
+		goto Exit;
+	}
+
+	if (cf_vector_size(faulty_nodes) != 0
+			&& cf_vector_size(new_succession_list) == 1) {
+		// This node most likely lost time (slept/paused) and the rest of the
+		// cluster reformed. Its best to go to the orphan state and start from
+		// there instead of moving to a single node cluster and again eventually
+		// forming a larger cluster.
+		WARNING(
+				"all cluster members are part of different cluster - changing state to orphan");
+		register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+		goto Exit;
+	}
+
+	// Start a new paxos round.
+	log_cf_node_vector("current succession list", &g_register.succession_list,
+			CF_DEBUG);
+
+	log_cf_node_vector("proposed succession list", new_succession_list,
+			CF_DEBUG);
+	DEBUG("proposed cluster size %d", cf_vector_size(new_succession_list));
+
+	as_paxos_start_result result = paxos_proposer_proposal_start(
+			new_succession_list, new_succession_list);
+
+	// Log paxos result.
+	paxos_result_log(result, new_succession_list);
+
+	// TODO: Should we move to orphan state if there are not enough nodes in the
+	// cluster.
+	// Tentatively yes....
+	if (result == AS_PAXOS_RESULT_CLUSTER_TOO_SMALL) {
+		register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+	}
+
+	paxos_proposal_started = (result == AS_PAXOS_RESULT_STARTED);
+Exit:
+	// Although these are stack vectors the contents can be heap allocated on
+	// resize. Destroy call is prudent.
+	cf_vector_destroy(dead_nodes);
+	cf_vector_destroy(faulty_nodes);
+	cf_vector_destroy(new_nodes);
+	cf_vector_destroy(new_succession_list);
+
+	// Compute list of rejected nodes.
+	if (paxos_proposal_started) {
+		// Nodes in set (pending_join - new succession list) could not be
+		// accomodated and should receive a join reject.
+		vector_subtract(&g_clustering.pending_join_requests,
+				new_succession_list);
+	}
+	else {
+		// Nodes in set (pending_join - current succession list) could not be
+		// accomodated and should receive a join reject.
+		vector_subtract(&g_clustering.pending_join_requests,
+				&g_register.succession_list);
+
+	}
+
+	cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node);
+	vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests);
+
+	// Clear the pending join requests
+	vector_clear(&g_clustering.pending_join_requests);
+
+	// Send reject messages to rejected nodes.
+	clustering_join_requests_reject(rejected_nodes);
+
+	cf_vector_destroy(rejected_nodes);
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check for and handle eviction by self node's principal.
+ *
+ * @param principal_plugin_data the pluging data for the principal.
+ * @param plugin_data_hlc_ts the hlc timestamp when the plugin data was
+ * received.
+ * @param plugin_data_ts the monotonic clock timestamp when the plugin data was
+ * recvied.
+ */
+static void
+clustering_non_principal_evicted_check(cf_node principal_nodeid,
+		as_hb_plugin_node_data* principal_plugin_data,
+		as_hlc_msg_timestamp* plugin_data_hlc_ts, cf_clock plugin_data_ts)
+{
+	CLUSTERING_LOCK();
+	bool is_evicted = false;
+
+	if (!as_hb_is_alive(principal_nodeid)) {
+		is_evicted = true;
+		goto Exit;
+	}
+
+	if (!clustering_is_our_principal(principal_nodeid)
+			|| clustering_hb_plugin_data_is_obsolete(
+					g_register.cluster_modified_hlc_ts,
+					g_register.cluster_modified_time,
+					principal_plugin_data->data,
+					principal_plugin_data->data_size, plugin_data_ts,
+					plugin_data_hlc_ts)) {
+		// The plugin data is obsolete. Can't take decisions based on it.
+		goto Exit;
+	}
+
+	// Get the changed node's succession list, cluster key. All the fields
+	// should be present since the obsolete check also checked for fields being
+	// valid.
+	cf_node* succession_list_p = clustering_hb_plugin_succession_get(
+			principal_plugin_data->data, principal_plugin_data->data_size);
+	uint32_t* succession_list_length_p =
+			clustering_hb_plugin_succession_length_get(
+					principal_plugin_data->data,
+					principal_plugin_data->data_size);
+
+	// Check if we have been evicted.
+	if (!clustering_is_node_in_succession(config_self_nodeid_get(),
+			succession_list_p, *succession_list_length_p)) {
+		is_evicted = true;
+	}
+
+Exit:
+	if (is_evicted) {
+		// This node has been evicted from the cluster.
+		WARNING("evicted from cluster by principal node %"PRIx64"- changing state to orphan",
+				principal_nodeid);
+		register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Monitor plugin data change events for evictions.
+ */
+static void
+clustering_non_principal_hb_plugin_data_changed_handle(
+		as_clustering_internal_event* change_event)
+{
+	clustering_non_principal_evicted_check(
+			change_event->plugin_data_changed_nodeid, change_event->plugin_data,
+			&change_event->plugin_data_changed_hlc_ts,
+			change_event->plugin_data_changed_ts);
+}
+
+/**
+ * Update the preferred principal in the non-principal mode.
+ */
+static void
+clustering_non_principal_preferred_principal_update()
+{
+	cf_node current_principal = 0;
+	if (clustering_principal_get(&current_principal) != 0
+			|| current_principal == 0) {
+		// We are an orphan.
+		return;
+	}
+
+	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
+
+	clustering_neighboring_nodes_get(new_succession_list);
+	cf_node self_nodeid = config_self_nodeid_get();
+	cf_vector_append(new_succession_list, &self_nodeid);
+
+	clustering_succession_list_clique_evict(new_succession_list,
+			"clique based evicted nodes while updating preferred principal:");
+
+	// Sort the new succession list.
+	vector_sort_unique(new_succession_list, cf_node_compare_desc);
+
+	cf_node preferred_principal = 0;
+	int new_cluster_size = cf_vector_size(new_succession_list);
+	if (new_cluster_size > 0) {
+		if (vector_find(new_succession_list, &current_principal) < 0) {
+			cf_vector_get(new_succession_list, 0, &preferred_principal);
+		}
+	}
+
+	CLUSTERING_LOCK();
+	if (preferred_principal != 0
+			&& g_clustering.preferred_principal != preferred_principal) {
+		INFO("preferred principal updated to %"PRIx64,
+				g_clustering.preferred_principal);
+	}
+	g_clustering.preferred_principal = preferred_principal;
+
+	cf_vector_destroy(new_succession_list);
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle quantum interval start in the non principal state.
+ */
+static void
+clustering_non_principal_quantum_interval_start_handle()
+{
+	// Reject all accumulated join requests since we are no longer a principal.
+	clustering_join_requests_reject_all();
+
+	if (as_hb_self_is_duplicate()) {
+		// Cluster is in a bad shape and self node has a duplicate node-id.
+		register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+		return;
+	}
+
+	// Update the preferred principal.
+	clustering_non_principal_preferred_principal_update();
+
+	// Check if we have been evicted.
+	cf_node principal = 0;
+
+	if (clustering_principal_get(&principal) != 0) {
+		WARNING("could not get principal for self node");
+		return;
+	}
+
+	as_hlc_msg_timestamp plugin_data_hlc_ts;
+	cf_clock plugin_data_ts = 0;
+	as_hb_plugin_node_data plugin_data = { 0 };
+
+	if (clustering_hb_plugin_data_get(principal, &plugin_data,
+			&plugin_data_hlc_ts, &plugin_data_ts) != 0) {
+		plugin_data_ts = 0;
+		memset(&plugin_data, 0, sizeof(plugin_data));
+	}
+
+	clustering_non_principal_evicted_check(principal, &plugin_data,
+			&plugin_data_hlc_ts, plugin_data_ts);
+}
+
+/**
+ * Handle quantum interval start.
+ */
+static void
+clustering_quantum_interval_start_handle(as_clustering_internal_event* event)
+{
+	CLUSTERING_LOCK();
+
+	// Dispatch based on state.
+	switch (g_clustering.state) {
+	case AS_CLUSTERING_STATE_ORPHAN:
+		clustering_orphan_quantum_interval_start_handle();
+		break;
+	case AS_CLUSTERING_STATE_PRINCIPAL:
+		clustering_principal_quantum_interval_start_handle(event);
+		break;
+	case AS_CLUSTERING_STATE_NON_PRINCIPAL:
+		clustering_non_principal_quantum_interval_start_handle();
+	default:
+		break;
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle a timer event in the orphan state.
+ */
+static void
+clustering_orphan_timer_event_handle()
+{
+	// Attempt a join request.
+	DETAIL("attempting join request from orphan state");
+	clustering_join_request_attempt();
+}
+
+/**
+ * Handle a timer event for the clustering module.
+ */
+static void
+clustering_timer_event_handle()
+{
+	CLUSTERING_LOCK();
+
+	// Dispatch based on state.
+	switch (g_clustering.state) {
+	case AS_CLUSTERING_STATE_ORPHAN:
+		clustering_orphan_timer_event_handle();
+		break;
+	default:
+		break;
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Check if the incoming message is sane to be proccessed further.
+ */
+static bool
+clustering_message_sanity_check(cf_node src_nodeid, msg* msg)
+{
+	as_cluster_proto_identifier proto;
+	if (msg_proto_id_get(msg, &proto) != 0) {
+		WARNING(
+				"received message with no clustering protocol identifier from node %"PRIx64,
+				src_nodeid);
+		return false;
+	}
+
+	return clustering_versions_are_compatible(proto,
+			clustering_protocol_identifier_get());
+}
+
+/**
+ * Handle an incoming join request. We do not bother with older replay's for
+ * join requests because the pending request are cleanup during new cluster
+ * formation.
+ */
+static void
+clustering_join_request_handle(as_clustering_internal_event* msg_event)
+{
+	cf_node src_nodeid = msg_event->msg_src_nodeid;
+	DEBUG("received cluster join request from node %"PRIx64, src_nodeid);
+	bool fire_quantum_event = false;
+
+	CLUSTERING_LOCK();
+
+	cf_clock now = cf_getms();
+
+	if (g_clustering.move_cmd_issue_time + join_request_move_reject_interval()
+			> now) {
+		// We have just send out a move request. Reject this join request.
+		INFO("ignoring join request from node %"PRIx64" since we have just issued a move command",
+				src_nodeid);
+		clustering_join_reject_send(src_nodeid);
+		goto Exit;
+	}
+
+	if ((!clustering_is_principal() && !clustering_is_orphan())
+			|| g_clustering.last_join_request_sent_time + join_request_timeout()
+					>= cf_getms()) {
+		// Can't handle a join request this node is not the principal right now
+		// or this node is trying to join another cluster.
+		msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REJECT);
+
+		DETAIL("sent join reject to node %"PRIx64, msg_event->msg_src_nodeid);
+
+		// Sent the message to the acceptors.
+		msg_node_send(msg, msg_event->msg_src_nodeid);
+
+		goto Exit;
+	}
+
+	if (vector_find(&g_clustering.pending_join_requests, &src_nodeid) >= 0) {
+		DEBUG("ignoring join request from node %"PRIx64" since a request is already pending",
+				src_nodeid);
+		goto Exit;
+	}
+
+	// Check if we are receiving a stale or very delayed join request.
+	int64_t message_delay_estimate = as_hlc_timestamp_diff_ms(
+			as_hlc_timestamp_now(), msg_event->msg_hlc_ts.send_ts);
+	if (message_delay_estimate < 0
+			|| message_delay_estimate > join_request_accept_delay_max()) {
+		INFO("ignoring stale join request from node %"PRIx64" - delay estimate %lu(ms) ",
+				src_nodeid, message_delay_estimate);
+		goto Exit;
+	}
+
+	// Add this request to the pending queue.
+	cf_vector_append_unique(&g_clustering.pending_join_requests, &src_nodeid);
+
+	// Generate a join request accepted event for the quantum interval
+	// generator.
+	as_clustering_internal_event join_request_event;
+	memset(&join_request_event, 0, sizeof(join_request_event));
+	join_request_event.type =
+			AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED;
+	join_request_event.join_request_source_nodeid = src_nodeid;
+	internal_event_dispatch(&join_request_event);
+	fire_quantum_event = true;
+
+	INFO("accepted join request from node %"PRIx64, src_nodeid);
+
+Exit:
+	CLUSTERING_UNLOCK();
+
+	if (fire_quantum_event) {
+		internal_event_dispatch(&join_request_event);
+	}
+}
+
+/**
+ * Handle an incoming join reject.
+ */
+static void
+clustering_join_reject_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+
+	DEBUG("received cluster join reject from node %"PRIx64, src_nodeid);
+
+	CLUSTERING_LOCK();
+
+	if (!clustering_is_orphan()) {
+		// Already part of a cluster. Ignore the reject.
+		INFO(
+				"already part of a cluster - ignoring join reject from node %"PRIx64,
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (paxos_proposer_proposal_is_active()) {
+		// This node is attempting to form a new cluster.
+		INFO(
+				"already trying to form a cluster - ignoring join reject from node %"PRIx64,
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (g_clustering.last_join_request_principal == src_nodeid) {
+		// This node had requested the source principal for cluster membership
+		// which was rejected. Try and join a different cluster.
+
+		// This join request should not be considered as pending, so reset the
+		// join request sent time.
+		g_clustering.last_join_request_sent_time = 0;
+		g_clustering.last_join_request_principal = 0;
+		clustering_join_request_attempt();
+	}
+
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming merge move command. Basically this node switched to orphan
+ * state and sends a join request to the principal listed in the merge move.
+ */
+static void
+clustering_merge_move_handle(as_clustering_internal_event* event)
+{
+	cf_node src_nodeid = event->msg_src_nodeid;
+
+	DEBUG("received cluster merge move from node %"PRIx64, src_nodeid);
+
+	CLUSTERING_LOCK();
+
+	as_cluster_key msg_cluster_key = 0;
+	msg_cluster_key_get(event->msg, &msg_cluster_key);
+
+	if (clustering_is_orphan()) {
+		// Already part of a cluster. Ignore the reject.
+		INFO(
+				"already orphan node - ignoring merge move command from node %"PRIx64,
+				src_nodeid);
+		goto Exit;
+	}
+
+	if (msg_is_obsolete(g_register.cluster_modified_hlc_ts,
+			g_register.cluster_modified_time, event->msg_recvd_ts,
+			&event->msg_hlc_ts) || !clustering_is_our_principal(src_nodeid)
+			|| paxos_proposer_proposal_is_active()
+			|| msg_cluster_key != g_register.cluster_key) {
+		INFO("ignoring cluster merge move from node %"PRIx64, src_nodeid);
+		goto Exit;
+	}
+
+	// Madril simulation black lists current principal so that we do not end up
+	// joining him again immediately. However the check for obsolete data should
+	// make that check from madril redundant.
+	cf_node new_principal = 0;
+
+	if (msg_proposed_principal_get(event->msg, &new_principal) != 0) {
+		// Move command does not have the proposed principal
+		WARNING(
+				"received merge move command without a proposed principal. Will join the first available principal");
+		new_principal = 0;
+	}
+
+	// Switch to orphan cluster state so that we move to the new principal.
+	register_become_orphan (AS_CLUSTERING_ATTEMPTING_MERGE);
+
+	// Send a join request to a the new principal
+	clustering_principal_join_request_attempt(new_principal);
+Exit:
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle an incoming message.
+ */
+static void
+clustering_msg_event_handle(as_clustering_internal_event* msg_event)
+{
+	// Delegate handling based on message type.
+	switch (msg_event->msg_type) {
+	case AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST:
+		clustering_join_request_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_JOIN_REJECT:
+		clustering_join_reject_handle(msg_event);
+		break;
+	case AS_CLUSTERING_MSG_TYPE_MERGE_MOVE:
+		clustering_merge_move_handle(msg_event);
+		break;
+	default:	// Non cluster management messages.
+		break;
+	}
+}
+
+/**
+ * Fabric msg listener that generates an internal message event and dispatches
+ * it to the sub system.
+ */
+static int
+clustering_fabric_msg_listener(cf_node msg_src_nodeid, msg* msg, void* udata)
+{
+	if (!clustering_is_running()) {
+		// Ignore fabric messages when clustering is not running.
+		WARNING("clustering stopped - ignoring message from node %"PRIx64,
+				msg_src_nodeid);
+		goto Exit;
+	}
+
+	// Sanity check.
+	if (!clustering_message_sanity_check(msg_src_nodeid, msg)) {
+		WARNING("invalid mesage received from node %"PRIx64, msg_src_nodeid);
+		goto Exit;
+	}
+
+	as_clustering_internal_event msg_event;
+	memset(&msg_event, 0, sizeof(msg_event));
+	msg_event.type = AS_CLUSTERING_INTERNAL_EVENT_MSG;
+
+	msg_event.msg_src_nodeid = msg_src_nodeid;
+
+	// Update hlc and store update message timestamp for the event.
+	as_hlc_timestamp send_ts = 0;
+	msg_send_ts_get(msg, &send_ts);
+	as_hlc_timestamp_update(msg_event.msg_src_nodeid, send_ts,
+			&msg_event.msg_hlc_ts);
+
+	msg_event.msg = msg;
+	msg_event.msg_recvd_ts = cf_getms();
+	msg_type_get(msg, &msg_event.msg_type);
+
+	internal_event_dispatch(&msg_event);
+
+Exit:
+	as_fabric_msg_put(msg);
+	return 0;
+}
+
+/**
+ * Handle register cluster changed.
+ */
+static void
+clustering_register_cluster_changed_handle()
+{
+	CLUSTERING_LOCK();
+
+	if (paxos_proposer_proposal_is_active()) {
+		paxos_proposer_fail();
+	}
+
+	if (clustering_is_principal()) {
+		g_clustering.state = AS_CLUSTERING_STATE_PRINCIPAL;
+	}
+	else {
+		g_clustering.state = AS_CLUSTERING_STATE_NON_PRINCIPAL;
+		// We are a non-principal. Reject all pending join requests.
+		clustering_join_requests_reject_all();
+	}
+
+	g_clustering.preferred_principal = 0;
+	g_clustering.last_join_request_principal = 0;
+	g_clustering.move_cmd_issue_time = 0;
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle register synced events. Basically this means it is safe to publish the
+ * cluster changed event to external sub systems.
+ */
+static void
+clustering_register_cluster_synced_handle(as_clustering_internal_event* event)
+{
+	CLUSTERING_LOCK();
+
+	// Queue the cluster change event for publishing.
+	as_clustering_event cluster_change_event;
+	cluster_change_event.type = AS_CLUSTERING_CLUSTER_CHANGED;
+	cluster_change_event.qualifier = event->qualifier;
+	cluster_change_event.cluster_key = g_register.cluster_key;
+	cluster_change_event.succession_list = &g_register.succession_list;
+	external_event_queue(&cluster_change_event);
+
+	g_clustering.has_integrity = true;
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle the register going to orphaned state.
+ */
+static void
+clustering_register_orphaned_handle(as_clustering_internal_event* event)
+{
+	CLUSTERING_LOCK();
+	g_clustering.state = AS_CLUSTERING_STATE_ORPHAN;
+	g_clustering.orphan_state_start_time = cf_getms();
+	g_clustering.preferred_principal = 0;
+
+	// Queue the cluster change event for publishing.
+	as_clustering_event orphaned_event;
+	orphaned_event.type = AS_CLUSTERING_ORPHANED;
+	orphaned_event.qualifier = event->qualifier;
+	orphaned_event.cluster_key = 0;
+	orphaned_event.succession_list = NULL;
+	external_event_queue(&orphaned_event);
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle hb plugin data change by dispatching it based on clustering change.
+ */
+static void
+clustering_hb_plugin_data_changed_event_handle(
+		as_clustering_internal_event* change_event)
+{
+	CLUSTERING_LOCK();
+	switch (g_clustering.state) {
+	case AS_CLUSTERING_STATE_NON_PRINCIPAL:
+		clustering_non_principal_hb_plugin_data_changed_handle(change_event);
+		break;
+	default:
+		break;
+	}
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Handle heartbeat event.
+ */
+static void
+clustering_hb_event_handle(as_clustering_internal_event* hb_event)
+{
+	for (int i = 0; i < hb_event->hb_n_events; i++) {
+		if (hb_event->hb_events[i].evt == AS_HB_NODE_DEPART
+				&& clustering_is_our_principal(hb_event->hb_events[i].nodeid)) {
+			// Our principal is no longer visible.
+			INFO("principal node %"PRIx64" departed - switching to orphan state",
+					hb_event->hb_events[i].nodeid);
+			register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
+		}
+	}
+}
+
+/**
+ * Handle the fail of a paxos proposal started by the self node.
+ */
+static void
+clustering_paxos_proposer_fail_handle()
+{
+	// Send reject to all pending join requesters.
+	clustering_join_requests_reject_all();
+}
+
+/**
+ * Clustering module event handler.
+ */
+static void
+clustering_event_handle(as_clustering_internal_event* event)
+{
+	// Lock to enusure the entire event handling is atomic and parallel events
+	// events (hb/fabric)  do not interfere.
+	CLUSTERING_LOCK();
+
+	switch (event->type) {
+	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
+		clustering_timer_event_handle();
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START:
+		clustering_quantum_interval_start_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_HB:
+		clustering_hb_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED:
+		clustering_hb_plugin_data_changed_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_MSG:
+		clustering_msg_event_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED:
+		clustering_register_orphaned_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED:
+		clustering_register_cluster_changed_handle();
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED:
+		clustering_register_cluster_synced_handle(event);
+		break;
+	case AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL:	// Send reject message to all
+		clustering_paxos_proposer_fail_handle();
+		break;
+	default:	// Not of interest for main clustering module.
+		break;
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Initialize the template to be used for clustering messages.
+ */
+static void
+clustering_msg_init()
+{
+	// Register fabric clustering msg type with no processing function:
+	// This permits getting / putting clustering msgs to be moderated via an
+	// idle msg queue.
+	as_fabric_register_msg_fn(M_TYPE_CLUSTERING, g_clustering_msg_template,
+			sizeof(g_clustering_msg_template), AS_CLUSTERING_MSG_SCRATCH_SIZE,
+			clustering_fabric_msg_listener, NULL);
+}
+
+/**
+ * Change listener that updates the first time in current quantum.
+ */
+static void
+clustering_hb_plugin_data_change_listener(cf_node changed_node_id)
+{
+	if (!clustering_is_running()) {
+		return;
+	}
+
+	DETAIL("cluster information change detected for node %"PRIx64,
+			changed_node_id);
+
+	as_hb_plugin_node_data plugin_data;
+	as_clustering_internal_event change_event;
+	memset(&change_event, 0, sizeof(change_event));
+	change_event.type = AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED;
+	change_event.plugin_data_changed_nodeid = changed_node_id;
+	change_event.plugin_data = &plugin_data;
+
+	if (clustering_hb_plugin_data_get(changed_node_id, &plugin_data,
+			&change_event.plugin_data_changed_hlc_ts,
+			&change_event.plugin_data_changed_ts) != 0) {
+		// Not possible. We should be able to read the plugin data that changed.
+		return;
+	}
+	internal_event_dispatch(&change_event);
+}
+
+/**
+ * Listen to external heartbeat event and dispatch an internal heartbeat event.
+ */
+static void
+clustering_hb_event_listener(int n_events, as_hb_event_node* hb_node_events,
+		void* udata)
+{
+	if (!clustering_is_running()) {
+		return;
+	}
+
+	// Wrap the events in an internal event and dispatch.
+	as_clustering_internal_event hb_event;
+	memset(&hb_event, 0, sizeof(hb_event));
+	hb_event.type = AS_CLUSTERING_INTERNAL_EVENT_HB;
+	hb_event.hb_n_events = n_events;
+	hb_event.hb_events = hb_node_events;
+
+	internal_event_dispatch(&hb_event);
+}
+
+/**
+ * Reform the cluster with the same succession list.This would trigger the
+ * generation of new partition info and the cluster would get a new cluster key.
+ *
+ * @return 0 if new clustering round started, 1 if not principal, -1 otherwise.
+ */
+static int
+clustering_cluster_reform()
+{
+	int rv = -1;
+	CLUSTERING_LOCK();
+
+	cf_vector* dead_nodes = vector_stack_lockless_create(cf_node);
+	clustering_dead_nodes_find(dead_nodes);
+
+	log_cf_node_vector("recluster: dead nodes - ", dead_nodes,
+			cf_vector_size(dead_nodes) > 0 ? CF_INFO : CF_DEBUG);
+
+	cf_vector* faulty_nodes = vector_stack_lockless_create(cf_node);
+	clustering_faulty_nodes_find(faulty_nodes);
+
+	log_cf_node_vector("recluster: faulty nodes - ", faulty_nodes,
+			cf_vector_size(faulty_nodes) > 0 ? CF_INFO : CF_DEBUG);
+
+	cf_vector* new_nodes = vector_stack_lockless_create(cf_node);
+	clustering_nodes_to_add_get(new_nodes);
+	log_cf_node_vector("recluster: pending join requests - ", new_nodes,
+			cf_vector_size(new_nodes) > 0 ? CF_INFO : CF_DEBUG);
+
+	if (!clustering_is_running() || !clustering_is_principal()
+			|| cf_vector_size(dead_nodes) > 0
+			|| cf_vector_size(faulty_nodes) > 0
+			|| cf_vector_size(new_nodes) > 0) {
+		INFO(
+				"recluster: skipped - principal %s dead_nodes %d faulty_nodes %d new_nodes %d",
+				clustering_is_principal() ? "true" : "false",
+				cf_vector_size(dead_nodes), cf_vector_size(faulty_nodes),
+				cf_vector_size(new_nodes));
+
+		if (!clustering_is_principal()) {
+			// Common case - command will likely be sent to all nodes.
+			rv = 1;
+		}
+
+		goto Exit;
+	}
+
+	cf_vector* succession_list = vector_stack_lockless_create(cf_node);
+	vector_copy(succession_list, &g_register.succession_list);
+
+	log_cf_node_vector(
+			"recluster: principal node - reforming new cluster with succession list:",
+			succession_list, CF_INFO);
+
+	as_paxos_start_result result = paxos_proposer_proposal_start(
+			succession_list, succession_list);
+
+	// Log paxos result.
+	paxos_result_log(result, succession_list);
+
+	rv = (result == AS_PAXOS_RESULT_STARTED) ? 0 : -1;
+
+	if (rv == -1) {
+		INFO("recluster: skipped");
+	}
+	else {
+		INFO("recluster: triggered...");
+	}
+
+	cf_vector_destroy(succession_list);
+
+Exit:
+	cf_vector_destroy(dead_nodes);
+	cf_vector_destroy(faulty_nodes);
+	cf_vector_destroy(new_nodes);
+	CLUSTERING_UNLOCK();
+	return rv;
+}
+
+/**
+ * Initialize clustering subsystem.
+ */
+static void
+clustering_init()
+{
+	if (clustering_is_initialized()) {
+		return;
+	}
+
+	CLUSTERING_LOCK();
+	memset(&g_clustering, 0, sizeof(g_clustering));
+
+	// Start out as an orphan cluster.
+	g_clustering.state = AS_CLUSTERING_STATE_ORPHAN;
+	g_clustering.orphan_state_start_time = cf_getms();
+
+	g_clustering.join_request_blackout = cf_shash_create(cf_nodeid_shash_fn,
+			sizeof(cf_node), sizeof(cf_clock),
+			AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	vector_lockless_init(&g_clustering.pending_join_requests, cf_node);
+
+	// Register as a plugin with the heartbeat subsystem.
+	as_hb_plugin clustering_plugin;
+	memset(&clustering_plugin, 0, sizeof(clustering_plugin));
+
+	clustering_plugin.id = AS_HB_PLUGIN_CLUSTERING;
+	// Includes the size for the protocol version, the cluster key, the paxos
+	// sequence number for current cluster and the preferred principal.
+	clustering_plugin.wire_size_fixed = sizeof(uint32_t)
+			+ sizeof(as_cluster_key) + sizeof(as_paxos_sequence_number)
+			+ sizeof(cf_node);
+	// Size of the node in succession list.
+	clustering_plugin.wire_size_per_node = sizeof(cf_node);
+	clustering_plugin.set_fn = clustering_hb_plugin_set_fn;
+	clustering_plugin.parse_fn = clustering_hb_plugin_parse_data_fn;
+	clustering_plugin.change_listener =
+			clustering_hb_plugin_data_change_listener;
+
+	as_hb_plugin_register(&clustering_plugin);
+
+	// Register as hb event listener
+	as_hb_register_listener(clustering_hb_event_listener, NULL);
+
+	// Initialize fabric message pool.
+	clustering_msg_init();
+
+	// Initialize external event publisher.
+	external_event_publisher_init();
+
+	// Initialize the register.
+	register_init();
+
+	// Initialize timer.
+	timer_init();
+
+	// Initialize the quantum interval generator
+	quantum_interval_generator_init();
+
+	// Initialize paxos.
+	paxos_init();
+
+	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED;
+
+	DETAIL("clustering module initialized");
+
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Start the clustering sub-system.
+ */
+static void
+clustering_start()
+{
+	if (clustering_is_running()) {
+		return;
+	}
+
+	CLUSTERING_LOCK();
+	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_RUNNING;
+	CLUSTERING_UNLOCK();
+
+	// Start quantum interval generator.
+	quantum_interval_generator_start();
+
+	// Start the timer.
+	timer_start();
+
+	// Start the external event publisher.
+	external_event_publisher_start();
+}
+
+/**
+ * Stop the clustering sub-system.
+ */
+static void
+clustering_stop()
+{
+	if (!clustering_is_running()) {
+		return;
+	}
+
+	CLUSTERING_LOCK();
+	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN;
+	CLUSTERING_UNLOCK();
+
+	// Stop the timer.
+	timer_stop();
+
+	// Stop the external event publisher.
+	external_event_publisher_stop();
+
+	CLUSTERING_LOCK();
+	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED;
+	CLUSTERING_UNLOCK();
+}
+
+/**
+ * Dump clustering state to logs.
+ */
+static void
+clustering_dump(bool verbose)
+{
+	if (!clustering_is_running()) {
+		INFO("CL: stopped");
+		return;
+	}
+
+	paxos_proposer_dump(verbose);
+	paxos_acceptor_dump(verbose);
+	register_dump(verbose);
+
+	CLUSTERING_LOCK();
+
+	switch (g_clustering.state) {
+	case AS_CLUSTERING_STATE_ORPHAN:
+		INFO("CL: state: orphan");
+		break;
+	case AS_CLUSTERING_STATE_PRINCIPAL:
+		INFO("CL: state: principal");
+		break;
+	case AS_CLUSTERING_STATE_NON_PRINCIPAL:
+		INFO("CL: state: non-principal");
+		break;
+	}
+
+	INFO("CL: %s",
+			g_clustering.has_integrity ? "has integrity" : "integrity fault");
+	cf_node current_principal;
+	if (clustering_principal_get(&current_principal) != 0) {
+		if (g_clustering.preferred_principal != current_principal) {
+			INFO("CL: preferred principal %"PRIx64,
+					g_clustering.preferred_principal);
+		}
+	}
+
+	if (g_clustering.state == AS_CLUSTERING_STATE_ORPHAN) {
+		INFO("CL: join request sent to principal %"PRIx64,
+				g_clustering.last_join_request_principal);
+		INFO("CL: join request sent time: %"PRIu64" now: %"PRIu64 ,
+				g_clustering.last_join_request_sent_time, cf_getms());
+	}
+
+	if (verbose) {
+		log_cf_node_vector("CL: pending join requests:",
+				&g_clustering.pending_join_requests, CF_INFO);
+	}
+
+	CLUSTERING_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Internal event dispatcher
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Simple dispatcher for events. The order of dispatch is from lower (less
+ * dependent) to higher (more dependent) sub-modules.
+ */
+static void
+internal_event_dispatch(as_clustering_internal_event* event)
+{
+	// Sub-module dispatch.
+	quantum_interval_generator_event_dispatch(event);
+	paxos_event_dispatch(event);
+	register_event_dispatch(event);
+
+	// Dispatch to the main clustering module.
+	clustering_event_handle(event);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public API.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ *
+ * Initialize clustering subsystem.
+ */
+void
+as_clustering_init()
+{
+	clustering_init();
+}
+
+/**
+ * Start clustering subsystem.
+ */
+void
+as_clustering_start()
+{
+	clustering_start();
+}
+
+/**
+ * Stop clustering subsystem.
+ */
+void
+as_clustering_stop()
+{
+	clustering_stop();
+}
+
+/**
+ * Reform the cluster with the same succession list.This would trigger the
+ * generation of new partition info and the cluster would get a new cluster key.
+ *
+ * @return 0 if new clustering round started, -1 otherwise.
+ */
+int
+as_clustering_cluster_reform()
+{
+	return clustering_cluster_reform();
+}
+
+/**
+ * Return the quantum interval, i.e., the interval at which cluster change
+ * decisions are taken. The unit is milliseconds.
+ */
+uint64_t
+as_clustering_quantum_interval()
+{
+	return quantum_interval();
+}
+
+/**
+ * TEMPORARY - used by paxos only.
+ */
+void
+as_clustering_set_integrity(bool has_integrity)
+{
+	g_clustering.has_integrity = has_integrity;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering info command functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * If false means than either this node is orphaned, or is undergoing a cluster
+ * change.
+ */
+bool
+as_clustering_has_integrity()
+{
+	return g_clustering.has_integrity;
+}
+
+/**
+ * Indicates if self node is orphaned.
+ */
+bool
+as_clustering_is_orphan()
+{
+	return clustering_is_orphan();
+}
+
+/**
+ * Dump clustering state to the log.
+ */
+void
+as_clustering_dump(bool verbose)
+{
+	clustering_dump(verbose);
+}
+
+/**
+ * Set the min cluster size.
+ */
+int
+as_clustering_cluster_size_min_set(uint32_t new_cluster_size_min)
+{
+	CLUSTERING_LOCK();
+	int rv = 0;
+	uint32_t cluster_size = cf_vector_size(&g_register.succession_list);
+	if (clustering_is_orphan() || cluster_size >= new_cluster_size_min) {
+		INFO("changing value of min-cluster-size from %u to %u",
+				g_config.clustering_config.cluster_size_min,
+				new_cluster_size_min);
+		g_config.clustering_config.cluster_size_min = new_cluster_size_min;
+	}
+	else {
+		WARNING(
+				"min-cluster-size %d should be <= current cluster size %d - ignoring",
+				new_cluster_size_min, cluster_size);
+		rv = -1;
+	}
+	CLUSTERING_UNLOCK();
+	return rv;
+}
+
+/**
+ * Log a vector of node-ids at input severity spliting long vectors over
+ * multiple lines. The call might not work if the vector is not protected
+ * against multi-threaded access.
+ *
+ * @param context the logging context.
+ * @param severity the log severity.
+ * @param file_name the source file name for the log line.
+ * @param line the source file line number for the log line.
+ * @param message the message prefix for each log line. Message and node list
+ * will be separated with a space. Can be NULL for no prefix.
+ * @param nodes the vector of nodes.
+ */
+void
+as_clustering_cf_node_vector_event(cf_fault_severity severity,
+		cf_fault_context context, char* file_name, int line, char* message,
+		cf_vector* nodes)
+{
+	as_clustering_cf_node_array_event(severity, context, file_name, line,
+			message, vector_to_array(nodes), cf_vector_size(nodes));
+}
+
+/**
+ * Log an array of node-ids at input severity spliting long vectors over
+ * multiple lines. The call might not work if the array is not protected against
+ * multi-threaded access.
+ *
+ * @param context the logging context.
+ * @param severity the log severity.
+ * @param file_name the source file name for the log line.
+ * @param line the source file line number for the log line.
+ * @param message the message prefix for each log line. Message and node list
+ * will be separated with a space. Can be NULL for no prefix.
+ * @param nodes the array of nodes.
+ * @param node_count the count of nodes in the array.
+ */
+void
+as_clustering_cf_node_array_event(cf_fault_severity severity,
+		cf_fault_context context, char* file_name, int line, char* message,
+		cf_node* nodes, int node_count)
+{
+	if (!cf_context_at_severity(context, severity) && severity != CF_TRACE) {
+		return;
+	}
+
+	// Also account the space following the nodeid.
+	int node_str_len = 2 * (sizeof(cf_node)) + 1;
+
+	int message_length = 0;
+	char copied_message[LOG_LENGTH_MAX()];
+
+	if (message) {
+		// Limit the message length to allow at least one node to fit in the log
+		// line. Accounting for the separator between message and node list.
+		message_length = MIN(strnlen(message, LOG_LENGTH_MAX() - 1),
+		LOG_LENGTH_MAX() - 1 - node_str_len) + 1;
+
+		// Truncate the message.
+		strncpy(copied_message, message, message_length);
+		message = copied_message;
+	}
+
+	// Allow for the NULL terminator.
+	int nodes_per_line = (LOG_LENGTH_MAX() - message_length - 1) / node_str_len;
+	nodes_per_line = MAX(1, nodes_per_line);
+
+	// Have a buffer large enough to accomodate the message and nodes per line.
+	char log_buffer[message_length + (nodes_per_line * node_str_len) + 1];	// For the NULL terminator.
+	int output_node_count = 0;
+
+	// Marks the start of the nodeid list in the log line buffer.
+	char* node_buffer_start = log_buffer;
+	if (message) {
+		node_buffer_start += sprintf(log_buffer, "%s ", message);
+	}
+
+	for (int i = 0; i < node_count;) {
+		char* buffer = node_buffer_start;
+
+		for (int j = 0; j < nodes_per_line && i < node_count; j++) {
+			buffer += sprintf(buffer, "%"PRIx64" ", nodes[i]);
+			output_node_count++;
+			i++;
+		}
+
+		// Overwrite the space from the last node on the log line only if there
+		// is atleast one node output
+		if (buffer != node_buffer_start) {
+			*(buffer - 1) = 0;
+			cf_fault_event(context, severity, file_name, line, "%s",
+					log_buffer);
+		}
+	}
+
+	// Handle the empty vector case.
+	if (output_node_count == 0) {
+		sprintf(node_buffer_start, "(empty)");
+		cf_fault_event(context, severity, file_name, line, "%s", log_buffer);
+	}
+}
diff --git a/as/src/fabric/endpoint.c b/as/src/fabric/endpoint.c
new file mode 100644
index 00000000..d0538b70
--- /dev/null
+++ b/as/src/fabric/endpoint.c
@@ -0,0 +1,880 @@
+/*
+ * endpoint.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fabric/endpoint.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "fault.h"
+#include "socket.h"
+
+#include "base/cfg.h"
+
+/*----------------------------------------------------------------------------
+ * Private internal data structures.
+ *----------------------------------------------------------------------------*/
+typedef struct as_endpoint_collect_udata_s
+{
+	/**
+	 * Collected endpoint pointers.
+	 */
+	const as_endpoint** endpoints;
+
+	/**
+	 * Collected endpoint count.
+	 */
+	uint32_t collected_count;
+} as_endpoint_collect_udata;
+
+typedef struct as_endpoint_to_string_udata_s
+{
+	/**
+	 * Current write pointer.
+	 */
+	char* write_ptr;
+
+	/**
+	 * buffer remaining capacity.
+	 */
+	size_t buffer_remaining;
+
+	/**
+	 * Number of endpoints converted.
+	 */
+	uint32_t endpoints_converted;
+
+	/**
+	 * Capabilities of endpoint.
+	 */
+	uint8_t capabilities;
+
+	/**
+	 * Capability mask. Set to 0 to match all the endpoints.
+	 */
+	uint8_t capability_mask;
+} as_endpoint_to_string_udata;
+
+typedef struct as_endpoint_list_overlap_udata_s
+{
+	/**
+	 * Indicates if there was an overlap.
+	 */
+	bool overlapped;
+
+	/**
+	 * Indicates if endpoint capabilities should be ignored.
+	 */
+	bool ignore_capabilities;
+
+	/**
+	 * The other list to compare.
+	 */
+	const as_endpoint_list* other;
+} as_endpoint_list_overlap_udata;
+
+typedef struct as_endpoint_list_endpoint_find_udata_s
+{
+	/**
+	 * Indicates if there was an overlap.
+	 */
+	bool match_found;
+
+	/**
+	 * Indicates if endpoint capabilities should be ignored.
+	 */
+	bool ignore_capabilities;
+
+	/**
+	 * The other list to compare.
+	 */
+	const as_endpoint* to_find;
+} as_endpoint_list_endpoint_find_udata;
+
+/*----------------------------------------------------------------------------
+ * Private internal function forward declarations.
+ *----------------------------------------------------------------------------*/
+static bool endpoint_addr_type_is_valid(uint8_t type);
+static size_t endpoint_addr_binary_size(uint8_t type);
+static size_t endpoint_sizeof_by_addr_type(uint8_t addr_type);
+static as_endpoint* endpoint_allocate(uint8_t addr_type);
+static void endpoint_collect_iterate_fn(const as_endpoint* endpoint, void* udata);
+static void endpoint_to_string_iterate(const as_endpoint* endpoint, void* udata);
+static uint8_t endpoint_addr_type_from_cf_ip_addr(const cf_ip_addr* addr);
+static void endpoint_from_sock_cfg(const cf_sock_cfg* src, as_endpoint* endpoint);
+static void endpoint_list_overlap_iterate(const as_endpoint* endpoint, void* udata);
+static void endpoint_list_find_iterate(const as_endpoint* endpoint, void* udata);
+
+static bool endpoints_are_equal(const as_endpoint* endpoint1, const as_endpoint* endpoint2, const bool ignore_capabilities);
+static void endpoints_preference_sort(const as_endpoint* endpoints[], size_t n_endpoints);
+
+/*----------------------------------------------------------------------------
+ * Public API.
+ *----------------------------------------------------------------------------*/
+
+/**
+ * Get the sizeof an endpoint. Accounts for variable size of the address field.
+ * @return the size of the endpoint address. Zero if the endpoint address is
+ * invalid.
+ */
+size_t
+as_endpoint_sizeof(const as_endpoint* endpoint)
+{
+	return endpoint_sizeof_by_addr_type(endpoint->addr_type);
+}
+
+/**
+ * Enable a capability on an endpoint given its mask.
+ * @param endpoint the endpoint.
+ * @param capability_mask the capability mask.
+ */
+void
+as_endpoint_capability_enable(as_endpoint* endpoint, uint8_t capability_mask)
+{
+	endpoint->capabilities |= capability_mask;
+}
+
+/**
+ * Disable a capability on an endpoint given its mask.
+ * @param endpoint the endpoint.
+ * @param capability_mask the capability mask.
+ */
+void
+as_endpoint_capability_disable(as_endpoint* endpoint, uint8_t capability_mask)
+{
+	endpoint->capabilities &= ~capability_mask;
+}
+
+/**
+ * Connect to an endpoint.
+ *
+ * @param endpoint the peer endpoint to connect to.
+ * @param timeout the overall connect timeout.
+ * @param sock (output) will be populated if connections is successful.
+ * @return -1 on success, 0 on failure.
+ */
+int
+as_endpoint_connect(const as_endpoint* endpoint, int32_t timeout, cf_socket* sock)
+{
+	if (!endpoint_addr_type_is_valid(endpoint->addr_type)) {
+		return -1;
+	}
+
+	cf_sock_cfg cfg;
+	cf_sock_cfg_init(&cfg, CF_SOCK_OWNER_INVALID);
+	cfg.port = endpoint->port;
+	if (cf_ip_addr_from_binary(endpoint->addr, endpoint_addr_binary_size(endpoint->addr_type),
+		&cfg.addr) <= 0) {
+		return -1;
+	}
+
+	int rv = cf_socket_init_client(&cfg, timeout, sock);
+
+	// Reset the client sock config, because the config is a stack pointer.
+	sock->cfg = NULL;
+	return rv;
+}
+
+/**
+ * Connect to the best matching endpoint in the endpoint list.
+ *
+ * @param endpoint_list the list of endpoints.
+ * @param filter_fn filter function to discard incompatible endpoints. Can be
+ * NULL.
+ * @param filter_udata udata passed on as is to the filter function.
+ * @param timeout the overall connect timeout.
+ * @param sock (output) will be populated if connection is successful.
+ * @return the connected endpoint on success, NULL if no endpoint count be
+ * connected.
+ */
+const as_endpoint*
+as_endpoint_connect_any(const as_endpoint_list* endpoint_list,
+	as_endpoint_filter_fn filter_fn, void* filter_udata, int32_t timeout, cf_socket* sock)
+{
+	if (endpoint_list->n_endpoints == 0) {
+		return NULL;
+	}
+
+	const as_endpoint* ordered_endpoints[endpoint_list->n_endpoints];
+	const as_endpoint* rv = NULL;
+
+	as_endpoint_collect_udata collect_udata;
+	collect_udata.endpoints = ordered_endpoints;
+	collect_udata.collected_count = 0;
+
+	// Collect all endpoints in a pointer array.
+	as_endpoint_list_iterate(endpoint_list, endpoint_collect_iterate_fn, &collect_udata);
+
+	// Sort by descending preference.
+	endpoints_preference_sort(ordered_endpoints, endpoint_list->n_endpoints);
+
+	// TODO: Timeout individual connect or have the caller adjust based on
+	// number of endpoints
+	for (uint8_t i = 0; i < endpoint_list->n_endpoints; i++) {
+		if (filter_fn && !(filter_fn)(ordered_endpoints[i], filter_udata)) {
+			continue;
+		}
+
+		// Try this potential candidate.
+		if (as_endpoint_connect(ordered_endpoints[i], timeout, sock) == 0) {
+			// Connect succeeded.
+			rv = ordered_endpoints[i];
+			break;
+		}
+	}
+
+	return rv;
+}
+
+/**
+ * Convert a socket configuration to an endpoint in place.
+ * @return a heap allocated, converted endpoint. Should be freed using cf_free
+ * once the endpoint is no longer needed.
+ */
+void
+as_endpoint_from_sock_cfg_fill(const cf_sock_cfg* src, as_endpoint* endpoint)
+{
+	endpoint_from_sock_cfg(src, endpoint);
+}
+
+/**
+ * Convert a socket configuration to an endpoint.
+ * @return a heap allocated, converted endpoint. Should be freed using cf_free
+ * once the endpoint is no longer needed.
+ */
+as_endpoint*
+as_endpoint_from_sock_cfg(const cf_sock_cfg* src)
+{
+	uint8_t addr_type = endpoint_addr_type_from_cf_ip_addr(&src->addr);
+	as_endpoint* endpoint = endpoint_allocate(addr_type);
+	endpoint_from_sock_cfg(src, endpoint);
+	return endpoint;
+}
+
+/**
+ * Convert an endpoint to a cf_sock_addr.
+ * @param endpoint the source endpoint.
+ * @param sock_addr the target socket address.
+ * @return 0 on success, -1 on failure.
+ */
+int
+as_endpoint_to_sock_addr(const as_endpoint* endpoint, cf_sock_addr* sock_addr)
+{
+	sock_addr->port = endpoint->port;
+	return
+		cf_ip_addr_from_binary(endpoint->addr, endpoint_addr_binary_size(endpoint->addr_type),
+			&sock_addr->addr) > 0 ? 0 : -1;
+}
+
+/**
+ * Indicates if an endpoint supports listed capabilities.
+ * @return true if the endpoint supports the input capability.
+ */
+bool
+as_endpoint_capability_is_supported(const as_endpoint* endpoint, uint8_t capability_mask)
+{
+	return (endpoint->capabilities & capability_mask) > 0;
+}
+
+/**
+ * Return the in memory size in bytes of the endpoint list.
+ * @param endpoint_list the endpoint list.
+ * @param size (output) the size of the list on success.
+ * @return 0 on successful size calculation, -1 otherwise.
+ */
+int
+as_endpoint_list_sizeof(const as_endpoint_list* endpoint_list, size_t* size)
+{
+	return as_endpoint_list_nsizeof(endpoint_list, size, SIZE_MAX);
+}
+
+/**
+ * Return the in memory size in bytes of the endpoint list, but abort if the
+ * size of the read exceeds the input size.
+ * @param endpoint_list the endpoint list.
+ * @param size (output) the size of the list on success.
+ * @param size_max the maximum size until which parsing will be attempted.
+ * @return 0 on successful size calculation, -1 otherwise.
+ */
+int
+as_endpoint_list_nsizeof(const as_endpoint_list* endpoint_list, size_t* size, size_t size_max)
+{
+	if (!endpoint_list) {
+		return 0;
+	}
+
+	*size = sizeof(as_endpoint_list);
+
+	uint8_t* endpoint_ptr = (uint8_t*) endpoint_list->endpoints;
+	for (int i = 0; i < endpoint_list->n_endpoints; i++) {
+		size_t endpoint_size = as_endpoint_sizeof((as_endpoint*)endpoint_ptr);
+		if (endpoint_size == 0) {
+			// Invalid endpoint. Signal error
+			*size = 0;
+			return -1;
+		}
+
+		if (*size + endpoint_size > size_max) {
+			*size = 0;
+			return -1;
+		}
+
+		*size += endpoint_size;
+		endpoint_ptr += endpoint_size;
+	}
+
+	return 0;
+}
+
+/**
+ * Iterate over endpoints in an endpoint list and invoke the iterate function
+ * for each endpoint.
+ * @param iterate_fn the iterate function invoked for each endpoint in the list.
+ * @param udata passed as is to the iterate function. Useful for getting results
+ * out of the iteration.
+ * NULL if there is no plugin data.
+ * @return the size of the plugin data. 0 if there is no plugin data.
+ */
+void
+as_endpoint_list_iterate(const as_endpoint_list* endpoint_list,
+	const as_endpoint_iterate_fn iterate_fn, void* udata)
+{
+	if(!endpoint_list) {
+		return;
+	}
+
+	uint8_t* endpoint_ptr = (uint8_t*) endpoint_list->endpoints;
+
+	for (int i = 0; i < endpoint_list->n_endpoints; i++) {
+		if (iterate_fn) {
+			(iterate_fn)((as_endpoint*) endpoint_ptr, udata);
+		}
+		endpoint_ptr += as_endpoint_sizeof((as_endpoint*) endpoint_ptr);
+	}
+}
+
+/**
+ * Convert a server configuration to an endpoint list in place into the
+ * destination endpoint list.
+ * @param serv_cfg source server configuration.
+ * @param endpoint_list destination endpoint list.
+ */
+void
+as_endpoint_list_from_serv_cfg_fill(const cf_serv_cfg* serv_cfg, as_endpoint_list* endpoint_list)
+{
+	endpoint_list->n_endpoints = serv_cfg->n_cfgs;
+
+	uint8_t* endpoint_ptr = (uint8_t*) &endpoint_list->endpoints[0];
+	for (int i = 0; i < serv_cfg->n_cfgs; i++) {
+		as_endpoint* endpoint = (as_endpoint*) endpoint_ptr;
+		endpoint_from_sock_cfg(&serv_cfg->cfgs[i], endpoint);
+		endpoint_ptr += as_endpoint_sizeof(endpoint);
+	}
+}
+
+/**
+ * Convert a server configuration to an endpoint list.
+ * @param serv_cfg server configuration.
+ * @return a heap allocated endpoint list.  Should be freed using cf_free
+ * once the endpoint is no longer needed.
+ */
+as_endpoint_list*
+as_endpoint_list_from_serv_cfg(const cf_serv_cfg* serv_cfg)
+{
+	size_t result_size = sizeof(as_endpoint_list);
+	for (int i = 0; i < serv_cfg->n_cfgs; i++) {
+		result_size += endpoint_sizeof_by_addr_type(
+			endpoint_addr_type_from_cf_ip_addr(&serv_cfg->cfgs[i].addr));
+	}
+
+	as_endpoint_list* endpoint_list = (as_endpoint_list*) cf_malloc(result_size);
+
+	as_endpoint_list_from_serv_cfg_fill(serv_cfg, endpoint_list);
+
+	return endpoint_list;
+}
+
+/**
+ * Compare two endpoint lists for equality.
+ * @param list1 the first. NULL allowed.
+ * @param list2 the second list. NULL allowed.
+ * @return true iff the lists are equals, false otherwise.
+ */
+bool
+as_endpoint_lists_are_equal(const as_endpoint_list* list1, const as_endpoint_list* list2)
+{
+	if (list1 == list2) {
+		return true;
+	}
+
+	if (!list1 || !list2) {
+		return false;
+	}
+
+	size_t size1;
+	if (as_endpoint_list_sizeof(list1, &size1) != 0) {
+		return false;
+	}
+
+	size_t size2;
+	if (as_endpoint_list_sizeof(list2, &size2) != 0) {
+		return false;
+	}
+
+	if (size1 != size2) {
+		return false;
+	}
+
+	return memcmp(list1, list2, size1) == 0;
+}
+
+/**
+ * Check if two lists overlap in at least one endpoint.
+ * @param list1 the first. NULL allowed.
+ * @param list2 the second list. NULL allowed.
+ * @param ignore_capabilities set to true if the overlap match should ignore
+ * node capabilities, false if capabilities should also be matched.
+ * @return true iff the lists are overlap, false otherwise.
+ */
+bool
+as_endpoint_lists_are_overlapping(const as_endpoint_list* list1, const as_endpoint_list* list2,
+	bool ignore_capabilities)
+{
+	if (list1 == list2) {
+		return true;
+	}
+
+	if (!list1 || !list2) {
+		return false;
+	}
+
+	as_endpoint_list_overlap_udata udata;
+	udata.overlapped = false;
+	udata.other = list2;
+	udata.ignore_capabilities = ignore_capabilities;
+
+	as_endpoint_list_iterate(list1, endpoint_list_overlap_iterate, &udata);
+
+	return udata.overlapped;
+}
+
+/**
+ * Convert an endpoint list to a string.
+ * @param endpoint_list the input list. NULL allowed.
+ * @param buffer the output buffer.
+ * @param buffer_capacity the capacity of the output buffer.
+ * @return the number of characters printed (excluding the null  byte  used  to
+ * end  output to strings)
+ */
+int
+as_endpoint_list_to_string(const as_endpoint_list* endpoint_list, char* buffer,
+		size_t buffer_capacity)
+{
+	return as_endpoint_list_to_string_match_capabilities(endpoint_list, buffer,
+			buffer_capacity, 0, 0);
+}
+
+/**
+ * Convert an endpoint list to a string matching capabilities.
+ * @param endpoint_list the input list. NULL allowed.
+ * @param buffer the output buffer.
+ * @param buffer_capacity the capacity of the output buffer.
+ * @param capability_mask specifies which bit to match.
+ * @param capabilities specifies capabilities to be match for.
+ * @return the number of characters printed (excluding the null  byte  used  to
+ * end output to strings)
+ */
+int
+as_endpoint_list_to_string_match_capabilities(
+		const as_endpoint_list* endpoint_list, char* buffer,
+		size_t buffer_capacity, uint8_t capability_mask, uint8_t capabilities)
+{
+	if (!endpoint_list) {
+		buffer[0] = 0;
+		return 0;
+	}
+
+	as_endpoint_to_string_udata udata = { 0 };
+	udata.write_ptr = buffer;
+	udata.buffer_remaining = buffer_capacity;
+	udata.capabilities = capabilities;
+	udata.capability_mask = capability_mask;
+	as_endpoint_list_iterate(endpoint_list, endpoint_to_string_iterate, &udata);
+
+	if (udata.endpoints_converted) {
+		if (udata.endpoints_converted != endpoint_list->n_endpoints) {
+			// Truncation has happened. Add ellipses.
+			if (udata.buffer_remaining > 4) {
+				udata.buffer_remaining -= sprintf(udata.write_ptr, "...");
+			}
+		}
+		else {
+			// Remove the dangling comma from the last endpoint.
+			udata.write_ptr--;
+			udata.buffer_remaining++;
+		}
+	}
+
+	// Ensure NULL termination.
+	*udata.write_ptr = 0;
+
+	return buffer_capacity - udata.buffer_remaining;
+}
+
+/**
+ * Populate dyn buf with endpoints info
+ * @param endpoint_list the input list. NULL allowed.
+ * @param db the dynamic buffer.
+ */
+void
+as_endpoint_list_info(const as_endpoint_list* endpoint_list, cf_dyn_buf* db)
+{
+	size_t endpoint_list_size = 0;
+	as_endpoint_list_sizeof(endpoint_list, &endpoint_list_size);
+	// 4 chars for delimiters, 50 chars for ipv6 ip and port, rounded to 64
+	size_t endpoint_list_str_size = 64 * endpoint_list_size;
+
+	char endpoint_list_str[endpoint_list_str_size];
+	as_endpoint_list_to_string_match_capabilities(endpoint_list,
+			endpoint_list_str, sizeof(endpoint_list_str), AS_ENDPOINT_TLS_MASK,
+			0);
+
+	cf_dyn_buf_append_string(db, "endpoint=");
+	if (endpoint_list_str[0] != '\0') {
+		cf_dyn_buf_append_string(db, endpoint_list_str);
+	}
+	cf_dyn_buf_append_string(db, ":");
+
+	as_endpoint_list_to_string_match_capabilities(endpoint_list,
+			endpoint_list_str, sizeof(endpoint_list_str), AS_ENDPOINT_TLS_MASK,
+			AS_ENDPOINT_TLS_MASK);
+
+	cf_dyn_buf_append_string(db, "endpoint-tls=");
+	if (endpoint_list_str[0] != '\0') {
+		cf_dyn_buf_append_string(db, endpoint_list_str);
+	}
+
+}
+
+/*----------------------------------------------------------------------------
+ * Private internal functions.
+ *----------------------------------------------------------------------------*/
+/**
+ * Indicates if input address type is valid.
+ */
+static bool
+endpoint_addr_type_is_valid(uint8_t type)
+{
+	return type > AS_ENDPOINT_ADDR_TYPE_UNDEF && type < AS_ENDPOINT_ADDR_TYPE_SENTINEL;
+}
+
+/**
+ * Get the size of the binary for input address type.
+ * TODO: Move to socket API. Not if we support DNS names.
+ */
+static size_t
+endpoint_addr_binary_size(uint8_t type)
+{
+	return (type == AS_ENDPOINT_ADDR_TYPE_IPv4) ? 4 : 16;
+}
+
+/**
+ * Return the sizeof endpoint give its address type.
+ */
+static size_t
+endpoint_sizeof_by_addr_type(uint8_t addr_type)
+{
+	return sizeof(as_endpoint) + endpoint_addr_binary_size(addr_type);
+}
+
+/**
+ * Convert cf_ip address to endpoint address type.
+ */
+static uint8_t
+endpoint_addr_type_from_cf_ip_addr(const cf_ip_addr* addr)
+{
+	return cf_ip_addr_is_legacy(addr) ? AS_ENDPOINT_ADDR_TYPE_IPv4 : AS_ENDPOINT_ADDR_TYPE_IPv6;
+}
+
+/**
+ * Heap allocate an endpoint.
+ */
+static as_endpoint*
+endpoint_allocate(uint8_t addr_type)
+{
+	return cf_malloc(endpoint_sizeof_by_addr_type(addr_type));
+}
+
+/**
+ * Convert a socket to an endpoint.
+ */
+static void
+endpoint_from_sock_cfg(const cf_sock_cfg* src, as_endpoint* endpoint)
+{
+	endpoint->addr_type =
+		cf_ip_addr_is_legacy(&src->addr) ? AS_ENDPOINT_ADDR_TYPE_IPv4 : AS_ENDPOINT_ADDR_TYPE_IPv6;
+	endpoint->port = src->port;
+
+	// We will have allocated correct binary size.
+	CF_IGNORE_ERROR(
+		cf_ip_addr_to_binary(&src->addr, endpoint->addr,
+			endpoint_addr_binary_size(endpoint->addr_type)));
+
+	endpoint->capabilities = (src->owner == CF_SOCK_OWNER_HEARTBEAT_TLS ||
+		src->owner == CF_SOCK_OWNER_FABRIC_TLS) ? AS_ENDPOINT_TLS_MASK : 0;
+}
+
+/**
+ * Generate a hash for an endpoint, but salted with the a random tie breaker to
+ * generate random looking shuffles for "equal" endpoints. This is jenkins
+ * one-at-a-time hash of the tie breaker concatenated with the endpoint.
+ */
+static uint32_t
+endpoint_sort_hash(const as_endpoint* endpoint, int tie_breaker)
+{
+	uint32_t hash = 0;
+
+	// Hash the nodeid.
+	uint8_t* key = (uint8_t*)&tie_breaker;
+	for (int i = 0; i < sizeof(tie_breaker); ++i) {
+		hash += *key;
+		hash += (hash << 10);
+		hash ^= (hash >> 6);
+		key++;
+	}
+
+	// Hash the endpoint value.
+	size_t endpoint_size = as_endpoint_sizeof(endpoint);
+	key = (uint8_t*)endpoint;
+	for (int i = 0; i < endpoint_size; ++i) {
+		hash += *key;
+		hash += (hash << 10);
+		hash ^= (hash >> 6);
+		key++;
+	}
+
+	hash += (hash << 3);
+	hash ^= (hash >> 11);
+	hash += (hash << 15);
+	return hash;
+}
+
+/**
+ * Comparator to sort endpoints in descending order of preference.
+ */
+static int
+endpoint_preference_compare(const void* e1, const void* e2, void* arg)
+{
+	const as_endpoint* endpoint1 = *(as_endpoint**)e1;
+	const as_endpoint* endpoint2 = *(as_endpoint**)e2;
+	int tie_breaker = *((int*)arg);
+
+	// Prefer TLS over clear text.
+	bool endpoint1_is_tls = as_endpoint_capability_is_supported(endpoint1, AS_ENDPOINT_TLS_MASK);
+
+	bool endpoint2_is_tls = as_endpoint_capability_is_supported(endpoint2, AS_ENDPOINT_TLS_MASK);
+
+	if (endpoint1_is_tls != endpoint2_is_tls) {
+		return endpoint1_is_tls ? -1 : 1;
+	}
+
+	// If TLS capabilities match prefer IPv6.
+	bool endpoint1_is_ipv6 = endpoint1->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6;
+	bool endpoint2_is_ipv6 = endpoint2->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6;
+
+	if (endpoint1_is_ipv6 != endpoint2_is_ipv6) {
+		return endpoint1_is_ipv6 ? -1 : 1;
+	}
+
+	// Used tie breaker parameter to salt the hashes for load balancing.
+	return endpoint_sort_hash(endpoint1, tie_breaker) -
+		endpoint_sort_hash(endpoint2, tie_breaker);
+}
+
+/**
+ * Sort endpoints in place in descending order of preference.
+ * @param endpoints array of endpoint pointers.
+ */
+static void
+endpoints_preference_sort(const as_endpoint* endpoints[], size_t n_endpoints)
+{
+	// Random tie breaker to load balance between two equivalent endpoints.
+	int tie_breaker = rand();
+
+	qsort_r(endpoints, n_endpoints, sizeof(as_endpoint*),
+		endpoint_preference_compare, &tie_breaker);
+}
+
+/**
+ * Iterate and collect all endpoint addresses in passed in udata.
+ */
+static void
+endpoint_collect_iterate_fn(const as_endpoint* endpoint, void* udata)
+{
+	as_endpoint_collect_udata* endpoints_data = (as_endpoint_collect_udata*) udata;
+	endpoints_data->endpoints[endpoints_data->collected_count++] = endpoint;
+}
+
+/**
+ * Iterate over endpoints and convert them to strings.
+ */
+static void
+endpoint_to_string_iterate(const as_endpoint* endpoint, void* udata)
+{
+	as_endpoint_to_string_udata* to_string_data =
+			(as_endpoint_to_string_udata*)udata;
+
+	if ((endpoint->capabilities & to_string_data->capability_mask)
+			!= (to_string_data->capabilities & to_string_data->capability_mask)) {
+		// skip as the capabilities do not match
+		to_string_data->endpoints_converted++;
+		return;
+	}
+
+	char address_buffer[1024];
+	int capacity = sizeof(address_buffer);
+	char* endpoint_str_ptr = address_buffer;
+
+	cf_sock_addr temp_addr;
+	if (cf_ip_addr_from_binary(endpoint->addr,
+			endpoint_addr_binary_size(endpoint->addr_type), &temp_addr.addr)
+			<= 0) {
+		return;
+	}
+
+	int rv = 0;
+	if (endpoint->port) {
+		temp_addr.port = endpoint->port;
+		rv = cf_sock_addr_to_string(&temp_addr, endpoint_str_ptr, capacity);
+		if (rv <= 0) {
+			return;
+		}
+
+		capacity -= rv;
+		endpoint_str_ptr += rv;
+		rv = snprintf(endpoint_str_ptr, capacity, ",");
+	}
+	else {
+		// Skip port and tls capabilities.
+		rv = cf_ip_addr_to_string(&temp_addr.addr, endpoint_str_ptr, capacity);
+		if (rv <= 0) {
+			return;
+		}
+
+		capacity -= rv;
+		endpoint_str_ptr += rv;
+		rv = snprintf(endpoint_str_ptr, capacity, ",");
+	}
+
+	if (rv == capacity) {
+		// Output truncated. Abort.
+		return;
+	}
+
+	int to_write = strnlen(address_buffer, sizeof(address_buffer));
+
+	// Ensure we leave space for the NULL terminator.
+	if (to_write + 1 <= to_string_data->buffer_remaining) {
+		sprintf(to_string_data->write_ptr, "%s", address_buffer);
+		to_string_data->buffer_remaining -= to_write;
+		to_string_data->write_ptr += to_write;
+		to_string_data->endpoints_converted++;
+	}
+}
+
+/**
+ * Compare two endpoints for equality.
+ * @param endpoint1 the first. NULL allowed.
+ * @param endpoint2 the second endpoint. NULL allowed.
+ * @param ignore_capabilities indicates if endpoint capabilities should be
+ * ignored.
+ * @return true iff the endpoints are equals, false otherwise.
+ */
+static bool
+endpoints_are_equal(const as_endpoint* endpoint1, const as_endpoint* endpoint2,
+	bool ignore_capabilities)
+{
+	if (endpoint1 == endpoint2) {
+		return true;
+	}
+
+	if (!endpoint1 || !endpoint2) {
+		return false;
+	}
+
+	size_t size1 = as_endpoint_sizeof(endpoint1);
+	if (!size1) {
+		return false;
+	}
+
+	size_t size2 = as_endpoint_sizeof(endpoint2);
+	if (!size2) {
+		return false;
+	}
+
+	if (size1 != size2) {
+		return false;
+	}
+
+	return (ignore_capabilities || endpoint1->capabilities == endpoint2->capabilities)
+		&& endpoint1->port == endpoint2->port && endpoint1->addr_type == endpoint2->addr_type
+		&& memcmp(endpoint1->addr, endpoint2->addr, endpoint_addr_binary_size(endpoint1->addr_type)) == 0;
+}
+
+/**
+ * Iterate function to find an overlap.
+ */
+static void
+endpoint_list_overlap_iterate(const as_endpoint* endpoint, void* udata)
+{
+	as_endpoint_list_overlap_udata* overlap_udata = (as_endpoint_list_overlap_udata*) udata;
+	as_endpoint_list_endpoint_find_udata find_udata;
+	find_udata.match_found = false;
+	find_udata.ignore_capabilities = overlap_udata->ignore_capabilities;
+	find_udata.to_find = endpoint;
+
+	as_endpoint_list_iterate(overlap_udata->other, endpoint_list_find_iterate, &find_udata);
+
+	overlap_udata->overlapped |= find_udata.match_found;
+}
+
+/**
+ * Iterate function to search for an endpoint.
+ */
+static void
+endpoint_list_find_iterate(const as_endpoint* endpoint, void* udata)
+{
+	as_endpoint_list_endpoint_find_udata* find_udata = (as_endpoint_list_endpoint_find_udata*) udata;
+
+	const as_endpoint* to_find = find_udata->to_find;
+	if (!to_find) {
+		return;
+	}
+
+	find_udata->match_found |= endpoints_are_equal(endpoint, to_find,
+		find_udata->ignore_capabilities);
+}
diff --git a/as/src/fabric/exchange.c b/as/src/fabric/exchange.c
new file mode 100644
index 00000000..fe9a94d1
--- /dev/null
+++ b/as/src/fabric/exchange.c
@@ -0,0 +1,3457 @@
+/*
+ * exchange.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fabric/exchange.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/param.h> // For MAX() and MIN().
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "shash.h"
+#include "socket.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/stats.h"
+#include "fabric/fabric.h"
+#include "fabric/hb.h"
+#include "fabric/partition_balance.h"
+#include "storage/storage.h"
+
+/*
+ * Overview
+ * ========
+ * Cluster data exchange state machine. Exchanges per namespace partition
+ * version exchange for now, after evey cluster change.
+ *
+ * State transition diagram
+ * ========================
+ * The exchange state transition diagram responds to three events
+ * 	1. Incoming message
+ * 	2. Timer event
+ * 	3. Clustering module's cluster change event.
+ *
+ * There are four states
+ * 	1. Rest - the exchange is complete with all exchanged data committed.
+ * 	2. Exchanging - the cluster has changed since the last commit and new data
+ * exchange is in progress.
+ * 	3. Ready to commit - this node has send its exchange data to all cluster
+ * members, received corresponding acks and also exchange data from all cluster
+ * members.
+ * 	4. Orphaned - this node is an orphan. After a timeout blocks client
+ * transactions.
+ *
+ * Exchange starts by being in the orphaned state.
+ *
+ * Code organization
+ * =================
+ *
+ * There are different sections for each state. Each state has a dispatcher
+ * which delegates the event handing to a state specific function. All state is
+ * protected under a single lock.
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Constants
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Exchange protocol version information.
+ */
+#define AS_EXCHANGE_PROTOCOL_IDENTIFIER 1
+
+/**
+ * A soft limit for the maximum cluster size. Meant to be optimize hash and list
+ * data structures and not as a limit on the number of nodes.
+ */
+#define AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT 200
+
+/**
+ * A soft limit for the maximum number of unique vinfo's in a namespace. Meant
+ * to be optimize hash and list data structures and not as a limit on the number
+ * of vinfos processed.
+ */
+#define AS_EXCHANGE_UNIQUE_VINFO_MAX_SIZE_SOFT 200
+
+/**
+ * Average number of partitions for a version information. Used as initial
+ * allocation size for every unique vinfo, hence a smaller value.
+ */
+#define AS_EXCHANGE_VINFO_NUM_PIDS_AVG 1024
+
+/**
+ * Maximum event listeners.
+ */
+#define AS_EXTERNAL_EVENT_LISTENER_MAX 7
+
+/*
+ * ----------------------------------------------------------------------------
+ * Exchange data format for namespaces payload
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Partition data exchanged for each unique vinfo for a namespace.
+ */
+typedef struct as_exchange_vinfo_payload_s
+{
+	/**
+	 * The partition vinfo.
+	 */
+	as_partition_version vinfo;
+
+	/**
+	 * Count of partitions having this vinfo.
+	 */
+	uint32_t num_pids;
+
+	/**
+	 * Partition having this vinfo.
+	 */
+	uint16_t pids[];
+}__attribute__((__packed__)) as_exchange_vinfo_payload;
+
+/**
+ * Information exchanged for a single namespace.
+ */
+typedef struct as_exchange_ns_vinfos_payload_s
+{
+	/**
+	 * Count of version infos.
+	 */
+	uint32_t num_vinfos;
+
+	/**
+	 * Parition version information for each unique version.
+	 */
+	as_exchange_vinfo_payload vinfos[];
+}__attribute__((__packed__)) as_exchange_ns_vinfos_payload;
+
+/**
+ * Received data stored per node, per namespace, before actual commit.
+ */
+typedef struct as_exchange_node_namespace_data_s
+{
+	/**
+	 * Mapped local namespace.
+	 */
+	as_namespace* local_namespace;
+
+	/**
+	 * Partition versions for this namespace. This field is reused across
+	 * exchange rounds and may not be null even if the local namespace is null.
+	 */
+	as_exchange_ns_vinfos_payload* partition_versions;
+
+	/**
+	 * Sending node's rack id for this namespace.
+	 */
+	uint32_t rack_id;
+
+	/**
+	 * Sending node's roster generation for this namespace.
+	 */
+	uint32_t roster_generation;
+
+	/**
+	 * Sending node's roster count for this namespace.
+	 */
+	uint32_t roster_count;
+
+	/**
+	 * Sending node's roster for this namespace.
+	 */
+	cf_node* roster;
+
+	/**
+	 * Sending node's roster rack-ids for this namespace.
+	 */
+	cf_node* roster_rack_ids;
+
+	/**
+	 * Sender's eventual regime for this namespace.
+	 */
+	uint32_t eventual_regime;
+
+	/**
+	 * Sender's rebalance regime for this namespace.
+	 */
+	uint32_t rebalance_regime;
+} as_exchange_node_namespace_data;
+
+/**
+ * Exchanged data for a single node.
+ */
+typedef struct as_exchange_node_data_s
+{
+	/**
+	 * Number of sender's namespaces that have a matching local namespace.
+	 */
+	uint32_t num_namespaces;
+
+	/**
+	 * Data for sender's namespaces having a matching local namespace.
+	 */
+	as_exchange_node_namespace_data namespace_data[AS_NAMESPACE_SZ];
+} as_exchange_node_data;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Exchange internal data structures
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Exchange subsystem status.
+ */
+typedef enum
+{
+	AS_EXCHANGE_SYS_STATE_UNINITIALIZED,
+	AS_EXCHANGE_SYS_STATE_RUNNING,
+	AS_EXCHANGE_SYS_STATE_SHUTTING_DOWN,
+	AS_EXCHANGE_SYS_STATE_STOPPED
+} as_exchange_sys_state;
+
+/**
+ * Exchange message types.
+ */
+typedef enum
+{
+	/**
+	 * Exchange data for one node.
+	 */
+	AS_EXCHANGE_MSG_TYPE_DATA,
+
+	/**
+	 * Ack on receipt of exchanged data.
+	 */
+	AS_EXCHANGE_MSG_TYPE_DATA_ACK,
+
+	/**
+	 * Not used.
+	 */
+	AS_EXCHANGE_MSG_TYPE_DATA_NACK,
+
+	/**
+	 * The source is ready to commit exchanged information.
+	 */
+	AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT,
+
+	/**
+	 * Message from the principal asking all nodes to commit the exchanged
+	 * information.
+	 */
+	AS_EXCHANGE_MSG_TYPE_COMMIT,
+
+	/**
+	 * Sentinel value for exchange message types.
+	 */
+	AS_EXCHANGE_MSG_TYPE_SENTINEL
+} as_exchange_msg_type;
+
+/**
+ * Internal exchange event type.
+ */
+typedef enum
+{
+	/**
+	 * Cluster change event.
+	 */
+	AS_EXCHANGE_EVENT_CLUSTER_CHANGE,
+
+	/**
+	 * Timer event.
+	 */
+	AS_EXCHANGE_EVENT_TIMER,
+
+	/**
+	 * Incoming message event.
+	 */
+	AS_EXCHANGE_EVENT_MSG,
+} as_exchange_event_type;
+
+/**
+ * Internal exchange event.
+ */
+typedef struct as_exchange_event_s
+{
+	/**
+	 * The type of the event.
+	 */
+	as_exchange_event_type type;
+
+	/**
+	 * Message for incoming message events.
+	 */
+	msg* msg;
+
+	/**
+	 * Source for incoming message events.
+	 */
+	cf_node msg_source;
+
+	/**
+	 * Clustering event instance for clustering events.
+	 */
+	as_clustering_event* clustering_event;
+} as_exchange_event;
+
+/**
+ * Exchange subsystem state in the state transition diagram.
+ */
+typedef enum as_exchange_state_s
+{
+	/**
+	 * Exchange subsystem is at rest will all data exchanged synchronized and
+	 * committed.
+	 */
+	AS_EXCHANGE_STATE_REST,
+
+	/**
+	 * Data exchange is in progress.
+	 */
+	AS_EXCHANGE_STATE_EXCHANGING,
+
+	/**
+	 * Data exchange is complete and this node is ready to commit data.
+	 */
+	AS_EXCHANGE_STATE_READY_TO_COMMIT,
+
+	/**
+	 * Self node is orphaned.
+	 */
+	AS_EXCHANGE_STATE_ORPHANED
+} as_exchange_state;
+
+/**
+ * State for a single node in the succession list.
+ */
+typedef struct as_exchange_node_state_s
+{
+	/**
+	 * Inidicates if peer node has acknowledged send from self.
+	 */
+	bool send_acked;
+
+	/**
+	 * Inidicates if self node has received data from this peer.
+	 */
+	bool received;
+
+	/**
+	 * Inidicates if this peer node is ready to commit. Only relevant and used
+	 * by the current principal.
+	 */
+	bool is_ready_to_commit;
+
+	/**
+	 * Exchange data received from this peer node. Member variables may be heap
+	 * allocated and hence should be freed carefully while discarding this
+	 * structure instance.
+	 */
+	as_exchange_node_data* data;
+} as_exchange_node_state;
+
+/**
+ * State maintained by the exchange subsystem.
+ */
+typedef struct as_exchange_s
+{
+	/**
+	 * Exchange subsystem status.
+	 */
+	as_exchange_sys_state sys_state;
+
+	/**
+	 * Exchange state in the state transition diagram.
+	 */
+	as_exchange_state state;
+
+	/**
+	 * Time when this node's exchange data was sent out.
+	 */
+	cf_clock send_ts;
+
+	/**
+	 * Time when this node's ready to commit was sent out.
+	 */
+	cf_clock ready_to_commit_send_ts;
+
+	/**
+	 * Thread id of the timer event generator.
+	 */
+	pthread_t timer_tid;
+
+	/**
+	 * Nodes that are not yet ready to commit.
+	 */
+	cf_vector ready_to_commit_pending_nodes;
+
+	/**
+	 * Current cluster key.
+	 */
+	as_cluster_key cluster_key;
+
+	/**
+	 * Cluster size - size of the succession list.
+	 */
+	uint32_t cluster_size;
+
+	/**
+	 * Exchange's copy of the succession list.
+	 */
+	cf_vector succession_list;
+
+	/**
+	 * The principal node in current succession list. Always the first node.
+	 */
+	cf_node principal;
+
+	/**
+	 * Last committed cluster key.
+	 */
+	as_cluster_key committed_cluster_key;
+
+	/**
+	 * Last committed cluster size - size of the succession list.
+	 */
+	uint32_t committed_cluster_size;
+
+	/**
+	 * Last committed exchange's succession list.
+	 */
+	cf_vector committed_succession_list;
+
+	/**
+	 * The principal node in the committed succession list. Always the first
+	 * node.
+	 */
+	cf_node committed_principal;
+
+	/**
+	 * The time this node entered orphan state.
+	 */
+	cf_clock orphan_state_start_time;
+
+	/**
+	 * Indicates if transactions have already been blocked in the orphan state.
+	 */
+	bool orphan_state_are_transactions_blocked;
+
+	/**
+	 * Will have an as_exchange_node_state entry for every node in the
+	 * succession list.
+	 */
+	cf_shash* nodeid_to_node_state;
+
+	/**
+	 * This node's data payload for current round.
+	 */
+	cf_dyn_buf self_data_dyn_buf[AS_NAMESPACE_SZ];
+} as_exchange;
+
+/**
+ * Internal storage for external event listeners.
+ */
+typedef struct as_exchange_event_listener_s
+{
+	/**
+	 * The listener's calback function.
+	 */
+	as_exchange_cluster_changed_cb event_callback;
+
+	/**
+	 * The listeners user data object passed back as is to the callback
+	 * function.
+	 */
+	void* udata;
+} as_exchange_event_listener;
+
+/**
+ * External event publisher state.
+ */
+typedef struct as_exchange_external_event_publisher_s
+{
+	/**
+	 * State of the external event publisher.
+	 */
+	as_exchange_sys_state sys_state;
+
+	/**
+	 * Inidicates if there is an event to publish.
+	 */
+	bool event_queued;
+
+	/**
+	 * The pending event to publish.
+	 */
+	as_exchange_cluster_changed_event to_publish;
+
+	/**
+	 * The static succession list published with the message.
+	 */
+	cf_vector published_succession_list;
+
+	/**
+	 * Conditional variable to signal a pending event.
+	 */
+	pthread_cond_t is_pending;
+
+	/**
+	 * Thread id of the publisher thread.
+	 */
+	pthread_t event_publisher_tid;
+
+	/**
+	 * Mutex to protect the conditional variable.
+	 */
+	pthread_mutex_t is_pending_mutex;
+
+	/**
+	 * External event listeners.
+	 */
+	as_exchange_event_listener event_listeners[AS_EXTERNAL_EVENT_LISTENER_MAX];
+
+	/**
+	 * Event listener count.
+	 */
+	uint32_t event_listener_count;
+} as_exchange_external_event_publisher;
+
+
+/*
+ * ----------------------------------------------------------------------------
+ * Externs
+ * ----------------------------------------------------------------------------
+ */
+void
+as_skew_monitor_update();
+
+/*
+ * ----------------------------------------------------------------------------
+ * Globals
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Singleton exchange state all initialized to zero.
+ */
+static as_exchange g_exchange = { 0 };
+
+/**
+ * The fields in the exchange message. Should never change the order or elements
+ * in between.
+ */
+typedef enum
+{
+	AS_EXCHANGE_MSG_ID,
+	AS_EXCHANGE_MSG_TYPE,
+	AS_EXCHANGE_MSG_CLUSTER_KEY,
+	AS_EXCHANGE_MSG_NAMESPACES,
+	AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS,
+	AS_EXCHANGE_MSG_NS_RACK_IDS,
+	AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS,
+	AS_EXCHANGE_MSG_NS_ROSTERS,
+	AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS,
+	AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES,
+	AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES,
+
+	NUM_EXCHANGE_MSG_FIELDS
+} as_exchange_msg_fields;
+
+/**
+ * Exchange message template.
+ */
+static const msg_template exchange_msg_template[] = {
+		{ AS_EXCHANGE_MSG_ID, M_FT_UINT32 },
+		{ AS_EXCHANGE_MSG_TYPE, M_FT_UINT32 },
+		{ AS_EXCHANGE_MSG_CLUSTER_KEY, M_FT_UINT64 },
+		{ AS_EXCHANGE_MSG_NAMESPACES, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_RACK_IDS, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_ROSTERS, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, M_FT_MSGPACK },
+		{ AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, M_FT_MSGPACK }
+};
+
+COMPILER_ASSERT(sizeof(exchange_msg_template) / sizeof(msg_template) ==
+		NUM_EXCHANGE_MSG_FIELDS);
+
+/**
+ * Global lock to set or get exchanged info from other threads.
+ */
+pthread_mutex_t g_exchanged_info_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/**
+ * Global lock to serialize all reads and writes to the exchange state.
+ */
+pthread_mutex_t g_exchange_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * Singleton external events publisher.
+ */
+static as_exchange_external_event_publisher g_external_event_publisher;
+
+/**
+ * The fat lock for all clustering events listener changes.
+ */
+static pthread_mutex_t g_external_event_publisher_lock =
+		PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * Acquire a lock on the event publisher.
+ */
+#define EXTERNAL_EVENT_PUBLISHER_LOCK()						\
+({															\
+	pthread_mutex_lock (&g_external_event_publisher_lock);	\
+	LOCK_DEBUG("publisher locked in %s", __FUNCTION__);		\
+})
+
+/**
+ * Relinquish the lock on the external event publisher.
+ */
+#define EXTERNAL_EVENT_PUBLISHER_UNLOCK()						\
+({																\
+	pthread_mutex_unlock (&g_external_event_publisher_lock);	\
+	LOCK_DEBUG("publisher unLocked in %s", __FUNCTION__);		\
+})
+
+/*
+ * ----------------------------------------------------------------------------
+ * Logging macros.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Used to limit potentially long log lines. Includes space for NULL terminator.
+ */
+#define LOG_LENGTH_MAX() (800)
+#define CRASH(format, ...) cf_crash(AS_EXCHANGE, format, ##__VA_ARGS__)
+#define WARNING(format, ...) cf_warning(AS_EXCHANGE, format, ##__VA_ARGS__)
+#define INFO(format, ...) cf_info(AS_EXCHANGE, format, ##__VA_ARGS__)
+#define DEBUG(format, ...) cf_debug(AS_EXCHANGE, format, ##__VA_ARGS__)
+#define DETAIL(format, ...) cf_detail(AS_EXCHANGE, format, ##__VA_ARGS__)
+#define LOG(severity, format, ...)			\
+({											\
+	switch (severity) {						\
+	case CF_CRITICAL:						\
+		CRASH(format, ##__VA_ARGS__);		\
+		break;								\
+	case CF_WARNING:						\
+		WARNING(format, ##__VA_ARGS__);		\
+		break;								\
+	case CF_INFO:							\
+		INFO(format, ##__VA_ARGS__);		\
+		break;								\
+	case CF_DEBUG:							\
+		DEBUG(format, ##__VA_ARGS__);		\
+		break;								\
+	case CF_DETAIL:							\
+		DETAIL(format, ##__VA_ARGS__);		\
+		break;								\
+	default:								\
+		break;								\
+	}										\
+})
+
+/**
+ * Size of the (per-namespace) self payload dynamic buffer.
+ */
+#define AS_EXCHANGE_SELF_DYN_BUF_SIZE() (AS_EXCHANGE_UNIQUE_VINFO_MAX_SIZE_SOFT		\
+		* ((AS_EXCHANGE_VINFO_NUM_PIDS_AVG * sizeof(uint16_t))						\
+				+ sizeof(as_partition_version)))
+
+/**
+ * Scratch size for exchange messages.
+ * TODO: Compute this properly.
+ */
+#define AS_EXCHANGE_MSG_SCRATCH_SIZE 2048
+
+#ifdef LOCK_DEBUG_ENABLED
+#define LOCK_DEBUG(format, ...) DEBUG(format, ##__VA_ARGS__)
+#else
+#define LOCK_DEBUG(format, ...)
+#endif
+
+/**
+ * Acquire a lock on the exchange subsystem.
+ */
+#define EXCHANGE_LOCK()							\
+({												\
+	pthread_mutex_lock (&g_exchange_lock);		\
+	LOCK_DEBUG("locked in %s", __FUNCTION__);	\
+})
+
+/**
+ * Relinquish the lock on the exchange subsystem.
+ */
+#define EXCHANGE_UNLOCK()							\
+({													\
+	pthread_mutex_unlock (&g_exchange_lock);		\
+	LOCK_DEBUG("unLocked in %s", __FUNCTION__);		\
+})
+
+/**
+ * Timer event generation interval.
+ */
+#define EXCHANGE_TIMER_TICK_INTERVAL() (75)
+
+/**
+ * Minimum timeout interval for sent exchange data.
+ */
+#define EXCHANGE_SEND_MIN_TIMEOUT() (MAX(75, as_hb_tx_interval_get() / 2))
+
+/**
+ * Maximum timeout interval for sent exchange data.
+ */
+#define EXCHANGE_SEND_MAX_TIMEOUT() (30000)
+
+/**
+ * Timeout for receiving commit message after transitioning to ready to commit.
+ */
+#define EXCHANGE_READY_TO_COMMIT_TIMEOUT() (EXCHANGE_SEND_MIN_TIMEOUT())
+
+/**
+ * Send timeout is a step function with this value as the interval for each
+ * step.
+ */
+#define EXCHANGE_SEND_STEP_INTERVAL()							\
+(MAX(EXCHANGE_SEND_MIN_TIMEOUT(), as_hb_tx_interval_get()))
+
+/**
+ * Check if exchange is initialized.
+ */
+#define EXCHANGE_IS_INITIALIZED()						\
+({														\
+	EXCHANGE_LOCK();									\
+	bool initialized = (g_exchange.sys_state			\
+			!= AS_EXCHANGE_SYS_STATE_UNINITIALIZED);	\
+	EXCHANGE_UNLOCK();									\
+	initialized;										\
+})
+
+/**
+ * * Check if exchange is running.
+ */
+#define EXCHANGE_IS_RUNNING()											\
+({																		\
+	EXCHANGE_LOCK();													\
+	bool running = (EXCHANGE_IS_INITIALIZED()							\
+			&& g_exchange.sys_state == AS_EXCHANGE_SYS_STATE_RUNNING);	\
+	EXCHANGE_UNLOCK();													\
+	running;															\
+})
+
+/**
+ * Create temporary stack variables.
+ */
+#define TOKEN_PASTE(x, y) x##y
+#define STACK_VAR(x, y) TOKEN_PASTE(x, y)
+
+/**
+ * Convert a vector to a stack allocated array.
+ */
+#define cf_vector_to_stack_array(vector_p, nodes_array_p, num_nodes_p)	\
+({																		\
+	*num_nodes_p = cf_vector_size(vector_p);							\
+	if (*num_nodes_p > 0) {												\
+		*nodes_array_p = alloca(sizeof(cf_node) * (*num_nodes_p));		\
+		for (int i = 0; i < *num_nodes_p; i++) {						\
+			cf_vector_get(vector_p, i, &(*nodes_array_p)[i]);			\
+		}																\
+	}																	\
+	else {																\
+		*nodes_array_p = NULL;											\
+	}																	\
+})
+
+/**
+ * Create and initialize a lockless stack allocated vector to initially sized to
+ * store cluster node number of elements.
+ */
+#define cf_vector_stack_create(value_type)											\
+({																					\
+	cf_vector * STACK_VAR(vector, __LINE__) = (cf_vector*)alloca(					\
+			sizeof(cf_vector));														\
+	size_t buffer_size = AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT							\
+			* sizeof(value_type);													\
+	void* STACK_VAR(buff, __LINE__) = alloca(buffer_size); cf_vector_init_smalloc(	\
+			STACK_VAR(vector, __LINE__), sizeof(value_type),						\
+			(uint8_t*)STACK_VAR(buff, __LINE__), buffer_size,						\
+			VECTOR_FLAG_INITZERO);													\
+	STACK_VAR(vector, __LINE__);													\
+})
+
+/*
+ * ----------------------------------------------------------------------------
+ * Vector functions to be moved to cf_vector
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Convert a vector to an array.
+ * FIXME: return pointer to the internal vector storage.
+ */
+static cf_node*
+vector_to_array(cf_vector* vector)
+{
+	return (cf_node*)vector->vector;
+}
+
+/**
+ * Clear / delete all entries in a vector.
+ */
+static void
+vector_clear(cf_vector* vector)
+{
+	cf_vector_delete_range(vector, 0, cf_vector_size(vector));
+}
+
+/**
+ * Find the index of an element in the vector. Equality is based on mem compare.
+ *
+ * @param vector the source vector.
+ * @param element the element to find.
+ * @return the index if the element is found, -1 otherwise.
+ */
+static int
+vector_find(cf_vector* vector, const void* element)
+{
+	int element_count = cf_vector_size(vector);
+	size_t value_len = VECTOR_ELEM_SZ(vector);
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(vector, i);
+		if (src_element) {
+			if (memcmp(element, src_element, value_len) == 0) {
+				return i;
+			}
+		}
+	}
+	return -1;
+}
+
+/**
+ * Copy all elements form the source vector to the destination vector to the
+ * destination vector. Assumes the source and destination vector are not being
+ * modified while the copy operation is in progress.
+ *
+ * @param dest the destination vector.
+ * @param src the source vector.
+ * @return the number of elements copied.
+ */
+static int
+vector_copy(cf_vector* dest, cf_vector* src)
+{
+	int element_count = cf_vector_size(src);
+	int copied_count = 0;
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(src, i);
+		if (src_element) {
+			cf_vector_append(dest, src_element);
+			copied_count++;
+		}
+	}
+	return copied_count;
+}
+
+/**
+ * Generate a hash code for a blob using Jenkins hash function.
+ */
+static uint32_t
+exchange_blob_hash(const uint8_t* value, size_t value_size)
+{
+	uint32_t hash = 0;
+	for (int i = 0; i < value_size; ++i) {
+		hash += value[i];
+		hash += (hash << 10);
+		hash ^= (hash >> 6);
+	}
+	hash += (hash << 3);
+	hash ^= (hash >> 11);
+	hash += (hash << 15);
+
+	return hash;
+}
+
+/**
+ * Generate a hash code for a mesh node key.
+ */
+static uint32_t
+exchange_vinfo_shash(const void* value)
+{
+	return exchange_blob_hash((const uint8_t*)value,
+			sizeof(as_partition_version));
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Clustering external event publisher
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * * Check if event publisher is running.
+ */
+static bool
+exchange_external_event_publisher_is_running()
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+	bool running = g_external_event_publisher.sys_state
+			== AS_EXCHANGE_SYS_STATE_RUNNING;
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+	return running;
+}
+
+/**
+ * Initialize the event publisher.
+ */
+static void
+exchange_external_event_publisher_init()
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+	memset(&g_external_event_publisher, 0, sizeof(g_external_event_publisher));
+	cf_vector_init(&g_external_event_publisher.published_succession_list,
+			sizeof(cf_node),
+			AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
+
+	pthread_mutex_init(&g_external_event_publisher.is_pending_mutex, NULL);
+	pthread_cond_init(&g_external_event_publisher.is_pending, NULL);
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * Register a clustering event listener.
+ */
+static void
+exchange_external_event_listener_register(
+		as_exchange_cluster_changed_cb event_callback, void* udata)
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+
+	if (g_external_event_publisher.event_listener_count
+			>= AS_EXTERNAL_EVENT_LISTENER_MAX) {
+		CRASH("cannot register more than %d event listeners",
+				AS_EXTERNAL_EVENT_LISTENER_MAX);
+	}
+
+	g_external_event_publisher.event_listeners[g_external_event_publisher.event_listener_count].event_callback =
+			event_callback;
+	g_external_event_publisher.event_listeners[g_external_event_publisher.event_listener_count].udata =
+			udata;
+	g_external_event_publisher.event_listener_count++;
+
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * Wakeup the publisher thread.
+ */
+static void
+exchange_external_event_publisher_thr_wakeup()
+{
+	pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex);
+	pthread_cond_signal(&g_external_event_publisher.is_pending);
+	pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex);
+}
+
+/**
+ * Queue up and external event to publish.
+ */
+static void
+exchange_external_event_queue(as_exchange_cluster_changed_event* event)
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+	memcpy(&g_external_event_publisher.to_publish, event,
+			sizeof(g_external_event_publisher.to_publish));
+
+	vector_clear(&g_external_event_publisher.published_succession_list);
+	if (event->succession) {
+		// Use the static list for the published event, so that the input event
+		// object can be destroyed irrespective of when the it is published.
+		for (int i = 0; i < event->cluster_size; i++) {
+			cf_vector_append(
+					&g_external_event_publisher.published_succession_list,
+					&event->succession[i]);
+		}
+		g_external_event_publisher.to_publish.succession = vector_to_array(
+				&g_external_event_publisher.published_succession_list);
+
+	}
+	else {
+		g_external_event_publisher.to_publish.succession = NULL;
+	}
+
+	g_external_event_publisher.event_queued = true;
+
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+
+	// Wake up the publisher thread.
+	exchange_external_event_publisher_thr_wakeup();
+}
+
+/**
+ * Publish external events if any are pending.
+ */
+static void
+exchange_external_events_publish()
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+
+	if (g_external_event_publisher.event_queued) {
+		g_external_event_publisher.event_queued = false;
+		for (uint32_t i = 0;
+				i < g_external_event_publisher.event_listener_count; i++) {
+			(g_external_event_publisher.event_listeners[i].event_callback)(
+					&g_external_event_publisher.to_publish,
+					g_external_event_publisher.event_listeners[i].udata);
+		}
+	}
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * External event publisher thread.
+ */
+static void*
+exchange_external_event_publisher_thr(void* arg)
+{
+	pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex);
+
+	while (true) {
+		pthread_cond_wait(&g_external_event_publisher.is_pending,
+				&g_external_event_publisher.is_pending_mutex);
+		if (exchange_external_event_publisher_is_running()) {
+			exchange_external_events_publish();
+		}
+		else {
+			// Publisher stopped, exit the tread.
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Start the event publisher.
+ */
+static void
+exchange_external_event_publisher_start()
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+	g_external_event_publisher.sys_state = AS_EXCHANGE_SYS_STATE_RUNNING;
+
+	// Start the event publishing thread.
+	if (pthread_create(&g_external_event_publisher.event_publisher_tid, 0,
+			exchange_external_event_publisher_thr, NULL) != 0) {
+		CRASH("could not create event publishing thread: %s",
+				cf_strerror(errno));
+	}
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+}
+
+/**
+ * Stop the event publisher.
+ */
+static void
+external_event_publisher_stop()
+{
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+	g_external_event_publisher.sys_state = AS_EXCHANGE_SYS_STATE_SHUTTING_DOWN;
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+
+	exchange_external_event_publisher_thr_wakeup();
+	pthread_join(g_external_event_publisher.event_publisher_tid, NULL);
+
+	EXTERNAL_EVENT_PUBLISHER_LOCK();
+	g_external_event_publisher.sys_state = AS_EXCHANGE_SYS_STATE_STOPPED;
+	g_external_event_publisher.event_queued = false;
+	EXTERNAL_EVENT_PUBLISHER_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Node state related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Initialize node state.
+ */
+static void
+exchange_node_state_init(as_exchange_node_state* node_state)
+{
+	memset(node_state, 0, sizeof(*node_state));
+
+	node_state->data = cf_calloc(1, sizeof(as_exchange_node_data));
+}
+
+/**
+ * Reset node state.
+ */
+static void
+exchange_node_state_reset(as_exchange_node_state* node_state)
+{
+	node_state->send_acked = false;
+	node_state->received = false;
+	node_state->is_ready_to_commit = false;
+
+	node_state->data->num_namespaces = 0;
+	for (int i = 0; i < AS_NAMESPACE_SZ; i++) {
+		node_state->data->namespace_data[i].local_namespace = NULL;
+	}
+}
+
+/**
+ * Destroy node state.
+ */
+static void
+exchange_node_state_destroy(as_exchange_node_state* node_state)
+{
+	for (int i = 0; i < AS_NAMESPACE_SZ; i++) {
+		if (node_state->data->namespace_data[i].partition_versions) {
+			cf_free(node_state->data->namespace_data[i].partition_versions);
+		}
+
+		if (node_state->data->namespace_data[i].roster) {
+			cf_free(node_state->data->namespace_data[i].roster);
+		}
+
+		if (node_state->data->namespace_data[i].roster_rack_ids) {
+			cf_free(node_state->data->namespace_data[i].roster_rack_ids);
+		}
+	}
+
+	cf_free(node_state->data);
+}
+
+/**
+ * Reduce function to match node -> node state hash to the succession list.
+ * Should always be invoked under a lock over the main hash.
+ */
+static int
+exchange_node_states_reset_reduce(const void* key, void* data, void* udata)
+{
+	const cf_node* node = (const cf_node*)key;
+	as_exchange_node_state* node_state = (as_exchange_node_state*)data;
+
+	int node_index = vector_find(&g_exchange.succession_list, node);
+	if (node_index < 0) {
+		// Node not in succession list
+		exchange_node_state_destroy(node_state);
+		return CF_SHASH_REDUCE_DELETE;
+	}
+
+	exchange_node_state_reset(node_state);
+	return CF_SHASH_OK;
+}
+
+/**
+ * Adjust the nodeid_to_node_state hash to have an entry for every node in the
+ * succession list with state reset for a new round of exchange. Removes entries
+ * not in the succession list.
+ */
+static void
+exchange_node_states_reset()
+{
+	EXCHANGE_LOCK();
+
+	// Fix existing entries by reseting entries in succession and removing
+	// entries not in succession list.
+	cf_shash_reduce(g_exchange.nodeid_to_node_state,
+			exchange_node_states_reset_reduce, NULL);
+
+	// Add missing entries.
+	int succession_length = cf_vector_size(&g_exchange.succession_list);
+
+	as_exchange_node_state temp_state;
+	for (int i = 0; i < succession_length; i++) {
+		cf_node nodeid;
+
+		cf_vector_get(&g_exchange.succession_list, i, &nodeid);
+		if (cf_shash_get(g_exchange.nodeid_to_node_state, &nodeid, &temp_state)
+				== CF_SHASH_ERR_NOT_FOUND) {
+			exchange_node_state_init(&temp_state);
+
+			cf_shash_put(g_exchange.nodeid_to_node_state, &nodeid, &temp_state);
+		}
+	}
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Reduce function to find nodes that had not acked self node's exchange data.
+ */
+static int
+exchange_nodes_find_send_unacked_reduce(const void* key, void* data,
+		void* udata)
+{
+	const cf_node* node = (const cf_node*)key;
+	as_exchange_node_state* node_state = (as_exchange_node_state*)data;
+	cf_vector* unacked = (cf_vector*)udata;
+
+	if (!node_state->send_acked) {
+		cf_vector_append(unacked, node);
+	}
+	return CF_SHASH_OK;
+}
+
+/**
+ * Find nodes that have not acked self node's exchange data.
+ */
+static void
+exchange_nodes_find_send_unacked(cf_vector* unacked)
+{
+	cf_shash_reduce(g_exchange.nodeid_to_node_state,
+			exchange_nodes_find_send_unacked_reduce, unacked);
+}
+
+/**
+ * Reduce function to find peer nodes from whom self node has not received
+ * exchange data.
+ */
+static int
+exchange_nodes_find_not_received_reduce(const void* key, void* data,
+		void* udata)
+{
+	const cf_node* node = (const cf_node*)key;
+	as_exchange_node_state* node_state = (as_exchange_node_state*)data;
+	cf_vector* not_received = (cf_vector*)udata;
+
+	if (!node_state->received) {
+		cf_vector_append(not_received, node);
+	}
+	return CF_SHASH_OK;
+}
+
+/**
+ * Find peer nodes from whom self node has not received exchange data.
+ */
+static void
+exchange_nodes_find_not_received(cf_vector* not_received)
+{
+	cf_shash_reduce(g_exchange.nodeid_to_node_state,
+			exchange_nodes_find_not_received_reduce, not_received);
+}
+
+/**
+ * Reduce function to find peer nodes that are not ready to commit.
+ */
+static int
+exchange_nodes_find_not_ready_to_commit_reduce(const void* key, void* data,
+		void* udata)
+{
+	const cf_node* node = (const cf_node*)key;
+	as_exchange_node_state* node_state = (as_exchange_node_state*)data;
+	cf_vector* not_ready_to_commit = (cf_vector*)udata;
+
+	if (!node_state->is_ready_to_commit) {
+		cf_vector_append(not_ready_to_commit, node);
+	}
+	return CF_SHASH_OK;
+}
+
+/**
+ * Find peer nodes that are not ready to commit.
+ */
+static void
+exchange_nodes_find_not_ready_to_commit(cf_vector* not_ready_to_commit)
+{
+	cf_shash_reduce(g_exchange.nodeid_to_node_state,
+			exchange_nodes_find_not_ready_to_commit_reduce,
+			not_ready_to_commit);
+}
+
+/**
+ * Update the node state for a node.
+ */
+static void
+exchange_node_state_update(cf_node nodeid, as_exchange_node_state* node_state)
+{
+	cf_shash_put(g_exchange.nodeid_to_node_state, &nodeid, node_state);
+}
+
+/**
+ * Get state of a node from the hash. If not found crash because this entry
+ * should be present in the hash.
+ */
+static void
+exchange_node_state_get_safe(cf_node nodeid, as_exchange_node_state* node_state)
+{
+	if (cf_shash_get(g_exchange.nodeid_to_node_state, &nodeid, node_state)
+			== CF_SHASH_ERR_NOT_FOUND) {
+		CRASH(
+				"node entry for node %"PRIx64"  missing from node state hash", nodeid);
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Message related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Fill compulsary fields in a message common to all message types.
+ */
+static void
+exchange_msg_src_fill(msg* msg, as_exchange_msg_type type)
+{
+	EXCHANGE_LOCK();
+	msg_set_uint32(msg, AS_EXCHANGE_MSG_ID, AS_EXCHANGE_PROTOCOL_IDENTIFIER);
+	msg_set_uint64(msg, AS_EXCHANGE_MSG_CLUSTER_KEY, g_exchange.cluster_key);
+	msg_set_uint32(msg, AS_EXCHANGE_MSG_TYPE, type);
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Get the msg buffer from a pool and fill in all compulsory fields.
+ * @return the msg buff with compulsory fields filled in.
+ */
+static msg*
+exchange_msg_get(as_exchange_msg_type type)
+{
+	msg* msg = as_fabric_msg_get(M_TYPE_EXCHANGE);
+	exchange_msg_src_fill(msg, type);
+	return msg;
+}
+
+/**
+ * Return the message buffer back to the pool.
+ */
+static void
+exchange_msg_return(msg* msg)
+{
+	as_fabric_msg_put(msg);
+}
+
+/**
+ * Get message id.
+ */
+static int
+exchange_msg_id_get(msg* msg, uint32_t* msg_id)
+{
+	if (msg_get_uint32(msg, AS_EXCHANGE_MSG_ID, msg_id) != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Get message type.
+ */
+static int
+exchange_msg_type_get(msg* msg, as_exchange_msg_type* msg_type)
+{
+	if (msg_get_uint32(msg, AS_EXCHANGE_MSG_TYPE, msg_type) != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Get message cluster key.
+ */
+static int
+exchange_msg_cluster_key_get(msg* msg, as_cluster_key* cluster_key)
+{
+	if (msg_get_uint64(msg, AS_EXCHANGE_MSG_CLUSTER_KEY, cluster_key) != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Set data payload for a message.
+ */
+static void
+exchange_msg_data_payload_set(msg* msg)
+{
+	uint32_t ns_count = g_config.n_namespaces;
+
+	cf_vector_define(namespace_list, sizeof(msg_buf_ele), ns_count, 0);
+	cf_vector_define(partition_versions, sizeof(msg_buf_ele), ns_count, 0);
+	uint32_t rack_ids[ns_count];
+
+	bool have_roster = false;
+	bool have_roster_rack_ids = false;
+	uint32_t roster_generations[ns_count];
+	cf_vector_define(rosters, sizeof(msg_buf_ele), ns_count, 0);
+	cf_vector_define(rosters_rack_ids, sizeof(msg_buf_ele), ns_count, 0);
+
+	bool have_regimes = false;
+	uint32_t eventual_regimes[ns_count];
+	uint32_t rebalance_regimes[ns_count];
+
+	pthread_mutex_lock(&g_exchanged_info_lock);
+
+	for (uint32_t ns_ix = 0; ns_ix < ns_count; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		msg_buf_ele ns_ele = {
+			.sz = (uint32_t)strlen(ns->name),
+			.ptr = (uint8_t*)ns->name
+		};
+
+		msg_buf_ele pv_ele = {
+			.sz = (uint32_t)g_exchange.self_data_dyn_buf[ns_ix].used_sz,
+			.ptr = g_exchange.self_data_dyn_buf[ns_ix].buf
+		};
+
+		msg_buf_ele rn_ele = {
+			.sz = (uint32_t)(ns->smd_roster_count * sizeof(cf_node)),
+			.ptr = (uint8_t*)ns->smd_roster
+		};
+
+		msg_buf_ele rri_ele = {
+			.sz = (uint32_t)(ns->smd_roster_count * sizeof(uint32_t)),
+			.ptr = (uint8_t*)ns->smd_roster_rack_ids
+		};
+
+		cf_vector_append(&namespace_list, &ns_ele);
+		cf_vector_append(&partition_versions, &pv_ele);
+		rack_ids[ns_ix] = ns->rack_id;
+
+		if (ns->smd_roster_generation != 0) {
+			have_roster = true;
+
+			if (! have_roster_rack_ids) {
+				for (uint32_t n = 0; n < ns->smd_roster_count; n++) {
+					if (ns->smd_roster_rack_ids[n] != 0) {
+						have_roster_rack_ids = true;
+						break;
+					}
+				}
+			}
+		}
+
+		roster_generations[ns_ix] = ns->smd_roster_generation;
+		cf_vector_append(&rosters, &rn_ele);
+		cf_vector_append(&rosters_rack_ids, &rri_ele);
+
+		eventual_regimes[ns_ix] = ns->eventual_regime;
+		rebalance_regimes[ns_ix] = ns->rebalance_regime;
+
+		if (eventual_regimes[ns_ix] != 0 || rebalance_regimes[ns_ix] != 0) {
+			have_regimes = true;
+		}
+	}
+
+	msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NAMESPACES, &namespace_list);
+	msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS,
+			&partition_versions);
+	msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_RACK_IDS, rack_ids,
+			ns_count);
+
+	if (have_roster) {
+		msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS,
+				roster_generations, ns_count);
+		msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NS_ROSTERS, &rosters);
+
+		if (have_roster_rack_ids) {
+			msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS,
+					&rosters_rack_ids);
+		}
+	}
+
+	if (have_regimes) {
+		msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES,
+				eventual_regimes, ns_count);
+		msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES,
+				rebalance_regimes, ns_count);
+	}
+
+	pthread_mutex_unlock(&g_exchanged_info_lock);
+}
+
+/**
+ * Check sanity of an incoming message. If this check passes the message is
+ * guaranteed to have valid protocol identifier, valid type and valid matching
+ * cluster key with source node being a part of the cluster.
+ * @return 0 if the message in valid, -1 if the message is invalid and should be
+ * ignored.
+ */
+static bool
+exchange_msg_is_sane(cf_node source, msg* msg)
+{
+	uint32_t id = 0;
+	if (exchange_msg_id_get(msg, &id) != 0||
+	id != AS_EXCHANGE_PROTOCOL_IDENTIFIER) {
+		DEBUG(
+				"received exchange message with mismatching identifier - expected %u but was  %u",
+				AS_EXCHANGE_PROTOCOL_IDENTIFIER, id);
+		return false;
+	}
+
+	as_exchange_msg_type msg_type = 0;
+
+	if (exchange_msg_type_get(msg, &msg_type) != 0
+			|| msg_type >= AS_EXCHANGE_MSG_TYPE_SENTINEL) {
+		WARNING("received exchange message with invalid message type  %u",
+				msg_type);
+		return false;
+	}
+
+	EXCHANGE_LOCK();
+	as_cluster_key current_cluster_key = g_exchange.cluster_key;
+	bool is_in_cluster = vector_find(&g_exchange.succession_list, &source) >= 0;
+	EXCHANGE_UNLOCK();
+
+	if (!is_in_cluster) {
+		DEBUG("received exchange message from node %"PRIx64" not in cluster",
+				source);
+		return false;
+	}
+
+	as_cluster_key incoming_cluster_key = 0;
+	if (exchange_msg_cluster_key_get(msg, &incoming_cluster_key) != 0
+			|| (current_cluster_key != incoming_cluster_key)
+			|| current_cluster_key == 0) {
+		DEBUG("received exchange message with mismatching cluster key - expected %"PRIx64" but was  %"PRIx64,
+				current_cluster_key, incoming_cluster_key);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * Send a message over fabric.
+ *
+ * @param msg the message to send.
+ * @param dest the desination node.
+ * @param error_msg the error message.
+ */
+static void
+exchange_msg_send(msg* msg, cf_node dest, char* error_msg)
+{
+	if (as_fabric_send(dest, msg, AS_FABRIC_CHANNEL_CTRL)) {
+		// Fabric will not return the message to the pool. Do it ourself.
+		exchange_msg_return(msg);
+		WARNING("%s (dest:%"PRIx64")", error_msg, dest);
+	}
+}
+
+/**
+ * Send a message over to a list of destination nodes.
+ *
+ * @param msg the message to send.
+ * @param dests the node list to send the message to.
+ * @param num_dests the number of destination nodes.
+ * @param error_msg the error message.
+ */
+static void
+exchange_msg_send_list(msg* msg, cf_node* dests, int num_dests, char* error_msg)
+{
+	if (as_fabric_send_list(dests, num_dests, msg, AS_FABRIC_CHANNEL_CTRL)
+			!= 0) {
+		// Fabric will not return the message to the pool. Do it ourself.
+		exchange_msg_return(msg);
+		as_clustering_log_cf_node_array(CF_WARNING, AS_EXCHANGE, error_msg,
+				dests, num_dests);
+	}
+}
+
+/**
+ * Send a commit message to a destination node.
+ * @param dest the destination node.
+ */
+static void
+exchange_commit_msg_send(cf_node dest)
+{
+	msg* commit_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_COMMIT);
+	DEBUG("sending commit message to node %"PRIx64, dest);
+	exchange_msg_send(commit_msg, dest, "error sending commit message");
+}
+
+/**
+ * Send a commit message to a list of destination nodes.
+ * @param dests the destination nodes.
+ * @param num_dests the number of destination nodes.
+ */
+static void
+exchange_commit_msg_send_all(cf_node* dests, int num_dests)
+{
+	msg* commit_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_COMMIT);
+	as_clustering_log_cf_node_array(CF_DEBUG, AS_EXCHANGE,
+			"sending commit message to nodes:", dests, num_dests);
+	exchange_msg_send_list(commit_msg, dests, num_dests,
+			"error sending commit message");
+}
+
+/**
+ * Send ready to commit message to the principal.
+ */
+static void
+exchange_ready_to_commit_msg_send()
+{
+	EXCHANGE_LOCK();
+	g_exchange.ready_to_commit_send_ts = cf_getms();
+	cf_node principal = g_exchange.principal;
+	EXCHANGE_UNLOCK();
+
+	msg* ready_to_commit_msg = exchange_msg_get(
+			AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT);
+	DEBUG("sending ready to commit message to node %"PRIx64, principal);
+	exchange_msg_send(ready_to_commit_msg, principal,
+			"error sending ready to commit message");
+}
+
+/**
+ * Send exchange data to all nodes that have not acked the send.
+ */
+static void
+exchange_data_msg_send_pending_ack()
+{
+	EXCHANGE_LOCK();
+	g_exchange.send_ts = cf_getms();
+
+	cf_node* unacked_nodes;
+	int num_unacked_nodes;
+	cf_vector* unacked_nodes_vector = cf_vector_stack_create(cf_node);
+
+	exchange_nodes_find_send_unacked(unacked_nodes_vector);
+	cf_vector_to_stack_array(unacked_nodes_vector, &unacked_nodes,
+			&num_unacked_nodes);
+
+	cf_vector_destroy(unacked_nodes_vector);
+
+	if (!num_unacked_nodes) {
+		goto Exit;
+	}
+
+	msg* data_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_DATA);
+	exchange_msg_data_payload_set(data_msg);
+
+	as_clustering_log_cf_node_array(CF_DEBUG, AS_EXCHANGE,
+			"sending exchange data to nodes:", unacked_nodes,
+			num_unacked_nodes);
+
+	exchange_msg_send_list(data_msg, unacked_nodes, num_unacked_nodes,
+			"error sending exchange data");
+Exit:
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Send a commit message to a destination node.
+ * @param dest the destination node.
+ */
+static void
+exchange_data_ack_msg_send(cf_node dest)
+{
+	msg* ack_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_DATA_ACK);
+	DEBUG("sending data ack message to node %"PRIx64, dest);
+	exchange_msg_send(ack_msg, dest, "error sending data ack message");
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Data payload related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Add a pid to the namespace hash for the input vinfo.
+ */
+static void
+exchange_namespace_hash_pid_add(cf_shash* ns_hash, as_partition_version* vinfo,
+		uint16_t pid)
+{
+	if (as_partition_version_is_null(vinfo)) {
+		// Ignore NULL vinfos.
+		return;
+	}
+
+	cf_vector* pid_vector;
+
+	// Append the hash.
+	if (cf_shash_get(ns_hash, vinfo, &pid_vector) != CF_SHASH_OK) {
+		// We are seeing this vinfo for the first time.
+		pid_vector = cf_vector_create(sizeof(uint16_t),
+		AS_EXCHANGE_VINFO_NUM_PIDS_AVG, 0);
+		cf_shash_put(ns_hash, vinfo, &pid_vector);
+	}
+
+	cf_vector_append(pid_vector, &pid);
+}
+
+/**
+ * Destroy the pid vector for each vinfo.
+ */
+static int
+exchange_namespace_hash_destroy_reduce(const void* key, void* data, void* udata)
+{
+	cf_vector* pid_vector = *(cf_vector**)data;
+	cf_vector_destroy(pid_vector);
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+/**
+ * Serialize each vinfo and accumulated pids to the input buffer.
+ */
+static int
+exchange_namespace_hash_serialize_reduce(const void* key, void* data,
+		void* udata)
+{
+	const as_partition_version* vinfo = (const as_partition_version*)key;
+	cf_vector* pid_vector = *(cf_vector**)data;
+	cf_dyn_buf* dyn_buf = (cf_dyn_buf*)udata;
+
+	// Append the vinfo.
+	cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)vinfo, sizeof(*vinfo));
+
+	// Append the count of pids.
+	uint32_t num_pids = cf_vector_size(pid_vector);
+	cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)&num_pids, sizeof(num_pids));
+
+	// Append each pid.
+	for (int i = 0; i < num_pids; i++) {
+		uint16_t* pid = cf_vector_getp(pid_vector, i);
+		cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)pid, sizeof(*pid));
+	}
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Append namespace payload, in as_exchange_namespace_payload format, for a
+ * namespace to the dynamic buffer.
+ *
+ * @param ns the namespace.
+ * @param dyn_buf the dynamic buffer.
+ */
+static void
+exchange_data_namespace_payload_add(as_namespace* ns, cf_dyn_buf* dyn_buf)
+{
+	// A hash from each unique non null vinfo to a vector of partition ids
+	// having the vinfo.
+	cf_shash* ns_hash = cf_shash_create(exchange_vinfo_shash,
+			sizeof(as_partition_version), sizeof(cf_vector*),
+			AS_EXCHANGE_UNIQUE_VINFO_MAX_SIZE_SOFT, 0);
+
+	as_partition* partitions = ns->partitions;
+
+	// Populate the hash with one entry for each vinfo
+	for (int i = 0; i < AS_PARTITIONS; i++) {
+		as_partition_version* current_vinfo = &partitions[i].version;
+		exchange_namespace_hash_pid_add(ns_hash, current_vinfo, i);
+	}
+
+	// We are ready to populate the dyn buffer with this ns's data.
+	DEBUG("namespace %s has %d unique vinfos", ns->name,
+			cf_shash_get_size(ns_hash));
+
+	// Append the vinfo count.
+	uint32_t num_vinfos = cf_shash_get_size(ns_hash);
+	cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)&num_vinfos, sizeof(num_vinfos));
+
+	// Append vinfos and partitions.
+	cf_shash_reduce(ns_hash, exchange_namespace_hash_serialize_reduce, dyn_buf);
+
+	// Destroy the intermediate hash and the pid vectors.
+	cf_shash_reduce(ns_hash, exchange_namespace_hash_destroy_reduce, NULL);
+
+	cf_shash_destroy(ns_hash);
+}
+
+/**
+ * Prepare the exchanged data payloads.
+ */
+static void
+exchange_data_payloads_prepare()
+{
+	EXCHANGE_LOCK();
+
+	// Block / abort migrations and freeze the partition version infos.
+	as_partition_balance_disallow_migrations();
+	as_partition_balance_synchronize_migrations();
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		// Append payload for each namespace.
+
+		// TODO - add API to reset dynbuf?
+		g_exchange.self_data_dyn_buf[ns_ix].used_sz = 0;
+
+		exchange_data_namespace_payload_add(g_config.namespaces[ns_ix],
+				&g_exchange.self_data_dyn_buf[ns_ix]);
+	}
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Indicates if the per-namespace fields in an incoming data message are valid.
+ *
+ * @return number of namespaces.
+ */
+static uint32_t
+exchange_data_msg_get_num_namespaces(as_exchange_event* msg_event)
+{
+	uint32_t num_namespaces_sent = 0;
+	uint32_t num_namespace_elements_sent = 0;
+
+	if (!msg_msgpack_container_get_count(msg_event->msg,
+			AS_EXCHANGE_MSG_NAMESPACES, &num_namespaces_sent)
+			|| num_namespaces_sent > AS_NAMESPACE_SZ) {
+		WARNING("received invalid namespaces from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (!msg_msgpack_container_get_count(msg_event->msg,
+			AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, &num_namespace_elements_sent)
+			|| num_namespaces_sent != num_namespace_elements_sent) {
+		WARNING("received invalid partition versions from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (!msg_msgpack_container_get_count(msg_event->msg,
+			AS_EXCHANGE_MSG_NS_RACK_IDS, &num_namespace_elements_sent)
+			|| num_namespaces_sent != num_namespace_elements_sent) {
+		WARNING("received invalid cluster groups from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS)
+			&& (!msg_msgpack_container_get_count(msg_event->msg,
+					AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS,
+					&num_namespace_elements_sent)
+					|| num_namespaces_sent != num_namespace_elements_sent)) {
+		WARNING("received invalid roster generations from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS)
+			&& (!msg_msgpack_container_get_count(msg_event->msg,
+					AS_EXCHANGE_MSG_NS_ROSTERS,
+					&num_namespace_elements_sent)
+					|| num_namespaces_sent != num_namespace_elements_sent)) {
+		WARNING("received invalid rosters from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS)
+			&& (!msg_msgpack_container_get_count(msg_event->msg,
+					AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS,
+					&num_namespace_elements_sent)
+					|| num_namespaces_sent != num_namespace_elements_sent)) {
+		WARNING("received invalid rosters-rack-ids from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES)
+			&& (!msg_msgpack_container_get_count(msg_event->msg,
+					AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES,
+					&num_namespace_elements_sent)
+					|| num_namespaces_sent != num_namespace_elements_sent)) {
+		WARNING("received invalid eventual regimes from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES)
+			&& (!msg_msgpack_container_get_count(msg_event->msg,
+					AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES,
+					&num_namespace_elements_sent)
+					|| num_namespaces_sent != num_namespace_elements_sent)) {
+		WARNING("received invalid rebalance regimes from node %"PRIx64,
+				msg_event->msg_source);
+		return 0;
+	}
+
+	return num_namespaces_sent;
+}
+
+/**
+ * Basic validation for incoming namespace payload.
+ * Validates that
+ * 	1. Number of vinfos < AS_PARTITIONS.
+ * 	2. Each partition is between 0 and AS_PARTITIONS.
+ * 	3. Namespaces payload does not exceed payload_end_ptr.
+ *
+ * @param ns_payload pointer to start of the namespace payload.
+ * @param ns_payload_size the size of the input namespace payload.
+ * @return true if this is a valid payload.
+ */
+static bool
+exchange_namespace_payload_is_valid(as_exchange_ns_vinfos_payload* ns_payload,
+		uint32_t ns_payload_size)
+{
+	// Pointer past the last byte in the payload.
+	uint8_t* payload_end_ptr = (uint8_t*)ns_payload + ns_payload_size;
+
+	if ((uint8_t*)ns_payload->vinfos > payload_end_ptr) {
+		return false;
+	}
+
+	if (ns_payload->num_vinfos > AS_PARTITIONS) {
+		return false;
+	}
+
+	uint8_t* read_ptr = (uint8_t*)ns_payload->vinfos;
+
+	for (uint32_t i = 0; i < ns_payload->num_vinfos; i++) {
+		if (read_ptr >= payload_end_ptr) {
+			return false;
+		}
+
+		as_exchange_vinfo_payload* vinfo_payload =
+				(as_exchange_vinfo_payload*)read_ptr;
+
+		if ((uint8_t*)vinfo_payload->pids > payload_end_ptr) {
+			return false;
+		}
+
+		if (vinfo_payload->num_pids > AS_PARTITIONS) {
+			return false;
+		}
+
+		size_t pids_size = vinfo_payload->num_pids * sizeof(uint16_t);
+
+		if ((uint8_t*)vinfo_payload->pids + pids_size > payload_end_ptr) {
+			return false;
+		}
+
+		for (uint32_t j = 0; j < vinfo_payload->num_pids; j++) {
+			if (vinfo_payload->pids[j] >= AS_PARTITIONS) {
+				return false;
+			}
+		}
+
+		read_ptr += sizeof(as_exchange_vinfo_payload) + pids_size;
+	}
+
+	if (read_ptr != payload_end_ptr) {
+		// There are unaccounted for extra bytes in the payload.
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Common across all states
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Indicates if self node is the cluster principal.
+ */
+static bool
+exchange_self_is_principal()
+{
+	EXCHANGE_LOCK();
+	bool is_principal = (g_config.self_node == g_exchange.principal);
+	EXCHANGE_UNLOCK();
+	return is_principal;
+}
+
+/**
+ * Dump exchange state.
+ */
+static void
+exchange_dump(cf_fault_severity severity, bool verbose)
+{
+	EXCHANGE_LOCK();
+	cf_vector* node_vector = cf_vector_stack_create(cf_node);
+
+	char* state_str = "";
+	switch (g_exchange.state) {
+	case AS_EXCHANGE_STATE_REST:
+		state_str = "rest";
+		break;
+	case AS_EXCHANGE_STATE_EXCHANGING:
+		state_str = "exchanging";
+		break;
+	case AS_EXCHANGE_STATE_READY_TO_COMMIT:
+		state_str = "ready to commit";
+		break;
+	case AS_EXCHANGE_STATE_ORPHANED:
+		state_str = "orphaned";
+		break;
+	}
+
+	LOG(severity, "EXG: state: %s", state_str);
+
+	if (g_exchange.state == AS_EXCHANGE_STATE_ORPHANED) {
+		LOG(severity, "EXG: client transactions blocked: %s",
+				g_exchange.orphan_state_are_transactions_blocked ?
+						"true" : "false");
+		LOG(severity, "EXG: orphan since: %"PRIu64"(millis)",
+				cf_getms() - g_exchange.orphan_state_start_time);
+	}
+	else {
+		LOG(severity, "EXG: cluster key: %"PRIx64, g_exchange.cluster_key);
+		as_clustering_log_cf_node_vector(severity, AS_EXCHANGE,
+				"EXG: succession:", &g_exchange.succession_list);
+
+		if (verbose) {
+			vector_clear(node_vector);
+			exchange_nodes_find_send_unacked(node_vector);
+			as_clustering_log_cf_node_vector(severity, AS_EXCHANGE,
+					"EXG: send pending:", node_vector);
+
+			vector_clear(node_vector);
+			exchange_nodes_find_not_received(node_vector);
+			as_clustering_log_cf_node_vector(severity, AS_EXCHANGE,
+					"EXG: receive pending:", node_vector);
+
+			if (exchange_self_is_principal()) {
+				vector_clear(node_vector);
+				exchange_nodes_find_not_ready_to_commit(node_vector);
+				as_clustering_log_cf_node_vector(severity, AS_EXCHANGE,
+						"EXG: ready to commit pending:", node_vector);
+			}
+		}
+	}
+
+	cf_vector_destroy(node_vector);
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Reset state for new round of exchange, while reusing as mush heap allocated
+ * space for exchanged data.
+ * @param new_succession_list new succession list. Can be NULL for orphaned
+ * state.
+ * @param new_cluster_key 0 for orphaned state.
+ */
+static void
+exchange_reset_for_new_round(cf_vector* new_succession_list,
+		as_cluster_key new_cluster_key)
+{
+	EXCHANGE_LOCK();
+	vector_clear(&g_exchange.succession_list);
+	g_exchange.principal = 0;
+
+	if (new_succession_list && cf_vector_size(new_succession_list) > 0) {
+		vector_copy(&g_exchange.succession_list, new_succession_list);
+		// Set the principal node.
+		cf_vector_get(&g_exchange.succession_list, 0, &g_exchange.principal);
+		g_exchange.cluster_size = cf_vector_size(new_succession_list);
+	}
+	else {
+		g_exchange.cluster_size = 0;
+	}
+
+	// Reset accumulated node states.
+	exchange_node_states_reset();
+
+	g_exchange.cluster_key = new_cluster_key;
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Commit exchange state to reflect self node being an orphan.
+ */
+static void
+exchange_orphan_commit()
+{
+	EXCHANGE_LOCK();
+	g_exchange.committed_cluster_key = 0;
+	g_exchange.committed_cluster_size = 0;
+	g_exchange.committed_principal = 0;
+	vector_clear(&g_exchange.committed_succession_list);
+	WARNING("blocking client transactions in orphan state!");
+	as_partition_balance_revert_to_orphan();
+	g_exchange.orphan_state_are_transactions_blocked = true;
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Receive an orphaned event and abort current round.
+ */
+static void
+exchange_orphaned_handle(as_clustering_event* orphaned_event)
+{
+	DEBUG("got orphaned event");
+
+	EXCHANGE_LOCK();
+
+	if (g_exchange.state != AS_EXCHANGE_STATE_REST
+			&& g_exchange.state != AS_EXCHANGE_STATE_ORPHANED) {
+		INFO("aborting partition exchange with cluster key %"PRIx64,
+				g_exchange.cluster_key);
+	}
+
+	g_exchange.state = AS_EXCHANGE_STATE_ORPHANED;
+	exchange_reset_for_new_round(NULL, 0);
+
+	// Stop ongoing migrations if any.
+	as_partition_balance_disallow_migrations();
+	as_partition_balance_synchronize_migrations();
+
+	// Update the time this node got into orphan state.
+	g_exchange.orphan_state_start_time = cf_getms();
+
+	// Potentially temporary orphan state. We will timeout and commit orphan
+	// state if this persists for long.
+	g_exchange.orphan_state_are_transactions_blocked = false;
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Receive a cluster change event and start a new data exchange round.
+ */
+static void
+exchange_cluster_change_handle(as_clustering_event* clustering_event)
+{
+	EXCHANGE_LOCK();
+
+	DEBUG("got cluster change event");
+
+	if (g_exchange.state != AS_EXCHANGE_STATE_REST
+			&& g_exchange.state != AS_EXCHANGE_STATE_ORPHANED) {
+		INFO("aborting partition exchange with cluster key %"PRIx64,
+				g_exchange.cluster_key);
+	}
+
+	exchange_reset_for_new_round(clustering_event->succession_list,
+			clustering_event->cluster_key);
+
+	g_exchange.state = AS_EXCHANGE_STATE_EXCHANGING;
+
+	INFO("data exchange started with cluster key %"PRIx64,
+			g_exchange.cluster_key);
+
+	// Prepare the data payloads.
+	exchange_data_payloads_prepare();
+
+	EXCHANGE_UNLOCK();
+
+	exchange_data_msg_send_pending_ack();
+}
+
+/**
+ * Handle a cluster change event.
+ * @param cluster_change_event the cluster change event.
+ */
+static void
+exchange_clustering_event_handle(as_exchange_event* exchange_clustering_event)
+{
+	as_clustering_event* clustering_event =
+			exchange_clustering_event->clustering_event;
+
+	switch (clustering_event->type) {
+	case AS_CLUSTERING_ORPHANED:
+		exchange_orphaned_handle(clustering_event);
+		break;
+	case AS_CLUSTERING_CLUSTER_CHANGED:
+		exchange_cluster_change_handle(clustering_event);
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Orphan state event handling
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The wait time in orphan state after which client transactions and transaction
+ * related interactions (e.g. valid partition map publishing) should be blocked.
+ */
+static uint32_t
+exchange_orphan_transaction_block_timeout()
+{
+	return (uint32_t)as_clustering_quantum_interval()
+			* AS_EXCHANGE_REVERT_ORPHAN_INTERVALS;
+}
+
+/**
+ * Handle the timer event and if we have been an orphan for too long, block
+ * client transactions.
+ */
+static void
+exchange_orphan_timer_event_handle()
+{
+	uint32_t timeout = exchange_orphan_transaction_block_timeout();
+	EXCHANGE_LOCK();
+	if (!g_exchange.orphan_state_are_transactions_blocked
+			&& g_exchange.orphan_state_start_time + timeout < cf_getms()) {
+		exchange_orphan_commit();
+	}
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Event processing in the orphan state.
+ */
+static void
+exchange_orphan_event_handle(as_exchange_event* event)
+{
+	switch (event->type) {
+	case AS_EXCHANGE_EVENT_CLUSTER_CHANGE:
+		exchange_clustering_event_handle(event);
+		break;
+	case AS_EXCHANGE_EVENT_TIMER:
+		exchange_orphan_timer_event_handle();
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Rest state event handling
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Process a message event when in rest state.
+ */
+static void
+exchange_rest_msg_event_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	if (!exchange_msg_is_sane(msg_event->msg_source, msg_event->msg)) {
+		goto Exit;
+	}
+
+	as_exchange_msg_type msg_type;
+	exchange_msg_type_get(msg_event->msg, &msg_type);
+
+	if (exchange_self_is_principal()
+			&& msg_type == AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT) {
+		// The commit message did not make it to the source node, hence it send
+		// us the ready to commit message. Resend the commit message.
+		DEBUG("received a ready to commit message from %"PRIx64,
+				msg_event->msg_source);
+		exchange_commit_msg_send(msg_event->msg_source);
+	}
+	else {
+		DEBUG(
+				"rest state received unexpected mesage of type %d from node %"PRIx64,
+				msg_type, msg_event->msg_source);
+
+	}
+
+Exit:
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Event processing in the rest state.
+ */
+static void
+exchange_rest_event_handle(as_exchange_event* event)
+{
+	switch (event->type) {
+	case AS_EXCHANGE_EVENT_CLUSTER_CHANGE:
+		exchange_clustering_event_handle(event);
+		break;
+	case AS_EXCHANGE_EVENT_MSG:
+		exchange_rest_msg_event_handle(event);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Exchanging state event handling
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Commit namespace payload for a node.
+ * Assumes the namespace vinfo and succession list have been zero set before.
+ */
+static void
+exchange_namespace_payload_pre_commit_for_node(cf_node node,
+		as_exchange_node_namespace_data* namespace_data)
+{
+	as_namespace* ns = namespace_data->local_namespace;
+
+	uint32_t sl_ix = ns->cluster_size++;
+
+	ns->succession[sl_ix] = node;
+
+	as_exchange_ns_vinfos_payload* ns_payload =
+			namespace_data->partition_versions;
+	uint8_t* read_ptr = (uint8_t*)ns_payload->vinfos;
+
+	for (int i = 0; i < ns_payload->num_vinfos; i++) {
+		as_exchange_vinfo_payload* vinfo_payload =
+				(as_exchange_vinfo_payload*)read_ptr;
+
+		for (int j = 0; j < vinfo_payload->num_pids; j++) {
+			memcpy(&ns->cluster_versions[sl_ix][vinfo_payload->pids[j]],
+					&vinfo_payload->vinfo, sizeof(vinfo_payload->vinfo));
+		}
+
+		read_ptr += sizeof(as_exchange_vinfo_payload)
+				+ vinfo_payload->num_pids * sizeof(uint16_t);
+	}
+
+	ns->rack_ids[sl_ix] = namespace_data->rack_id;
+
+	if (namespace_data->roster_generation > ns->roster_generation) {
+		ns->roster_generation = namespace_data->roster_generation;
+		ns->roster_count = namespace_data->roster_count;
+
+		memcpy(ns->roster, namespace_data->roster,
+				ns->roster_count * sizeof(cf_node));
+
+		if (namespace_data->roster_rack_ids) {
+			memcpy(ns->roster_rack_ids, namespace_data->roster_rack_ids,
+					ns->roster_count * sizeof(uint32_t));
+		}
+		else {
+			memset(ns->roster_rack_ids, 0, ns->roster_count * sizeof(uint32_t));
+		}
+	}
+
+	if (namespace_data->eventual_regime > ns->eventual_regime) {
+		ns->eventual_regime = namespace_data->eventual_regime;
+	}
+
+	ns->rebalance_regimes[sl_ix] = namespace_data->rebalance_regime;
+}
+
+/**
+ * Commit exchange data for a given node.
+ */
+static void
+exchange_data_pre_commit_for_node(cf_node node)
+{
+	EXCHANGE_LOCK();
+	as_exchange_node_state node_state;
+	exchange_node_state_get_safe(node, &node_state);
+
+	for (uint32_t i = 0; i < node_state.data->num_namespaces; i++) {
+		exchange_namespace_payload_pre_commit_for_node(node,
+				&node_state.data->namespace_data[i]);
+	}
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Check that there's not a mixture of AP and CP nodes in any namespace.
+ */
+static bool
+exchange_data_pre_commit_ap_cp_check()
+{
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace* ns = g_config.namespaces[i];
+
+		cf_node ap_node = (cf_node)0;
+		cf_node cp_node = (cf_node)0;
+
+		for (uint32_t n = 0; n < ns->cluster_size; n++) {
+			if (ns->rebalance_regimes[n] == 0) {
+				ap_node = ns->succession[n];
+			}
+			else {
+				cp_node = ns->succession[n];
+			}
+		}
+
+		if (ap_node != (cf_node)0 && cp_node != (cf_node)0) {
+			WARNING("{%s} has mixture of AP and SC nodes - for example %lx is AP and %lx is SC",
+					ns->name, ap_node, cp_node);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * Pre commit namespace data anticipating a successful commit from the
+ * principal. This pre commit is to ensure regime advances in cp mode to cover
+ * the case where the principal commits exchange data but the commit to a
+ * non-principal is lost.
+ */
+static bool
+exchange_exchanging_pre_commit()
+{
+	EXCHANGE_LOCK();
+	pthread_mutex_lock(&g_exchanged_info_lock);
+
+	// Reset exchange data for all namespaces.
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace* ns = g_config.namespaces[i];
+		memset(ns->succession, 0, sizeof(ns->succession));
+
+		// Assuming zero to represent "null" partition.
+		memset(ns->cluster_versions, 0, sizeof(ns->cluster_versions));
+
+		memset(ns->rack_ids, 0, sizeof(ns->rack_ids));
+
+		ns->roster_generation = 0;
+		ns->roster_count = 0;
+		memset(ns->roster, 0, sizeof(ns->roster));
+		memset(ns->roster_rack_ids, 0, sizeof(ns->roster_rack_ids));
+
+		ns->eventual_regime = 0;
+		// Note - not clearing ns->rebalance_regime - it's not set here.
+		memset(ns->rebalance_regimes, 0, sizeof(ns->rebalance_regimes));
+
+		// Reset ns cluster size to zero.
+		ns->cluster_size = 0;
+	}
+
+	// Fill the namespace partition version info in succession list order.
+	int num_nodes = cf_vector_size(&g_exchange.succession_list);
+	for (int i = 0; i < num_nodes; i++) {
+		cf_node node;
+		cf_vector_get(&g_exchange.succession_list, i, &node);
+		exchange_data_pre_commit_for_node(node);
+	}
+
+	// Collected all exchanged data - do final configuration consistency checks.
+	if (! exchange_data_pre_commit_ap_cp_check()) {
+		WARNING("abandoned exchange - fix configuration conflict");
+		pthread_mutex_unlock(&g_exchanged_info_lock);
+		EXCHANGE_UNLOCK();
+		return false;
+	}
+
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace* ns = g_config.namespaces[i];
+
+		if (ns->eventual_regime != 0) {
+			ns->eventual_regime += 2;
+
+			// TODO - until future storage format change, we'll use partition 0
+			// to save and restore ns->eventual_regime.
+
+			// Ok to not take partition lock.
+			as_partition* p = &ns->partitions[0];
+
+			as_storage_info_set(ns, p, true);
+
+			INFO("{%s} eventual-regime %u ready", ns->name,
+					ns->eventual_regime);
+		}
+	}
+
+	pthread_mutex_unlock(&g_exchanged_info_lock);
+	EXCHANGE_UNLOCK();
+
+	return true;
+}
+
+/**
+ * Check to see if all exchange data is sent and received. If so switch to
+ * ready_to_commit state.
+ */
+static void
+exchange_exchanging_check_switch_ready_to_commit()
+{
+	EXCHANGE_LOCK();
+
+	cf_vector* node_vector = cf_vector_stack_create(cf_node);
+	bool ready_to_commit = false;
+
+	if (g_exchange.state == AS_EXCHANGE_STATE_REST
+			|| g_exchange.cluster_key == 0) {
+		goto Exit;
+	}
+
+	exchange_nodes_find_send_unacked(node_vector);
+	if (cf_vector_size(node_vector) > 0) {
+		// We still have unacked exchange send messages.
+		goto Exit;
+	}
+
+	vector_clear(node_vector);
+	exchange_nodes_find_not_received(node_vector);
+	if (cf_vector_size(node_vector) > 0) {
+		// We still haven't received exchange messages from all nodes in the
+		// succession list.
+		goto Exit;
+	}
+
+	g_exchange.state = AS_EXCHANGE_STATE_READY_TO_COMMIT;
+
+	ready_to_commit = true;
+
+	DEBUG("ready to commit exchange data for cluster key %"PRIx64,
+			g_exchange.cluster_key);
+
+Exit:
+	cf_vector_destroy(node_vector);
+
+	if (ready_to_commit && exchange_exchanging_pre_commit()) {
+		exchange_ready_to_commit_msg_send();
+	}
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Handle incoming data message.
+ *
+ * Assumes the message has been checked for sanity.
+ */
+static void
+exchange_exchanging_data_msg_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	DEBUG("received exchange data from node %"PRIx64, msg_event->msg_source);
+
+	as_exchange_node_state node_state;
+	exchange_node_state_get_safe(msg_event->msg_source, &node_state);
+
+	if (!node_state.received) {
+		uint32_t num_namespaces_sent = exchange_data_msg_get_num_namespaces(
+				msg_event);
+
+		if (num_namespaces_sent == 0) {
+			WARNING("ignoring invalid exchange data from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		cf_vector_define(namespace_list, sizeof(msg_buf_ele),
+				num_namespaces_sent, 0);
+		cf_vector_define(partition_versions, sizeof(msg_buf_ele),
+				num_namespaces_sent, 0);
+		uint32_t rack_ids[num_namespaces_sent];
+
+		uint32_t roster_generations[num_namespaces_sent];
+		cf_vector_define(rosters, sizeof(msg_buf_ele), num_namespaces_sent, 0);
+		cf_vector_define(rosters_rack_ids, sizeof(msg_buf_ele),
+				num_namespaces_sent, 0);
+
+		memset(roster_generations, 0, sizeof(roster_generations));
+
+		uint32_t eventual_regimes[num_namespaces_sent];
+		uint32_t rebalance_regimes[num_namespaces_sent];
+
+		memset(eventual_regimes, 0, sizeof(eventual_regimes));
+		memset(rebalance_regimes, 0, sizeof(rebalance_regimes));
+
+		if (!msg_msgpack_list_get_buf_array_presized(msg_event->msg,
+				AS_EXCHANGE_MSG_NAMESPACES, &namespace_list)) {
+			WARNING("received invalid namespaces from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		if (!msg_msgpack_list_get_buf_array_presized(msg_event->msg,
+				AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, &partition_versions)) {
+			WARNING("received invalid partition versions from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		uint32_t num_rack_ids = num_namespaces_sent;
+
+		if (!msg_msgpack_list_get_uint32_array(msg_event->msg,
+				AS_EXCHANGE_MSG_NS_RACK_IDS, rack_ids, &num_rack_ids)) {
+			WARNING("received invalid cluster groups from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		uint32_t num_roster_generations = num_namespaces_sent;
+
+		if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS)
+				&& !msg_msgpack_list_get_uint32_array(msg_event->msg,
+						AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS,
+						roster_generations, &num_roster_generations)) {
+			WARNING("received invalid roster generations from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS)
+				&& !msg_msgpack_list_get_buf_array_presized(msg_event->msg,
+						AS_EXCHANGE_MSG_NS_ROSTERS, &rosters)) {
+			WARNING("received invalid rosters from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS)
+				&& !msg_msgpack_list_get_buf_array_presized(msg_event->msg,
+						AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS,
+						&rosters_rack_ids)) {
+			WARNING("received invalid rosters-rack-ids from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		uint32_t num_eventual_regimes = num_namespaces_sent;
+
+		if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES)
+				&& !msg_msgpack_list_get_uint32_array(msg_event->msg,
+						AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, eventual_regimes,
+						&num_eventual_regimes)) {
+			WARNING("received invalid eventual regimes from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		uint32_t num_rebalance_regimes = num_namespaces_sent;
+
+		if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES)
+				&& !msg_msgpack_list_get_uint32_array(msg_event->msg,
+						AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, rebalance_regimes,
+						&num_rebalance_regimes)) {
+			WARNING("received invalid rebalance regimes from node %"PRIx64,
+					msg_event->msg_source);
+			goto Exit;
+		}
+
+		node_state.data->num_namespaces = 0;
+
+		for (uint32_t i = 0; i < num_namespaces_sent; i++) {
+			msg_buf_ele* namespace_name_element = cf_vector_getp(
+					&namespace_list, i);
+
+			// Find a match for the namespace.
+			as_namespace* matching_namespace = as_namespace_get_bybuf(
+					namespace_name_element->ptr, namespace_name_element->sz);
+
+			if (!matching_namespace) {
+				continue;
+			}
+
+			as_exchange_node_namespace_data* namespace_data =
+					&node_state.data->namespace_data[node_state.data->num_namespaces];
+			node_state.data->num_namespaces++;
+
+			namespace_data->local_namespace = matching_namespace;
+			namespace_data->rack_id = rack_ids[i];
+			namespace_data->roster_generation = roster_generations[i];
+			namespace_data->eventual_regime = eventual_regimes[i];
+			namespace_data->rebalance_regime = rebalance_regimes[i];
+
+			// Copy partition versions.
+			msg_buf_ele* partition_versions_element = cf_vector_getp(
+					&partition_versions, i);
+
+			if (!exchange_namespace_payload_is_valid(
+					(as_exchange_ns_vinfos_payload*)partition_versions_element->ptr,
+					partition_versions_element->sz)) {
+				WARNING(
+						"received invalid partition versions for namespace %s from node %"PRIx64,
+						matching_namespace->name, msg_event->msg_source);
+				goto Exit;
+			}
+
+			namespace_data->partition_versions = cf_realloc(
+					namespace_data->partition_versions,
+					partition_versions_element->sz);
+
+			memcpy(namespace_data->partition_versions,
+					partition_versions_element->ptr,
+					partition_versions_element->sz);
+
+			// Copy rosters.
+			// TODO - make this piece a utility function?
+			if (namespace_data->roster_generation == 0) {
+				namespace_data->roster_count = 0;
+			}
+			else {
+				msg_buf_ele* roster_ele = cf_vector_getp(&rosters, i);
+
+				namespace_data->roster_count = roster_ele->sz / sizeof(cf_node);
+
+				if (namespace_data->roster_count == 0
+						|| namespace_data->roster_count > AS_CLUSTER_SZ
+						|| roster_ele->sz % sizeof(cf_node) != 0) {
+					WARNING(
+							"received invalid roster for namespace %s from node %"PRIx64,
+							matching_namespace->name, msg_event->msg_source);
+					goto Exit;
+				}
+
+				namespace_data->roster =
+						cf_realloc(namespace_data->roster, roster_ele->sz);
+
+				memcpy(namespace_data->roster, roster_ele->ptr, roster_ele->sz);
+
+				uint32_t rri_ele_sz = 0;
+
+				if (cf_vector_size(&rosters_rack_ids) != 0) {
+					msg_buf_ele* rri_ele = cf_vector_getp(&rosters_rack_ids, i);
+
+					if (rri_ele->sz != 0) {
+						rri_ele_sz = rri_ele->sz;
+
+						if (rri_ele_sz !=
+								namespace_data->roster_count * sizeof(uint32_t)) {
+							WARNING(
+									"received invalid roster-rack-ids for namespace %s from node %"PRIx64,
+									matching_namespace->name, msg_event->msg_source);
+							goto Exit;
+						}
+
+						namespace_data->roster_rack_ids =
+								cf_realloc(namespace_data->roster_rack_ids,
+										rri_ele_sz);
+
+						memcpy(namespace_data->roster_rack_ids, rri_ele->ptr,
+								rri_ele_sz);
+					}
+				}
+
+				if (rri_ele_sz == 0 && namespace_data->roster_rack_ids) {
+					cf_free(namespace_data->roster_rack_ids);
+					namespace_data->roster_rack_ids = NULL;
+				}
+			}
+		}
+
+		// Mark exchange data received from the source.
+		node_state.received = true;
+		exchange_node_state_update(msg_event->msg_source, &node_state);
+	}
+	else {
+		// Duplicate pinfo received. Ignore.
+		INFO("received duplicate exchange data from node %"PRIx64,
+				msg_event->msg_source);
+	}
+
+	// Send an acknowledgement.
+	exchange_data_ack_msg_send(msg_event->msg_source);
+
+	// Check if we can switch to ready to commit state.
+	exchange_exchanging_check_switch_ready_to_commit();
+
+Exit:
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Handle incoming data ack message.
+ *
+ * Assumes the message has been checked for sanity.
+ */
+static void
+exchange_exchanging_data_ack_msg_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	DEBUG("received exchange data ack from node %"PRIx64,
+			msg_event->msg_source);
+
+	as_exchange_node_state node_state;
+	exchange_node_state_get_safe(msg_event->msg_source, &node_state);
+
+	if (!node_state.send_acked) {
+		// Mark send as acked in the node state.
+		node_state.send_acked = true;
+		exchange_node_state_update(msg_event->msg_source, &node_state);
+	}
+	else {
+		// Duplicate ack. Ignore.
+		DEBUG("received duplicate data ack from node %"PRIx64,
+				msg_event->msg_source);
+	}
+
+	// We might have send and received all partition info. Check for completion.
+	exchange_exchanging_check_switch_ready_to_commit();
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Process a message event when in exchanging state.
+ */
+static void
+exchange_exchanging_msg_event_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	if (!exchange_msg_is_sane(msg_event->msg_source, msg_event->msg)) {
+		goto Exit;
+	}
+
+	as_exchange_msg_type msg_type;
+	exchange_msg_type_get(msg_event->msg, &msg_type);
+
+	switch (msg_type) {
+	case AS_EXCHANGE_MSG_TYPE_DATA:
+		exchange_exchanging_data_msg_handle(msg_event);
+		break;
+	case AS_EXCHANGE_MSG_TYPE_DATA_ACK:
+		exchange_exchanging_data_ack_msg_handle(msg_event);
+		break;
+	default:
+		DEBUG(
+				"exchanging state received unexpected mesage of type %d from node %"PRIx64,
+				msg_type, msg_event->msg_source);
+	}
+Exit:
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Process a message event when in exchanging state.
+ */
+static void
+exchange_exchanging_timer_event_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+	bool send_data = false;
+
+	cf_clock now = cf_getms();
+
+	// The timeout is a "linear" step function, where the timeout is constant
+	// for the step interval.
+	cf_clock min_timeout = EXCHANGE_SEND_MIN_TIMEOUT();
+	cf_clock max_timeout = EXCHANGE_SEND_MAX_TIMEOUT();
+	uint32_t step_interval = EXCHANGE_SEND_STEP_INTERVAL();
+	cf_clock timeout = MAX(min_timeout,
+			MIN(max_timeout,
+					min_timeout
+							* ((now - g_exchange.send_ts) / step_interval)));
+
+	if (g_exchange.send_ts + timeout < now) {
+		send_data = true;
+	}
+
+	EXCHANGE_UNLOCK();
+
+	if (send_data) {
+		exchange_data_msg_send_pending_ack();
+	}
+}
+
+/**
+ * Event processing in the exchanging state.
+ */
+static void
+exchange_exchanging_event_handle(as_exchange_event* event)
+{
+	switch (event->type) {
+	case AS_EXCHANGE_EVENT_CLUSTER_CHANGE:
+		exchange_clustering_event_handle(event);
+		break;
+	case AS_EXCHANGE_EVENT_MSG:
+		exchange_exchanging_msg_event_handle(event);
+		break;
+	case AS_EXCHANGE_EVENT_TIMER:
+		exchange_exchanging_timer_event_handle(event);
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Ready_To_Commit state event handling
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Handle incoming ready to commit message.
+ *
+ * Assumes the message has been checked for sanity.
+ */
+static void
+exchange_ready_to_commit_rtc_msg_handle(as_exchange_event* msg_event)
+{
+	if (!exchange_self_is_principal()) {
+		WARNING(
+				"non-principal self received ready to commit message from %"PRIx64" - ignoring",
+				msg_event->msg_source);
+		return;
+	}
+
+	EXCHANGE_LOCK();
+
+	DEBUG("received ready to commit from node %"PRIx64, msg_event->msg_source);
+
+	as_exchange_node_state node_state;
+	exchange_node_state_get_safe(msg_event->msg_source, &node_state);
+
+	if (!node_state.is_ready_to_commit) {
+		// Mark as ready to commit in the node state.
+		node_state.is_ready_to_commit = true;
+		exchange_node_state_update(msg_event->msg_source, &node_state);
+	}
+	else {
+		// Duplicate ready to commit received. Ignore.
+		INFO("received duplicate ready to commit message from node %"PRIx64,
+				msg_event->msg_source);
+	}
+
+	cf_vector* node_vector = cf_vector_stack_create(cf_node);
+	exchange_nodes_find_not_ready_to_commit(node_vector);
+
+	if (cf_vector_size(node_vector) <= 0) {
+		// Send a commit message to all nodes in succession list.
+		cf_node* node_list = NULL;
+		int num_node_list = 0;
+		cf_vector_to_stack_array(&g_exchange.succession_list, &node_list,
+				&num_node_list);
+		exchange_commit_msg_send_all(node_list, num_node_list);
+	}
+
+	cf_vector_destroy(node_vector);
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Commit accumulated exchange data.
+ */
+static void
+exchange_data_commit()
+{
+	EXCHANGE_LOCK();
+
+	INFO("data exchange completed with cluster key %"PRIx64,
+			g_exchange.cluster_key);
+
+	// Exchange is done, use the current cluster details as the committed
+	// cluster details.
+	g_exchange.committed_cluster_key = g_exchange.cluster_key;
+	g_exchange.committed_cluster_size = g_exchange.cluster_size;
+	g_exchange.committed_principal = g_exchange.principal;
+	vector_clear(&g_exchange.committed_succession_list);
+	vector_copy(&g_exchange.committed_succession_list,
+			&g_exchange.succession_list);
+
+	// Force an update of the skew, to ensure new nodes if any have been checked
+	// for skew.
+	as_skew_monitor_update();
+
+	// Must cover partition balance since it may manipulate ns->cluster_size.
+	pthread_mutex_lock(&g_exchanged_info_lock);
+	as_partition_balance();
+	pthread_mutex_unlock(&g_exchanged_info_lock);
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Handle incoming data ack message.
+ *
+ * Assumes the message has been checked for sanity.
+ */
+static void
+exchange_ready_to_commit_commit_msg_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	if (msg_event->msg_source != g_exchange.principal) {
+		WARNING(
+				"ignoring commit message from node %"PRIx64" - expected message from %"PRIx64,
+				msg_event->msg_source, g_exchange.principal);
+		goto Exit;
+	}
+
+	INFO("received commit command from principal node %"PRIx64,
+			msg_event->msg_source);
+
+	// Commit exchanged data.
+	exchange_data_commit();
+
+	// Move to the rest state.
+	g_exchange.state = AS_EXCHANGE_STATE_REST;
+
+	// Queue up a cluster change event for downstream sub systems.
+	as_exchange_cluster_changed_event cluster_change_event;
+	cluster_change_event.cluster_key = g_exchange.committed_cluster_key;
+	cluster_change_event.succession = vector_to_array(
+			&g_exchange.committed_succession_list);
+	cluster_change_event.cluster_size = g_exchange.committed_cluster_size;
+
+	exchange_external_event_queue(&cluster_change_event);
+
+Exit:
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Handle incoming data message in ready to commit stage.
+ *
+ * Assumes the message has been checked for sanity.
+ */
+static void
+exchange_ready_to_commit_data_msg_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	DEBUG("received exchange data from node %"PRIx64, msg_event->msg_source);
+
+	// The source must have missed self node's data ack. Send an
+	// acknowledgement.
+	exchange_data_ack_msg_send(msg_event->msg_source);
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Process a message event when in ready_to_commit state.
+ */
+static void
+exchange_ready_to_commit_msg_event_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	if (!exchange_msg_is_sane(msg_event->msg_source, msg_event->msg)) {
+		goto Exit;
+	}
+
+	as_exchange_msg_type msg_type;
+	exchange_msg_type_get(msg_event->msg, &msg_type);
+
+	switch (msg_type) {
+	case AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT:
+		exchange_ready_to_commit_rtc_msg_handle(msg_event);
+		break;
+	case AS_EXCHANGE_MSG_TYPE_COMMIT:
+		exchange_ready_to_commit_commit_msg_handle(msg_event);
+		break;
+	case AS_EXCHANGE_MSG_TYPE_DATA:
+		exchange_ready_to_commit_data_msg_handle(msg_event);
+		break;
+	default:
+		DEBUG(
+				"ready to commit state received unexpected message of type %d from node %"PRIx64,
+				msg_type, msg_event->msg_source);
+	}
+Exit:
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Process a message event when in ready_to_commit state.
+ */
+static void
+exchange_ready_to_commit_timer_event_handle(as_exchange_event* msg_event)
+{
+	EXCHANGE_LOCK();
+
+	if (g_exchange.ready_to_commit_send_ts + EXCHANGE_READY_TO_COMMIT_TIMEOUT()
+			< cf_getms()) {
+		// Its been a while since ready to commit has been sent to the
+		// principal, retransmit it so that the principal gets it this time and
+		// supplies a commit message.
+		exchange_ready_to_commit_msg_send();
+	}
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Event processing in the ready_to_commit state.
+ */
+static void
+exchange_ready_to_commit_event_handle(as_exchange_event* event)
+{
+	switch (event->type) {
+	case AS_EXCHANGE_EVENT_CLUSTER_CHANGE:
+		exchange_clustering_event_handle(event);
+		break;
+	case AS_EXCHANGE_EVENT_MSG:
+		exchange_ready_to_commit_msg_event_handle(event);
+		break;
+	case AS_EXCHANGE_EVENT_TIMER:
+		exchange_ready_to_commit_timer_event_handle(event);
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Exchange core subsystem
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Dispatch an exchange event inline to the relevant state handler.
+ */
+static void
+exchange_event_handle(as_exchange_event* event)
+{
+	EXCHANGE_LOCK();
+
+	switch (g_exchange.state) {
+	case AS_EXCHANGE_STATE_REST:
+		exchange_rest_event_handle(event);
+		break;
+	case AS_EXCHANGE_STATE_EXCHANGING:
+		exchange_exchanging_event_handle(event);
+		break;
+	case AS_EXCHANGE_STATE_READY_TO_COMMIT:
+		exchange_ready_to_commit_event_handle(event);
+		break;
+	case AS_EXCHANGE_STATE_ORPHANED:
+		exchange_orphan_event_handle(event);
+		break;
+	}
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Exchange timer event generator thread, to help with retries and retransmits
+ * across all states.
+ */
+static void*
+exchange_timer_thr(void* arg)
+{
+	as_exchange_event timer_event;
+	memset(&timer_event, 0, sizeof(timer_event));
+	timer_event.type = AS_EXCHANGE_EVENT_TIMER;
+
+	while (EXCHANGE_IS_RUNNING()) {
+		// Wait for a while and retry.
+		usleep(EXCHANGE_TIMER_TICK_INTERVAL() * 1000);
+		exchange_event_handle(&timer_event);
+	}
+	return NULL;
+}
+
+/**
+ * Handle incoming messages from fabric.
+ */
+static int
+exchange_fabric_msg_listener(cf_node source, msg* msg, void* udata)
+{
+	if (!EXCHANGE_IS_RUNNING()) {
+		// Ignore this message.
+		DEBUG("exchange stopped - ignoring message from %"PRIx64, source);
+		goto Exit;
+	}
+
+	as_exchange_event msg_event;
+	memset(&msg_event, 0, sizeof(msg_event));
+	msg_event.type = AS_EXCHANGE_EVENT_MSG;
+	msg_event.msg = msg;
+	msg_event.msg_source = source;
+
+	exchange_event_handle(&msg_event);
+Exit:
+	as_fabric_msg_put(msg);
+	return 0;
+}
+
+/**
+ * Listener for cluster change events from clustering layer.
+ */
+void
+exchange_clustering_event_listener(as_clustering_event* event)
+{
+	if (!EXCHANGE_IS_RUNNING()) {
+		// Ignore this message.
+		DEBUG("exchange stopped - ignoring cluster change event");
+		return;
+	}
+
+	as_exchange_event clustering_event;
+	memset(&clustering_event, 0, sizeof(clustering_event));
+	clustering_event.type = AS_EXCHANGE_EVENT_CLUSTER_CHANGE;
+	clustering_event.clustering_event = event;
+
+	// Dispatch the event.
+	exchange_event_handle(&clustering_event);
+}
+
+/**
+ * Initialize the template to be used for exchange messages.
+ */
+static void
+exchange_msg_init()
+{
+	// Register fabric exchange msg type with no processing function.
+	as_fabric_register_msg_fn(M_TYPE_EXCHANGE, exchange_msg_template,
+			sizeof(exchange_msg_template), AS_EXCHANGE_MSG_SCRATCH_SIZE,
+			exchange_fabric_msg_listener, NULL);
+}
+
+/**
+ * Initialize exchange subsystem.
+ */
+static void
+exchange_init()
+{
+	if (EXCHANGE_IS_INITIALIZED()) {
+		return;
+	}
+
+	EXCHANGE_LOCK();
+
+	memset(&g_exchange, 0, sizeof(g_exchange));
+
+	// Start in the orphaned state.
+	g_exchange.state = AS_EXCHANGE_STATE_ORPHANED;
+	g_exchange.orphan_state_start_time = cf_getms();
+	g_exchange.orphan_state_are_transactions_blocked = true;
+
+	// Initialize the adjacencies.
+	g_exchange.nodeid_to_node_state = cf_shash_create(cf_nodeid_shash_fn,
+			sizeof(cf_node), sizeof(as_exchange_node_state),
+			AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	cf_vector_init(&g_exchange.succession_list, sizeof(cf_node),
+	AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
+	cf_vector_init(&g_exchange.committed_succession_list, sizeof(cf_node),
+	AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
+
+	// Initialize exchange fabric messaging.
+	exchange_msg_init();
+
+	// Initialize self exchange data dynamic buffers.
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		cf_dyn_buf_init_heap(&g_exchange.self_data_dyn_buf[ns_ix],
+			AS_EXCHANGE_SELF_DYN_BUF_SIZE());
+	}
+
+	// Initialize external event publishing.
+	exchange_external_event_publisher_init();
+
+	// Get partition versions from storage.
+	as_partition_balance_init();
+
+	DEBUG("exchange module initialized");
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Stop exchange subsystem.
+ */
+static void
+exchange_stop()
+{
+	if (!EXCHANGE_IS_RUNNING()) {
+		WARNING("exchange is already stopped");
+		return;
+	}
+
+	// Ungaurded state, but this should be ok.
+	g_exchange.sys_state = AS_EXCHANGE_SYS_STATE_SHUTTING_DOWN;
+
+	// Wait for the relanabce send thread to finish.
+	pthread_join(g_exchange.timer_tid, NULL);
+
+	EXCHANGE_LOCK();
+
+	g_exchange.sys_state = AS_EXCHANGE_SYS_STATE_STOPPED;
+
+	DEBUG("exchange module stopped");
+
+	EXCHANGE_UNLOCK();
+
+	external_event_publisher_stop();
+}
+
+/**
+ * Start the exchange subsystem.
+ */
+static void
+exchange_start()
+{
+	EXCHANGE_LOCK();
+
+	if (EXCHANGE_IS_RUNNING()) {
+		// Shutdown the exchange subsystem.
+		exchange_stop();
+	}
+
+	g_exchange.sys_state = AS_EXCHANGE_SYS_STATE_RUNNING;
+
+	// Start the timer thread.
+	if (0
+			!= pthread_create(&g_exchange.timer_tid, 0, exchange_timer_thr,
+					&g_exchange)) {
+		CRASH("could not create exchange thread: %s", cf_strerror(errno));
+	}
+
+	DEBUG("exchange module started");
+
+	EXCHANGE_UNLOCK();
+
+	exchange_external_event_publisher_start();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public API
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Initialize exchange subsystem.
+ */
+void
+as_exchange_init()
+{
+	exchange_init();
+}
+
+/**
+ * Start exchange subsystem.
+ */
+void
+as_exchange_start()
+{
+	exchange_start();
+}
+
+/**
+ * Stop exchange subsystem.
+ */
+void
+as_exchange_stop()
+{
+}
+
+/**
+ * Register to receive cluster-changed events.
+ * TODO - may replace with simple static list someday.
+ */
+void
+as_exchange_register_listener(as_exchange_cluster_changed_cb cb, void* udata)
+{
+	exchange_external_event_listener_register(cb, udata);
+}
+
+/**
+ * Dump exchange state to log.
+ */
+void
+as_exchange_dump(bool verbose)
+{
+	exchange_dump(CF_INFO, verbose);
+}
+
+/**
+ * Member-access method.
+ */
+uint64_t
+as_exchange_cluster_key()
+{
+	return (uint64_t)g_exchange.committed_cluster_key;
+}
+
+/**
+ * Member-access method.
+ */
+uint32_t
+as_exchange_cluster_size()
+{
+	return g_exchange.committed_cluster_size;
+}
+
+/**
+ * Copy over the committed succession list.
+ * Ensure the input vector has enough capacity.
+ */
+void
+as_exchange_succession(cf_vector* succession)
+{
+	EXCHANGE_LOCK();
+	vector_copy(succession, &g_exchange.committed_succession_list);
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Return the committed succession list.
+ */
+cf_node*
+as_exchange_succession_unsafe()
+{
+	return vector_to_array(&g_exchange.committed_succession_list);
+}
+
+/**
+ * Return the committed succession list as a string in a dyn-buf.
+ */
+void
+as_exchange_info_get_succession(cf_dyn_buf* db)
+{
+	EXCHANGE_LOCK();
+
+	cf_node* nodes = vector_to_array(&g_exchange.committed_succession_list);
+
+	for (uint32_t i = 0; i < g_exchange.committed_cluster_size; i++) {
+		cf_dyn_buf_append_uint64_x(db, nodes[i]);
+		cf_dyn_buf_append_char(db, ',');
+	}
+
+	if (g_exchange.committed_cluster_size != 0) {
+		cf_dyn_buf_chomp(db);
+	}
+
+	// Always succeeds.
+	cf_dyn_buf_append_string(db, "\nok");
+
+	EXCHANGE_UNLOCK();
+}
+
+/**
+ * Member-access method.
+ */
+cf_node
+as_exchange_principal()
+{
+	return g_exchange.committed_principal;
+}
+
+/**
+ * Lock before setting or getting exchanged info from non-exchange thread.
+ */
+void
+as_exchange_info_lock()
+{
+	pthread_mutex_lock(&g_exchanged_info_lock);
+}
+
+/**
+ * Unlock after setting or getting exchanged info from non-exchange thread.
+ */
+void
+as_exchange_info_unlock()
+{
+	pthread_mutex_unlock(&g_exchanged_info_lock);
+}
diff --git a/as/src/fabric/fabric.c b/as/src/fabric/fabric.c
new file mode 100644
index 00000000..3974acda
--- /dev/null
+++ b/as/src/fabric/fabric.c
@@ -0,0 +1,2943 @@
+/*
+ * fabric.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//   Object Management:
+//   ------------------
+//
+//	 Node and FC objects are reference counted. Correct book keeping on object
+//	 references are vital to system operations.
+//
+//   Holders of FC references:
+//     (1) node->fc_hash
+//     (2) node->send_idle_fc_queue
+//     (3) (epoll_event ev).data.ptr
+//
+//	 For sending, (2) and (3) are mutually exclusive.
+//	 Refs between (2) and (3) are passed virtually whenever possible, without
+//	 needing to explicitly call reserve/release.
+//	 (3) takes ref on rearm.
+//	 (3) gives ref to calling thread when epoll triggers, due to ONESHOT.
+//	     Thread will either rearm or give ref to (2). Never do both.
+//
+//	 FCs are created in two methods: fabric_node_connect(), run_fabric_accept()
+//
+//   Holders of Node references:
+//     * fc->node
+//     * g_fabric.node_hash
+
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/fabric.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_ll.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_rchash.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+#include "shash.h"
+#include "socket.h"
+#include "tls.h"
+
+#include "base/cfg.h"
+#include "base/stats.h"
+#include "fabric/endpoint.h"
+#include "fabric/hb.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define FABRIC_BUFFER_MEM_SZ		(1024 * 1024) // bytes
+#define FABRIC_BUFFER_MAX_SZ		(128 * 1024 * 1024) // used simply for validation
+#define FABRIC_EPOLL_SEND_EVENTS	16
+#define FABRIC_EPOLL_RECV_EVENTS	1
+
+typedef enum {
+	// These values go on the wire, so mind backward compatibility if changing.
+	FS_FIELD_NODE,
+	FS_UNUSED1, // used to be FS_ADDR
+	FS_UNUSED2, // used to be FS_PORT
+	FS_UNUSED3, // used to be FS_ANV
+	FS_UNUSED4, // used to be FS_ADDR_EX
+	FS_CHANNEL,
+
+	NUM_FS_FIELDS
+} fs_msg_fields;
+
+static const msg_template fabric_mt[] = {
+		{ FS_FIELD_NODE, M_FT_UINT64 },
+		{ FS_UNUSED1, M_FT_UINT32 },
+		{ FS_UNUSED2, M_FT_UINT32 },
+		{ FS_UNUSED3, M_FT_BUF },
+		{ FS_UNUSED4, M_FT_BUF },
+		{ FS_CHANNEL, M_FT_UINT32 },
+};
+
+COMPILER_ASSERT(sizeof(fabric_mt) / sizeof(msg_template) == NUM_FS_FIELDS);
+
+#define FS_MSG_SCRATCH_SIZE	128
+
+#define DEFAULT_EVENTS (EPOLLERR | EPOLLHUP | EPOLLRDHUP | EPOLLONESHOT)
+
+// Block size for allocating fabric hb plugin data.
+#define HB_PLUGIN_DATA_BLOCK_SIZE	128
+
+typedef struct fabric_recv_thread_pool_s {
+	cf_vector			threads;
+	cf_poll				poll;
+	uint32_t			pool_id;
+} fabric_recv_thread_pool;
+
+typedef struct send_entry_s {
+	struct send_entry_s *next;
+	uint32_t id;
+	uint32_t count;
+	cf_poll poll;
+} send_entry;
+
+typedef struct fabric_state_s {
+	as_fabric_msg_fn	msg_cb[M_TYPE_MAX];
+	void 				*msg_udata[M_TYPE_MAX];
+
+	cf_queue			msg_pool_queue[M_TYPE_MAX]; // a pool of reusable msgs
+	cf_vector			fb_free;
+
+	fabric_recv_thread_pool recv_pool[AS_FABRIC_N_CHANNELS];
+
+	pthread_mutex_t		send_lock;
+	send_entry			*sends;
+	send_entry			*send_head;
+
+	pthread_mutex_t		node_hash_lock;
+	cf_rchash			*node_hash; // key is cf_node, value is (fabric_node *)
+} fabric_state;
+
+typedef struct fabric_buffer_s {
+	uint8_t 		*buf;
+	uint8_t			*progress;
+	const uint8_t	*end;
+	uint8_t			membuf[FABRIC_BUFFER_MEM_SZ];
+} fabric_buffer;
+
+typedef struct fabric_node_s {
+	cf_node 	node_id; // remote node
+	bool		live; // set to false on shutdown
+	uint32_t	connect_count[AS_FABRIC_N_CHANNELS];
+	bool		connect_full;
+
+	pthread_mutex_t		connect_lock;
+
+	pthread_mutex_t		fc_hash_lock;
+	cf_shash			*fc_hash; // key is (fabric_connection *), value unused
+
+	pthread_mutex_t		send_idle_fc_queue_lock;
+	cf_queue			send_idle_fc_queue[AS_FABRIC_N_CHANNELS];
+
+	cf_queue			send_queue[AS_FABRIC_N_CHANNELS];
+
+	uint8_t	send_counts[];
+} fabric_node;
+
+typedef struct fabric_connection_s {
+	cf_socket sock;
+	cf_sock_addr peer;
+	fabric_node *node;
+
+	bool failed;
+	bool started_via_connect;
+
+	fabric_buffer	s_buf;
+	msg				*s_msg_in_progress;
+	size_t			s_count;
+
+	fabric_buffer	*r_buf_in_progress;
+	uint32_t		r_msg_size;
+	msg_type		r_type;
+	uint64_t		benchmark_time;
+
+	// The send_ptr != NULL means that the FC's sock has registered with
+	// send_poll. This is needed because epoll's API doesn't allow registering
+	// a socket without event triggers (ERR and HUP are enabled even when
+	// unspecified).
+	send_entry *send_ptr;
+	fabric_recv_thread_pool *pool;
+
+	uint64_t s_bytes;
+	uint64_t s_bytes_last;
+	uint64_t r_bytes;
+	uint64_t r_bytes_last;
+} fabric_connection;
+
+typedef struct node_list_s {
+	uint32_t count;
+	cf_node nodes[AS_CLUSTER_SZ]; // must support the maximum cluster size.
+} node_list;
+
+const char *CHANNEL_NAMES[] = {
+		[AS_FABRIC_CHANNEL_RW]   = "rw",
+		[AS_FABRIC_CHANNEL_CTRL] = "ctrl",
+		[AS_FABRIC_CHANNEL_BULK] = "bulk",
+		[AS_FABRIC_CHANNEL_META] = "meta",
+};
+
+COMPILER_ASSERT(sizeof(CHANNEL_NAMES) / sizeof(const char *) ==
+		AS_FABRIC_N_CHANNELS);
+
+
+//==========================================================
+// Globals.
+//
+
+cf_serv_cfg g_fabric_bind = { .n_cfgs = 0 };
+cf_tls_info *g_fabric_tls;
+
+static fabric_state g_fabric;
+static cf_poll g_accept_poll;
+
+static as_endpoint_list *g_published_endpoint_list;
+static bool g_published_endpoint_list_ipv4_only;
+
+// Max connections formed via connect. Others are formed via accept.
+static uint32_t g_fabric_connect_limit[AS_FABRIC_N_CHANNELS];
+
+
+//==========================================================
+// Forward declarations.
+//
+
+// Support functions.
+static void send_entry_insert(send_entry **se_pp, send_entry *se);
+
+static void fabric_published_serv_cfg_fill(const cf_serv_cfg *bind_cfg, cf_serv_cfg *published_cfg, bool ipv4_only);
+static bool fabric_published_endpoints_refresh(void);
+
+// fabric_node
+static fabric_node *fabric_node_create(cf_node node_id);
+static fabric_node *fabric_node_get(cf_node node_id);
+static fabric_node *fabric_node_get_or_create(cf_node node_id);
+static fabric_node *fabric_node_pop(cf_node node_id);
+static int fabric_node_disconnect_reduce_fn(const void *key, void *data, void *udata);
+static void fabric_node_disconnect(cf_node node_id);
+
+static fabric_connection *fabric_node_connect(fabric_node *node, uint32_t ch);
+static int fabric_node_send(fabric_node *node, msg *m, as_fabric_channel channel);
+static void fabric_node_connect_all(fabric_node *node);
+static void fabric_node_destructor(void *pnode);
+inline static void fabric_node_reserve(fabric_node *node);
+inline static void fabric_node_release(fabric_node *node);
+static bool fabric_node_add_connection(fabric_node *node, fabric_connection *fc);
+static uint8_t fabric_node_find_min_send_count(const fabric_node *node);
+static bool fabric_node_is_connect_full(const fabric_node *node);
+
+static int fabric_get_node_list_fn(const void *key, uint32_t keylen, void *data, void *udata);
+static uint32_t fabric_get_node_list(node_list *nl);
+
+// fabric_buffer
+static fabric_buffer *fabric_buffer_create(size_t sz);
+static void fabric_buffer_init(fabric_buffer *fb, size_t sz);
+static void fabric_buffer_destroy(fabric_buffer *fb);
+inline static void fabric_buffer_free_extra(fabric_buffer *fb);
+inline static bool fabric_buffer_resize(fabric_buffer *fb, size_t sz);
+
+// fabric_connection
+fabric_connection *fabric_connection_create(cf_socket *sock, cf_sock_addr *peer);
+static bool fabric_connection_accept_tls(fabric_connection *fc);
+static bool fabric_connection_connect_tls(fabric_connection *fc);
+inline static void fabric_connection_reserve(fabric_connection *fc);
+static void fabric_connection_release(fabric_connection *fc);
+inline static cf_node fabric_connection_get_id(const fabric_connection *fc);
+
+static void fabric_connection_send_assign(fabric_connection *fc);
+static void fabric_connection_send_unassign(fabric_connection *fc);
+inline static void fabric_connection_recv_rearm(fabric_connection *fc);
+inline static void fabric_connection_send_rearm(fabric_connection *fc);
+static void fabric_connection_disconnect(fabric_connection *fc);
+static void fabric_connection_set_keepalive_options(fabric_connection *fc);
+
+static void fabric_connection_reroute_msg(fabric_connection *fc);
+static void fabric_connection_send_progress(fabric_connection *fc, bool is_last);
+static bool fabric_connection_process_writable(fabric_connection *fc);
+
+static bool fabric_connection_process_fabric_msg(fabric_connection *fc, const msg *m);
+static bool fabric_connection_read_fabric_msg(fabric_connection *fc);
+
+static bool fabric_connection_process_msg(fabric_connection *fc, bool do_rearm);
+static bool fabric_connection_process_readable(fabric_connection *fc);
+
+// fabric_recv_thread_pool
+static void fabric_recv_thread_pool_init(fabric_recv_thread_pool *pool, uint32_t size, uint32_t pool_id);
+static void fabric_recv_thread_pool_set_size(fabric_recv_thread_pool *pool, uint32_t size);
+static void fabric_recv_thread_pool_add_fc(fabric_recv_thread_pool *pool, fabric_connection *fc);
+
+// fabric_endpoint
+static bool fabric_endpoint_list_get(cf_node nodeid, as_endpoint_list *endpoint_list, size_t *endpoint_list_size);
+static bool fabric_connect_endpoint_filter(const as_endpoint *endpoint, void *udata);
+
+// Thread functions.
+static void *run_fabric_recv(void *arg);
+static void run_fabric_recv_cleanup(void *arg);
+static void *run_fabric_send(void *arg);
+static void *run_fabric_accept(void *arg);
+
+// Ticker helpers.
+static int fabric_rate_node_reduce_fn(const void *key, uint32_t keylen, void *data, void *udata);
+static int fabric_rate_fc_reduce_fn(const void *key, void *data, void *udata);
+
+// Heartbeat.
+static void fabric_hb_plugin_set_fn(msg *m);
+static void fabric_hb_plugin_parse_data_fn(msg *m, cf_node source, as_hb_plugin_node_data *plugin_data);
+static void fabric_heartbeat_event(int nevents, as_hb_event_node *events, void *udata);
+
+
+//==========================================================
+// Public API.
+//
+
+//------------------------------------------------
+// msg
+//
+
+msg *
+as_fabric_msg_get(msg_type type)
+{
+	if (type >= M_TYPE_MAX) {
+		return NULL;
+	}
+
+	msg *m = NULL;
+
+	if (cf_queue_pop(&g_fabric.msg_pool_queue[type], &m, CF_QUEUE_NOWAIT) !=
+			CF_QUEUE_OK) {
+		m = msg_create(type);
+	}
+	else {
+		msg_incr_ref(m);
+	}
+
+	return m;
+}
+
+void
+as_fabric_msg_put(msg *m)
+{
+	int cnt = cf_rc_release(m);
+
+	if (cnt == 0) {
+		msg_reset(m);
+
+		if (cf_queue_sz(&g_fabric.msg_pool_queue[m->type]) > 128) {
+			msg_put(m);
+		}
+		else {
+			cf_queue_push(&g_fabric.msg_pool_queue[m->type], &m);
+		}
+	}
+	else if (cnt < 0) {
+		msg_dump(m, "extra put");
+		cf_crash(AS_FABRIC, "extra put for msg type %d", m->type);
+	}
+}
+
+// Log information about existing "msg" objects and queues.
+void
+as_fabric_msg_queue_dump()
+{
+	cf_info(AS_FABRIC, "All currently-existing msg types:");
+
+	int total_q_sz = 0;
+	int total_alloced_msgs = 0;
+
+	for (int i = 0; i < M_TYPE_MAX; i++) {
+		int q_sz = cf_queue_sz(&g_fabric.msg_pool_queue[i]);
+		int num_of_type = cf_atomic_int_get(g_num_msgs_by_type[i]);
+
+		total_alloced_msgs += num_of_type;
+
+		if (q_sz || num_of_type) {
+			cf_info(AS_FABRIC, "|msgq[%d]| = %d ; alloc'd = %d", i, q_sz, num_of_type);
+			total_q_sz += q_sz;
+		}
+	}
+
+	int num_msgs = cf_atomic_int_get(g_num_msgs);
+
+	if (abs(num_msgs - total_alloced_msgs) > 2) {
+		cf_warning(AS_FABRIC, "num msgs (%d) != total alloc'd msgs (%d)", num_msgs, total_alloced_msgs);
+	}
+
+	cf_info(AS_FABRIC, "Total num. msgs = %d ; Total num. queued = %d ; Delta = %d", num_msgs, total_q_sz, num_msgs - total_q_sz);
+}
+
+//------------------------------------------------
+// as_fabric
+//
+
+int
+as_fabric_init()
+{
+	for (uint32_t i = 0; i < AS_FABRIC_N_CHANNELS; i++) {
+		g_fabric_connect_limit[i] = g_config.n_fabric_channel_fds[i];
+
+		fabric_recv_thread_pool_init(&g_fabric.recv_pool[i],
+				g_config.n_fabric_channel_recv_threads[i], i);
+	}
+
+	pthread_mutex_init(&g_fabric.send_lock, 0);
+
+	as_fabric_register_msg_fn(M_TYPE_FABRIC, fabric_mt, sizeof(fabric_mt),
+			FS_MSG_SCRATCH_SIZE, NULL, NULL);
+
+	pthread_mutex_init(&g_fabric.node_hash_lock, 0);
+
+	cf_rchash_create(&g_fabric.node_hash, cf_nodeid_rchash_fn,
+			fabric_node_destructor, sizeof(cf_node), 128, 0);
+
+	for (int i = 0; i < M_TYPE_MAX; i++) {
+		cf_queue_init(&g_fabric.msg_pool_queue[i], sizeof(msg *),
+				CF_QUEUE_ALLOCSZ, true);
+	}
+
+	cf_vector_init(&g_fabric.fb_free, sizeof(fabric_buffer *), 64,
+			VECTOR_FLAG_BIGLOCK);
+
+	g_published_endpoint_list = NULL;
+	g_published_endpoint_list_ipv4_only = cf_ip_addr_legacy_only();
+
+	if (! fabric_published_endpoints_refresh()) {
+		cf_crash(AS_FABRIC, "error creating fabric published endpoint list");
+	}
+
+	as_hb_plugin fabric_plugin;
+
+	memset(&fabric_plugin, 0, sizeof(fabric_plugin));
+	fabric_plugin.id = AS_HB_PLUGIN_FABRIC;
+	fabric_plugin.wire_size_fixed = 0; // includes the size for the protocol version
+	as_endpoint_list_sizeof(g_published_endpoint_list,
+			&fabric_plugin.wire_size_fixed);
+	fabric_plugin.wire_size_per_node = 0; // size per node node in succession list
+	fabric_plugin.set_fn = fabric_hb_plugin_set_fn;
+	fabric_plugin.parse_fn = fabric_hb_plugin_parse_data_fn;
+	fabric_plugin.change_listener = NULL;
+	as_hb_plugin_register(&fabric_plugin);
+
+	as_hb_register_listener(fabric_heartbeat_event, &g_fabric);
+
+	as_fabric_transact_init();
+
+	return 0;
+}
+
+int
+as_fabric_start()
+{
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	g_fabric.sends =
+			cf_malloc(sizeof(send_entry) * g_config.n_fabric_send_threads);
+	g_fabric.send_head = g_fabric.sends;
+
+	cf_info(AS_FABRIC, "starting %u fabric send threads", g_config.n_fabric_send_threads);
+
+	for (int i = 0; i < g_config.n_fabric_send_threads; i++) {
+		cf_poll_create(&g_fabric.sends[i].poll);
+		g_fabric.sends[i].id = i;
+		g_fabric.sends[i].count = 0;
+		g_fabric.sends[i].next = g_fabric.sends + i + 1;
+
+		if (pthread_create(&thread, &attrs, run_fabric_send,
+				&g_fabric.sends[i]) != 0) {
+			cf_crash(AS_FABRIC, "could not create fabric send thread");
+		}
+	}
+
+	g_fabric.sends[g_config.n_fabric_send_threads - 1].next = NULL;
+
+	for (uint32_t i = 0; i < AS_FABRIC_N_CHANNELS; i++) {
+		cf_info(AS_FABRIC, "starting %u fabric %s channel recv threads", g_config.n_fabric_channel_recv_threads[i], CHANNEL_NAMES[i]);
+
+		fabric_recv_thread_pool_set_size(&g_fabric.recv_pool[i],
+				g_config.n_fabric_channel_recv_threads[i]);
+	}
+
+	cf_info(AS_FABRIC, "starting fabric accept thread");
+
+	if (pthread_create(&thread, &attrs, run_fabric_accept, NULL) != 0) {
+		cf_crash(AS_FABRIC, "could not create fabric accept thread");
+	}
+
+	return 0;
+}
+
+void
+as_fabric_set_recv_threads(as_fabric_channel channel, uint32_t count)
+{
+	g_config.n_fabric_channel_recv_threads[channel] = count;
+
+	fabric_recv_thread_pool_set_size(&g_fabric.recv_pool[channel], count);
+}
+
+int
+as_fabric_send(cf_node node_id, msg *m, as_fabric_channel channel)
+{
+	m->benchmark_time = g_config.fabric_benchmarks_enabled ? cf_getns() : 0;
+
+	if (g_config.self_node == node_id) {
+		cf_assert(g_fabric.msg_cb[m->type], AS_FABRIC, "m->type %d not registered", m->type);
+		(g_fabric.msg_cb[m->type])(node_id, m, g_fabric.msg_udata[m->type]);
+
+		return AS_FABRIC_SUCCESS;
+	}
+
+	fabric_node *node = fabric_node_get(node_id);
+	int ret = fabric_node_send(node, m, channel);
+
+	if (node) {
+		fabric_node_release(node); // from fabric_node_get
+	}
+
+	return ret;
+}
+
+int
+as_fabric_send_list(const cf_node *nodes, uint32_t node_count, msg *m,
+		as_fabric_channel channel)
+{
+	if (! nodes) {
+		node_list nl;
+
+		fabric_get_node_list(&nl);
+		return as_fabric_send_list(nl.nodes, nl.count, m, channel);
+	}
+
+	int ret = AS_FABRIC_SUCCESS;
+
+	for (uint32_t i = 0; i < node_count; i++) {
+		msg_incr_ref(m);
+
+		if ((ret = as_fabric_send(nodes[i], m, channel)) != AS_FABRIC_SUCCESS) {
+			// Leave the reference for the sake of caller.
+			break;
+		}
+	}
+
+	as_fabric_msg_put(m); // release main reference
+
+	return ret;
+}
+
+// TODO - make static registration
+void
+as_fabric_register_msg_fn(msg_type type, const msg_template *mt, size_t mt_sz,
+		size_t scratch_sz, as_fabric_msg_fn msg_cb, void *msg_udata)
+{
+	msg_type_register(type, mt, mt_sz, scratch_sz);
+
+	g_fabric.msg_cb[type] = msg_cb;
+	g_fabric.msg_udata[type] = msg_udata;
+}
+
+void
+as_fabric_info_peer_endpoints_get(cf_dyn_buf *db)
+{
+	node_list nl;
+	fabric_get_node_list(&nl);
+
+	for (uint32_t i = 0; i < nl.count; i++) {
+		if (nl.nodes[i] == g_config.self_node) {
+			continue;
+		}
+
+		fabric_node *node = fabric_node_get(nl.nodes[i]);
+
+		if (! node) {
+			cf_info(AS_FABRIC, "\tnode %lx not found in hash although reported available", nl.nodes[i]);
+			continue;
+		}
+
+		size_t endpoint_list_capacity = 1024;
+		bool retry = true;
+
+		while (true) {
+			uint8_t stack_mem[endpoint_list_capacity];
+			as_endpoint_list *endpoint_list = (as_endpoint_list *)stack_mem;
+
+			if (! fabric_endpoint_list_get(node->node_id, endpoint_list,
+					&endpoint_list_capacity)) {
+				if (errno == ENOENT) {
+					// No entry present for this node in heartbeat.
+					cf_detail(AS_FABRIC, "could not get endpoint list for %lx", node->node_id);
+					break;
+				}
+
+				if (! retry) {
+					break;
+				}
+
+				retry = false;
+				continue;
+			}
+
+			cf_dyn_buf_append_string(db, "fabric.peer=");
+			cf_dyn_buf_append_string(db, "node-id=");
+			cf_dyn_buf_append_uint64_x(db, node->node_id);
+			cf_dyn_buf_append_string(db, ":");
+			as_endpoint_list_info(endpoint_list, db);
+			cf_dyn_buf_append_string(db, ";");
+			break;
+		}
+
+		fabric_node_release(node);
+	}
+}
+
+bool
+as_fabric_is_published_endpoint_list(const as_endpoint_list *list)
+{
+	return as_endpoint_lists_are_equal(g_published_endpoint_list, list);
+}
+
+// Used by heartbeat subsystem only, for duplicate node-id detection.
+as_endpoint_list *
+as_fabric_hb_plugin_get_endpoint_list(as_hb_plugin_node_data *plugin_data)
+{
+	return (plugin_data && plugin_data->data_size != 0) ?
+			(as_endpoint_list *)plugin_data->data : NULL;
+}
+
+void
+as_fabric_rate_capture(fabric_rate *rate)
+{
+	pthread_mutex_lock(&g_fabric.node_hash_lock);
+	cf_rchash_reduce(g_fabric.node_hash, fabric_rate_node_reduce_fn, rate);
+	pthread_mutex_unlock(&g_fabric.node_hash_lock);
+}
+
+void
+as_fabric_dump(bool verbose)
+{
+	node_list nl;
+	fabric_get_node_list(&nl);
+
+	cf_info(AS_FABRIC, " Fabric Dump: nodes known %d", nl.count);
+
+	for (uint32_t i = 0; i < nl.count; i++) {
+		if (nl.nodes[i] == g_config.self_node) {
+			cf_info(AS_FABRIC, "\tnode %lx is self", nl.nodes[i]);
+			continue;
+		}
+
+		fabric_node *node = fabric_node_get(nl.nodes[i]);
+
+		if (! node) {
+			cf_info(AS_FABRIC, "\tnode %lx not found in hash although reported available", nl.nodes[i]);
+			continue;
+		}
+
+		pthread_mutex_lock(&node->fc_hash_lock);
+		cf_info(AS_FABRIC, "\tnode %lx fds {via_connect={h=%d m=%d l=%d} all=%d} live %d q {h=%d m=%d l=%d}",
+				node->node_id,
+				node->connect_count[AS_FABRIC_CHANNEL_CTRL],
+				node->connect_count[AS_FABRIC_CHANNEL_RW],
+				node->connect_count[AS_FABRIC_CHANNEL_BULK],
+				cf_shash_get_size(node->fc_hash), node->live,
+				cf_queue_sz(&node->send_queue[AS_FABRIC_CHANNEL_CTRL]),
+				cf_queue_sz(&node->send_queue[AS_FABRIC_CHANNEL_RW]),
+				cf_queue_sz(&node->send_queue[AS_FABRIC_CHANNEL_BULK]));
+		pthread_mutex_unlock(&node->fc_hash_lock);
+
+		fabric_node_release(node); // node_get
+	}
+}
+
+
+//==========================================================
+// Support functions.
+//
+
+static void
+send_entry_insert(send_entry **se_pp, send_entry *se)
+{
+	while (*se_pp && se->count > (*se_pp)->count) {
+		se_pp = &(*se_pp)->next;
+	}
+
+	se->next = *se_pp;
+	*se_pp = se;
+}
+
+// Get addresses to publish as serv config. Expand "any" addresses.
+static void
+fabric_published_serv_cfg_fill(const cf_serv_cfg *bind_cfg,
+		cf_serv_cfg *published_cfg, bool ipv4_only)
+{
+	cf_serv_cfg_init(published_cfg);
+
+	cf_sock_cfg sock_cfg;
+
+	for (int i = 0; i < bind_cfg->n_cfgs; i++) {
+		cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg);
+
+		// Expand "any" address to all interfaces.
+		if (cf_ip_addr_is_any(&sock_cfg.addr)) {
+			cf_ip_addr all_addrs[CF_SOCK_CFG_MAX];
+			uint32_t n_all_addrs = CF_SOCK_CFG_MAX;
+
+			if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) {
+				cf_warning(AS_FABRIC, "error getting all interface addresses");
+				n_all_addrs = 0;
+			}
+
+			for (int j = 0; j < n_all_addrs; j++) {
+				// Skip local address if any is specified.
+				if (cf_ip_addr_is_local(&all_addrs[j]) ||
+						(ipv4_only && ! cf_ip_addr_is_legacy(&all_addrs[j]))) {
+					continue;
+				}
+
+				cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr);
+
+				if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) {
+					cf_crash(AS_FABRIC, "error initializing published address list");
+				}
+			}
+		}
+		else {
+			if (ipv4_only && ! cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) {
+				continue;
+			}
+
+			if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) {
+				cf_crash(AS_FABRIC, "error initializing published address list");
+			}
+		}
+	}
+}
+
+// Refresh the fabric published endpoint list.
+// Return true on success.
+static bool
+fabric_published_endpoints_refresh()
+{
+	if (g_published_endpoint_list &&
+			g_published_endpoint_list_ipv4_only == cf_ip_addr_legacy_only()) {
+		return true;
+	}
+
+	// The global flag has changed, refresh the published address list.
+	if (g_published_endpoint_list) {
+		// Free the obsolete list.
+		cf_free(g_published_endpoint_list);
+	}
+
+	cf_serv_cfg published_cfg;
+	fabric_published_serv_cfg_fill(&g_fabric_bind, &published_cfg,
+			g_published_endpoint_list_ipv4_only);
+
+	g_published_endpoint_list = as_endpoint_list_from_serv_cfg(&published_cfg);
+	cf_assert(g_published_endpoint_list, AS_FABRIC, "error initializing mesh published address list");
+
+	g_published_endpoint_list_ipv4_only = cf_ip_addr_legacy_only();
+
+	if (g_published_endpoint_list->n_endpoints == 0) {
+		if (g_published_endpoint_list_ipv4_only) {
+			cf_warning(AS_FABRIC, "no IPv4 addresses configured for fabric");
+		}
+		else {
+			cf_warning(AS_FABRIC, "no addresses configured for fabric");
+		}
+
+		return false;
+	}
+
+	char endpoint_list_str[512];
+	as_endpoint_list_to_string(g_published_endpoint_list, endpoint_list_str,
+			sizeof(endpoint_list_str));
+
+	cf_info(AS_FABRIC, "updated fabric published address list to {%s}", endpoint_list_str);
+
+	return true;
+}
+
+
+//==========================================================
+// fabric_node
+//
+
+static fabric_node *
+fabric_node_create(cf_node node_id)
+{
+	size_t size = sizeof(fabric_node) +
+			(sizeof(uint8_t) * g_config.n_fabric_send_threads);
+	fabric_node *node = cf_rc_alloc(size);
+
+	memset(node, 0, size);
+
+	node->node_id = node_id;
+	node->live = true;
+
+	if (pthread_mutex_init(&node->send_idle_fc_queue_lock, NULL) != 0) {
+		cf_crash(AS_FABRIC, "fabric_node_create(%lx) failed to init send_idle_fc_queue_lock", node_id);
+	}
+
+	for (int i = 0; i < AS_FABRIC_N_CHANNELS; i++) {
+		cf_queue_init(&node->send_idle_fc_queue[i], sizeof(fabric_connection *),
+				CF_QUEUE_ALLOCSZ, false);
+
+		cf_queue_init(&node->send_queue[i], sizeof(msg *), CF_QUEUE_ALLOCSZ,
+				true);
+	}
+
+	if (pthread_mutex_init(&node->connect_lock, NULL) != 0) {
+		cf_crash(AS_FABRIC, "fabric_node_create(%lx) failed to init connect_lock", node_id);
+	}
+
+	if (pthread_mutex_init(&node->fc_hash_lock, NULL) != 0) {
+		cf_crash(AS_FABRIC, "fabric_node_create(%lx) failed to init fc_hash_lock", node_id);
+	}
+
+	node->fc_hash = cf_shash_create(cf_shash_fn_ptr,
+			sizeof(fabric_connection *), 0, 32, 0);
+
+	cf_detail(AS_FABRIC, "fabric_node_create(%lx) node %p", node_id, node);
+
+	return node;
+}
+
+static fabric_node *
+fabric_node_get(cf_node node_id)
+{
+	fabric_node *node;
+
+	pthread_mutex_lock(&g_fabric.node_hash_lock);
+	int rv = cf_rchash_get(g_fabric.node_hash, &node_id, sizeof(cf_node),
+			(void **)&node);
+	pthread_mutex_unlock(&g_fabric.node_hash_lock);
+
+	if (rv != CF_RCHASH_OK) {
+		return NULL;
+	}
+
+	return node;
+}
+
+static fabric_node *
+fabric_node_get_or_create(cf_node node_id)
+{
+	fabric_node *node;
+
+	pthread_mutex_lock(&g_fabric.node_hash_lock);
+
+	if (cf_rchash_get(g_fabric.node_hash, &node_id, sizeof(cf_node),
+			(void **)&node) == CF_RCHASH_OK) {
+		pthread_mutex_unlock(&g_fabric.node_hash_lock);
+
+		fabric_node_connect_all(node);
+
+		return node;
+	}
+
+	node = fabric_node_create(node_id);
+
+	if (cf_rchash_put_unique(g_fabric.node_hash, &node_id, sizeof(cf_node),
+			node) != CF_RCHASH_OK) {
+		cf_crash(AS_FABRIC, "fabric_node_get_or_create(%lx)", node_id);
+	}
+
+	fabric_node_reserve(node); // for return
+
+	pthread_mutex_unlock(&g_fabric.node_hash_lock);
+
+	fabric_node_connect_all(node);
+
+	return node;
+}
+
+static fabric_node *
+fabric_node_pop(cf_node node_id)
+{
+	fabric_node *node = NULL;
+
+	pthread_mutex_lock(&g_fabric.node_hash_lock);
+
+	if (cf_rchash_get(g_fabric.node_hash, &node_id, sizeof(cf_node),
+			(void **)&node) == CF_RCHASH_OK) {
+		if (cf_rchash_delete(g_fabric.node_hash, &node_id, sizeof(node_id)) !=
+				CF_RCHASH_OK) {
+			cf_crash(AS_FABRIC, "fabric_node_pop(%lx)", node_id);
+		}
+	}
+
+	pthread_mutex_unlock(&g_fabric.node_hash_lock);
+
+	return node;
+}
+
+static int
+fabric_node_disconnect_reduce_fn(const void *key, void *data, void *udata)
+{
+	fabric_connection *fc = *(fabric_connection **)key;
+
+	cf_assert(fc, AS_FABRIC, "fc == NULL, don't put NULLs into fc_hash");
+	cf_socket_shutdown(&fc->sock);
+	fabric_connection_release(fc); // for delete from node->fc_hash
+
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+static void
+fabric_node_disconnect(cf_node node_id)
+{
+	fabric_node *node = fabric_node_pop(node_id);
+
+	if (! node) {
+		cf_warning(AS_FABRIC, "fabric_node_disconnect(%lx) not connected", node_id);
+		return;
+	}
+
+	cf_info(AS_FABRIC, "fabric_node_disconnect(%lx)", node_id);
+
+	pthread_mutex_lock(&node->fc_hash_lock);
+
+	node->live = false;
+	// Clean up all fc's attached to this node.
+	cf_shash_reduce(node->fc_hash, fabric_node_disconnect_reduce_fn, NULL);
+
+	pthread_mutex_unlock(&node->fc_hash_lock);
+
+	pthread_mutex_lock(&node->send_idle_fc_queue_lock);
+
+	for (int i = 0; i < AS_FABRIC_N_CHANNELS; i++) {
+		while (true) {
+			fabric_connection *fc;
+
+			int rv = cf_queue_pop(&node->send_idle_fc_queue[i], &fc,
+					CF_QUEUE_NOWAIT);
+
+			if (rv != CF_QUEUE_OK) {
+				break;
+			}
+
+			fabric_connection_send_unassign(fc);
+			fabric_connection_release(fc);
+		}
+	}
+
+	pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+
+	fabric_node_release(node); // from fabric_node_pop()
+}
+
+static fabric_connection *
+fabric_node_connect(fabric_node *node, uint32_t ch)
+{
+	cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u)", node, ch);
+
+	pthread_mutex_lock(&node->connect_lock);
+
+	uint32_t fds = node->connect_count[ch] + 1;
+
+	if (fds > g_fabric_connect_limit[ch]) {
+		pthread_mutex_unlock(&node->connect_lock);
+		return NULL;
+	}
+
+	cf_socket sock;
+	cf_sock_addr addr;
+	size_t endpoint_list_capacity = 1024;
+	int tries_remaining = 3;
+
+	while (tries_remaining--) {
+		uint8_t endpoint_list_mem[endpoint_list_capacity];
+		as_endpoint_list *endpoint_list = (as_endpoint_list *)endpoint_list_mem;
+
+		if (fabric_endpoint_list_get(node->node_id, endpoint_list,
+				&endpoint_list_capacity)) {
+			char endpoint_list_str[1024];
+
+			as_endpoint_list_to_string(endpoint_list, endpoint_list_str,
+					sizeof(endpoint_list_str));
+			cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u) node_id %lx with endpoints {%s}", node, ch, node->node_id, endpoint_list_str);
+
+			// Initiate connect to the remote endpoint.
+			const as_endpoint *connected_endpoint = as_endpoint_connect_any(
+					endpoint_list, fabric_connect_endpoint_filter, NULL, 0,
+					&sock);
+
+			if (! connected_endpoint) {
+				cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u) node_id %lx failed for endpoints {%s}", node, ch, node->node_id, endpoint_list_str);
+				pthread_mutex_unlock(&node->connect_lock);
+				return NULL;
+			}
+
+			as_endpoint_to_sock_addr(connected_endpoint, &addr);
+
+			if (as_endpoint_capability_is_supported(connected_endpoint,
+					AS_ENDPOINT_TLS_MASK)) {
+				tls_socket_prepare_client(g_fabric_tls, &sock);
+			}
+
+			break; // read success
+		}
+
+		if (errno == ENOENT) {
+			// No entry present for this node in heartbeat.
+			cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u) unknown remote node %lx", node, ch, node->node_id);
+			pthread_mutex_unlock(&node->connect_lock);
+			return NULL;
+		}
+
+		// The list capacity was not enough. Retry with suggested list size.
+	}
+
+	if (tries_remaining < 0) {
+		cf_warning(AS_FABRIC,"fabric_node_connect(%p, %u) List get error for remote node %lx", node, ch, node->node_id);
+		pthread_mutex_unlock(&node->connect_lock);
+		return NULL;
+	}
+
+	msg *m = as_fabric_msg_get(M_TYPE_FABRIC);
+
+	cf_atomic64_incr(&g_stats.fabric_connections_opened);
+	msg_set_uint64(m, FS_FIELD_NODE, g_config.self_node);
+	msg_set_uint32(m, FS_CHANNEL, ch);
+	m->benchmark_time = g_config.fabric_benchmarks_enabled ? cf_getns() : 0;
+
+	fabric_connection *fc = fabric_connection_create(&sock, &addr);
+
+	fc->s_msg_in_progress = m;
+	fc->started_via_connect = true;
+	fc->pool = &g_fabric.recv_pool[ch];
+
+	if (! fabric_node_add_connection(node, fc)) {
+		fabric_connection_release(fc);
+		pthread_mutex_unlock(&node->connect_lock);
+		return NULL;
+	}
+
+	node->connect_count[ch]++;
+	node->connect_full = fabric_node_is_connect_full(node);
+
+	pthread_mutex_unlock(&node->connect_lock);
+
+	return fc;
+}
+
+static int
+fabric_node_send(fabric_node *node, msg *m, as_fabric_channel channel)
+{
+	if (! node || ! node->live) {
+		return AS_FABRIC_ERR_NO_NODE;
+	}
+
+	while (true) {
+		// Sync with fabric_connection_process_writable() to avoid non-empty
+		// send_queue with every fc being in send_idle_fc_queue.
+		pthread_mutex_lock(&node->send_idle_fc_queue_lock);
+
+		fabric_connection *fc;
+		int rv = cf_queue_pop(&node->send_idle_fc_queue[(int)channel], &fc,
+				CF_QUEUE_NOWAIT);
+
+		if (rv != CF_QUEUE_OK) {
+			cf_queue_push(&node->send_queue[(int)channel], &m);
+			pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+
+			if (! node->connect_full) {
+				fabric_node_connect_all(node);
+			}
+
+			break;
+		}
+
+		pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+
+		if ((! cf_socket_exists(&fc->sock)) || fc->failed) {
+			fabric_connection_release(fc); // send_idle_fc_queue
+			continue;
+		}
+
+		fc->s_msg_in_progress = m;
+
+		// Wake up.
+		if (fc->send_ptr) {
+			fabric_connection_send_rearm(fc); // takes fc ref
+		}
+		else {
+			fabric_connection_send_assign(fc); // takes fc ref
+		}
+
+		break;
+	}
+
+	return AS_FABRIC_SUCCESS;
+}
+
+static void
+fabric_node_connect_all(fabric_node *node)
+{
+	if (! node->live) {
+		return;
+	}
+
+	for (uint32_t ch = 0; ch < AS_FABRIC_N_CHANNELS; ch++) {
+		uint32_t n = g_fabric_connect_limit[ch] - node->connect_count[ch];
+
+		for (uint32_t i = 0; i < n; i++) {
+			fabric_connection *fc = fabric_node_connect(node, ch);
+
+			if (! fc) {
+				break;
+			}
+
+			// TLS connections are one-way. Outgoing connections are for
+			// outgoing data.
+			if (fc->sock.state == CF_SOCKET_STATE_NON_TLS) {
+				fabric_recv_thread_pool_add_fc(&g_fabric.recv_pool[ch], fc);
+				cf_detail(AS_FABRIC, "{%16lX, %u} activated", fabric_connection_get_id(fc), fc->sock.fd);
+			}
+
+			// Takes the remaining ref for send_poll and idle queue.
+			fabric_connection_send_assign(fc);
+		}
+	}
+}
+
+static void
+fabric_node_destructor(void *pnode)
+{
+	fabric_node *node = (fabric_node *)pnode;
+	cf_detail(AS_FABRIC, "fabric_node_destructor(%p)", node);
+
+	for (int i = 0; i < AS_FABRIC_N_CHANNELS; i++) {
+		// send_idle_fc_queue section.
+		cf_assert(cf_queue_sz(&node->send_idle_fc_queue[i]) == 0, AS_FABRIC, "send_idle_fc_queue not empty as expected");
+		cf_queue_destroy(&node->send_idle_fc_queue[i]);
+
+		// send_queue section.
+		while (true) {
+			msg *m;
+
+			if (cf_queue_pop(&node->send_queue[i], &m, CF_QUEUE_NOWAIT) !=
+					CF_QUEUE_OK) {
+				break;
+			}
+
+			as_fabric_msg_put(m);
+		}
+
+		cf_queue_destroy(&node->send_queue[i]);
+	}
+
+	pthread_mutex_destroy(&node->send_idle_fc_queue_lock);
+
+	// connection_hash section.
+	cf_assert(cf_shash_get_size(node->fc_hash) == 0, AS_FABRIC, "fc_hash not empty as expected");
+	cf_shash_destroy(node->fc_hash);
+
+	pthread_mutex_destroy(&node->fc_hash_lock);
+}
+
+inline static void
+fabric_node_reserve(fabric_node *node) {
+	cf_rc_reserve(node);
+}
+
+inline static void
+fabric_node_release(fabric_node *node)
+{
+	int cnt = cf_rc_release(node);
+
+	if (cnt == 0) {
+		fabric_node_destructor(node);
+		cf_rc_free(node);
+	}
+	else if (cnt < 0) {
+		cf_crash(AS_FABRIC, "fabric_node_release(%p) extra call", node);
+	}
+}
+
+static bool
+fabric_node_add_connection(fabric_node *node, fabric_connection *fc)
+{
+	pthread_mutex_lock(&node->fc_hash_lock);
+
+	if (! node->live) {
+		pthread_mutex_unlock(&node->fc_hash_lock);
+		return false;
+	}
+
+	fabric_node_reserve(node);
+	fc->node = node;
+
+	fabric_connection_set_keepalive_options(fc);
+	fabric_connection_reserve(fc); // for put into node->fc_hash
+
+	uint8_t value = 0;
+	int rv = cf_shash_put_unique(node->fc_hash, &fc, &value);
+
+	cf_assert(rv == CF_SHASH_OK, AS_FABRIC, "fabric_node_add_connection(%p, %p) failed to add with rv %d", node, fc, rv);
+
+	pthread_mutex_unlock(&node->fc_hash_lock);
+
+	return true;
+}
+
+static uint8_t
+fabric_node_find_min_send_count(const fabric_node *node)
+{
+	uint8_t min = node->send_counts[0];
+
+	for (uint32_t i = 1; i < g_config.n_fabric_send_threads; i++) {
+		if (node->send_counts[i] < min) {
+			min = node->send_counts[i];
+		}
+	}
+
+	return min;
+}
+
+static bool
+fabric_node_is_connect_full(const fabric_node *node)
+{
+	for (int ch = 0; ch < AS_FABRIC_N_CHANNELS; ch++) {
+		if (node->connect_count[ch] < g_fabric_connect_limit[ch]) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
+static int
+fabric_get_node_list_fn(const void *key, uint32_t keylen, void *data,
+		void *udata)
+{
+	node_list *nl = (node_list *)udata;
+
+	if (nl->count == AS_CLUSTER_SZ) {
+		return 0;
+	}
+
+	nl->nodes[nl->count] = *(const cf_node *)key;
+	nl->count++;
+
+	return 0;
+}
+
+// Get a list of all the nodes - use a dynamic array, which requires inline.
+static uint32_t
+fabric_get_node_list(node_list *nl)
+{
+	nl->count = 1;
+	nl->nodes[0] = g_config.self_node;
+
+	pthread_mutex_lock(&g_fabric.node_hash_lock);
+	cf_rchash_reduce(g_fabric.node_hash, fabric_get_node_list_fn, nl);
+	pthread_mutex_unlock(&g_fabric.node_hash_lock);
+
+	return nl->count;
+}
+
+
+//==========================================================
+// fabric_buffer
+//
+
+static fabric_buffer *
+fabric_buffer_create(size_t sz)
+{
+	fabric_buffer *fb;
+
+	if (cf_vector_pop(&g_fabric.fb_free, &fb) != 0) {
+		fb = cf_malloc(sizeof(fabric_buffer));
+	}
+
+	fabric_buffer_init(fb, sz);
+
+	return fb;
+}
+
+static void
+fabric_buffer_init(fabric_buffer *fb, size_t sz)
+{
+	if (sz > FABRIC_BUFFER_MEM_SZ) {
+		fb->buf = (uint8_t *)cf_malloc(sz);
+	}
+	else {
+		fb->buf = fb->membuf;
+	}
+
+	fb->progress = fb->buf;
+	fb->end = fb->buf + sz;
+}
+
+static void
+fabric_buffer_destroy(fabric_buffer *fb)
+{
+	fabric_buffer_free_extra(fb);
+
+	if (cf_vector_size(&g_fabric.fb_free) > 64) {
+		cf_free(fb);
+	}
+	else if (cf_vector_append(&g_fabric.fb_free, &fb) != 0) {
+		cf_crash(AS_FABRIC, "push into %p failed on fb %p", &g_fabric.fb_free, fb);
+	}
+}
+
+inline static void
+fabric_buffer_free_extra(fabric_buffer *fb)
+{
+	if (fb->buf != fb->membuf) {
+		cf_free(fb->buf);
+	}
+}
+
+// Resize fb after we know the msg_size.
+inline static bool
+fabric_buffer_resize(fabric_buffer *fb, size_t sz)
+{
+	if (sz > FABRIC_BUFFER_MEM_SZ) {
+		if (sz > FABRIC_BUFFER_MAX_SZ) {
+			return false;
+		}
+
+		cf_assert(fb->buf == fb->membuf, AS_FABRIC, "function misuse");
+
+		size_t old_sz = fb->progress - fb->membuf;
+
+		fb->buf = (uint8_t *)cf_malloc(sz);
+
+		memcpy(fb->buf, fb->membuf, old_sz);
+		fb->progress = fb->buf + old_sz;
+	}
+
+	fb->end = fb->buf + sz;
+	return true;
+}
+
+
+//==========================================================
+// fabric_connection
+//
+
+fabric_connection *
+fabric_connection_create(cf_socket *sock, cf_sock_addr *peer)
+{
+	fabric_connection *fc = cf_rc_alloc(sizeof(fabric_connection));
+
+	memset(fc, 0, sizeof(fabric_connection));
+
+	cf_socket_copy(sock, &fc->sock);
+	cf_sock_addr_copy(peer, &fc->peer);
+
+	fc->r_buf_in_progress = fabric_buffer_create(sizeof(msg_hdr));
+	fc->r_type = M_TYPE_FABRIC;
+
+	return fc;
+}
+
+static bool
+fabric_connection_accept_tls(fabric_connection *fc)
+{
+	int32_t tls_ev = tls_socket_accept(&fc->sock);
+
+	if (tls_ev == EPOLLERR) {
+		cf_warning(AS_FABRIC, "fabric TLS server handshake with %s failed", cf_sock_addr_print(&fc->peer));
+		return false;
+	}
+
+	if (tls_ev == 0) {
+		tls_socket_must_not_have_data(&fc->sock, "fabric server handshake");
+		tls_ev = EPOLLIN;
+	}
+
+	cf_poll_modify_socket(g_accept_poll, &fc->sock,
+			tls_ev | EPOLLERR | EPOLLHUP | EPOLLRDHUP, fc);
+	return true;
+}
+
+static bool
+fabric_connection_connect_tls(fabric_connection *fc)
+{
+	int32_t tls_ev = tls_socket_connect(&fc->sock);
+
+	if (tls_ev == EPOLLERR) {
+		cf_warning(AS_FABRIC, "fabric TLS client handshake with %s failed", cf_sock_addr_print(&fc->peer));
+		return false;
+	}
+
+	if (tls_ev == 0) {
+		tls_socket_must_not_have_data(&fc->sock, "fabric client handshake");
+		tls_ev = EPOLLOUT;
+	}
+
+	cf_poll_modify_socket(fc->send_ptr->poll, &fc->sock,
+			tls_ev | DEFAULT_EVENTS, fc);
+	return true;
+}
+
+inline static void
+fabric_connection_reserve(fabric_connection *fc)
+{
+	cf_rc_reserve(fc);
+}
+
+static void
+fabric_connection_release(fabric_connection *fc)
+{
+	int cnt = cf_rc_release(fc);
+
+	if (cnt == 0) {
+		if (fc->s_msg_in_progress) {
+			// First message (s_count == 0) is initial M_TYPE_FABRIC message
+			// and does not need to be saved.
+			if (! fc->started_via_connect || fc->s_count != 0) {
+				cf_queue_push(&fc->node->send_queue[fc->pool->pool_id],
+						&fc->s_msg_in_progress);
+			}
+			else {
+				as_fabric_msg_put(fc->s_msg_in_progress);
+			}
+		}
+
+		if (fc->node) {
+			fabric_node_release(fc->node);
+			fc->node = NULL;
+		}
+		else {
+			cf_detail(AS_FABRIC, "releasing fc %p not attached to a node", fc);
+		}
+
+		cf_socket_close(&fc->sock);
+		cf_socket_term(&fc->sock);
+		cf_atomic64_incr(&g_stats.fabric_connections_closed);
+
+		fabric_buffer_destroy(fc->r_buf_in_progress);
+		fabric_buffer_free_extra(&fc->s_buf);
+
+		cf_rc_free(fc);
+	}
+	else if (cnt < 0) {
+		cf_crash(AS_FABRIC, "extra fabric_connection_release %p", fc);
+	}
+}
+
+inline static cf_node
+fabric_connection_get_id(const fabric_connection *fc)
+{
+	if (fc->node) {
+		return fc->node->node_id;
+	}
+
+	return 0;
+}
+
+// epoll takes the reference of fc.
+static void
+fabric_connection_send_assign(fabric_connection *fc)
+{
+	pthread_mutex_lock(&g_fabric.send_lock);
+
+	send_entry **pp = &g_fabric.send_head;
+	uint8_t min = fabric_node_find_min_send_count(fc->node);
+
+	while (true) {
+		uint32_t send_id = (*pp)->id;
+
+		if (fc->node->send_counts[send_id] == min) {
+			break;
+		}
+
+		cf_assert((*pp)->next, AS_FABRIC, "fabric_connection_send_assign() invalid send_count state");
+
+		pp = &(*pp)->next;
+	}
+
+	send_entry *se = *pp;
+
+	se->count++;
+	fc->node->send_counts[se->id]++;
+
+	if (se->next && se->next->count < se->count) {
+		*pp = se->next;
+		send_entry_insert(pp, se);
+	}
+
+	fc->send_ptr = se;
+
+	pthread_mutex_unlock(&g_fabric.send_lock);
+
+	cf_poll_add_socket(se->poll, &fc->sock, EPOLLOUT | DEFAULT_EVENTS, fc);
+}
+
+static void
+fabric_connection_send_unassign(fabric_connection *fc)
+{
+	pthread_mutex_lock(&g_fabric.send_lock);
+
+	if (! fc->send_ptr) {
+		pthread_mutex_unlock(&g_fabric.send_lock);
+		return;
+	}
+
+	send_entry **pp = &g_fabric.send_head;
+	send_entry *se = fc->send_ptr;
+
+	while (*pp != se) {
+		cf_assert((*pp)->next, AS_FABRIC, "fabric_connection_send_unassign() invalid send_count state");
+
+		pp = &(*pp)->next;
+	}
+
+	cf_assert(se->count != 0 || fc->node->send_counts[se->id] != 0, AS_FABRIC, "invalid send_count accounting se %p id %u count %u node send_count %u",
+			se, se->id, se->count, fc->node->send_counts[se->id]);
+
+	se->count--;
+	fc->node->send_counts[se->id]--;
+
+	*pp = se->next;
+	send_entry_insert(&g_fabric.send_head, se);
+
+	fc->send_ptr = NULL;
+
+	pthread_mutex_unlock(&g_fabric.send_lock);
+}
+
+inline static void
+fabric_connection_recv_rearm(fabric_connection *fc)
+{
+	cf_poll_modify_socket(fc->pool->poll, &fc->sock,
+			EPOLLIN | DEFAULT_EVENTS, fc);
+}
+
+// epoll takes the reference of fc.
+inline static void
+fabric_connection_send_rearm(fabric_connection *fc)
+{
+	cf_poll_modify_socket(fc->send_ptr->poll, &fc->sock,
+			EPOLLOUT | DEFAULT_EVENTS, fc);
+}
+
+static void
+fabric_connection_disconnect(fabric_connection *fc)
+{
+	fc->failed = true;
+	cf_socket_shutdown(&fc->sock);
+
+	fabric_node *node = fc->node;
+
+	if (! node) {
+		return;
+	}
+
+	pthread_mutex_lock(&node->fc_hash_lock);
+
+	if (cf_shash_delete(node->fc_hash, &fc) != CF_SHASH_OK) {
+		cf_detail(AS_FABRIC, "fc %p is not in (node %p)->fc_hash", fc, node);
+		pthread_mutex_unlock(&node->fc_hash_lock);
+		return;
+	}
+
+	pthread_mutex_unlock(&node->fc_hash_lock);
+
+	if (fc->started_via_connect) {
+		pthread_mutex_lock(&node->connect_lock);
+
+		cf_atomic32_decr(&node->connect_count[fc->pool->pool_id]);
+		node->connect_full = false;
+
+		pthread_mutex_unlock(&node->connect_lock);
+	}
+
+	pthread_mutex_lock(&node->send_idle_fc_queue_lock);
+
+	if (cf_queue_delete(&node->send_idle_fc_queue[fc->pool->pool_id], &fc,
+			true) == CF_QUEUE_OK) {
+		fabric_connection_release(fc); // for delete from send_idle_fc_queue
+	}
+
+	pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+
+	cf_detail(AS_FABRIC, "fabric_connection_disconnect(%p) {pool=%u id=%lx fd=%u}",
+			fc, fc->pool ? fc->pool->pool_id : 0,
+			node ? node->node_id : (cf_node)0, fc->sock.fd);
+
+	fabric_connection_release(fc); // for delete from node->fc_hash
+}
+
+static void
+fabric_connection_set_keepalive_options(fabric_connection *fc)
+{
+	if (g_config.fabric_keepalive_enabled) {
+		cf_socket_keep_alive(&fc->sock, g_config.fabric_keepalive_time,
+				g_config.fabric_keepalive_intvl,
+				g_config.fabric_keepalive_probes);
+	}
+}
+
+static void
+fabric_connection_reroute_msg(fabric_connection *fc)
+{
+	if (! fc->s_msg_in_progress) {
+		return;
+	}
+
+	// Don't reroute initial M_TYPE_FABRIC message.
+	if ((fc->started_via_connect && fc->s_count == 0) ||
+			fabric_node_send(fc->node, fc->s_msg_in_progress,
+					fc->pool->pool_id) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(fc->s_msg_in_progress);
+	}
+
+	fc->s_msg_in_progress = NULL;
+}
+
+static void
+fabric_connection_send_progress(fabric_connection *fc, bool is_last)
+{
+	uint8_t *send_progress;
+	size_t send_full;
+
+	if (fc->s_buf.buf) {
+		// Partially sent msg.
+		send_progress = fc->s_buf.progress;
+		send_full = fc->s_buf.end - send_progress;
+	}
+	else {
+		// Fresh msg.
+		msg *m = fc->s_msg_in_progress;
+
+		send_full = msg_get_wire_size(m);
+		fabric_buffer_init(&fc->s_buf, send_full);
+
+		send_progress = fc->s_buf.progress;
+		msg_to_wire(m, send_progress);
+
+		if (m->benchmark_time != 0) {
+			m->benchmark_time = histogram_insert_data_point(
+					g_stats.fabric_send_init_hists[fc->pool->pool_id],
+					m->benchmark_time);
+		}
+	}
+
+	int32_t flags = MSG_NOSIGNAL | (is_last ? 0 : MSG_MORE);
+	int32_t send_sz = cf_socket_send(&fc->sock, send_progress, send_full,
+			flags);
+
+	if (send_sz < 0) {
+		if (errno != EAGAIN && errno != EWOULDBLOCK) {
+			fc->failed = true;
+			cf_socket_write_shutdown(&fc->sock);
+			return;
+		}
+
+		send_sz = 0; // treat as sending 0
+	}
+
+	if (fc->s_msg_in_progress->benchmark_time != 0) {
+		fc->s_msg_in_progress->benchmark_time = histogram_insert_data_point(
+				g_stats.fabric_send_fragment_hists[fc->pool->pool_id],
+				fc->s_msg_in_progress->benchmark_time);
+	}
+
+	fc->s_bytes += send_sz;
+
+	if ((size_t)send_sz == send_full) {
+		// Complete send.
+		as_fabric_msg_put(fc->s_msg_in_progress);
+		fc->s_msg_in_progress = NULL;
+		fabric_buffer_free_extra(&fc->s_buf);
+		fc->s_buf.buf = NULL;
+		fc->s_count++;
+	}
+	else {
+		// Partial send.
+		fc->s_buf.progress += send_sz;
+	}
+}
+
+// Must rearm or place into idle queue on success.
+static bool
+fabric_connection_process_writable(fabric_connection *fc)
+{
+	// Strategy with MSG_MORE to prevent small packets during migration.
+	// Case 1 - socket buffer not full:
+	//    Send all messages except last with MSG_MORE. Last message flushes
+	//    buffer.
+	// Case 2 - socket buffer full:
+	//    All messages get sent with MSG_MORE but because buffer full, small
+	//    packets still won't happen.
+	fabric_node *node = fc->node;
+	uint32_t pool = fc->pool->pool_id;
+
+	if (! fc->s_msg_in_progress) {
+		// TODO - Change to load op when atomic API is ready.
+		// Also should be rare or not even happen in x86_64.
+		cf_warning(AS_FABRIC, "fc(%p)->s_msg_in_progress NULL on entry", fc);
+		return false;
+	}
+
+	while (fc->s_msg_in_progress) {
+		msg *pending = NULL;
+
+		cf_queue_pop(&node->send_queue[pool], &pending, CF_QUEUE_NOWAIT);
+		fabric_connection_send_progress(fc, ! pending);
+
+		if (fc->s_msg_in_progress) {
+			if (pending) {
+				cf_queue_push_head(&node->send_queue[pool], &pending);
+			}
+
+			fabric_connection_send_rearm(fc);
+			return true;
+		}
+
+		fc->s_msg_in_progress = pending;
+	}
+
+	if (! fc->node->live || fc->failed) {
+		return false;
+	}
+
+	// Try with bigger lock block to sync with as_fabric_send().
+	pthread_mutex_lock(&node->send_idle_fc_queue_lock);
+
+	if (! fc->node->live || fc->failed) {
+		pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+		return false;
+	}
+
+	if (cf_queue_pop(&node->send_queue[pool], &fc->s_msg_in_progress,
+			CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) {
+		cf_queue_push(&node->send_idle_fc_queue[pool], &fc);
+		pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+		return true;
+	}
+
+	pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+
+	fabric_connection_send_rearm(fc);
+
+	return true;
+}
+
+// Return true on success.
+static bool
+fabric_connection_process_fabric_msg(fabric_connection *fc, const msg *m)
+{
+	cf_poll_delete_socket(g_accept_poll, &fc->sock);
+
+	cf_node node_id;
+
+	if (msg_get_uint64(m, FS_FIELD_NODE, &node_id) != 0) {
+		cf_warning(AS_FABRIC, "process_fabric_msg: failed to read M_TYPE_FABRIC node");
+		return false;
+	}
+
+	cf_detail(AS_FABRIC, "process_fabric_msg: M_TYPE_FABRIC from node %lx", node_id);
+
+	fabric_node *node = fabric_node_get_or_create(node_id);
+
+	if (! fabric_node_add_connection(node, fc)) {
+		fabric_node_release(node); // from cf_rchash_get
+		return false;
+	}
+
+	uint32_t pool_id = AS_FABRIC_N_CHANNELS; // illegal value
+
+	msg_get_uint32(m, FS_CHANNEL, &pool_id);
+
+	if (pool_id >= AS_FABRIC_N_CHANNELS) {
+		fabric_node_release(node); // from cf_rchash_get
+		return false;
+	}
+
+	fabric_buffer_free_extra(fc->r_buf_in_progress);
+	fabric_buffer_init(fc->r_buf_in_progress, sizeof(msg_hdr));
+	fc->r_msg_size = 0;
+
+	// fc->pool needs to be set before placing into send_idle_fc_queue.
+	fabric_recv_thread_pool_add_fc(&g_fabric.recv_pool[pool_id], fc);
+
+	// TLS connections are one-way. Incoming connections are for
+	// incoming data.
+	if (fc->sock.state == CF_SOCKET_STATE_NON_TLS) {
+		pthread_mutex_lock(&node->send_idle_fc_queue_lock);
+
+		if (node->live && ! fc->failed) {
+			fabric_connection_reserve(fc); // for send poll & idleQ
+
+			if (cf_queue_pop(&node->send_queue[pool_id], &fc->s_msg_in_progress,
+					CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) {
+				cf_queue_push(&node->send_idle_fc_queue[pool_id], &fc);
+			}
+			else {
+				fabric_connection_send_assign(fc);
+			}
+		}
+
+		pthread_mutex_unlock(&node->send_idle_fc_queue_lock);
+	}
+
+	fabric_node_release(node); // from cf_rchash_get
+	fabric_connection_release(fc); // from g_accept_poll
+
+	return true;
+}
+
+static bool
+fabric_connection_read_fabric_msg(fabric_connection *fc)
+{
+	fabric_buffer *fb = fc->r_buf_in_progress;
+
+	while (true) {
+		size_t recv_full = fb->end - fb->progress;
+		int32_t recv_sz = cf_socket_recv(&fc->sock, fb->progress, recv_full, 0);
+
+		if (recv_sz < 0) {
+			if (errno != EAGAIN && errno != EWOULDBLOCK) {
+				cf_warning(AS_FABRIC, "fabric_connection_read_fabric_msg() recv_sz %d errno %d %s", recv_sz, errno, cf_strerror(errno));
+				return false;
+			}
+
+			break;
+		}
+
+		if (recv_sz == 0) {
+			cf_detail(AS_FABRIC, "fabric_connection_read_fabric_msg(%p) fb=%p recv_sz == 0 / %zu", fc, fb, recv_full);
+			return false;
+		}
+
+		fb->progress += recv_sz;
+		fc->r_bytes += recv_sz;
+
+		if ((size_t)recv_sz < recv_full) {
+			tls_socket_must_not_have_data(&fc->sock, "partial fabric read");
+			break;
+		}
+
+		if (fc->r_msg_size == 0) {
+			size_t hdr_sz = fb->progress - fb->buf;
+
+			if (msg_get_initial(
+					&fc->r_msg_size, &fc->r_type, fb->buf, hdr_sz) != 0) {
+				cf_crash(AS_FABRIC, "fb->end was not initialized correctly");
+			}
+
+			if (! fabric_buffer_resize(fb, fc->r_msg_size)) {
+				cf_warning(AS_FABRIC, "fabric_connection_read_fabric_msg(%p) invalid msg_size %u remote 0x%lx", fc, fc->r_msg_size, fabric_connection_get_id(fc));
+				return false;
+			}
+
+			continue;
+		}
+
+		tls_socket_must_not_have_data(&fc->sock, "full fabric read");
+
+		if (fc->r_type != M_TYPE_FABRIC) {
+			cf_warning(AS_FABRIC, "fabric_connection_read_fabric_msg() expected type M_TYPE_FABRIC(%d) got type %d", M_TYPE_FABRIC, fc->r_type);
+			return false;
+		}
+
+		msg *m = as_fabric_msg_get(M_TYPE_FABRIC);
+
+		if (msg_parse(m, fb->buf, fc->r_msg_size) != 0) {
+			cf_warning(AS_FABRIC, "msg_parse failed for fc %p fb %p", fc, fb);
+			as_fabric_msg_put(m);
+			return false;
+		}
+
+		bool ret = fabric_connection_process_fabric_msg(fc, m);
+		as_fabric_msg_put(m);
+
+		return ret;
+	}
+
+	return true;
+}
+
+// Return true on success.
+// Must have re-armed on success.
+static bool
+fabric_connection_process_msg(fabric_connection *fc, bool do_rearm)
+{
+	msg *m = as_fabric_msg_get(fc->r_type);
+
+	if (! m) {
+		cf_warning(AS_FABRIC, "Failed to create message for type %d (max %d)", fc->r_type, M_TYPE_MAX);
+		return false;
+	}
+
+	fabric_buffer *fb = fc->r_buf_in_progress;
+
+	if (msg_parse(m, fb->buf, fc->r_msg_size) != 0) {
+		cf_warning(AS_FABRIC, "msg_parse failed for fc %p fb %p", fc, fb);
+		as_fabric_msg_put(m);
+		return false;
+	}
+
+	cf_assert(fc->node, AS_FABRIC, "process_msg: no node assigned");
+
+	// Save some state for after re-arm.
+	cf_node node = fc->node->node_id;
+	uint64_t bt = fc->benchmark_time;
+	uint32_t ch = fc->pool->pool_id;
+
+	fc->r_msg_size = 0;
+
+	if (do_rearm) {
+		// Re-arm for next message (possibly handled in another thread).
+		fc->r_buf_in_progress = fabric_buffer_create(sizeof(msg_hdr));
+		fabric_connection_recv_rearm(fc); // do not use fc after this point
+	}
+
+	if (g_fabric.msg_cb[m->type]) {
+		(g_fabric.msg_cb[m->type])(node, m, g_fabric.msg_udata[m->type]);
+
+		if (bt != 0) {
+			histogram_insert_data_point(g_stats.fabric_recv_cb_hists[ch], bt);
+		}
+	}
+	else {
+		cf_warning(AS_FABRIC, "process_msg: could not deliver message type %d", m->type);
+		as_fabric_msg_put(m);
+	}
+
+	if (do_rearm) {
+		fabric_buffer_destroy(fb);
+	}
+
+	return true;
+}
+
+// Return true on success.
+// Must have re-armed on success.
+static bool
+fabric_connection_process_readable(fabric_connection *fc)
+{
+	fabric_buffer *fb = fc->r_buf_in_progress;
+	size_t recv_all = 0;
+
+	while (true) {
+		size_t recv_full = fb->end - fb->progress;
+		int32_t recv_sz = cf_socket_recv(&fc->sock, fb->progress, recv_full, 0);
+
+		if (recv_sz < 0) {
+			if (errno != EAGAIN && errno != EWOULDBLOCK) {
+				cf_warning(AS_FABRIC, "fabric_connection_process_readable() recv_sz %d errno %d %s", recv_sz, errno, cf_strerror(errno));
+				return false;
+			}
+
+			break;
+		}
+
+		if (recv_sz == 0) {
+			cf_detail(AS_FABRIC, "fabric_connection_process_readable(%p) fb=%p recv_sz == 0 / %zu", fc, fb, recv_full);
+			return false;
+		}
+
+		fb->progress += recv_sz;
+		fc->r_bytes += recv_sz;
+		recv_all += recv_sz;
+
+		if (fc->r_msg_size == 0) {
+			fc->benchmark_time = g_config.fabric_benchmarks_enabled ?
+					cf_getns() : 0;
+		}
+
+		if ((size_t)recv_sz < recv_full) {
+			if (fc->benchmark_time != 0) {
+				fc->benchmark_time = histogram_insert_data_point(
+						g_stats.fabric_recv_fragment_hists[fc->pool->pool_id],
+						fc->benchmark_time);
+			}
+
+			break;
+		}
+
+		if (fc->r_msg_size == 0) {
+			size_t hdr_sz = fb->progress - fb->buf;
+
+			if (msg_get_initial(
+					&fc->r_msg_size, &fc->r_type, fb->buf, hdr_sz) != 0) {
+				cf_crash(AS_FABRIC, "fb->end was not initialized correctly");
+			}
+
+			if (! fabric_buffer_resize(fb, fc->r_msg_size)) {
+				cf_warning(AS_FABRIC, "fabric_connection_process_readable(%p) invalid msg_size %u remote 0x%lx", fc, fc->r_msg_size, fabric_connection_get_id(fc));
+				return false;
+			}
+
+			continue;
+		}
+
+		bool do_rearm = recv_all > (size_t)g_config.fabric_recv_rearm_threshold;
+
+		if (! fabric_connection_process_msg(fc, do_rearm)) {
+			return false;
+		}
+
+		if (do_rearm) {
+			// Already rearmed.
+			return true;
+		}
+
+		fabric_buffer_free_extra(fc->r_buf_in_progress);
+		fabric_buffer_init(fc->r_buf_in_progress, sizeof(msg_hdr));
+	}
+
+	fabric_connection_recv_rearm(fc);
+	return true;
+}
+
+
+//==========================================================
+// fabric_recv_thread_pool
+//
+
+static void
+fabric_recv_thread_pool_init(fabric_recv_thread_pool *pool, uint32_t size,
+		uint32_t pool_id)
+{
+	cf_vector_init(&pool->threads, sizeof(pthread_t), size, 0);
+	cf_poll_create(&pool->poll);
+	pool->pool_id = pool_id;
+}
+
+// Called only at startup or under set-config lock. Caller has checked size.
+static void
+fabric_recv_thread_pool_set_size(fabric_recv_thread_pool *pool, uint32_t size)
+{
+	while (size < cf_vector_size(&pool->threads)) {
+		pthread_t th;
+		cf_vector_pop(&pool->threads, &th);
+		pthread_cancel(th);
+	}
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	while (size > cf_vector_size(&pool->threads)) {
+		if (pthread_create(&thread, &attrs, run_fabric_recv, pool) != 0) {
+			cf_crash(AS_FABRIC, "could not create fabric recv thread");
+		}
+
+		cf_vector_append(&pool->threads, &thread);
+	}
+}
+
+static void
+fabric_recv_thread_pool_add_fc(fabric_recv_thread_pool *pool,
+		fabric_connection *fc)
+{
+	fabric_connection_reserve(fc); // extra ref for poll
+	fc->pool = pool;
+
+	uint32_t recv_events = EPOLLIN | DEFAULT_EVENTS;
+
+	cf_poll_add_socket(pool->poll, &fc->sock, recv_events, fc);
+}
+
+
+//==========================================================
+// fabric_endpoint
+//
+
+// Get the endpoint list to connect to the remote node.
+// Returns true on success where errno will be set to  ENOENT if there is no
+// endpoint list could be obtained for this node and ENOMEM if the input
+// endpoint_list_size is less than actual size. Var endpoint_list_size will be
+// updated with the required capacity.
+static bool
+fabric_endpoint_list_get(cf_node nodeid, as_endpoint_list *endpoint_list,
+		size_t *endpoint_list_size)
+{
+	as_hb_plugin_node_data plugin_data = {
+			.data_capacity = *endpoint_list_size,
+			.data = endpoint_list,
+			.data_size = 0,
+	};
+
+	if (as_hb_plugin_data_get(nodeid, AS_HB_PLUGIN_FABRIC, &plugin_data, NULL,
+			NULL) == 0) {
+		return plugin_data.data_size != 0;
+	}
+
+	if (errno == ENOENT) {
+		return false;
+	}
+
+	// Not enough allocated memory.
+	*endpoint_list_size = plugin_data.data_size;
+
+	return false;
+}
+
+// Filter out endpoints not matching this node's capabilities.
+static bool
+fabric_connect_endpoint_filter(const as_endpoint *endpoint, void *udata)
+{
+	if (cf_ip_addr_legacy_only() &&
+			endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) {
+		return false;
+	}
+
+	// If we don't offer TLS, then we won't connect via TLS, either.
+	if (g_config.tls_fabric.bind_port == 0 &&
+			as_endpoint_capability_is_supported(endpoint,
+					AS_ENDPOINT_TLS_MASK)) {
+		return false;
+	}
+
+	return true;
+}
+
+
+//==========================================================
+// Thread functions.
+//
+
+static void *
+run_fabric_recv(void *arg)
+{
+	int oldstate;
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
+
+	fabric_recv_thread_pool *pool = (fabric_recv_thread_pool *)arg;
+	static int worker_id_counter = 0;
+	uint64_t worker_id = worker_id_counter++;
+	cf_poll poll = pool->poll;
+
+	cf_detail(AS_FABRIC, "run_fabric_recv() created index %lu", worker_id);
+
+	pthread_cleanup_push(run_fabric_recv_cleanup, (void *)worker_id);
+
+	while (true) {
+		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
+		pthread_testcancel();
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
+
+		cf_poll_event events[FABRIC_EPOLL_RECV_EVENTS];
+		int32_t n = cf_poll_wait(poll, events, FABRIC_EPOLL_RECV_EVENTS, -1);
+
+		for (int32_t i = 0; i < n; i++) {
+			fabric_connection *fc = events[i].data;
+
+			if (fc->node && ! fc->node->live) {
+				fabric_connection_disconnect(fc);
+				fabric_connection_release(fc);
+				continue;
+			}
+
+			// Handle remote close, socket errors.
+			// Also triggered by call to cf_socket_shutdown(fc->sock), but only
+			// first call.
+			// Not triggered by cf_socket_close(fc->sock), which automatically
+			// does EPOLL_CTL_DEL.
+			if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) {
+				cf_detail(AS_FABRIC, "%lu: epoll : error, will close: fc %p fd %d errno %d signal {err:%d, hup:%d, rdhup:%d}",
+						worker_id,
+						fc, CSFD(&fc->sock), errno,
+						((events[i].events & EPOLLERR) ? 1 : 0),
+						((events[i].events & EPOLLHUP) ? 1 : 0),
+						((events[i].events & EPOLLRDHUP) ? 1 : 0));
+				fabric_connection_disconnect(fc);
+				fabric_connection_release(fc);
+				continue;
+			}
+
+			cf_assert(events[i].events == EPOLLIN, AS_FABRIC, "epoll not setup correctly for %p", fc);
+
+			if (! fabric_connection_process_readable(fc)) {
+				fabric_connection_disconnect(fc);
+				fabric_connection_release(fc);
+				continue;
+			}
+		}
+	}
+
+	pthread_cleanup_pop(0);
+	return NULL;
+}
+
+static void
+run_fabric_recv_cleanup(void *arg)
+{
+	uint64_t worker_id = (uint64_t)arg;
+
+	cf_detail(AS_FABRIC, "run_fabric_recv() canceling index %lu", worker_id);
+}
+
+static void *
+run_fabric_send(void *arg)
+{
+	send_entry *se = (send_entry *)arg;
+	cf_poll poll = se->poll;
+
+	cf_detail(AS_FABRIC, "run_fabric_send() fd %d id %u", poll.fd, se->id);
+
+	while (true) {
+		cf_poll_event events[FABRIC_EPOLL_SEND_EVENTS];
+		int32_t n = cf_poll_wait(poll, events, FABRIC_EPOLL_SEND_EVENTS, -1);
+
+		for (int32_t i = 0; i < n; i++) {
+			fabric_connection *fc = events[i].data;
+
+			if (fc->node && ! fc->node->live) {
+				fabric_connection_disconnect(fc);
+				fabric_connection_send_unassign(fc);
+				fabric_connection_release(fc);
+				continue;
+			}
+
+			// Handle remote close, socket errors. Also triggered by call to
+			// cf_socket_shutdown(fb->sock), but only first call. Not triggered
+			// by cf_socket_close(fb->sock), which automatically EPOLL_CTL_DEL.
+			if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) {
+				cf_detail(AS_FABRIC, "epoll : error, will close: fc %p fd %d errno %d signal {err:%d, hup:%d, rdhup:%d}",
+						fc, CSFD(&fc->sock), errno,
+						((events[i].events & EPOLLERR) ? 1 : 0),
+						((events[i].events & EPOLLHUP) ? 1 : 0),
+						((events[i].events & EPOLLRDHUP) ? 1 : 0));
+				fabric_connection_disconnect(fc);
+				fabric_connection_send_unassign(fc);
+				fabric_connection_reroute_msg(fc);
+				fabric_connection_release(fc);
+				continue;
+			}
+
+			if (tls_socket_needs_handshake(&fc->sock)) {
+				if (! fabric_connection_connect_tls(fc)) {
+					fabric_connection_disconnect(fc);
+					fabric_connection_send_unassign(fc);
+					fabric_connection_reroute_msg(fc);
+					fabric_connection_release(fc);
+				}
+
+				continue;
+			}
+
+			cf_assert(events[i].events == EPOLLOUT, AS_FABRIC, "epoll not setup correctly for %p", fc);
+
+			if (! fabric_connection_process_writable(fc)) {
+				fabric_connection_disconnect(fc);
+				fabric_connection_send_unassign(fc);
+				fabric_connection_reroute_msg(fc);
+				fabric_connection_release(fc);
+				continue;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void *
+run_fabric_accept(void *arg)
+{
+	cf_sockets sockset;
+
+	if (cf_socket_init_server(&g_fabric_bind, &sockset) < 0) {
+		cf_crash(AS_FABRIC, "Could not create fabric listener socket - check configuration");
+	}
+
+	cf_poll_create(&g_accept_poll);
+	cf_poll_add_sockets(g_accept_poll, &sockset, EPOLLIN | EPOLLERR | EPOLLHUP);
+	cf_socket_show_server(AS_FABRIC, "fabric", &sockset);
+
+	while (true) {
+		// Accept new connections on the service socket.
+		cf_poll_event events[64];
+		int32_t n = cf_poll_wait(g_accept_poll, events, 64, -1);
+
+		for (int32_t i = 0; i < n; i++) {
+			cf_socket *ssock = events[i].data;
+
+			if (cf_sockets_has_socket(&sockset, ssock)) {
+				cf_socket csock;
+				cf_sock_addr sa;
+
+				if (cf_socket_accept(ssock, &csock, &sa) < 0) {
+					if (errno == EMFILE) {
+						cf_warning(AS_FABRIC, "low on file descriptors");
+						continue;
+					}
+					else {
+						cf_crash(AS_FABRIC, "cf_socket_accept: %d %s", errno, cf_strerror(errno));
+					}
+				}
+
+				cf_detail(AS_FABRIC, "fabric_accept: accepting new sock %d", CSFD(&csock));
+				cf_atomic64_incr(&g_stats.fabric_connections_opened);
+
+				fabric_connection *fc = fabric_connection_create(&csock, &sa);
+
+				cf_sock_cfg *cfg = ssock->cfg;
+
+				if (cfg->owner == CF_SOCK_OWNER_FABRIC_TLS) {
+					tls_socket_prepare_server(g_fabric_tls, &fc->sock);
+				}
+
+				uint32_t events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
+				cf_poll_add_socket(g_accept_poll, &fc->sock, events, fc);
+			}
+			else {
+				fabric_connection *fc = events[i].data;
+
+				if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) {
+					fabric_connection_release(fc);
+					continue;
+				}
+
+				if (tls_socket_needs_handshake(&fc->sock)) {
+					if (! fabric_connection_accept_tls(fc)) {
+						fabric_connection_release(fc);
+					}
+
+					continue;
+				}
+
+				if (! fabric_connection_read_fabric_msg(fc)) {
+					fabric_connection_release(fc);
+					continue;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int
+fabric_rate_node_reduce_fn(const void *key, uint32_t keylen, void *data,
+		void *udata)
+{
+	fabric_node *node = (fabric_node *)data;
+	fabric_rate *rate = (fabric_rate *)udata;
+
+	pthread_mutex_lock(&node->fc_hash_lock);
+	cf_shash_reduce(node->fc_hash, fabric_rate_fc_reduce_fn, rate);
+	pthread_mutex_unlock(&node->fc_hash_lock);
+
+	return 0;
+}
+
+static int
+fabric_rate_fc_reduce_fn(const void *key, void *data, void *udata)
+{
+	fabric_connection *fc = *(fabric_connection **)key;
+	fabric_rate *rate = (fabric_rate *)udata;
+
+	if (! fc->pool) {
+		return 0;
+	}
+
+	uint32_t pool_id = fc->pool->pool_id;
+	uint64_t r_bytes = fc->r_bytes;
+	uint64_t s_bytes = fc->s_bytes;
+
+	rate->r_bytes[pool_id] += r_bytes - fc->r_bytes_last;
+	rate->s_bytes[pool_id] += s_bytes - fc->s_bytes_last;
+
+	fc->r_bytes_last = r_bytes;
+	fc->s_bytes_last = s_bytes;
+
+	return 0;
+}
+
+
+//==========================================================
+// Heartbeat.
+//
+
+// Set the fabric advertised endpoints.
+static void
+fabric_hb_plugin_set_fn(msg *m)
+{
+	if (m->type == M_TYPE_HEARTBEAT_V2) {
+		// In v1 and v2 fabric does not advertise its endpoints and they
+		// do not support plugged in data.
+		return;
+	}
+
+	if (! fabric_published_endpoints_refresh()) {
+		cf_warning(AS_FABRIC, "No publish addresses found for fabric.");
+		return;
+	}
+
+	size_t payload_size = 0;
+
+	if (as_endpoint_list_sizeof(
+			g_published_endpoint_list, &payload_size) != 0) {
+		cf_crash(AS_FABRIC, "Error getting endpoint list size for published addresses.");
+	}
+
+	if (msg_set_buf(m, AS_HB_MSG_FABRIC_DATA,
+			(uint8_t *)g_published_endpoint_list, payload_size,
+			MSG_SET_COPY) != 0) {
+		cf_crash(AS_FABRIC, "Error setting succession list on msg.");
+	}
+}
+
+// Plugin function that parses succession list out of a heartbeat pulse message.
+static void
+fabric_hb_plugin_parse_data_fn(msg *m, cf_node source,
+		as_hb_plugin_node_data *plugin_data)
+{
+	if (m->type == M_TYPE_HEARTBEAT_V2) {
+		plugin_data->data_size = 0;
+		return;
+	}
+
+	uint8_t *payload = NULL;
+	size_t payload_size = 0;
+
+	if (msg_get_buf(m, AS_HB_MSG_FABRIC_DATA, &payload, &payload_size,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_FABRIC, "Unable to read fabric published endpoint list from heartbeat from node %lx", source);
+		return;
+	}
+
+	if (payload_size > plugin_data->data_capacity) {
+		// Round up to nearest multiple of block size to prevent very frequent
+		// reallocation.
+		size_t data_capacity = ((payload_size + HB_PLUGIN_DATA_BLOCK_SIZE - 1) /
+				HB_PLUGIN_DATA_BLOCK_SIZE) * HB_PLUGIN_DATA_BLOCK_SIZE;
+
+		// Reallocate since we have outgrown existing capacity.
+		plugin_data->data = cf_realloc(plugin_data->data, data_capacity);
+
+		plugin_data->data_capacity = data_capacity;
+	}
+
+	plugin_data->data_size = payload_size;
+
+	memcpy(plugin_data->data, payload, payload_size);
+}
+
+// Function is called when a new node created or destroyed on the heartbeat
+// system.
+// This will insert a new element in the hashtable that keeps track of all TCP
+// connections.
+static void
+fabric_heartbeat_event(int nevents, as_hb_event_node *events, void *udata)
+{
+	if ((nevents < 1) || (nevents > AS_CLUSTER_SZ) || ! events) {
+		cf_warning(AS_FABRIC, "fabric: received event count of %d", nevents);
+		return;
+	}
+
+	for (int i = 0; i < nevents; i++) {
+		switch (events[i].evt) {
+		case AS_HB_NODE_ARRIVE: {
+				fabric_node *node = fabric_node_get_or_create(events[i].nodeid);
+				fabric_node_release(node); // for node_get_or_create()
+
+				cf_info(AS_FABRIC, "fabric: node %lx arrived", events[i].nodeid);
+			}
+			break;
+		case AS_HB_NODE_DEPART:
+			cf_info(AS_FABRIC, "fabric: node %lx departed", events[i].nodeid);
+			fabric_node_disconnect(events[i].nodeid);
+			break;
+		case AS_HB_NODE_ADJACENCY_CHANGED:
+			// Not relevant to fabric.
+			break;
+		default:
+			cf_warning(AS_FABRIC, "fabric: received unknown event type %d %lx", events[i].evt, events[i].nodeid);
+			break;
+		}
+	}
+}
+
+
+//==============================================================================
+// Fabric transact.
+//
+
+//==========================================================
+// Constants and typedefs.
+//
+
+typedef enum {
+	TRANSACT_CODE_REQUEST = 1,
+	TRANSACT_CODE_RESPONSE = 2,
+} transact_code;
+
+// Operation to be performed on transaction in retransmission hash.
+typedef enum {
+	TRANSACT_OP_TIMEOUT = 1,
+	TRANSACT_OP_RETRANSMIT = 2,
+} transact_op;
+
+typedef struct fabric_transact_xmit_s {
+	uint64_t		tid;
+	cf_node 		node_id;
+	msg				*m;
+	pthread_mutex_t	lock;
+
+	uint64_t		deadline_ms;
+	uint64_t		retransmit_ms;
+	int 			retransmit_wait;
+
+	as_fabric_transact_complete_fn cb;
+	void			*udata;
+} fabric_transact_xmit;
+
+typedef struct fabric_transact_recv_s {
+	cf_node 	node_id; // where it came from
+	uint64_t	tid; // inbound tid
+} fabric_transact_recv;
+
+typedef struct transact_recv_key_s {
+	uint64_t	tid;
+	cf_node 	node_id;
+} __attribute__ ((__packed__)) transact_recv_key;
+
+typedef struct ll_fabric_transact_xmit_element_s {
+	cf_ll_element ll_e;
+	int op;
+	uint64_t tid;
+} ll_fabric_transact_xmit_element;
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_atomic64 g_fabric_transact_tid = 0;
+static cf_rchash *g_fabric_transact_xmit_hash = NULL;
+static as_fabric_transact_recv_fn fabric_transact_recv_cb[M_TYPE_MAX] = { 0 };
+static void *fabric_transact_recv_udata[M_TYPE_MAX] = { 0 };
+
+
+//==========================================================
+// Forward declarations and inlines.
+//
+
+static void fabric_transact_xmit_destructor(void *object);
+static void fabric_transact_xmit_release(fabric_transact_xmit *ft);
+static int fabric_transact_msg_fn(cf_node node_id, msg *m, void *udata);
+static void *run_fabric_transact(void *arg);
+static void ll_ftx_destructor_fn(cf_ll_element *e);
+static int fabric_transact_xmit_reduce_fn(const void *key, uint32_t keylen, void *o, void *udata);
+static int ll_ftx_reduce_fn(cf_ll_element *le, void *udata);
+
+inline static transact_code
+tid_code_get(uint64_t tid)
+{
+	return (transact_code)(tid >> 56);
+}
+
+inline static uint64_t
+tid_code_set(uint64_t tid, transact_code code)
+{
+	return tid | (((uint64_t)code) << 56);
+}
+
+inline static uint64_t
+tid_code_clear(uint64_t tid)
+{
+	return tid & 0xFFffffFFFFffff;
+}
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_fabric_transact_init()
+{
+	cf_rchash_create(&g_fabric_transact_xmit_hash, cf_rchash_fn_u32,
+			fabric_transact_xmit_destructor, sizeof(uint64_t), 64,
+			CF_RCHASH_MANY_LOCK);
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, run_fabric_transact, NULL) != 0) {
+		cf_crash(AS_FABRIC, "could not create fabric transact thread");
+	}
+}
+
+void
+as_fabric_transact_start(cf_node dest, msg *m, int timeout_ms,
+		as_fabric_transact_complete_fn cb, void *udata)
+{
+	// TODO - could check it against the list of global message ids.
+
+	if (msg_field_get_type(m, 0) != M_FT_UINT64) {
+		// error
+		cf_warning(AS_FABRIC, "as_fabric_transact: first field must be int64");
+		(cb)(NULL, udata, AS_FABRIC_ERR_UNKNOWN);
+		return;
+	}
+
+	fabric_transact_xmit *ft = cf_rc_alloc(sizeof(fabric_transact_xmit));
+
+	ft->tid = cf_atomic64_incr(&g_fabric_transact_tid);
+	ft->node_id = dest;
+	ft->m = m;
+
+	pthread_mutex_init(&ft->lock, NULL);
+	uint64_t now = cf_getms();
+
+	ft->deadline_ms = now + timeout_ms;
+	ft->retransmit_wait = 10; // 10 ms start
+	ft->retransmit_ms = now + ft->retransmit_wait; // hard start at 10 milliseconds
+	ft->cb = cb;
+	ft->udata = udata;
+
+	uint64_t xmit_tid = tid_code_set(ft->tid, TRANSACT_CODE_REQUEST);
+
+	// Set message tid.
+	msg_set_uint64(m, 0, xmit_tid);
+
+	// Put will take the reference, need to keep one around for the send.
+	cf_rc_reserve(ft);
+	cf_rchash_put(g_fabric_transact_xmit_hash, &ft->tid, sizeof(ft->tid), ft);
+
+	// Transmit the initial message.
+	msg_incr_ref(m);
+
+	if (as_fabric_send(ft->node_id, ft->m, AS_FABRIC_CHANNEL_META) !=
+			AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+
+	fabric_transact_xmit_release(ft);
+
+	return;
+}
+
+// Registers all of this message type as a
+// transaction type message, which means the main message.
+int
+as_fabric_transact_register(msg_type type, const msg_template *mt, size_t mt_sz,
+		size_t scratch_sz, as_fabric_transact_recv_fn cb, void *udata)
+{
+	// Put details in the global structure.
+	fabric_transact_recv_cb[type] = cb;
+	fabric_transact_recv_udata[type] = udata;
+
+	// Register my internal callback with the main message callback.
+	as_fabric_register_msg_fn(type, mt, mt_sz, scratch_sz,
+			fabric_transact_msg_fn, NULL);
+
+	return 0;
+}
+
+int
+as_fabric_transact_reply(msg *m, void *transact_data)
+{
+	fabric_transact_recv *ftr = (fabric_transact_recv *)transact_data;
+
+	// This is a response - overwrite tid with response code etc.
+	uint64_t xmit_tid = tid_code_set(ftr->tid, TRANSACT_CODE_RESPONSE);
+	msg_set_uint64(m, 0, xmit_tid);
+
+	if (as_fabric_send(ftr->node_id, m, AS_FABRIC_CHANNEL_META) !=
+			AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// Local helpers - various initializers and destructors.
+//
+
+static void
+fabric_transact_xmit_release(fabric_transact_xmit *ft)
+{
+	if (cf_rc_release(ft) == 0) {
+		fabric_transact_xmit_destructor(ft);
+		cf_rc_free(ft);
+	}
+}
+
+// Received a message. Could be a response to an outgoing message, or a new
+// incoming transaction message.
+static int
+fabric_transact_msg_fn(cf_node node_id, msg *m, void *udata)
+{
+	// Assume m->type is correct.
+
+	// Received a message, make sure we have a registered callback.
+	if (fabric_transact_recv_cb[m->type] == 0) {
+		cf_warning(AS_FABRIC, "transact: received message for transact with bad type %d, internal error", m->type);
+		as_fabric_msg_put(m); // return to pool unexamined
+		return 0;
+	}
+
+	// Check to see that we have an outstanding request (only cb once!).
+	uint64_t tid = 0;
+
+	if (msg_get_uint64(m, 0 /*field_id*/, &tid) != 0) {
+		cf_warning(AS_FABRIC, "transact_msg: received message with no tid");
+		as_fabric_msg_put(m);
+		return 0;
+	}
+
+	transact_code code = tid_code_get(tid);
+	tid = tid_code_clear(tid);
+
+	// If it's a response, check against what you sent.
+	if (code == TRANSACT_CODE_RESPONSE) {
+		fabric_transact_xmit *ft;
+
+		if (cf_rchash_get(g_fabric_transact_xmit_hash, &tid, sizeof(tid),
+				(void **)&ft) != CF_RCHASH_OK) {
+			cf_detail(AS_FABRIC, "transact_msg: {%lu} no fabric transmit structure in global hash", tid);
+			as_fabric_msg_put(m);
+			return 0;
+		}
+
+		if (cf_rchash_delete(g_fabric_transact_xmit_hash, &tid, sizeof(tid)) ==
+				CF_RCHASH_ERR_NOT_FOUND) {
+			cf_detail(AS_FABRIC, "transact_msg: {%lu} concurrent thread has already removed transaction", tid);
+			fabric_transact_xmit_release(ft);
+			as_fabric_msg_put(m);
+			return 0;
+		}
+
+		pthread_mutex_lock(&ft->lock);
+
+		// Make sure we haven't notified some other way, then notify caller.
+		if (ft->cb) {
+			(ft->cb)(m, ft->udata, AS_FABRIC_SUCCESS);
+			ft->cb = NULL;
+		}
+
+		pthread_mutex_unlock(&ft->lock);
+
+		// This will often be the final release.
+		fabric_transact_xmit_release(ft);
+	}
+	else if (code == TRANSACT_CODE_REQUEST) {
+		fabric_transact_recv *ftr = cf_malloc(sizeof(fabric_transact_recv));
+
+		ftr->tid = tid; // has already been cleared
+		ftr->node_id = node_id;
+
+		// Notify caller - they will likely respond inline.
+		(*fabric_transact_recv_cb[m->type])(node_id, m, ftr,
+				fabric_transact_recv_udata[m->type]);
+		cf_free(ftr);
+	}
+	else {
+		cf_warning(AS_FABRIC, "transact_msg: {%lu} bad code on incoming message: %d", tid, code);
+		as_fabric_msg_put(m);
+	}
+
+	return 0;
+}
+
+static void
+fabric_transact_xmit_destructor(void *object)
+{
+	fabric_transact_xmit *ft = object;
+	as_fabric_msg_put(ft->m);
+}
+
+// Long running thread for transaction maintenance.
+static void *
+run_fabric_transact(void *arg)
+{
+	// Create a list of transactions to be processed in each pass.
+	cf_ll ll_fabric_transact_xmit;
+	// Initialize list to empty list.
+	// This list is processed by single thread. No need of a lock.
+	cf_ll_init(&ll_fabric_transact_xmit, &ll_ftx_destructor_fn, false);
+
+	while (true) {
+		usleep(10000); // 10 ms for now
+
+		// Visit each entry in g_fabric_transact_xmit_hash and select entries to
+		// be retransmitted or timed out. Add that transaction id (tid) in the
+		// linked list 'll_fabric_transact_xmit'.
+		cf_rchash_reduce(g_fabric_transact_xmit_hash,
+				fabric_transact_xmit_reduce_fn,
+				(void *)&ll_fabric_transact_xmit);
+
+		if (cf_ll_size(&ll_fabric_transact_xmit)) {
+			// There are transactions to be processed.
+			// Process each transaction in list.
+			cf_ll_reduce(&ll_fabric_transact_xmit, true /*forward*/,
+					ll_ftx_reduce_fn, NULL);
+		}
+	}
+
+	return 0;
+}
+
+static void
+ll_ftx_destructor_fn(cf_ll_element *e)
+{
+	cf_free(e);
+}
+
+static int
+fabric_transact_xmit_reduce_fn(const void *key, uint32_t keylen, void *o,
+		void *udata)
+{
+	fabric_transact_xmit *ftx = (fabric_transact_xmit *)o;
+	int op = 0;
+
+	uint64_t now = cf_getms();
+
+	pthread_mutex_lock(&ftx->lock);
+
+	if (now > ftx->deadline_ms) {
+		// Expire and remove transactions that are timed out.
+		// Need to call application: we've timed out.
+		op = (int)TRANSACT_OP_TIMEOUT;
+	}
+	else if (now > ftx->retransmit_ms) {
+		// Retransmit, update time counters, etc.
+		ftx->retransmit_ms = now + ftx->retransmit_wait;
+		ftx->retransmit_wait *= 2;
+		op = (int)TRANSACT_OP_RETRANSMIT;
+	}
+
+	if (op > 0) {
+		// Add the transaction in linked list of transactions to be processed.
+		// Process such transactions *outside* retransmit hash lock, because...
+		//
+		// Fabric short circuits the message to self by directly calling
+		// receiver function of corresponding module. Receiver constructs
+		// "reply" and hands over to fabric to deliver.
+		//
+		// On receiving "reply", fabric removes original message, for which
+		// this is a reply, from retransmit hash.
+		//
+		// "fabric_transact_xmit_reduce_fn" is invoked by reduce_delete, which
+		// holds the lock over corresponding hash (here "retransmit hash"). If
+		// the message, sent by this function, is short circuited by fabric,
+		// the same thread will again try to get lock over "retransmit hash",
+		// resulting in deadlock.
+
+		cf_ll *ll_fabric_transact_xmit = (cf_ll *)udata;
+
+		// Create new node for list.
+		ll_fabric_transact_xmit_element *ll_ftx_ele =
+				(ll_fabric_transact_xmit_element *)
+				cf_malloc(sizeof(ll_fabric_transact_xmit_element));
+
+		ll_ftx_ele->tid = ftx->tid;
+		ll_ftx_ele->op = op;
+		// Append into list.
+		cf_ll_append(ll_fabric_transact_xmit, (cf_ll_element *)ll_ftx_ele);
+	}
+
+	pthread_mutex_unlock(&ftx->lock);
+
+	return 0;
+}
+
+static int
+ll_ftx_reduce_fn(cf_ll_element *le, void *udata)
+{
+	const ll_fabric_transact_xmit_element *ll_ftx_ele =
+			(const ll_fabric_transact_xmit_element *)le;
+	fabric_transact_xmit *ftx;
+	uint64_t tid = ll_ftx_ele->tid;
+
+	// cf_rchash_get increments ref count on transaction ftx.
+	int rv = cf_rchash_get(g_fabric_transact_xmit_hash, &tid, sizeof(tid),
+			(void **)&ftx);
+
+	if (rv != 0) {
+		cf_warning(AS_FABRIC, "No fabric transmit structure in global hash for fabric transaction-id %lu", tid);
+		return CF_LL_REDUCE_DELETE;
+	}
+
+	if (ll_ftx_ele->op == (int)TRANSACT_OP_TIMEOUT) {
+		// Call application: we've timed out.
+		if (ftx->cb) {
+			(ftx->cb)(0, ftx->udata, AS_FABRIC_ERR_TIMEOUT);
+			ftx->cb = NULL;
+		}
+
+		cf_detail(AS_FABRIC, "fabric transact: %lu timed out", tid);
+		// cf_rchash_delete removes ftx from hash and decrements ref count.
+		cf_rchash_delete(g_fabric_transact_xmit_hash, &tid, sizeof(tid));
+		// It should be final release of transaction ftx. On final release, it
+		// also decrements message ref count, taken by initial fabric_send().
+		fabric_transact_xmit_release(ftx);
+	}
+	else if (ll_ftx_ele->op == (int)TRANSACT_OP_RETRANSMIT) {
+		if (ftx->m) {
+			msg_incr_ref(ftx->m);
+
+			msg *m = ftx->m;
+			cf_node node = ftx->node_id;
+
+			if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_META) != 0) {
+				cf_detail(AS_FABRIC, "fabric: transact: %lu retransmit send failed", tid);
+				as_fabric_msg_put(m);
+			}
+			else {
+				cf_detail(AS_FABRIC, "fabric: transact: %lu retransmit send success", tid);
+			}
+		}
+
+		// Decrement ref count, incremented by cf_rchash_get.
+		fabric_transact_xmit_release(ftx);
+	}
+
+	// Remove it from linked list.
+	return CF_LL_REDUCE_DELETE;
+}
diff --git a/as/src/fabric/hb.c b/as/src/fabric/hb.c
new file mode 100644
index 00000000..b293846f
--- /dev/null
+++ b/as/src/fabric/hb.c
@@ -0,0 +1,9055 @@
+/*
+ * hb.c
+ *
+ * Copyright (C) 2012-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fabric/hb.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <zlib.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_hash_math.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "fault.h"
+#include "node.h"
+#include "shash.h"
+#include "socket.h"
+
+#include "base/cfg.h"
+#include "base/stats.h"
+#include "base/thr_info.h"
+#include "fabric/endpoint.h"
+#include "fabric/fabric.h"
+#include "fabric/partition_balance.h"
+
+/*
+ * Overview
+ * ========
+ * The heartbeat subsystem is a core clustering module that discovers nodes in
+ * the cluster and monitors connectivity to them. This subsystem maintains an
+ * "adjacency list", which is the list of nodes deemed to be alive and connected
+ * at any instance in time.
+ *
+ * The heartbeat subsystem is divided into three sub modules
+ * 	1. Config
+ * 	2. Channel
+ * 	3. Mesh
+ * 	4. Main
+ *
+ * Config
+ * ------
+ * This sub module deals with overall heartbeat subsystem configuration and
+ * dynamic updates to configuration.
+ *
+ * Channel
+ * -------
+ * This sub module is responsible for maintaining a channel between this node
+ * and all known nodes. The channel sub module provides the ability to broadcast
+ * or uni cast messages to known nodes.
+ *
+ * Other modules interact with the channel sub module primarily through events
+ * raised by the channel sub module. The events help other sub modules infer
+ * connectivity status to known nodes and react to incoming heartbeat message
+ * from other nodes.
+ *
+ * Depending on the configured mode (mesh. multicast) the channels between this
+ * node and other nodes could be
+ *  1. TCP and hence unicast. One per pair of nodes.
+ *  2. Multicast with UDP. One per cluster.
+ *
+ * Mesh
+ * ----
+ * This sub module is responsible for discovering cluster members. New nodes are
+ * discovered via adjacency lists published in their heartbeats of know nodes.
+ * The mesh module boots up using configured seed nodes.
+ *
+ * Main
+ * ----
+ * This sub module orchestrates other modules and hence main. Its primary
+ * responsibility is to maintain the adjacency list.
+ *
+ * Heartbeat messages
+ * ==================
+ *
+ * Every heartbeat message contains
+ * 	1. the source node's nodeid
+ * 	2. the source node's published ip address
+ * 	3. the source node's published port.
+ *
+ * There are the following types of heartbeat messages
+ * 	1. Pulse - messages sent at periodic intervals. Will contain current
+ * 	adjacency lists
+ * 	2. Info request - message sent in the mesh mode, to a known mesh node,
+ * 	in order to get ip address and port of a newly discovered node.
+ * 	3. Info reply - message sent in response to an info request. Returns
+ * 	the node's ip address and port.
+ *
+ * Message conventions
+ * -------------------
+ * 1. Published adjacency will always contain the source node.
+ *
+ * Design philosophy
+ * =================
+ *
+ * Locking vs single threaded event loop.
+ * --------------------------------------
+ * This first cut leans toward using locks instead of single threaded event
+ * loops to protect critical data. The choice is driven by the fact that
+ * synchronous external and inter-sub module interaction looked like more work
+ * with single threaded event loops. The design chooses simplicity over
+ * performance given the lower volumes of events that need to be processed here
+ * as compared to the transaction processing code. The locks are coarse, one per
+ * sub module and re-entrant. They are used generously and no function makes an
+ * assumption of locks prior locks being held.
+ *
+ * Inter-module interactions in some cases are via synchronous function calls,
+ * which run the risk of deadlocks. For now, deadlocks should not happen.
+ * However, if this ideology complicates code, inter-module interaction will be
+ * rewritten to use asynchronous event queues.
+ *
+ * Locking policy
+ * ==============
+ *
+ * 1. Lock as much as you can. The locks are re-entrant. This is not a critical
+ * 	  high volume code path, and hence correctness with simplicity is preferred.
+ * 	  Any read / write access to module state should be under a lock.
+ * 2. Preventing deadlocks
+ * 	  a. The enforced lock order is
+ * 		 1. Protocol lock (SET_PROTOCOL_LOCK) Uses to ensure protocol set is
+ *  atomic.
+ * 		 2. Main module (HB_LOCK)
+ * 		 3. Mesh and multicast modules (MESH_LOCK)
+ * 		 4. Channel (CHANNEL_LOCK)
+ * 		 5. Config (HB_CONFIG_LOCK)
+ * 	   Always make sure every thread acquires locks in this order ONLY. In terms
+ *  of functions calls only lower numbered modules can call functions from the
+ *  higher numbered modules while holding their onto their locks.
+ * 3. Events raised / messages passed to listeners should be outside the
+ * 	  module's lock.
+ *
+ * Guidelines for message plugins
+ * ==============================
+ * The parse data functions should NOT hold any locks and thus avert deadlocks.
+ *
+ * TODO
+ * ====
+ * 1. Extend to allow hostnames in mesh mode across the board.
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Macros
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Channel
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Size of the poll events set.
+ */
+#define POLL_SZ 1024
+
+/**
+ * The number of bytes for the message length on the wire.
+ */
+#define MSG_WIRE_LENGTH_SIZE 4
+
+/**
+ * Channel idle interval after which check for inactive channel is triggered.
+ */
+#define CHANNEL_IDLE_CHECK_PERIOD (CHANNEL_NODE_READ_IDLE_TIMEOUT() / 2)
+
+/**
+ * A channel times out if there is no msg received from a node in this interval.
+ * Set to a fraction of node timeout so that a new channel could be set up to
+ * recover from a potentially bad connection before the node times out.
+ */
+#define CHANNEL_NODE_READ_IDLE_TIMEOUT()					\
+(PULSE_TRANSMIT_INTERVAL()									\
+		* MAX(2, config_max_intervals_missed_get() / 3))
+
+/**
+ * Acquire a lock on the entire channel sub module.
+ */
+#define CHANNEL_LOCK() (pthread_mutex_lock(&g_channel_lock))
+
+/**
+ * Relinquish the lock on the entire channel sub module.
+ */
+#define CHANNEL_UNLOCK() (pthread_mutex_unlock(&g_channel_lock))
+
+/*
+ * ----------------------------------------------------------------------------
+ * Mesh and Multicast
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Read write timeout (in ms).
+ */
+#define MESH_RW_TIMEOUT 5
+
+/**
+ * Size of the network header.
+ *
+ *  Maximum size of IPv4 header - 20 bytes (assuming no variable length fields)
+ *  Fixed size of IPv6 header - 40 bytes (assuming no extension headers)
+ *  Maximum size of TCP header - 60 Bytes
+ *  Size of UDP header (fixed) - 8 bytes
+ *  So maximum size of empty TCP datagram - 60 + 20 = 80 bytes
+ *  So maximum size of empty IPv4 UDP datagram - 20 + 8 = 28 bytes
+ *  So maximum size of empty IPv6 UDP datagram - 40 + 8 = 48 bytes
+ *
+ * Being conservative and assuming 30 bytes for IPv4 UDP header and 50 bytes for
+ * IPv6 UDP header.
+ */
+#define UDP_HEADER_SIZE_MAX 50
+
+/**
+ * Expected ratio - (input size) / (compressed size). Assuming 40% decrease in
+ * size after compression.
+ */
+#define MSG_COMPRESSION_RATIO (1.0 / 0.60)
+
+/**
+ * Mesh timeout for pending nodes.
+ */
+#define MESH_PENDING_TIMEOUT (CONNECT_TIMEOUT())
+
+/**
+ * Mesh inactive timeout after which a mesh node will be forgotten.
+ */
+#define MESH_INACTIVE_TIMEOUT (10 * HB_NODE_TIMEOUT())
+
+/**
+ * Mesh timeout for getting the endpoint for a node after which this node will
+ * be forgotten.
+ */
+#define MESH_ENDPOINT_UNKNOWN_TIMEOUT (HB_NODE_TIMEOUT())
+
+/**
+ * Intervals at which mesh tender runs.
+ */
+#define MESH_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL())
+
+/**
+ * Intervals at which attempts to resolve unresolved seed hostname will be made.
+ */
+#define MESH_SEED_RESOLVE_ATTEMPT_INTERVAL() (HB_NODE_TIMEOUT())
+
+/**
+ * Intervals at which conflict checks is enabled.
+ */
+#define MESH_CONFLICT_CHECK_INTERVAL() (5 * HB_NODE_TIMEOUT())
+
+/**
+ * Duration for which conflicts are checked.
+ */
+#define MESH_CONFLICT_CHECK_DURATION() (MESH_CONFLICT_CHECK_INTERVAL() / 5)
+
+/**
+ * Acquire a lock on the entire mesh sub module.
+ */
+#define MESH_LOCK() (pthread_mutex_lock(&g_mesh_lock))
+
+/**
+ * Relinquish the lock on the entire mesh sub module.
+ */
+#define MESH_UNLOCK() (pthread_mutex_unlock(&g_mesh_lock))
+
+/**
+ * Acquire a lock on the entire multicast sub module.
+ */
+#define MULTICAST_LOCK() (pthread_mutex_lock(&g_multicast_lock))
+
+/**
+ * Relinquish the lock on the entire multicast sub module.
+ */
+#define MULTICAST_UNLOCK() (pthread_mutex_unlock(&g_multicast_lock))
+
+/*
+ * ----------------------------------------------------------------------------
+ * Main
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The identifier for heartbeat protocol version 3.
+ */
+#define HB_PROTOCOL_V3_IDENTIFIER 0x6864
+
+/**
+ * Maximum length of hb protocol string.
+ */
+#define HB_PROTOCOL_STR_MAX_LEN 16
+
+/**
+ * Default allocation size for plugin data.
+ */
+#define HB_PLUGIN_DATA_DEFAULT_SIZE 128
+
+/**
+ * Block size for allocating node plugin data. Ensure the allocation is in
+ * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating.
+ */
+#define HB_PLUGIN_DATA_BLOCK_SIZE 128
+
+/**
+ * Message scratch size for v3 HB messages. To accommodate 64 node cluster.
+ */
+#define AS_HB_MSG_SCRATCH_SIZE 1024
+
+/**
+ * A soft limit for the maximum cluster size. Meant to be optimize hash and list
+ * data structures and not as a limit on the number of nodes.
+ */
+#define AS_HB_CLUSTER_MAX_SIZE_SOFT 200
+
+/**
+ * Maximum event listeners.
+ */
+#define AS_HB_EVENT_LISTENER_MAX 7
+
+/**
+ * Maximum permissible cluster-name mismatch per node.
+ */
+#define CLUSTER_NAME_MISMATCH_MAX 2
+
+/**
+ * Timeout for deeming a node dead based on received heartbeats.
+ */
+#define HB_NODE_TIMEOUT()											\
+((config_max_intervals_missed_get() * config_tx_interval_get()))
+
+/**
+ * Intervals at which heartbeats are send.
+ */
+#define PULSE_TRANSMIT_INTERVAL()							\
+(MAX(config_tx_interval_get(), AS_HB_TX_INTERVAL_MS_MIN))
+
+/**
+ * Intervals at which adjacency tender runs.
+ */
+#define ADJACENCY_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL())
+
+/**
+ * Intervals at which adjacency tender runs in anticipation of addtional node
+ * depart events.
+ */
+#define ADJACENCY_FAST_TEND_INTERVAL (MIN(ADJACENCY_TEND_INTERVAL, 10))
+
+/**
+ * Acquire a lock on the external event publisher.
+ */
+#define EXTERNAL_EVENT_PUBLISH_LOCK()					\
+(pthread_mutex_lock(&g_external_event_publish_lock))
+
+/**
+ * Relinquish the lock on the external event publisher.
+ */
+#define EXTERNAL_EVENT_PUBLISH_UNLOCK()					\
+(pthread_mutex_unlock(&g_external_event_publish_lock))
+
+/**
+ * Acquire a lock on the heartbeat main module.
+ */
+#define HB_LOCK() (pthread_mutex_lock(&g_hb_lock))
+
+/**
+ * Relinquish the lock on the  heartbeat main module.
+ */
+#define HB_UNLOCK() (pthread_mutex_unlock(&g_hb_lock))
+
+/**
+ * Weightage of current latency over current moving average. For now weigh
+ * recent values heavily over older values.
+ */
+#define ALPHA (0.65)
+
+/*
+ * ----------------------------------------------------------------------------
+ * Common
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The default MTU for multicast in case device discovery fails.
+ */
+#define DEFAULT_MIN_MTU 1500
+
+/**
+ * Maximum memory size allocated on the call stack.
+ */
+#define STACK_ALLOC_LIMIT (16 * 1024)
+
+/**
+ * Max string length for an endpoint list converted to a string.
+ */
+#define ENDPOINT_LIST_STR_SIZE 1024
+
+/**
+ * A hard limit on the buffer size for parsing incoming messages.
+ */
+#define MSG_BUFFER_MAX_SIZE (10 * 1024 * 1024)
+
+#ifndef ASC
+#define ASC (2 << 2)
+#endif
+
+/**
+ * Connection initiation timeout, Capped at 100 ms.
+ */
+#define CONNECT_TIMEOUT() (MIN(100, config_tx_interval_get()))
+
+/**
+ * Allocate a buffer for heart beat messages. Larger buffers are heap allocated
+ * to prevent stack overflows.
+ */
+#define MSG_BUFF_ALLOC(size) (										\
+		(size) <= MSG_BUFFER_MAX_SIZE ?								\
+				(((size) > STACK_ALLOC_LIMIT) ?						\
+						cf_malloc(size) : alloca(size)) : NULL)
+
+/**
+ * Allocate a buffer for heart beat messages. Larger buffers are heap allocated
+ * to prevent stack overflows. Crashes the process on failure to allocate the
+ * buffer.
+ */
+#define MSG_BUFF_ALLOC_OR_DIE(size, crash_msg, ...)		\
+({														\
+	uint8_t* retval = MSG_BUFF_ALLOC((size));			\
+	if (!retval) {										\
+		CRASH(crash_msg, ##__VA_ARGS__);				\
+	}													\
+	retval;												\
+})
+
+/**
+ * Free the buffer allocated by MSG_BUFF_ALLOC
+ */
+#define MSG_BUFF_FREE(buffer, size)								\
+if (((size) > STACK_ALLOC_LIMIT) && buffer) {cf_free(buffer);}
+
+/**
+ * Acquire a lock on the entire config sub module.
+ */
+#define HB_CONFIG_LOCK() (pthread_mutex_lock(&g_hb_config_lock))
+
+/**
+ * Relinquish the lock on the entire config sub module.
+ */
+#define HB_CONFIG_UNLOCK() (pthread_mutex_unlock(&g_hb_config_lock))
+
+/**
+ * Acquire a lock while setting heartbeat protocol dynamically.
+ */
+#define SET_PROTOCOL_LOCK() (pthread_mutex_lock(&g_set_protocol_lock))
+
+/**
+ * Relinquish the lock after setting heartbeat protocol dynamically.
+ */
+#define SET_PROTOCOL_UNLOCK() (pthread_mutex_unlock(&g_set_protocol_lock))
+
+/**
+ * Logging macros.
+ */
+#define CRASH(format, ...) cf_crash(AS_HB, format, ##__VA_ARGS__)
+#define CRASH_NOSTACK(format, ...) cf_crash_nostack(AS_HB, format, ##__VA_ARGS__)
+#define WARNING(format, ...) cf_warning(AS_HB, format, ##__VA_ARGS__)
+#define TICKER_WARNING(format, ...)					\
+cf_ticker_warning(AS_HB, format, ##__VA_ARGS__)
+#define INFO(format, ...) cf_info(AS_HB, format, ##__VA_ARGS__)
+#define DEBUG(format, ...) cf_debug(AS_HB, format, ##__VA_ARGS__)
+#define DETAIL(format, ...) cf_detail(AS_HB, format, ##__VA_ARGS__)
+#define ASSERT(expression, message, ...)				\
+if (!(expression)) {WARNING(message, ##__VA_ARGS__);}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Private internal data structures
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Common
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Heartbeat subsystem state.
+ */
+typedef enum
+{
+	AS_HB_STATUS_UNINITIALIZED,
+	AS_HB_STATUS_RUNNING,
+	AS_HB_STATUS_SHUTTING_DOWN,
+	AS_HB_STATUS_STOPPED
+} as_hb_status;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Mesh related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Mesh node status enum.
+ */
+typedef enum
+{
+	/**
+	 * The mesh node has an active channel.
+	 */
+	AS_HB_MESH_NODE_CHANNEL_ACTIVE,
+
+	/**
+	 * The mesh node is waiting for an active channel.
+	 */
+	AS_HB_MESH_NODE_CHANNEL_PENDING,
+
+	/**
+	 * The mesh node does not have an active channel.
+	 */
+	AS_HB_MESH_NODE_CHANNEL_INACTIVE,
+
+	/**
+	 * The ip address and port for this node are not yet known.
+	 */
+	AS_HB_MESH_NODE_ENDPOINT_UNKNOWN,
+
+	/**
+	 * The sentinel value. Should be the last in the enum.
+	 */
+	AS_HB_MESH_NODE_STATUS_SENTINEL
+} as_hb_mesh_node_status;
+
+/**
+ * The info payload for a single node.
+ */
+typedef struct as_hb_mesh_info_reply_s
+{
+	/**
+	 * The nodeid of the node for which info reply is sent.
+	 */
+	cf_node nodeid;
+
+	/**
+	 * The advertised endpoint list for this node. List to allow variable size
+	 * endpoint list. Always access as reply.endpoints[0].
+	 */
+	as_endpoint_list endpoint_list[];
+}__attribute__((__packed__)) as_hb_mesh_info_reply;
+
+/**
+ * Mesh tend reduce function udata.
+ */
+typedef struct as_hb_mesh_tend_reduce_udata_s
+{
+	/**
+	 * The new endpoint lists to connect to. Each list has endpoints for s
+	 * single remote peer.
+	 */
+	as_endpoint_list** to_connect;
+
+	/**
+	 * The capacity of the to connect array.
+	 */
+	size_t to_connect_capacity;
+
+	/**
+	 * The count of endpoints to connect.
+	 */
+	size_t to_connect_count;
+
+	/**
+	 * Pointers to seeds that need matching.
+	 */
+	cf_vector* inactive_seeds_p;
+} as_hb_mesh_tend_reduce_udata;
+
+/**
+ * Mesh endpoint search udata.
+ */
+typedef struct
+{
+	/**
+	 * The endpoint to search.
+	 */
+	cf_sock_addr* to_search;
+
+	/**
+	 * Indicates is a match is found.
+	 */
+	bool found;
+} as_hb_endpoint_list_addr_find_udata;
+
+/**
+ * Mesh endpoint list search udata.
+ */
+typedef struct as_hb_mesh_endpoint_list_reduce_udata_s
+{
+	/**
+	 * The endpoint to search.
+	 */
+	as_endpoint_list* to_search;
+
+	/**
+	 * Indicates is a match is found.
+	 */
+	bool found;
+
+	/**
+	 * The matched key if found.
+	 */
+	cf_node* matched_nodeid;
+} as_hb_mesh_endpoint_list_reduce_udata;
+
+/**
+ * Information maintained for configured mesh seed nodes.
+ */
+typedef struct as_hb_mesh_seed_s
+{
+	/**
+	 * The name / ip address of this seed mesh host.
+	 */
+	char seed_host_name[HOST_NAME_MAX];
+
+	/**
+	 * The port of this seed mesh host.
+	 */
+	cf_ip_port seed_port;
+
+	/**
+	 * Identifies TLS mesh seed hosts.
+	 */
+	bool seed_tls;
+
+	/**
+	 * The heap allocated end point list for this seed host resolved usiung the
+	 * seeds hostname.
+	 * Will be null if the endpoint list cannot be resolved.
+	 */
+	as_endpoint_list* resolved_endpoint_list;
+
+	/**
+	 * Timestamp when the seed hostname was resolved into the endpoint list.
+	 * Used to perform periodic refresh of the endpoint list.
+	 */
+	cf_clock resolved_endpoint_list_ts;
+
+	/**
+	 * The state of this seed in terms of established channel.
+	 */
+	as_hb_mesh_node_status status;
+
+	/**
+	 * The last time the state of this node was updated.
+	 */
+	cf_clock last_status_updated;
+
+	/**
+	 * The node id for a matching mesh node entry. A zero will indicate that
+	 * there exists no matching mesh node entry.
+	 */
+	cf_node mesh_nodeid;
+
+	/**
+	 * Timestamp indicating when the matching mesh node's endpoint was updated.
+	 * Used to detect endpoint changes to the matching mesh node entry if it
+	 * exists.
+	 */
+	as_hlc_timestamp mesh_node_endpoint_change_ts;
+} as_hb_mesh_seed;
+
+/**
+ * Information maintained for discovered mesh end points.
+ */
+typedef struct as_hb_mesh_node_s
+{
+	/**
+	 * The heap allocated end point list for this mesh host. Should be freed
+	 * once the last mesh entry is removed from the mesh state.
+	 */
+	as_endpoint_list* endpoint_list;
+
+	/**
+	 * Timestamp when the mesh node was last updated.
+	 */
+	as_hlc_timestamp endpoint_change_ts;
+
+	/**
+	 * The state of this node in terms of established channel.
+	 */
+	as_hb_mesh_node_status status;
+
+	/**
+	 * The last time the state of this node was updated.
+	 */
+	cf_clock last_status_updated;
+
+	/**
+	 * The time this node's channel become inactive.
+	 */
+	cf_clock inactive_since;
+} as_hb_mesh_node;
+
+/**
+ * State maintained for the mesh mode.
+ */
+typedef struct as_hb_mesh_state_s
+{
+	/**
+	 * The sockets on which this instance accepts heartbeat tcp connections.
+	 */
+	cf_sockets listening_sockets;
+
+	/**
+	 * Indicates if the published endpoint list is ipv4 only.
+	 */
+	bool published_endpoint_list_ipv4_only;
+
+	/**
+	 * The published endpoint list.
+	 */
+	as_endpoint_list* published_endpoint_list;
+
+	/**
+	 * Mesh seed data.
+	 */
+	cf_vector seeds;
+
+	/**
+	 * A map from an cf_node _key to a mesh node.
+	 */
+	cf_shash* nodeid_to_mesh_node;
+
+	/**
+	 * Thread id for the mesh tender thread.
+	 */
+	pthread_t mesh_tender_tid;
+
+	/**
+	 * The status of the mesh module.
+	 */
+	as_hb_status status;
+
+	/**
+	 * The mtu on the listening device. This is extrapolated to all nodes and
+	 * paths in the cluster. This limits the cluster size possible.
+	 */
+	int min_mtu;
+
+	/**
+	 * Indicates if new nodes are discovered. Optimization to start mesh tend
+	 * earlier than normal tend interval on discovering new nodes.
+	 */
+	bool nodes_discovered;
+} as_hb_mesh_state;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Multicast data structures
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * State maintained for the multicast mode.
+ */
+typedef struct as_hb_multicast_state_s
+{
+	/**
+	 * The sockets associated with multicast mode.
+	 */
+	cf_mserv_cfg cfg;
+
+	/**
+	 * Multicast listening sockets.
+	 */
+	cf_sockets listening_sockets;
+
+	/**
+	 * The mtu on the listening device. This is extrapolated to all nodes and
+	 * paths in the cluster. This limits the cluster size possible.
+	 */
+	int min_mtu;
+} as_hb_multicast_state;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Channel state
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The type of a channel event.
+ */
+typedef enum
+{
+	/**
+	 * The endpoint has a channel tx/rx channel associated with it.
+	 */
+	AS_HB_CHANNEL_NODE_CONNECTED,
+
+	/**
+	 * The endpoint had a tx/rx channel that went down.
+	 */
+	AS_HB_CHANNEL_NODE_DISCONNECTED,
+
+	/**
+	 * A message was received on a connected channel. The message in the event,
+	 * is guaranteed to have passed basic sanity check like have protocol id,
+	 * type and source nodeid.
+	 */
+	AS_HB_CHANNEL_MSG_RECEIVED,
+
+	/**
+	 * Channel found node whose cluster name does not match.
+	 */
+	AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH
+} as_hb_channel_event_type;
+
+/**
+ * Status for reads from a channel.
+ */
+typedef enum
+{
+	/**
+	 * The message was read successfully and parser.
+	 */
+	AS_HB_CHANNEL_MSG_READ_SUCCESS,
+
+	/**
+	 * The message read successfully but parsing failed.
+	 */
+	AS_HB_CHANNEL_MSG_PARSE_FAIL,
+
+	/**
+	 * The message read failed network io.
+	 */
+	AS_HB_CHANNEL_MSG_CHANNEL_FAIL,
+
+	/**
+	 * Sentinel default value.
+	 */
+	AS_HB_CHANNEL_MSG_READ_UNDEF
+} as_hb_channel_msg_read_status;
+
+typedef struct
+{
+	/**
+	 * The endpoint address to search channel by.
+	 */
+	as_endpoint_list* endpoint_list;
+
+	/**
+	 * Indicates if the endpoint was found.
+	 */
+	bool found;
+
+	/**
+	 * The matching socket, if found.
+	 */
+	cf_socket* socket;
+} as_hb_channel_endpoint_reduce_udata;
+
+typedef struct
+{
+	/**
+	 * The endpoint address to search channel by.
+	 */
+	cf_sock_addr* addr_to_search;
+
+	/**
+	 * Indicates if the endpoint was found.
+	 */
+	bool found;
+} as_hb_channel_endpoint_iterate_udata;
+
+typedef struct
+{
+	/**
+	 * The message buffer to send.
+	 */
+	uint8_t* buffer;
+
+	/**
+	 * The buffer length.
+	 */
+	size_t buffer_len;
+} as_hb_channel_buffer_udata;
+
+/**
+ * A channel represents a medium to send and receive messages.
+ */
+typedef struct as_hb_channel_s
+{
+	/**
+	 * Indicates if this channel is a multicast channel.
+	 */
+	bool is_multicast;
+
+	/**
+	 * Indicates if this channel is inbound. Not relevant for multicast
+	 * channels.
+	 */
+	bool is_inbound;
+
+	/**
+	 * The id of the associated node. In mesh / unicast case this will initially
+	 * be zero and filled in when the nodeid for the node at the other end is
+	 * learnt. In multicast case this will be zero.
+	 */
+	cf_node nodeid;
+
+	/**
+	 * The address of the peer. Will always be specified for outbound channels.
+	 */
+	cf_sock_addr endpoint_addr;
+
+	/**
+	 * The last time a message was received from this node.
+	 */
+	cf_clock last_received;
+
+	/**
+	 * Time when this channel won a socket resolution. Zero if this channel
+	 * never won resolution. In compatibility mode with older code its possible
+	 * we will keep allowing the same socket to win and enter an infinite loop
+	 * of closing the sockets.
+	 */
+	cf_clock resolution_win_ts;
+} as_hb_channel;
+
+/**
+ * State maintained per heartbeat channel.
+ */
+typedef struct as_hb_channel_state_s
+{
+	/**
+	 * The poll handle. All IO wait across all heartbeat connections happens on
+	 * this handle.
+	 */
+	cf_poll poll;
+
+	/**
+	 * Channel status.
+	 */
+	as_hb_status status;
+
+	/**
+	 * Maps a socket to an as_hb_channel.
+	 */
+	cf_shash* socket_to_channel;
+
+	/**
+	 * Maps a nodeid to a channel specific node data structure. This association
+	 * will be made only on receiving the first heartbeat message from the node
+	 * on a channel.
+	 */
+	cf_shash* nodeid_to_socket;
+
+	/**
+	 * Sockets accumulated by the channel tender to close at the end of every
+	 * epoll loop.
+	 */
+	cf_queue socket_close_queue;
+
+	/**
+	 * The sockets on which heartbeat subsystem listens.
+	 */
+	cf_sockets* listening_sockets;
+
+	/**
+	 * Clock to keep track of last time idle connections were checked.
+	 */
+	cf_clock last_channel_idle_check;
+
+	/**
+	 * Enables / disables publishing channel events. Events should be disabled
+	 * only when the state changes are temporary / transient and hence would not
+	 * change the overall channel state from an external perspective.
+	 */
+	bool events_enabled;
+
+	/**
+	 * Events are batched and published to reduce cluster transitions. Queue of
+	 * unpublished heartbeat events.
+	 */
+	cf_queue events_queue;
+
+	/**
+	 * Thread id for the socket tender thread.
+	 */
+	pthread_t channel_tender_tid;
+} as_hb_channel_state;
+
+/**
+ * Entry queued up for socket close.
+ */
+typedef struct as_hb_channel_socket_close_entry_s
+{
+	/**
+	 * The node for which this event was generated.
+	 */
+	cf_socket* socket;
+	/**
+	 * Indicates if this close is a remote close.
+	 */
+	bool is_remote;
+	/**
+	 * True if close of this entry should generate a disconnect event.
+	 */
+	bool raise_close_event;
+} as_hb_channel_socket_close_entry;
+
+/**
+ * An event generated by the channel sub module.
+ */
+typedef struct as_hb_channel_event_s
+{
+	/**
+	 * The channel event type.
+	 */
+	as_hb_channel_event_type type;
+
+	/**
+	 * The node for which this event was generated.
+	 */
+	cf_node nodeid;
+
+	/**
+	 * The received message if any over this endpoint. Valid for incoming
+	 * message type event. The message if not NULL never be edited or copied
+	 * over.
+	 */
+	msg* msg;
+
+	/**
+	 * The hlc timestamp for message receipt.
+	 */
+	as_hlc_msg_timestamp msg_hlc_ts;
+} as_hb_channel_event;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Main sub module state
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Heartbeat message types.
+ */
+typedef enum
+{
+	AS_HB_MSG_TYPE_PULSE,
+	AS_HB_MSG_TYPE_INFO_REQUEST,
+	AS_HB_MSG_TYPE_INFO_REPLY,
+	AS_HB_MSG_TYPE_COMPRESSED
+} as_hb_msg_type;
+
+/**
+ * Events published by the heartbeat subsystem.
+ */
+typedef enum
+{
+	AS_HB_INTERNAL_NODE_ARRIVE,
+	AS_HB_INTERNAL_NODE_DEPART,
+	AS_HB_INTERNAL_NODE_EVICT,
+	AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED
+} as_hb_internal_event_type;
+
+/**
+ * State maintained by the heartbeat subsystem for the selected mode.
+ */
+typedef struct as_hb_mode_state_s
+{
+	/**
+	 * The mesh / multicast state.
+	 */
+	union
+	{
+		as_hb_mesh_state mesh_state;
+		as_hb_multicast_state multicast_state;
+	};
+} as_hb_mode_state;
+
+/**
+ * Plugin data iterate reduce udata.
+ */
+typedef struct
+{
+	/**
+	 * The plugin id.
+	 */
+	as_hb_plugin_id pluginid;
+
+	/**
+	 * The iterate function.
+	 */
+	as_hb_plugin_data_iterate_fn iterate_fn;
+
+	/**
+	 * The udata for the iterate function.
+	 */
+	void* udata;
+} as_hb_adjacecny_iterate_reduce_udata;
+
+/**
+ * Information tracked for an adjacent nodes.
+ */
+typedef struct as_hb_adjacent_node_s
+{
+	/**
+	 * The heart beat protocol version.
+	 */
+	uint32_t protocol_version;
+
+	/**
+	 * The remote node's
+	 */
+	as_endpoint_list* endpoint_list;
+
+	/**
+	 * Used to cycle between the two copies of plugin data.
+	 */
+	int plugin_data_cycler;
+
+	/**
+	 * Plugin specific data accumulated by the heartbeat subsystem. The data is
+	 * heap allocated and should be destroyed the moment this element entry is
+	 * unused. There are two copies of the plugin data, one the current copy and
+	 * one the previous copy. Previous copy is used to generate data change
+	 * notifications.
+	 */
+	as_hb_plugin_node_data plugin_data[AS_HB_PLUGIN_SENTINEL][2];
+
+	/**
+	 * The monotonic local time node information was last updated.
+	 */
+	cf_clock last_updated_monotonic_ts;
+
+	/**
+	 * HLC timestamp for the last pulse message.
+	 */
+	as_hlc_msg_timestamp last_msg_hlc_ts;
+
+	/**
+	 * Track number of consecutive cluster-name mismatches.
+	 */
+	uint32_t cluster_name_mismatch_count;
+
+	/**
+	 * Moving average of the latency in ms.
+	 */
+	uint64_t avg_latency;
+
+	/**
+	 * A shift register tracking change of endpoints. On receipt of a heartbeat,
+	 * if source node's endpoints change 1 is inserted at the LSB, else 0 is
+	 * inserted at the LSB.
+	 */
+	uint64_t endpoint_change_tracker;
+} as_hb_adjacent_node;
+
+/**
+ * Internal storage for external event listeners.
+ */
+typedef struct as_hb_event_listener_s
+{
+	/**
+	 * Registered callback function.
+	 */
+	as_hb_event_fn event_callback;
+
+	/**
+	 * Arguments for the listeners.
+	 */
+	void* udata;
+} as_hb_event_listener;
+
+/**
+ * Heartbeat subsystem internal state.
+ */
+typedef struct as_hb_s
+{
+	/**
+	 * The status of the subsystem.
+	 */
+	as_hb_status status;
+
+	/**
+	 * The adjacency dictionary. The key is the nodeid. The value is an instance
+	 * of as_hb_adjacent_node.
+	 */
+	cf_shash* adjacency;
+
+	/**
+	 * The probation dictionary having nodes that display unexpected behavior.
+	 * Nodeids under probation and adjacency hash are always exclusive. The key
+	 * is the nodeid. The value is an instance of as_hb_adjacent_node.
+	 */
+	cf_shash* on_probation;
+
+	/**
+	 * Temporary nodeid to index hash used to compute nodes to evict from a
+	 * clique.
+	 */
+	cf_shash* nodeid_to_index;
+
+	/**
+	 * The mode specific state.
+	 */
+	as_hb_mode_state mode_state;
+
+	/**
+	 * The channel state.
+	 */
+	as_hb_channel_state channel_state;
+
+	/**
+	 * Self node accumulated stats used primarily to detect duplicate node-ids.
+	 */
+	as_hb_adjacent_node self_node;
+
+	/**
+	 * Indicates self node-id has duplicates.
+	 */
+	bool self_is_duplicate;
+
+	/**
+	 * Monotonic timestamp of when a self duplicate was detected.
+	 */
+	cf_clock self_duplicate_detected_ts;
+
+	/**
+	 * The plugin dictionary. The key is the as_hb_plugin entry and the value an
+	 * instance of as_hb_plugin.
+	 */
+	as_hb_plugin plugins[AS_HB_PLUGIN_SENTINEL];
+
+	/**
+	 * Thread id for the transmitter thread.
+	 */
+	pthread_t transmitter_tid;
+
+	/**
+	 * Thread id for the thread expiring nodes from the adjacency list.
+	 */
+	pthread_t adjacency_tender_tid;
+} as_hb;
+
+/**
+ * Registered heartbeat listeners.
+ */
+typedef struct as_hb_external_events_s
+{
+	/**
+	 * Events are batched and published. Queue of unpublished heartbeat events.
+	 */
+	cf_queue external_events_queue;
+
+	/**
+	 * Count of event listeners.
+	 */
+	int event_listener_count;
+
+	/**
+	 * External event listeners.
+	 */
+	as_hb_event_listener event_listeners[AS_HB_EVENT_LISTENER_MAX];
+} as_hb_external_events;
+
+/**
+ * Shash reduce function to read current adjacency list.
+ */
+typedef struct as_hb_adjacency_reduce_udata_s
+{
+	/**
+	 * The target adjacency list.
+	 */
+	cf_node* adj_list;
+
+	/**
+	 * Count of elements in the adjacency list.
+	 */
+	int adj_count;
+} as_hb_adjacency_reduce_udata;
+
+/**
+ * Udata for finding nodes in the adjacency list not in the input succession
+ * list.
+ */
+typedef struct
+{
+	/**
+	 * Number of events generated.
+	 */
+	int event_count;
+
+	/**
+	 * List of generated events.
+	 */
+	as_hb_event_node* events;
+
+	/**
+	 * Limit on number of generated events.
+	 */
+	int max_events;
+
+	/**
+	 * Current succession list.
+	 */
+	cf_node* succession;
+
+	/**
+	 * Number of nodes in succession list.
+	 */
+	int succession_size;
+} as_hb_find_new_nodes_reduce_udata;
+
+/**
+ * Shash reduce function to read current adjacency list.
+ */
+typedef struct as_hb_adjacency_tender_udata_s
+{
+	/**
+	 * The list of expired nodes.
+	 */
+	cf_node* dead_nodes;
+
+	/**
+	 * Count of elements in the dead node list.
+	 */
+	int dead_node_count;
+
+	/**
+	 * The list of evicted nodes , e.g. due to cluster name mismatch.
+	 */
+	cf_node* evicted_nodes;
+
+	/**
+	 * Count of elements in the evicted node list.
+	 */
+	int evicted_node_count;
+} as_hb_adjacency_tender_udata;
+
+/**
+ * Udata for tip clear.
+ */
+typedef struct as_hb_mesh_tip_clear_udata_s
+{
+	/**
+	 * Host IP or DNS name to be cleared from seed list.
+	 */
+	char host[HOST_NAME_MAX];
+	/**
+	 * Listening port of the host.
+	 */
+	int port;
+
+	/**
+	 * Node id if a specific node-id needs to be removed as well.
+	 */
+	cf_node nodeid;
+
+	/**
+	 * Tip-clear status
+	 */
+	bool entry_deleted;
+} as_hb_mesh_tip_clear_udata;
+
+/**
+ * Convert endpoint list to string in a process function.
+ */
+typedef struct endpoint_list_to_string_udata_s
+{
+	/**
+	 * The endpoint list in string format.
+	 */
+	char* endpoint_list_str;
+
+	/**
+	 * The size of enpoint list.
+	 */
+	size_t endpoint_list_str_capacity;
+} endpoint_list_to_string_udata;
+
+/**
+ * Udata to fill an endpoint list into a message.
+ */
+typedef struct endpoint_list_to_msg_udata_s
+{
+	/**
+	 * The target message.
+	 */
+	msg* msg;
+
+	/**
+	 * Indicates if we are running in mesh mode.
+	 */
+	bool is_mesh;
+} endpoint_list_to_msg_udata;
+
+/**
+ * Udata to test if this endpoint list overlaps with other endpoint list.
+ */
+typedef struct endpoint_list_equal_check_udata_s
+{
+	/**
+	 * The endpoint list of the new node.
+	 */
+	as_endpoint_list* other;
+
+	/**
+	 * Output. Indicates if the lists are equal.
+	 */
+	bool are_equal;
+} endpoint_list_equal_check_udata;
+
+/**
+ * Endpoint list process function.
+ * @param endpoint current endpoint in the iteration.
+ * @param udata udata passed through from the invoker of the iterate function.
+ */
+typedef void
+(*endpoint_list_process_fn)(const as_endpoint_list* endpoint_list, void* udata);
+
+/**
+ * Seed host list reduce udata.
+ */
+typedef struct as_hb_seed_host_list_udata_s
+{
+	/**
+	 * The buffer to receive the list.
+	 */
+	cf_dyn_buf* db;
+
+	/**
+	 * Selects TLS seed nodes.
+	 */
+	bool tls;
+} as_hb_seed_host_list_udata;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Globals
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Global heartbeat instance.
+ */
+static as_hb g_hb;
+
+/**
+ * Global heartbeat events listener instance.
+ */
+static as_hb_external_events g_hb_event_listeners;
+
+/**
+ * The big fat lock for all external event publishing. This ensures that a batch
+ * of external events are published atomically to preserve the order of external
+ * events.
+ */
+static pthread_mutex_t g_external_event_publish_lock =
+		PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * Global lock to serialize all read and writes to the heartbeat subsystem.
+ */
+static pthread_mutex_t g_hb_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * The big fat lock for all channel state.
+ */
+static pthread_mutex_t g_channel_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * The big fat lock for all mesh state.
+ */
+static pthread_mutex_t g_mesh_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * The big fat lock for all multicast state.
+ */
+static pthread_mutex_t g_multicast_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * The global lock for all heartbeat configuration.
+ */
+static pthread_mutex_t g_hb_config_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * The lock used while setting heartbeat protocol.
+ */
+static pthread_mutex_t g_set_protocol_lock =
+		PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
+/**
+ * Message templates for heartbeat messages.
+ */
+static msg_template g_hb_msg_template[] = {
+
+{ AS_HB_MSG_ID, M_FT_UINT32 },
+
+{ AS_HB_MSG_TYPE, M_FT_UINT32 },
+
+{ AS_HB_MSG_NODE, M_FT_UINT64 },
+
+{ AS_HB_MSG_CLUSTER_NAME, M_FT_STR },
+
+{ AS_HB_MSG_HLC_TIMESTAMP, M_FT_UINT64 },
+
+{ AS_HB_MSG_ENDPOINTS, M_FT_BUF },
+
+{ AS_HB_MSG_COMPRESSED_PAYLOAD, M_FT_BUF },
+
+{ AS_HB_MSG_INFO_REQUEST, M_FT_BUF },
+
+{ AS_HB_MSG_INFO_REPLY, M_FT_BUF },
+
+{ AS_HB_MSG_FABRIC_DATA, M_FT_BUF },
+
+{ AS_HB_MSG_HB_DATA, M_FT_BUF },
+
+{ AS_HB_MSG_PAXOS_DATA, M_FT_BUF },
+
+{ AS_HB_MSG_SKEW_MONITOR_DATA, M_FT_UINT64 } };
+
+/*
+ * ----------------------------------------------------------------------------
+ * Private internal function forward declarations.
+ * ----------------------------------------------------------------------------
+ */
+
+static void info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list);
+static uint32_t round_up_pow2(uint32_t v);
+static int vector_find(cf_vector* vector, const void* element);
+
+static void endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src);
+static void endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, void* udata);
+static void endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata);
+
+static int msg_compression_threshold(int mtu);
+static int msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list);
+static int msg_id_get(msg* msg, uint32_t* id);
+static int msg_nodeid_get(msg* msg, cf_node* nodeid);
+static int msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts);
+static int msg_type_get(msg* msg, as_hb_msg_type* type);
+static int msg_cluster_name_get(msg* msg, char** cluster_name);
+static int msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, size_t* adj_length);
+static int msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length);
+static void msg_node_list_set(msg* msg, int field_id, cf_node* node_list, size_t node_length);
+static void msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length);
+static int msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count);
+static void msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, void* udata);
+static void msg_src_fields_fill(msg* msg);
+static void msg_type_set(msg* msg, as_hb_msg_type msg_type);
+
+static int config_mcsize();
+static const cf_serv_cfg* config_bind_cfg_get();
+static const cf_mserv_cfg* config_multicast_group_cfg_get();
+static uint32_t config_tx_interval_get();
+static void config_tx_interval_set(uint32_t new_interval);
+static uint32_t config_override_mtu_get();
+static void config_override_mtu_set(uint32_t mtu);
+static uint32_t config_max_intervals_missed_get();
+static void config_max_intervals_missed_set(uint32_t new_max);
+static unsigned char config_multicast_ttl_get();
+static as_hb_protocol config_protocol_get();
+static void config_protocol_set(as_hb_protocol new_protocol);
+static cf_node config_self_nodeid_get();
+static as_hb_mode config_mode_get();
+static void config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, cf_serv_cfg* published_cfg, bool ipv4_only);
+static bool config_binding_is_valid(char** error, as_hb_protocol protocol);
+
+static void channel_init_channel(as_hb_channel* channel);
+static void channel_event_init(as_hb_channel_event* event);
+static bool channel_is_running();
+static bool channel_is_stopped();
+static uint32_t channel_win_grace_ms();
+static void channel_events_enabled_set(bool enabled);
+static bool channel_are_events_enabled();
+static void channel_event_queue(as_hb_channel_event* event);
+static void channel_event_publish_pending();
+static int channel_get_channel(cf_socket* socket, as_hb_channel* result);
+static void channel_socket_shutdown(cf_socket* socket);
+static int channel_socket_get(cf_node nodeid, cf_socket** socket);
+static bool channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find);
+static void channel_socket_destroy(cf_socket* sock);
+static void channel_socket_close(cf_socket* socket, bool remote_close, bool raise_close_event);
+static void channel_sockets_close(cf_vector* sockets);
+static void channel_socket_close_queue(cf_socket* socket, bool is_remote_close, bool raise_close_event);
+static void channel_socket_close_pending();
+static void channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, cf_sock_addr* endpoint_addr);
+static void channel_accept_connection(cf_socket* lsock);
+static as_hb_channel_msg_read_status channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len);
+static void channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata);
+static int channel_endpoint_search_reduce(const void* key, void* data, void* udata);
+static bool channel_endpoint_is_connected(as_endpoint_list* endpoint_list);
+static as_hb_channel_msg_read_status channel_multicast_msg_read(cf_socket* socket, msg* msg);
+static as_hb_channel_msg_read_status channel_mesh_msg_read(cf_socket* socket, msg* msg);
+static void channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid);
+static bool channel_socket_should_live(cf_socket* socket, as_hb_channel* channel);
+static cf_socket* channel_socket_resolve(cf_socket* socket1, cf_socket* socket2);
+static int channel_msg_sanity_check(as_hb_channel_event* msg_event);
+static int channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event);
+static void channel_msg_read(cf_socket* socket);
+static void channel_channels_idle_check();
+void* channel_tender(void* arg);
+static bool channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata);
+static void channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, int endpoint_list_count);
+static int channel_node_disconnect(cf_node nodeid);
+static void channel_mesh_listening_socks_register(cf_sockets* listening_sockets);
+static void channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets);
+static void channel_multicast_listening_socks_register(cf_sockets* listening_sockets);
+static void channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets);
+static void channel_init();
+static void channel_start();
+static int channel_sockets_get_reduce(const void* key, void* data, void* udata);
+static void channel_stop();
+static int channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length);
+static int channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length);
+static bool channel_msg_is_compression_required(msg* msg, int wire_size, int mtu);
+static int channel_msg_buffer_size_get(int wire_size, int mtu);
+static size_t channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, uint8_t* buffer, size_t buffer_len);
+static int channel_msg_unicast(cf_node dest, msg* msg);
+static int channel_msg_broadcast_reduce(const void* key, void* data, void* udata);
+static int channel_msg_broadcast(msg* msg);
+static void channel_clear();
+static int channel_dump_reduce(const void* key, void* data, void* udata);
+static void channel_dump(bool verbose);
+
+static bool mesh_is_running();
+static bool mesh_is_stopped();
+static void mesh_published_endpoints_process(endpoint_list_process_fn process_fn, void* udata);
+static const char* mesh_node_status_string(as_hb_mesh_node_status status);
+static int mesh_seed_delete_unsafe(int seed_index);
+static int mesh_seed_find_unsafe(char* host, int port);
+static void mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, int mesh_node_count);
+static void mesh_node_status_change(as_hb_mesh_node* mesh_node, as_hb_mesh_node_status new_status);
+static void mesh_listening_sockets_close();
+static void mesh_seed_host_list_get(cf_dyn_buf* db, bool tls);
+static void mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p);
+static void mesh_stop();
+static int mesh_tend_reduce(const void* key, void* data, void* udata);
+void* mesh_tender(void* arg);
+static void mesh_node_destroy(as_hb_mesh_node* mesh_node);
+static void mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata);
+static bool mesh_node_is_discovered(cf_node nodeid);
+static bool mesh_node_endpoint_list_is_valid(cf_node nodeid);
+static int mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node);
+static void mesh_channel_on_node_disconnect(as_hb_channel_event* event);
+static bool mesh_node_check_fix_self_msg(as_hb_channel_event* event);
+static void mesh_node_data_update(as_hb_channel_event* event);
+static int mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, size_t* reply_size);
+static void mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, size_t reply_count);
+static msg* mesh_info_msg_init(as_hb_msg_type msg_type);
+static void mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, size_t to_discover_count);
+static void mesh_channel_on_pulse(msg* msg);
+static void mesh_channel_on_info_request(msg* msg);
+static void mesh_channel_on_info_reply(msg* msg);
+static int mesh_tip(char* host, int port, bool tls);
+static void mesh_channel_event_process(as_hb_channel_event* event);
+static void mesh_init();
+static int mesh_free_node_data_reduce(const void* key, void* data, void* udata);
+static int mesh_tip_clear_reduce(const void* key, void* data, void* udata);
+static int mesh_peer_endpoint_reduce(const void* key, void* data, void* udata);
+static void mesh_clear();
+static void mesh_listening_sockets_open();
+static void mesh_start();
+static int mesh_dump_reduce(const void* key, void* data, void* udata);
+static void mesh_dump(bool verbose);
+
+static void multicast_init();
+static void multicast_clear();
+static void multicast_listening_sockets_open();
+static void multicast_start();
+static void multicast_listening_sockets_close();
+static void multicast_stop();
+static void multicast_dump(bool verbose);
+static int multicast_supported_cluster_size_get();
+
+static bool hb_is_initialized();
+static bool hb_is_running();
+static bool hb_is_stopped();
+static void hb_mode_init();
+static void hb_mode_start();
+static int hb_mtu();
+static void hb_msg_init();
+static uint32_t hb_protocol_identifier_get();
+static cf_clock hb_node_depart_time(cf_clock detect_time);
+static bool hb_is_mesh();
+static void hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, int node_count);
+static void hb_event_publish_pending();
+static int hb_adjacency_free_data_reduce(const void* key, void* data, void* udata);
+static void hb_clear();
+static int hb_adjacency_iterate_reduce(const void* key, void* data, void* udata);
+static void hb_plugin_set_fn(msg* msg);
+static void hb_plugin_parse_data_fn(msg* msg, cf_node source, as_hb_plugin_node_data* plugin_data);
+static msg* hb_msg_get();
+static void hb_msg_return(msg* msg);
+static void hb_plugin_msg_fill(msg* msg);
+static void hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, as_hb_plugin* plugins, bool plugin_data_changed[]);
+static void hb_plugin_init();
+void* hb_transmitter(void* arg);
+static int hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node);
+static void hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size);
+static void hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, cf_node** adjacency_list, size_t* adjacency_length);
+static bool hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node);
+static bool hb_self_is_duplicate();
+static void hb_self_duplicate_update();
+static void hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node);
+static int hb_adjacency_tend_reduce(const void* key, void* data, void* udata);
+void* hb_adjacency_tender(void* arg);
+static void hb_tx_start();
+static void hb_tx_stop();
+static void hb_adjacency_tender_start();
+static void hb_adjacency_tender_stop();
+static void hb_init();
+static void hb_start();
+static void hb_stop();
+static void hb_plugin_register(as_hb_plugin* plugin);
+static bool hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp send_ts);
+static void hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed);
+static bool hb_endpoint_change_tracker_is_normal(uint64_t tracker);
+static bool hb_endpoint_change_tracker_has_changed(uint64_t tracker);
+static void hb_adjacent_node_update(as_hb_channel_event* msg_event, as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]);
+static bool hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node);
+static void hb_channel_on_self_pulse(as_hb_channel_event* msg_event);
+static void hb_channel_on_pulse(as_hb_channel_event* msg_event);
+static void hb_channel_on_msg_rcvd(as_hb_channel_event* event);
+static void hb_handle_cluster_name_mismatch(as_hb_channel_event* event);
+static void hb_channel_event_process(as_hb_channel_event* event);
+static void hb_mode_dump(bool verbose);
+static int hb_dump_reduce(const void* key, void* data, void* udata);
+static void hb_dump(bool verbose);
+static void hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph);
+static void hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict);
+static int hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata);
+static void hb_plugin_data_iterate_all(as_hb_plugin_id pluginid,
+		as_hb_plugin_data_iterate_fn iterate_fn, void* udata);
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public functions.
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Initialize the heartbeat subsystem.
+ */
+void
+as_hb_init()
+{
+	// Initialize hb subsystem.
+	hb_init();
+
+	// Add the mesh seed nodes.
+	// Using one time seed config outside the config module.
+	if (hb_is_mesh()) {
+		for (int i = 0; i < AS_CLUSTER_SZ; i++) {
+			if (g_config.hb_config.mesh_seed_addrs[i]) {
+				mesh_tip(g_config.hb_config.mesh_seed_addrs[i],
+						g_config.hb_config.mesh_seed_ports[i],
+						g_config.hb_config.mesh_seed_tls[i]);
+			}
+			else {
+				break;
+			}
+		}
+	}
+}
+
+/**
+ * Start the heartbeat subsystem.
+ */
+void
+as_hb_start()
+{
+	hb_start();
+}
+
+/**
+ * Shut down the heartbeat subsystem.
+ */
+void
+as_hb_shutdown()
+{
+	hb_stop();
+}
+
+/**
+ * Indicates if self node is a duplicate
+ */
+bool
+as_hb_self_is_duplicate()
+{
+	return hb_self_is_duplicate();
+}
+
+/**
+ * Free the data structures of heart beat.
+ */
+void
+as_hb_destroy()
+{
+	// Destroy the main module.
+	hb_clear();
+}
+
+/**
+ * Return a string representation of a heartbeat protocol type.
+ *
+ * @param protocol for which the string is computed
+ * @param protocol_s string representation of protocol
+ */
+void
+as_hb_protocol_get_s(as_hb_protocol protocol, char* protocol_s)
+{
+	char *str;
+	switch (protocol) {
+	case AS_HB_PROTOCOL_V3:
+		str = "v3";
+		break;
+	case AS_HB_PROTOCOL_NONE:
+		str = "none";
+		break;
+	case AS_HB_PROTOCOL_RESET:
+		str = "reset";
+		break;
+	default:
+		str = "undefined";
+	}
+
+	sprintf(protocol_s, "%s", str);
+}
+
+/**
+ * Set heartbeat protocol version.
+ */
+as_hb_protocol
+as_hb_protocol_get()
+{
+	return config_protocol_get();
+}
+
+/**
+ * Set heartbeat protocol version.
+ */
+int
+as_hb_protocol_set(as_hb_protocol new_protocol)
+{
+	SET_PROTOCOL_LOCK();
+	int rv = 0;
+	if (config_protocol_get() == new_protocol) {
+		INFO("no heartbeat protocol change needed");
+		rv = 0;
+		goto Exit;
+	}
+	char old_protocol_s[HB_PROTOCOL_STR_MAX_LEN];
+	char new_protocol_s[HB_PROTOCOL_STR_MAX_LEN];
+	as_hb_protocol_get_s(config_protocol_get(), old_protocol_s);
+	as_hb_protocol_get_s(new_protocol, new_protocol_s);
+	switch (new_protocol) {
+	case AS_HB_PROTOCOL_V3:
+		if (hb_is_running()) {
+			INFO("disabling current heartbeat protocol %s", old_protocol_s);
+			hb_stop();
+		}
+		INFO("setting heartbeat protocol version number to %s", new_protocol_s);
+		config_protocol_set(new_protocol);
+		hb_start();
+		INFO("heartbeat protocol version set to %s", new_protocol_s);
+		break;
+
+	case AS_HB_PROTOCOL_NONE:
+		INFO("setting heartbeat protocol version to none");
+		hb_stop();
+		config_protocol_set(new_protocol);
+		INFO("heartbeat protocol set to none");
+		break;
+
+	case AS_HB_PROTOCOL_RESET:
+		if (config_protocol_get() == AS_HB_PROTOCOL_NONE) {
+			INFO("heartbeat messaging disabled ~~ not resetting");
+			rv = -1;
+			goto Exit;
+		}
+
+		// NB: "protocol" is never actually set to "RESET" ~~
+		// it is simply a trigger for the reset action.
+		INFO("resetting heartbeat messaging");
+
+		hb_stop();
+
+		hb_clear();
+
+		hb_start();
+
+		break;
+
+	default:
+		WARNING("unknown heartbeat protocol version number: %d", new_protocol);
+		rv = -1;
+		goto Exit;
+	}
+
+Exit:
+	SET_PROTOCOL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Register a heartbeat plugin.
+ */
+void
+as_hb_plugin_register(as_hb_plugin* plugin)
+{
+	if (!hb_is_initialized()) {
+		WARNING(
+				"main heartbeat module uninitialized - not registering the plugin");
+		return;
+	}
+	hb_plugin_register(plugin);
+}
+
+/**
+ * Register a heartbeat node event listener.
+ */
+void
+as_hb_register_listener(as_hb_event_fn event_callback, void* udata)
+{
+	if (!hb_is_initialized()) {
+		WARNING(
+				"main heartbeat module uninitialized - not registering the listener");
+		return;
+	}
+
+	HB_LOCK();
+
+	if (g_hb_event_listeners.event_listener_count >=
+	AS_HB_EVENT_LISTENER_MAX) {
+		CRASH("cannot register more than %d event listeners",
+				AS_HB_EVENT_LISTENER_MAX);
+	}
+
+	g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].event_callback =
+			event_callback;
+	g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].udata =
+			udata;
+	g_hb_event_listeners.event_listener_count++;
+
+	HB_UNLOCK();
+}
+
+/**
+ * Validate heartbeat config.
+ */
+void
+as_hb_config_validate()
+{
+	char *error;
+	// Validate clustering and heartbeat version compatibility.
+	as_hb_protocol hb_protocol = config_protocol_get();
+
+	if (hb_protocol != AS_HB_PROTOCOL_V3
+			&& hb_protocol != AS_HB_PROTOCOL_NONE) {
+		CRASH_NOSTACK("clustering protocol v5 requires hearbeat version v3");
+	}
+
+	if (!config_binding_is_valid(&error, hb_protocol)) {
+		CRASH_NOSTACK("%s", error);
+	}
+}
+
+/**
+ * Override the computed MTU for the network interface used by heartbeat.
+ */
+void
+as_hb_override_mtu_set(int mtu)
+{
+	config_override_mtu_set(mtu);
+}
+
+/**
+ * Get the heartbeat pulse transmit interval.
+ */
+uint32_t
+as_hb_tx_interval_get()
+{
+	return config_tx_interval_get();
+}
+
+/**
+ * Set the heartbeat pulse transmit interval.
+ */
+int
+as_hb_tx_interval_set(uint32_t new_interval)
+{
+	if (new_interval < AS_HB_TX_INTERVAL_MS_MIN
+			|| new_interval > AS_HB_TX_INTERVAL_MS_MAX) {
+		WARNING("heartbeat interval must be >= %u and <= %u - ignoring %u",
+				AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX,
+				new_interval);
+		return (-1);
+	}
+	config_tx_interval_set(new_interval);
+	return (0);
+}
+
+/**
+ * Set the maximum number of missed heartbeat intervals after which a node is
+ * considered expired.
+ */
+int
+as_hb_max_intervals_missed_set(uint32_t new_max)
+{
+	if (new_max < AS_HB_MAX_INTERVALS_MISSED_MIN) {
+		WARNING("heartbeat timeout must be >= %u - ignoring %u",
+				AS_HB_MAX_INTERVALS_MISSED_MIN, new_max);
+		return (-1);
+	}
+	config_max_intervals_missed_set(new_max);
+	return (0);
+}
+
+/**
+ * Get the timeout interval to consider a node dead / expired in milliseconds if
+ * no heartbeat pulse messages are received.
+ */
+uint32_t
+as_hb_node_timeout_get()
+{
+	return HB_NODE_TIMEOUT();
+}
+
+/**
+ * Populate the buffer with heartbeat configuration.
+ */
+void
+as_hb_info_config_get(cf_dyn_buf* db)
+{
+	if (hb_is_mesh()) {
+		info_append_string(db, "heartbeat.mode", "mesh");
+		info_append_addrs(db, "heartbeat.address", &g_config.hb_serv_spec.bind);
+		info_append_uint32(db, "heartbeat.port",
+				(uint32_t)g_config.hb_serv_spec.bind_port);
+		mesh_seed_host_list_get(db, false);
+		info_append_addrs(db, "heartbeat.tls-address",
+				&g_config.hb_tls_serv_spec.bind);
+		info_append_uint32(db, "heartbeat.tls-port",
+				g_config.hb_tls_serv_spec.bind_port);
+		info_append_string_safe(db, "heartbeat.tls-name",
+				g_config.hb_tls_serv_spec.tls_our_name);
+		mesh_seed_host_list_get(db, true);
+	}
+	else {
+		info_append_string(db, "heartbeat.mode", "multicast");
+		info_append_addrs(db, "heartbeat.address", &g_config.hb_serv_spec.bind);
+		info_append_addrs(db, "heartbeat.multicast-group",
+				&g_config.hb_multicast_groups);
+		info_append_uint32(db, "heartbeat.port",
+				(uint32_t)g_config.hb_serv_spec.bind_port);
+	}
+
+	info_append_uint32(db, "heartbeat.interval", config_tx_interval_get());
+	info_append_uint32(db, "heartbeat.timeout",
+			config_max_intervals_missed_get());
+
+	info_append_int(db, "heartbeat.mtu", hb_mtu());
+
+	char protocol_s[HB_PROTOCOL_STR_MAX_LEN];
+	as_hb_protocol_get_s(config_protocol_get(), protocol_s);
+
+	info_append_string(db, "heartbeat.protocol", protocol_s);
+}
+
+/**
+ * Populate heartbeat endpoints.
+ */
+void
+as_hb_info_endpoints_get(cf_dyn_buf* db)
+{
+	const cf_serv_cfg *cfg = config_bind_cfg_get();
+
+	if (cfg->n_cfgs == 0) {
+		// Will never happen in practice.
+		return;
+	}
+
+	info_append_int(db, "heartbeat.port", g_config.hb_serv_spec.bind_port);
+
+	char *string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT);
+	info_append_string(db, "heartbeat.addresses", string);
+	cf_free(string);
+
+	info_append_int(db, "heartbeat.tls-port",
+			g_config.hb_tls_serv_spec.bind_port);
+
+	string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT_TLS);
+	info_append_string(db, "heartbeat.tls-addresses", string);
+	cf_free(string);
+
+	if (hb_is_mesh()) {
+		MESH_LOCK();
+		cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
+				mesh_peer_endpoint_reduce, db);
+		MESH_UNLOCK();
+	}
+	else {
+		// Output multicast groups.
+		const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get();
+		if (multicast_cfg->n_cfgs == 0) {
+			return;
+		}
+
+		cf_dyn_buf_append_string(db, "heartbeat.multicast-groups=");
+		uint32_t count = 0;
+		for (uint32_t i = 0; i < multicast_cfg->n_cfgs; ++i) {
+			if (count > 0) {
+				cf_dyn_buf_append_char(db, ',');
+			}
+
+			cf_dyn_buf_append_string(db,
+					cf_ip_addr_print(&multicast_cfg->cfgs[i].addr));
+			++count;
+		}
+		cf_dyn_buf_append_char(db, ';');
+	}
+}
+
+/**
+ * Generate a string for listening address and port in format ip_address:port
+ * and return the heartbeat mode.
+ *
+ * @param mode (output) current heartbeat subsystem mode.
+ * @param addr_port (output) listening ip address and port formatted as
+ * ip_address:port
+ * @param addr_port_capacity the capacity of the addr_port input.
+ */
+void
+as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port,
+		size_t addr_port_capacity)
+{
+	*mode = hb_is_mesh() ? AS_HB_MODE_MESH : AS_HB_MODE_MULTICAST;
+	if (hb_is_mesh()) {
+		endpoint_list_to_string_udata udata;
+		udata.endpoint_list_str = addr_port;
+		udata.endpoint_list_str_capacity = addr_port_capacity;
+		mesh_published_endpoints_process(endpoint_list_to_string_process,
+				&udata);
+	}
+	else {
+		const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get();
+
+		char* write_ptr = addr_port;
+		int remaining = addr_port_capacity;
+
+		// Ensure we leave space for the terminating NULL delimiter.
+		for (int i = 0; i < multicast_cfg->n_cfgs && remaining > 1; i++) {
+			cf_sock_addr temp;
+			cf_ip_addr_copy(&multicast_cfg->cfgs[i].addr, &temp.addr);
+			temp.port = multicast_cfg->cfgs[i].port;
+			int rv = cf_sock_addr_to_string(&temp, write_ptr, remaining);
+			if (rv <= 0) {
+				// We exhausted the write buffer.
+				// Ensure NULL termination.
+				addr_port[addr_port_capacity - 1] = 0;
+				return;
+			}
+
+			write_ptr += rv;
+			remaining -= rv;
+
+			if (i != multicast_cfg->n_cfgs - 1 && remaining > 1) {
+				*write_ptr = ',';
+				write_ptr++;
+				remaining--;
+			}
+		}
+
+		// Ensure NULL termination.
+		*write_ptr = 0;
+	}
+}
+
+/**
+ * Populate the buffer with duplicate nodeids.
+ */
+void
+as_hb_info_duplicates_get(cf_dyn_buf* db)
+{
+	cf_dyn_buf_append_string(db, "cluster_duplicate_nodes=");
+
+	HB_LOCK();
+	bool self_is_duplicate = hb_self_is_duplicate();
+	int num_probation = cf_shash_get_size(g_hb.on_probation);
+	cf_node duplicate_list[num_probation + 1];
+
+	if (!self_is_duplicate && num_probation == 0) {
+		cf_dyn_buf_append_string(db, "null");
+		goto Exit;
+	}
+
+	as_hb_adjacency_reduce_udata probation_reduce_udata = { duplicate_list, 0 };
+
+	cf_shash_reduce(g_hb.on_probation, hb_adjacency_iterate_reduce,
+			&probation_reduce_udata);
+
+	if (hb_self_is_duplicate()) {
+		duplicate_list[probation_reduce_udata.adj_count++] =
+				config_self_nodeid_get();
+	}
+
+	int num_duplicates = probation_reduce_udata.adj_count;
+	qsort(duplicate_list, num_duplicates, sizeof(cf_node),
+			cf_node_compare_desc);
+
+	for (int i = 0; i < num_duplicates; i++) {
+		cf_dyn_buf_append_uint64_x(db, duplicate_list[i]);
+		cf_dyn_buf_append_char(db, ',');
+	}
+	cf_dyn_buf_chomp(db);
+
+Exit:
+	HB_UNLOCK();
+	cf_dyn_buf_append_char(db, ';');
+}
+
+/*
+ * -----------------------------------------------------------------
+ * Mesh mode public API
+ * -----------------------------------------------------------------
+ */
+
+/**
+ * Add an aerospike instance from the mesh seed list.
+ */
+int
+as_hb_mesh_tip(char* host, int port, bool tls)
+{
+	if (!hb_is_mesh()) {
+		WARNING("tip not applicable for multicast");
+		return (-1);
+	}
+
+	return mesh_tip(host, port, tls);
+}
+
+/**
+ * Remove a mesh node instance from the mesh list.
+ */
+int
+as_hb_mesh_tip_clear(char* host, int port)
+{
+	if (!hb_is_mesh()) {
+		WARNING("tip clear not applicable for multicast");
+		return (-1);
+	}
+
+	if (host == NULL || host[0] == '\0'
+			|| strnlen(host, HOST_NAME_MAX) == HOST_NAME_MAX) {
+		WARNING("incorrect host or port");
+		return (-1);
+	}
+
+	MESH_LOCK();
+	DETAIL("executing tip clear for %s:%d", host, port);
+
+	// FIXME: Remove the mesh host entry and close channel was done to meet
+	// AER-5241 ???
+	// tip-clear is not a mechanism to throw a connected node out of the
+	// cluster.
+	// We should not be required to use this mechanism now.
+	// tip-clear should only be used to cleanup seed list after decommisioning
+	// an ip.
+	as_hb_mesh_tip_clear_udata mesh_tip_clear_reduce_udata;
+	strncpy(mesh_tip_clear_reduce_udata.host, host, HOST_NAME_MAX);
+	mesh_tip_clear_reduce_udata.port = port;
+	mesh_tip_clear_reduce_udata.entry_deleted = false;
+	mesh_tip_clear_reduce_udata.nodeid = 0;
+
+	int seed_index = mesh_seed_find_unsafe(host, port);
+	if (seed_index >= 0) {
+		as_hb_mesh_seed* seed = cf_vector_getp(
+				&g_hb.mode_state.mesh_state.seeds, seed_index);
+		mesh_tip_clear_reduce_udata.nodeid = seed->mesh_nodeid;
+	}
+
+	// Refresh the mapping between the seeds and the mesh hosts.
+	mesh_seed_inactive_refresh_get_unsafe(NULL);
+	cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
+			mesh_tip_clear_reduce, &mesh_tip_clear_reduce_udata);
+
+	// Remove the seed entry in case we do not find a matching mesh entry.
+	// Will happen trivially if this seed could not be connected.
+	mesh_tip_clear_reduce_udata.entry_deleted |= mesh_seed_delete_unsafe(
+			mesh_seed_find_unsafe(host, port)) == 0;
+
+	MESH_UNLOCK();
+	return mesh_tip_clear_reduce_udata.entry_deleted ? 0 : -1;
+}
+
+/**
+ * Clear the entire mesh list.
+ */
+int
+as_hb_mesh_tip_clear_all(uint32_t* cleared)
+{
+	if (!hb_is_mesh()) {
+		WARNING("tip clear not applicable for multicast");
+		return (-1);
+	}
+
+	MESH_LOCK();
+	*cleared = cf_shash_get_size(
+			g_hb.mode_state.mesh_state.nodeid_to_mesh_node);
+
+	// Refresh the mapping between the seeds and the mesh hosts.
+	mesh_seed_inactive_refresh_get_unsafe(NULL);
+	cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
+			mesh_tip_clear_reduce, NULL);
+
+	// Remove all entries that did not have a matching mesh endpoint.
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	for (int i = 0; i < element_count; i++) {
+		if (mesh_seed_delete_unsafe(i) == 0) {
+			i--;
+			element_count--;
+		}
+		else {
+			// Should not happen in practice.
+			as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+			CRASH("error deleting mesh seed entry %s:%d", seed->seed_host_name,
+					seed->seed_port);
+		}
+	}
+
+	MESH_UNLOCK();
+	return (0);
+}
+
+/**
+ * Read the plugin data for a node in the adjacency list. The plugin_data->data
+ * input param should be pre allocated and plugin_data->data_capacity should
+ * indicate its capacity.
+ *
+ * @param nodeid the node id
+ * @param pluginid the plugin identifier.
+ * @param plugin_data (input/output) on success plugin_data->data will be the
+ * plugin's data for the node and plugin_data->data_size will be the data size.
+ * node. NULL if there is no plugin data.
+ * @praram msg_hlc_ts  (output) if not NULL will be filled with the timestamp of
+ * when the hb message for this data was received.
+ * @param recv_monotonic_ts (output) if not NULL will be filled with monotonic
+ * wall clock receive timestamp for this plugin data.
+ * @return 0 on success and -1 on error, where errno will be set to	 ENOENT if
+ * there is no entry for this node and ENOMEM if the input plugin data's
+ * capacity is less than plugin's data. In ENOMEM case plugin_data->data_size
+ * will be set to the required capacity.
+ */
+int
+as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin,
+		as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts,
+		cf_clock* recv_monotonic_ts)
+{
+	int rv = 0;
+
+	HB_LOCK();
+
+	as_hb_adjacent_node adjacent_node;
+	if (hb_adjacent_node_get(nodeid, &adjacent_node) != 0) {
+		rv = -1;
+		plugin_data->data_size = 0;
+		errno = ENOENT;
+		goto Exit;
+	}
+
+	as_hb_plugin_node_data* plugin_data_internal =
+			&adjacent_node.plugin_data[plugin][adjacent_node.plugin_data_cycler
+					% 2];
+
+	if (plugin_data_internal->data && plugin_data_internal->data_size) {
+		// Set the plugin data size
+		plugin_data->data_size = plugin_data_internal->data_size;
+
+		if (plugin_data_internal->data_size > plugin_data->data_capacity) {
+			rv = -1;
+			errno = ENOMEM;
+			goto Exit;
+		}
+
+		// Copy over the stored copy of the plugin data.
+		memcpy(plugin_data->data, plugin_data_internal->data,
+				plugin_data_internal->data_size);
+
+		// Copy the message timestamp.
+		if (msg_hlc_ts) {
+			memcpy(msg_hlc_ts, &adjacent_node.last_msg_hlc_ts,
+					sizeof(as_hlc_msg_timestamp));
+		}
+
+		if (recv_monotonic_ts) {
+			*recv_monotonic_ts = adjacent_node.last_updated_monotonic_ts;
+		}
+
+		rv = 0;
+	}
+	else {
+		// No plugin data set.
+		plugin_data->data_size = 0;
+		if (recv_monotonic_ts) {
+			*recv_monotonic_ts = 0;
+		}
+		if (msg_hlc_ts) {
+			memset(msg_hlc_ts, 0, sizeof(as_hlc_msg_timestamp));
+		}
+		rv = 0;
+	}
+
+Exit:
+	HB_UNLOCK();
+	return rv;
+}
+
+/**
+ * Call the iterate method on plugin data for all nodes in the input vector. The
+ * iterate function will be invoked for all nodes in the input vector even if
+ * they are not in the adjacency list or they have no plugin data. Plugin data
+ * will be NULL with size zero in such cases.
+ *
+ * @param nodes the iterate on.
+ * @param plugin the plugin identifier.
+ * @param iterate_fn the iterate function invoked for plugin data for every
+ * node.
+ * @param udata passed as is to the iterate function. Useful for getting results
+ * out of the iteration.
+ * NULL if there is no plugin data.
+ * @return the size of the plugin data. 0 if there is no plugin data.
+ */
+void
+as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin,
+		as_hb_plugin_data_iterate_fn iterate_fn, void* udata)
+
+{
+	HB_LOCK();
+
+	int size = cf_vector_size(nodes);
+
+	for (int i = 0; i < size; i++) {
+		cf_node* nodeid = cf_vector_getp(nodes, i);
+
+		if (nodeid == NULL || *nodeid == 0) {
+			continue;
+		}
+
+		as_hb_adjacent_node nodeinfo;
+
+		if (hb_adjacent_node_get(*nodeid, &nodeinfo) == 0) {
+			size_t data_size = 0;
+			void* data = NULL;
+
+			hb_adjacent_node_plugin_data_get(&nodeinfo, plugin, &data,
+					&data_size);
+
+			iterate_fn(*nodeid, data, data_size,
+					nodeinfo.last_updated_monotonic_ts,
+					&nodeinfo.last_msg_hlc_ts, udata);
+		}
+		else {
+			// This node is not known to the heartbeat subsystem.
+			iterate_fn(*nodeid, NULL, 0, 0, NULL, udata);
+		}
+	}
+
+	HB_UNLOCK();
+}
+
+/**
+ * Call the iterate method on all nodes in current adjacency list. Note plugin
+ * data can still be NULL if the plugin data failed to parse the plugin data.
+ *
+ * @param pluginid the plugin identifier.
+ * @param iterate_fn the iterate function invoked for plugin data for every
+ * node.
+ * @param udata passed as is to the iterate function. Useful for getting results
+ * out of the iteration.
+ * NULL if there is no plugin data.
+ * @return the size of the plugin data. 0 if there is no plugin data.
+ */
+void
+as_hb_plugin_data_iterate_all(as_hb_plugin_id pluginid,
+		as_hb_plugin_data_iterate_fn iterate_fn, void* udata)
+{
+	hb_plugin_data_iterate_all(pluginid, iterate_fn, udata);
+}
+
+/**
+ * Log the state of the heartbeat module.
+ */
+void
+as_hb_dump(bool verbose)
+{
+	INFO("Heartbeat Dump:");
+
+	as_hb_mode mode;
+	char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+	as_hb_info_listen_addr_get(&mode, endpoint_list_str,
+			sizeof(endpoint_list_str));
+
+	// Dump the config.
+	INFO("HB Mode: %s (%d)",
+			(mode == AS_HB_MODE_MULTICAST ?
+					"multicast" :
+					(mode == AS_HB_MODE_MESH ? "mesh" : "undefined")), mode);
+
+	INFO("HB Addresses: {%s}", endpoint_list_str);
+	INFO("HB MTU: %d", hb_mtu());
+
+	INFO("HB Interval: %d", config_tx_interval_get());
+	INFO("HB Timeout: %d", config_max_intervals_missed_get());
+	char protocol_s[HB_PROTOCOL_STR_MAX_LEN];
+	as_hb_protocol_get_s(config_protocol_get(), protocol_s);
+	INFO("HB Protocol: %s (%d)", protocol_s, config_protocol_get());
+
+	// dump mode specific state.
+	hb_mode_dump(verbose);
+
+	// Dump the channel state.
+	channel_dump(verbose);
+
+	// Dump the adjacency list.
+	hb_dump(verbose);
+}
+
+/**
+ * Indicates if a node is alive.
+ */
+bool
+as_hb_is_alive(cf_node nodeid)
+{
+	bool is_alive;
+	HB_LOCK();
+
+	as_hb_adjacent_node adjacent_node;
+	is_alive = (nodeid == config_self_nodeid_get())
+			|| (hb_adjacent_node_get(nodeid, &adjacent_node) == 0);
+
+	HB_UNLOCK();
+	return is_alive;
+}
+
+/**
+ * Compute the nodes to evict from the input nodes so that remaining nodes form
+ * a clique, based on adjacency lists. Self nodeid is never considered for
+ * eviction.
+ *
+ * @param nodes input cf_node vector.
+ * @param nodes_to_evict output cf_node clique array, that is initialized.
+ */
+void
+as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict)
+{
+	hb_maximal_clique_evict(nodes, nodes_to_evict);
+}
+
+/**
+ * Read the hlc timestamp for the message.
+ * Note: A protected API for the sole benefit of skew monitor.
+ *
+ * @param msg the incoming message.
+ * @param send_ts the output hlc timestamp.
+ * @return 0 if the time stamp could be parsed -1 on failure.
+ */
+int
+as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts)
+{
+	return msg_send_hlc_ts_get(msg, send_ts);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Common sub module.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Utility
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Round up input int to the nearest power of two.
+ */
+static uint32_t
+round_up_pow2(uint32_t v)
+{
+	v--;
+	v |= v >> 1;
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	v++;
+	return v;
+}
+
+/**
+ * Generate a hash code for a cf_socket.
+ */
+static uint32_t
+hb_socket_hash_fn(const void* key)
+{
+	const cf_socket** socket = (const cf_socket**)key;
+	return cf_hash_jen32((const uint8_t*)socket, sizeof(cf_socket*));
+}
+
+/**
+ * Reduce function to delete all entries in a map
+ */
+static int
+hb_delete_all_reduce(const void* key, void* data, void* udata)
+{
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Info call related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Append a address spec to a cf_dyn_buf.
+ */
+static void
+info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list)
+{
+	for (uint32_t i = 0; i < list->n_addrs; ++i) {
+		info_append_string(db, name, list->addrs[i]);
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Vector operations
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * TODO: Move this to cf_vector.
+ * Find the index of an element in the vector. Equality is based on mem compare.
+ *
+ * @param vector the source vector.
+ * @param element the element to find.
+ * @return the index if the element is found, -1 otherwise.
+ */
+static int
+vector_find(cf_vector* vector, const void* element)
+{
+	int element_count = cf_vector_size(vector);
+	size_t value_len = cf_vector_element_size(vector);
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		void* src_element = cf_vector_getp(vector, i);
+		if (src_element) {
+			if (memcmp(element, src_element, value_len) == 0) {
+				return i;
+			}
+		}
+	}
+	return -1;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Endpoint list related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Copy an endpoint list to the destination, while possible reallocating the
+ * destination space.
+ * @param dest the double pointer to the destination list, because it might need
+ * reallocation to accommodate a larger source list.
+ * @param src the source endpoint list.
+ */
+static void
+endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src)
+{
+	size_t src_size;
+
+	if (as_endpoint_list_sizeof(src, &src_size) != 0) {
+		// Bad endpoint list passed.
+		CRASH("invalid adjacency list passed for copying");
+	}
+
+	*dest = cf_realloc(*dest, src_size);
+
+	memcpy(*dest, src, src_size);
+}
+
+/**
+ * Process function to convert endpoint list to a string.
+ */
+static void
+endpoint_list_to_string_process(const as_endpoint_list* endpoint_list,
+		void* udata)
+{
+	endpoint_list_to_string_udata* to_string_udata =
+			(endpoint_list_to_string_udata*)udata;
+	as_endpoint_list_to_string(endpoint_list,
+			to_string_udata->endpoint_list_str,
+			to_string_udata->endpoint_list_str_capacity);
+}
+
+/**
+ * Process function to check if endpoint lists overlap.
+ */
+static void
+endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata)
+{
+	endpoint_list_equal_check_udata* equal_udata =
+			(endpoint_list_equal_check_udata*)udata;
+
+	equal_udata->are_equal |= as_endpoint_lists_are_equal(endpoint_list,
+			equal_udata->other);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Messge related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * The size of a buffer beyond which compression should be applied. For now set
+ * to 60% of the interface mtu.
+ */
+static int
+msg_compression_threshold(int mtu)
+{
+	return (int)(mtu * 0.6);
+}
+
+/**
+ * Read advertised endpoint list from an incoming message.
+ * @param msg the incoming message.
+ * @param endpoint_list the output endpoint. The endpoint_list will point to
+ * input message.
+ * internal location and should not be freed.
+ * @return 0 on success -1 on failure.
+ */
+static int
+msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list)
+{
+	size_t endpoint_list_size;
+	if (msg_get_buf(msg, AS_HB_MSG_ENDPOINTS, (uint8_t**)endpoint_list,
+			&endpoint_list_size, MSG_GET_DIRECT) != 0) {
+		return -1;
+	}
+
+	size_t parsed_size;
+	if (as_endpoint_list_nsizeof(*endpoint_list, &parsed_size,
+			endpoint_list_size) || parsed_size != endpoint_list_size) {
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Read the protocol identifier for this heartbeat message. These functions can
+ * get called multiple times for a single message. Hence they do not increment
+ * error counters.
+ *
+ * @param msg the incoming message.
+ * @param id the output id.
+ * @return 0 if the id could be parsed -1 on failure.
+ */
+static int
+msg_id_get(msg* msg, uint32_t* id)
+{
+	if (msg_get_uint32(msg, AS_HB_MSG_ID, id) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Read the source nodeid for a node. These functions can get called multiple
+ * times for a single message. Hence they do not increment error counters.
+ * @param msg the incoming message.
+ * @param nodeid the output nodeid.
+ * @return 0 if the nodeid could be parsed -1 on failure.
+ */
+static int
+msg_nodeid_get(msg* msg, cf_node* nodeid)
+{
+	if (msg_get_uint64(msg, AS_HB_MSG_NODE, nodeid) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Read the HLC send timestamp for the message. These functions can get called
+ * multiple times for a single message. Hence they do not increment error
+ * counters.
+ * @param msg the incoming message.
+ * @param send_ts the output hlc timestamp.
+ * @return 0 if the time stamp could be parsed -1 on failure.
+ */
+static int
+msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts)
+{
+	if (msg_get_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, send_ts) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Read the message type.  These functions can get called multiple times for a
+ * single message. Hence they do not increment error counters.
+ * @param msg the incoming message.
+ * @param type the output message type.
+ * @return 0 if the type could be parsed -1 on failure.
+ */
+static int
+msg_type_get(msg* msg, as_hb_msg_type* type)
+{
+	if (msg_get_uint32(msg, AS_HB_MSG_TYPE, type) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Read the cluster name.
+ * @param msg the incoming message.
+ * @param cluster name of the output message type.
+ * @return 0 if the cluster name could be parsed -1 on failure.
+ */
+static int
+msg_cluster_name_get(msg* msg, char** cluster_name)
+{
+	if (msg_get_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, NULL,
+			MSG_GET_DIRECT) != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Get a pointer to a node list in the message.
+ *
+ * @param msg the incoming message.
+ * @param field_id the field id.
+ * @param adj_list output. on success will point to the adjacency list in the
+ * message.
+ * @para adj_length output. on success will contain the length of the adjacency
+ * list.
+ * @return 0 on success. -1 if the adjacency list is absent.
+ */
+static int
+msg_node_list_get(msg* msg, int field_id, cf_node** adj_list,
+		size_t* adj_length)
+{
+	if (msg_get_buf(msg, field_id, (uint8_t**)adj_list, adj_length,
+			MSG_GET_DIRECT) != 0) {
+		return -1;
+	}
+
+	// correct adjacency list length.
+	*adj_length /= sizeof(cf_node);
+
+	return 0;
+}
+
+/**
+ * Get a pointer to the adjacency list in the message.
+ *
+ * @param msg the incoming message.
+ * @param adj_list output. on success will point to the adjacency list in the
+ * message.
+ * @para adj_length output. on success will contain the length of the adjacency
+ * list.
+ * @return 0 on success. -1 if the adjacency list is absent.
+ */
+static int
+msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length)
+{
+	return msg_node_list_get(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length);
+}
+
+/**
+ * Set a node list on an outgoing messages for a field.
+ *
+ * @param msg the outgoing message.
+ * @param field_id the id of the list field.
+ * @param node_list the adjacency list to set.
+ * @para node_length the length of the adjacency list.
+ */
+static void
+msg_node_list_set(msg* msg, int field_id, cf_node* node_list,
+		size_t node_length)
+{
+	if (msg_set_buf(msg, field_id, (uint8_t*)node_list,
+			sizeof(cf_node) * node_length, MSG_SET_COPY) != 0) {
+		CRASH("error setting adjacency list on msg");
+	}
+
+	return;
+}
+
+/**
+ * Set the adjacency list on an outgoing messages.
+ *
+ * @param msg the outgoing message.
+ * @param adj_list the adjacency list to set.
+ * @para adj_length the length of the adjacency list.
+ */
+static void
+msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length)
+{
+	msg_node_list_set(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length);
+}
+
+/**
+ * Set the info reply on an outgoing messages.
+ *
+ * @param msg the outgoing message.
+ * @param response the response list to set.
+ * @para response_count the length of the response list.
+ */
+static void
+msg_info_reply_set(msg* msg, as_hb_mesh_info_reply* response,
+		size_t response_count)
+{
+	size_t response_size = 0;
+	if (mesh_info_reply_sizeof(response, response_count, &response_size)) {
+		CRASH("error setting info reply on msg");
+	}
+
+	if (msg_set_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t*)response,
+			response_size, MSG_SET_COPY) != 0) {
+		CRASH("error setting info reply on msg");
+	}
+
+	return;
+}
+
+/**
+ * Get a pointer to the info reply list in the message.
+ *
+ * @param msg the incoming message.
+ * @param reply output. on success will point to the reply list in the message.
+ * @param reply_count output. on success will contain the length of the reply
+ * list.
+ * @return 0 on success. -1 if the reply list is absent.
+ */
+static int
+msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count)
+{
+	size_t reply_size;
+	if (msg_get_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t**)reply, &reply_size,
+			MSG_GET_DIRECT) != 0) {
+		return -1;
+	}
+
+	*reply_count = 0;
+
+	// Go over reply and compute the count of replies and also validate the
+	// endpoint lists.
+	uint8_t* start_ptr = (uint8_t*)*reply;
+	int64_t remaining_size = reply_size;
+
+	while (remaining_size > 0) {
+		as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr;
+		remaining_size -= sizeof(as_hb_mesh_info_reply);
+		start_ptr += sizeof(as_hb_mesh_info_reply);
+		if (remaining_size <= 0) {
+			// Incomplete / garbled info reply message.
+			*reply_count = 0;
+			return -1;
+		}
+
+		size_t endpoint_list_size = 0;
+		if (as_endpoint_list_nsizeof(reply_ptr->endpoint_list,
+				&endpoint_list_size, remaining_size) != 0) {
+			// Incomplete / garbled info reply message.
+			*reply_count = 0;
+			return -1;
+		}
+
+		remaining_size -= endpoint_list_size;
+		start_ptr += endpoint_list_size;
+		(*reply_count)++;
+	}
+
+	return 0;
+}
+
+/**
+ * Fill a message with an endpoint list.
+ */
+static void
+msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list,
+		void* udata)
+{
+	endpoint_list_to_msg_udata* to_msg_udata =
+			(endpoint_list_to_msg_udata*)udata;
+	msg* msg = to_msg_udata->msg;
+	bool is_mesh = to_msg_udata->is_mesh;
+
+	if (!published_endpoint_list) {
+		if (is_mesh) {
+			// Something is messed up. Except for v3 multicast,
+			// published list should not be empty.
+			WARNING("published endpoint list is empty");
+		}
+		return;
+	}
+
+	// Makes sense only for mesh.
+	if (is_mesh && published_endpoint_list) {
+		// Set the source address
+		size_t endpoint_list_size = 0;
+		as_endpoint_list_sizeof(published_endpoint_list, &endpoint_list_size);
+		if (msg_set_buf(msg, AS_HB_MSG_ENDPOINTS,
+				(uint8_t*)published_endpoint_list, endpoint_list_size,
+				MSG_SET_COPY) != 0) {
+			CRASH("error setting heartbeat address on msg");
+		}
+	}
+}
+
+/**
+ * Fill source fields for the message.
+ * @param msg the message to fill the source fields into.
+ */
+static void
+msg_src_fields_fill(msg* msg)
+{
+	bool is_mesh = hb_is_mesh();
+
+	// Set the hb protocol id / version.
+	if (msg_set_uint32(msg, AS_HB_MSG_ID, hb_protocol_identifier_get()) != 0) {
+		CRASH("error setting heartbeat protocol on msg");
+	}
+
+	// Set the source node.
+	if (msg_set_uint64(msg, AS_HB_MSG_NODE, config_self_nodeid_get()) != 0) {
+		CRASH("error setting node id on msg");
+	}
+
+	endpoint_list_to_msg_udata udata;
+	udata.msg = msg;
+	udata.is_mesh = is_mesh;
+
+	if (is_mesh) {
+		// Endpoint list only valid for mesh mode.
+		mesh_published_endpoints_process(msg_published_endpoints_fill, &udata);
+	}
+
+	// Set the send hlc timestamp
+	if (msg_set_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, as_hlc_timestamp_now())
+			!= 0) {
+		CRASH("error setting send timestamp on msg");
+	}
+}
+
+/**
+ * Set the type for an outgoing message.
+ * @param msg the outgoing message.
+ * @param msg_type the type to set.
+ */
+static void
+msg_type_set(msg* msg, as_hb_msg_type msg_type)
+{
+	// Set the message type.
+	if (msg_set_uint32(msg, AS_HB_MSG_TYPE, msg_type) != 0) {
+		CRASH("error setting type on msg");
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Config sub module.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Get mcsize.
+ */
+static int
+config_mcsize()
+{
+	int mode_cluster_size = 0;
+	if (hb_is_mesh()) {
+		// Only bounded by available memory. But let's say its infinite.
+		mode_cluster_size = INT_MAX;
+	}
+	else {
+		mode_cluster_size = multicast_supported_cluster_size_get();
+	}
+
+	// Ensure we are always upper bounded by the absolute max cluster size.
+	int supported_cluster_size = MIN(ASC, mode_cluster_size);
+
+	DETAIL("supported cluster size %d", supported_cluster_size);
+	return supported_cluster_size;
+}
+
+/**
+ * Get the binding addresses for the heartbeat subsystem.
+ */
+static const cf_serv_cfg*
+config_bind_cfg_get()
+{
+	// Not protected by config_lock because it is not changed.
+	return &g_config.hb_config.bind_cfg;
+}
+
+/**
+ * Get the multicast groups for the multicast mode.
+ */
+static const cf_mserv_cfg*
+config_multicast_group_cfg_get()
+{
+	// Not protected by config_lock. Never updated after config parsing..
+	return &g_config.hb_config.multicast_group_cfg;
+}
+
+/**
+ * Get the heartbeat pulse transmit interval.
+ */
+static uint32_t
+config_tx_interval_get()
+{
+	HB_CONFIG_LOCK();
+	uint32_t interval = g_config.hb_config.tx_interval;
+	HB_CONFIG_UNLOCK();
+	return interval;
+}
+
+/**
+ * Set the heartbeat pulse transmit interval.
+ */
+static void
+config_tx_interval_set(uint32_t new_interval)
+{
+	HB_CONFIG_LOCK();
+	INFO("changing value of interval from %d to %d ",
+			g_config.hb_config.tx_interval, new_interval);
+	g_config.hb_config.tx_interval = new_interval;
+	HB_CONFIG_UNLOCK();
+}
+
+/**
+ * Get the heartbeat pulse transmit interval.
+ */
+static uint32_t
+config_override_mtu_get()
+{
+	HB_CONFIG_LOCK();
+	uint32_t override_mtu = g_config.hb_config.override_mtu;
+	HB_CONFIG_UNLOCK();
+	return override_mtu;
+}
+
+/**
+ * Set the heartbeat pulse transmit interval.
+ */
+static void
+config_override_mtu_set(uint32_t mtu)
+{
+	HB_CONFIG_LOCK();
+	INFO("changing value of override mtu from %d to %d ",
+			g_config.hb_config.override_mtu, mtu);
+	g_config.hb_config.override_mtu = mtu;
+	HB_CONFIG_UNLOCK();
+	INFO("max supported cluster size is %d", config_mcsize());
+}
+
+/**
+ * Get the maximum number of missed heartbeat intervals after which a node is
+ * considered expired.
+ */
+static uint32_t
+config_max_intervals_missed_get()
+{
+	uint32_t rv = 0;
+	HB_CONFIG_LOCK();
+	rv = g_config.hb_config.max_intervals_missed;
+	HB_CONFIG_UNLOCK();
+	return rv;
+}
+
+/**
+ * Get the number intervals endpoints should be tracked for.
+ */
+static uint32_t
+config_endpoint_track_intervals_get()
+{
+	// Allow a grace period of half heartbeat timeout, but lower bounded to at
+	// least 3.
+	return MAX(3, config_max_intervals_missed_get() / 2);
+}
+
+/**
+ * Get the maximum number of allowed changes, per endpoint track intervals.
+ */
+static uint32_t
+config_endpoint_changes_allowed_get()
+{
+	// Allow no change to the endpoint list for now.
+	return 0;
+}
+
+/**
+ * Set the maximum number of missed heartbeat intervals after which a node is
+ * considered expired.
+ */
+static void
+config_max_intervals_missed_set(uint32_t new_max)
+{
+	HB_CONFIG_LOCK();
+	INFO("changing value of timeout from %d to %d ",
+			g_config.hb_config.max_intervals_missed, new_max);
+	g_config.hb_config.max_intervals_missed = new_max;
+	HB_CONFIG_UNLOCK();
+}
+
+/**
+ * Return ttl for multicast packets. Set to zero for default TTL.
+ */
+static unsigned char
+config_multicast_ttl_get()
+{
+	return g_config.hb_config.multicast_ttl;
+}
+
+/**
+ * Return the current heartbeat protocol.
+ */
+static as_hb_protocol
+config_protocol_get()
+{
+	as_hb_protocol rv = 0;
+	HB_CONFIG_LOCK();
+	rv = g_config.hb_config.protocol;
+	HB_CONFIG_UNLOCK();
+	return rv;
+}
+
+/**
+ * Return the current heartbeat protocol.
+ */
+static void
+config_protocol_set(as_hb_protocol new_protocol)
+{
+	HB_CONFIG_LOCK();
+	g_config.hb_config.protocol = new_protocol;
+	HB_CONFIG_UNLOCK();
+}
+
+/**
+ * The nodeid for this node.
+ */
+static cf_node
+config_self_nodeid_get()
+{
+	// Not protected by config_lock. Never updated after config parsing..
+	return g_config.self_node;
+}
+
+/**
+ * Return the heartbeat subsystem mode.
+ */
+static as_hb_mode
+config_mode_get()
+{
+	// Not protected by config_lock. Never updated after config parsing..
+	return g_config.hb_config.mode;
+}
+
+/**
+ * Expand "any" binding addresses to actual interface addresses.
+ * @param bind_cfg the binding configuration.
+ * @param published_cfg (output) the server configuration to expand.
+ * @param ipv4_only indicates if only legacy addresses should be allowed.
+ */
+static void
+config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg,
+		cf_serv_cfg* published_cfg, bool ipv4_only)
+{
+	cf_serv_cfg_init(published_cfg);
+	cf_sock_cfg sock_cfg;
+
+	for (int i = 0; i < bind_cfg->n_cfgs; i++) {
+		cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg);
+
+		// Expand "any" address to all interfaces.
+		if (cf_ip_addr_is_any(&sock_cfg.addr)) {
+			cf_ip_addr all_addrs[CF_SOCK_CFG_MAX];
+			uint32_t n_all_addrs = CF_SOCK_CFG_MAX;
+			if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) {
+				WARNING("error getting all interface addresses");
+				n_all_addrs = 0;
+			}
+
+			for (int j = 0; j < n_all_addrs; j++) {
+				// Skip local address if any is specified.
+				if (cf_ip_addr_is_local(&all_addrs[j])
+						|| (ipv4_only && !cf_ip_addr_is_legacy(&all_addrs[j]))) {
+					continue;
+				}
+
+				cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr);
+				if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) {
+					CRASH("error initializing published address list");
+				}
+			}
+
+			// TODO: Does not look like the right warning or the right message.
+			if (published_cfg->n_cfgs == 0) {
+				WARNING(
+						"no network interface addresses detected for heartbeat access");
+			}
+		}
+		else {
+			if (ipv4_only && !cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) {
+				continue;
+			}
+
+			if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) {
+				CRASH("error initializing published address list");
+			}
+		}
+	}
+}
+
+/**
+ * Checks if the heartbeat binding configuration is valid.
+ * @param error pointer to a static error message if validation fails, else will
+ * be set to NULL.
+ */
+static bool
+config_binding_is_valid(char** error, as_hb_protocol protocol)
+{
+	const cf_serv_cfg* bind_cfg = config_bind_cfg_get();
+	const cf_mserv_cfg* multicast_group_cfg = config_multicast_group_cfg_get();
+
+	if (hb_is_mesh()) {
+		if (bind_cfg->n_cfgs == 0) {
+			// Should not happen in practice.
+			*error = "no bind addresses found for heartbeat";
+			return false;
+		}
+
+		// Ensure we have a valid port for all bind endpoints.
+		for (int i = 0; i < bind_cfg->n_cfgs; i++) {
+			if (bind_cfg->cfgs[i].port == 0) {
+				*error = "invalid mesh listening port";
+				return false;
+			}
+		}
+
+		cf_serv_cfg publish_serv_cfg;
+		cf_serv_cfg_init(&publish_serv_cfg);
+
+		if (multicast_group_cfg->n_cfgs != 0) {
+			*error =
+					"invalid config option: multicast-group not supported in mesh mode";
+			return false;
+		}
+	}
+	else {
+		const cf_mserv_cfg* multicast_group_cfg =
+				config_multicast_group_cfg_get();
+
+		if (multicast_group_cfg->n_cfgs == 0) {
+			*error = "no multicast groups specified";
+			return false;
+		}
+
+		// Ensure multicast groups have valid ports.
+		// TODO: We could check if the address is valid multicast.
+		for (int i = 0; i < multicast_group_cfg->n_cfgs; i++) {
+			if (multicast_group_cfg->cfgs[i].port == 0) {
+				*error = "invalid multicast port";
+				return false;
+			}
+		}
+
+		if (g_config.hb_config.mesh_seed_addrs[0]) {
+			*error =
+					"invalid config option: mesh-seed-address-port not supported for multicast mode";
+			return false;
+		}
+
+		cf_serv_cfg publish_serv_cfg;
+		cf_serv_cfg_init(&publish_serv_cfg);
+	}
+
+	*error = NULL;
+	return true;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Channel sub module.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Initialize the channel structure.
+ */
+static void
+channel_init_channel(as_hb_channel* channel)
+{
+	memset(channel, 0, sizeof(as_hb_channel));
+	cf_ip_addr_set_any(&channel->endpoint_addr.addr);
+}
+
+/**
+ * Initialize the channel event structure.
+ */
+static void
+channel_event_init(as_hb_channel_event* event)
+{
+	memset(event, 0, sizeof(as_hb_channel_event));
+}
+
+/**
+ * Is channel running.
+ */
+static bool
+channel_is_running()
+{
+	CHANNEL_LOCK();
+	bool retval =
+			(g_hb.channel_state.status == AS_HB_STATUS_RUNNING) ? true : false;
+	CHANNEL_UNLOCK();
+	return retval;
+}
+
+/**
+ * Is channel stopped.
+ */
+static bool
+channel_is_stopped()
+{
+	CHANNEL_LOCK();
+	bool retval =
+			(g_hb.channel_state.status == AS_HB_STATUS_STOPPED) ? true : false;
+	CHANNEL_UNLOCK();
+	return retval;
+}
+
+/**
+ * Keep a winning socket as a winner for at least this amount of time to prevent
+ * constant flip flopping and give the winning socket a chance to send
+ * heartbeats.
+ */
+static uint32_t
+channel_win_grace_ms()
+{
+	return 3 * config_tx_interval_get();
+}
+
+/**
+ * Enable / disable events.
+ */
+static void
+channel_events_enabled_set(bool enabled)
+{
+	CHANNEL_LOCK();
+	g_hb.channel_state.events_enabled = enabled;
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Know if events are enabled.
+ */
+static bool
+channel_are_events_enabled()
+{
+	bool result;
+	CHANNEL_LOCK();
+	result = g_hb.channel_state.events_enabled;
+	CHANNEL_UNLOCK();
+	return result;
+}
+
+/**
+ * Discard an event that has been processed.
+ */
+static void
+channel_event_discard(as_hb_channel_event* event)
+{
+	// Free the message structure for message received events.
+	if (event->type == AS_HB_CHANNEL_MSG_RECEIVED) {
+		hb_msg_return(event->msg);
+	}
+}
+
+/**
+ * Queues a channel event for publishing by the channel tender.
+ */
+static void
+channel_event_queue(as_hb_channel_event* event)
+{
+	if (!channel_are_events_enabled()) {
+		channel_event_discard(event);
+		DETAIL(
+				"events disabled. Ignoring event of type %d with nodeid %" PRIx64,
+				event->type, event->nodeid);
+		return;
+	}
+
+	DETAIL("queuing channel event of type %d for node %" PRIx64, event->type,
+			event->nodeid);
+	cf_queue_push(&g_hb.channel_state.events_queue, event);
+}
+
+/**
+ * Publish queued up channel events. Should be called outside a channel lock to
+ * prevent deadlocks.
+ */
+static void
+channel_event_publish_pending()
+{
+	// No channel lock here to prevent deadlocks.
+	as_hb_channel_event event;
+	while (cf_queue_pop(&g_hb.channel_state.events_queue, &event, 0)
+			== CF_QUEUE_OK) {
+		// Nothing elaborate, using hardcoded list of event recipients.
+		mesh_channel_event_process(&event);
+		hb_channel_event_process(&event);
+
+		channel_event_discard(&event);
+	}
+}
+
+/**
+ * Return the endpoint associated with this socket if it exists.
+ *
+ * @param socket the socket to query for.
+ * @param result the output result.
+ * @return 0 if the socket was found and the result value is filled. -1 if a
+ * mapping for the socket could not be found.
+ */
+static int
+channel_get_channel(cf_socket* socket, as_hb_channel* result)
+{
+	int status;
+	CHANNEL_LOCK();
+
+	if (cf_shash_get(g_hb.channel_state.socket_to_channel, &socket, result)
+			== CF_SHASH_OK) {
+		status = 0;
+	}
+	else {
+		status = -1;
+	}
+
+	CHANNEL_UNLOCK();
+	return status;
+}
+
+/**
+ * Shutdown a channel socket without closing, forcing the channel tender to
+ * cleanup associated data structures.
+ */
+static void
+channel_socket_shutdown(cf_socket* socket)
+{
+	cf_socket_shutdown(socket);
+}
+
+/**
+ * Return the socket associated with this node.
+ * Returns 0 on success and -1 if there is no socket attached to this node.
+ */
+static int
+channel_socket_get(cf_node nodeid, cf_socket** socket)
+{
+	int rv = -1;
+	CHANNEL_LOCK();
+	if (cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, socket)
+			== CF_SHASH_ERR_NOT_FOUND) {
+		rv = -1;
+	}
+	else {
+		rv = 0;
+	}
+
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Indicate if a socket is present in a sockets list.
+ */
+static bool
+channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find)
+{
+	for (int i = 0; i < sockets->n_socks; i++) {
+		if (&sockets->socks[i] == to_find) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * Destroy an allocated socket.
+ */
+static void
+channel_socket_destroy(cf_socket* sock)
+{
+	cf_socket_close(sock);
+	cf_socket_term(sock);
+	cf_free(sock);
+}
+
+/**
+ * Close a channel socket. Precondition is that the socket is registered with
+ * the channel module using channel_socket_register.
+ */
+static void
+channel_socket_close(cf_socket* socket, bool remote_close,
+		bool raise_close_event)
+{
+	if (remote_close) {
+		DEBUG("remote close: fd %d event", CSFD(socket));
+	}
+
+	CHANNEL_LOCK();
+
+	if (channel_cf_sockets_contains(g_hb.channel_state.listening_sockets,
+			socket)) {
+		// Listening sockets will be closed by the mode (mesh/multicast
+		// ) modules.
+		goto Exit;
+	}
+
+	// Clean up data structures.
+	as_hb_channel channel;
+	int status = channel_get_channel(socket, &channel);
+
+	if (status == 0) {
+		if (channel.nodeid != 0) {
+			cf_socket* node_socket;
+			if (channel_socket_get(channel.nodeid, &node_socket) == 0
+					&& node_socket == socket) {
+				// Remove associated node for this socket.
+				cf_shash_delete(g_hb.channel_state.nodeid_to_socket,
+						&channel.nodeid);
+
+				if (!channel.is_multicast && raise_close_event) {
+					as_hb_channel_event event;
+					channel_event_init(&event);
+
+					// Notify others that this node is no longer connected.
+					event.type = AS_HB_CHANNEL_NODE_DISCONNECTED;
+					event.nodeid = channel.nodeid;
+					event.msg = NULL;
+
+					channel_event_queue(&event);
+				}
+			}
+		}
+
+		DETAIL("removed channel associated with fd %d polarity %s Type: %s",
+				CSFD(socket), channel.is_inbound ? "inbound" : "outbound",
+				channel.is_multicast ? "multicast" : "mesh");
+		// Remove associated channel.
+		cf_shash_delete(g_hb.channel_state.socket_to_channel, &socket);
+	}
+	else {
+		// Will only happen if we are closing this socket twice. Cannot
+		// deference the underlying fd because the socket has been freed.
+		WARNING("found a socket %p without an associated channel", socket);
+		goto Exit;
+	}
+
+	static int32_t err_ok[] = { ENOENT, EBADF, EPERM };
+	int32_t err = cf_poll_delete_socket_forgiving(g_hb.channel_state.poll,
+			socket, sizeof(err_ok) / sizeof(int32_t), err_ok);
+
+	if (err == ENOENT) {
+		// There is no valid code path where epoll ctl should fail.
+		CRASH("unable to remove fd %d from epoll fd list: %s", CSFD(socket),
+				cf_strerror(errno));
+		goto Exit;
+	}
+
+	cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
+	DEBUG("closing channel with fd %d", CSFD(socket));
+
+	channel_socket_destroy(socket);
+
+Exit:
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Close multiple sockets. Should be invoked only by channel stop.
+ * @param sockets the vector consisting of sockets to be closed.
+ */
+static void
+channel_sockets_close(cf_vector* sockets)
+{
+	uint32_t socket_count = cf_vector_size(sockets);
+	for (int index = 0; index < socket_count; index++) {
+		cf_socket* socket;
+		if (cf_vector_get(sockets, index, &socket) != 0) {
+			WARNING("error finding the fd %d to be deleted", CSFD(socket));
+			continue;
+		}
+		channel_socket_close(socket, false, true);
+	}
+}
+
+/**
+ * Queues a socket for closing by the channel tender. Should be used by all code
+ * paths other than the channel stop code path.
+ */
+static void
+channel_socket_close_queue(cf_socket* socket, bool is_remote_close,
+		bool raise_close_event)
+{
+	as_hb_channel_socket_close_entry close_entry = {
+		socket,
+		is_remote_close,
+		raise_close_event };
+	DETAIL("queuing close of fd %d", CSFD(socket));
+	cf_queue_push(&g_hb.channel_state.socket_close_queue, &close_entry);
+}
+
+/**
+ * Close queued up sockets.
+ */
+static void
+channel_socket_close_pending()
+{
+	// No channel lock required here.
+	as_hb_channel_socket_close_entry close_entry;
+	while (cf_queue_pop(&g_hb.channel_state.socket_close_queue, &close_entry, 0)
+			== CF_QUEUE_OK) {
+		channel_socket_close(close_entry.socket, close_entry.is_remote,
+				close_entry.raise_close_event);
+	}
+}
+
+/**
+ * Register a new socket.
+ *
+ * @param socket the socket.
+ * @param is_multicast indicates if this socket is a multicast socket.
+ * @param is_inbound indicates if this socket is an inbound / outbound.
+ * @param endpoint peer endpoint this socket connects to. Will be NULL for
+ * inbound sockets.
+ */
+static void
+channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound,
+		cf_sock_addr* endpoint_addr)
+{
+	CHANNEL_LOCK();
+
+	as_hb_channel channel;
+	channel_init_channel(&channel);
+
+	// This socket should not be part of the socket to channel map.
+	ASSERT(channel_get_channel(socket, &channel) == -1,
+			"error the channel already exists for fd %d", CSFD(socket));
+
+	channel.is_multicast = is_multicast;
+	channel.is_inbound = is_inbound;
+	channel.last_received = cf_getms();
+
+	if (endpoint_addr) {
+		memcpy(&channel.endpoint_addr, endpoint_addr, sizeof(*endpoint_addr));
+	}
+
+	// Add socket to poll list
+	cf_poll_add_socket(g_hb.channel_state.poll, socket,
+			EPOLLIN | EPOLLERR | EPOLLRDHUP, socket);
+
+	cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel);
+
+	DEBUG("channel created for fd %d - polarity %s type: %s", CSFD(socket),
+			channel.is_inbound ? "inbound" : "outbound",
+			channel.is_multicast ? "multicast" : "mesh");
+
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Accept an incoming tcp connection. For now this is relevant only to the mesh
+ * mode.
+ * @param lsock the listening socket that received the connection.
+ */
+static void
+channel_accept_connection(cf_socket* lsock)
+{
+	if (!hb_is_mesh()) {
+		// We do not accept connections in non mesh modes.
+		return;
+	}
+
+	cf_socket csock;
+	cf_sock_addr caddr;
+
+	if (cf_socket_accept(lsock, &csock, &caddr) < 0) {
+		if ((errno == EMFILE) || (errno == ENFILE) || (errno == ENOMEM)
+				|| (errno == ENOBUFS)) {
+			TICKER_WARNING(
+					"failed to accept heartbeat connection due to error : %s",
+					cf_strerror(errno));
+			// We are in an extreme situation where we ran out of system
+			// resources (file/mem). We should rather lie low and not do too
+			// much activity. So, sleep. We should not sleep too long as this
+			// same function is supposed to send heartbeat also.
+			usleep(MAX(AS_HB_TX_INTERVAL_MS_MIN, 1) * 1000);
+			return;
+		}
+		else {
+			// TODO: Find what there errors are.
+			WARNING("accept failed: %s", cf_strerror(errno));
+			return;
+		}
+	}
+
+	// Update the stats to reflect to a new connection opened.
+	cf_atomic_int_incr(&g_stats.heartbeat_connections_opened);
+
+	char caddr_str[HOST_NAME_MAX];
+	cf_sock_addr_to_string_safe(&caddr, caddr_str, sizeof(caddr_str));
+	DEBUG("new connection from %s", caddr_str);
+
+	cf_sock_cfg *cfg = lsock->cfg;
+
+	if (cfg->owner == CF_SOCK_OWNER_HEARTBEAT_TLS) {
+		tls_socket_prepare_server(g_config.hb_config.tls, &csock);
+
+		if (tls_socket_accept_block(&csock) != 1) {
+			WARNING("heartbeat TLS server handshake with %s failed", caddr_str);
+			cf_socket_close(&csock);
+			cf_socket_term(&csock);
+
+			cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
+			return;
+		}
+	}
+
+	// Allocate a new socket.
+	cf_socket* sock = cf_malloc(sizeof(cf_socket));
+	cf_socket_init(sock);
+	cf_socket_copy(&csock, sock);
+
+	// Register this socket with the channel subsystem.
+	channel_socket_register(sock, false, true, NULL);
+}
+
+/**
+ * Parse compressed buffer into a message.
+ *
+ * @param msg the input parsed compressed message and also the output heartbeat
+ * message.
+ * @param buffer the input buffer.
+ * @param buffer_content_len the length of the content in the buffer.
+ * @return the status of parsing the message.
+ */
+static as_hb_channel_msg_read_status
+channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len)
+{
+	// This is a direct pointer inside the buffer parameter. No allocation
+	// required.
+	uint8_t* compressed_buffer = NULL;
+	size_t compressed_buffer_length = 0;
+	int parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL;
+	void* uncompressed_buffer = NULL;
+	size_t uncompressed_buffer_length = 0;
+
+	if (msg_get_buf(msg, AS_HB_MSG_COMPRESSED_PAYLOAD, &compressed_buffer,
+			&compressed_buffer_length, MSG_GET_DIRECT) != 0) {
+		parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL;
+		goto Exit;
+	}
+
+	// Assume compression ratio of 3. We will expand the buffer if needed.
+	uncompressed_buffer_length = round_up_pow2(3 * compressed_buffer_length);
+
+	// Keep trying till we allocate enough memory for the uncompressed buffer.
+	while (true) {
+		uncompressed_buffer = MSG_BUFF_ALLOC_OR_DIE(uncompressed_buffer_length,
+				"error allocating memory size %zu for decompressing message",
+				uncompressed_buffer_length);
+
+		int uncompress_rv = uncompress(uncompressed_buffer,
+				&uncompressed_buffer_length, compressed_buffer,
+				compressed_buffer_length);
+
+		if (uncompress_rv == Z_OK) {
+			// Decompression was successful.
+			break;
+		}
+
+		if (uncompress_rv == Z_BUF_ERROR) {
+			// The uncompressed buffer is not large enough. Free current buffer
+			// and allocate a new buffer.
+			MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length);
+
+			// Give uncompressed buffer more space.
+			uncompressed_buffer_length *= 2;
+			continue;
+		}
+
+		// Decompression failed. Clean up and exit.
+		parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL;
+		goto Exit;
+	}
+
+	// Reset the message to prepare for parsing the uncompressed buffer. We have
+	// no issues losing the compressed buffer because we have an uncompressed
+	// copy.
+	msg_reset(msg);
+
+	// Parse the uncompressed buffer.
+	parsed =
+			msg_parse(msg, uncompressed_buffer, uncompressed_buffer_length)
+					== 0 ?
+					AS_HB_CHANNEL_MSG_READ_SUCCESS :
+					AS_HB_CHANNEL_MSG_PARSE_FAIL;
+
+	if (parsed == AS_HB_CHANNEL_MSG_READ_SUCCESS) {
+		// Copying the buffer content to ensure that the message and the buffer
+		// can have separate life cycles and we never get into races. The
+		// frequency of heartbeat messages is low enough to make this not matter
+		// much unless we have massive clusters.
+		msg_preserve_all_fields(msg);
+	}
+
+Exit:
+	MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length);
+	return parsed;
+}
+
+/**
+ * Parse the buffer into a message.
+ *
+ * @param msg the output heartbeat message.
+ * @param buffer the input buffer.
+ * @param buffer_content_len the length of the content in the buffer.
+ * @return the status of parsing the message.
+ */
+static as_hb_channel_msg_read_status
+channel_message_parse(msg* msg, void* buffer, int buffer_content_len)
+{
+	// Peek into the buffer to get hold of the message type.
+	msg_type type = 0;
+	uint32_t msg_size = 0;
+	if (msg_get_initial(&msg_size, &type, (uint8_t*)buffer, buffer_content_len)
+			!= 0 || type != msg->type) {
+		// Pre check because msg_parse considers this a warning but this would
+		// be common when protocol version between nodes do not match.
+		DEBUG("message type mismatch - expected:%d received:%d", msg->type,
+				type);
+		return AS_HB_CHANNEL_MSG_PARSE_FAIL;
+	}
+
+	bool parsed = msg_parse(msg, buffer, buffer_content_len) == 0;
+
+	if (parsed) {
+		if (msg_is_set(msg, AS_HB_MSG_COMPRESSED_PAYLOAD)) {
+			// This is a compressed message.
+			return channel_compressed_message_parse(msg, buffer,
+					buffer_content_len);
+		}
+
+		// This is an uncompressed message. Copying the buffer content to ensure
+		// that the message and the buffer can have separate life cycles and we
+		// never get into races. The frequency of heartbeat messages is low
+		// enough to make this not matter much unless we have massive clusters.
+		msg_preserve_all_fields(msg);
+	}
+
+	return parsed ?
+			AS_HB_CHANNEL_MSG_READ_SUCCESS : AS_HB_CHANNEL_MSG_PARSE_FAIL;
+}
+
+/**
+ * Iterate over a endpoint list and see if there is a matching socket address.
+ */
+static void
+channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata)
+{
+	cf_sock_addr sock_addr;
+	as_hb_channel_endpoint_iterate_udata* iterate_data =
+			(as_hb_channel_endpoint_iterate_udata*)udata;
+	if (as_endpoint_to_sock_addr(endpoint, &sock_addr) != 0) {
+		return;
+	}
+
+	if (cf_sock_addr_is_any(&sock_addr)) {
+		return;
+	}
+
+	iterate_data->found |= (cf_sock_addr_compare(&sock_addr,
+			iterate_data->addr_to_search) == 0);
+}
+
+/**
+ * Reduce function to find a matching endpoint.
+ */
+static int
+channel_endpoint_search_reduce(const void* key, void* data, void* udata)
+{
+	cf_socket** socket = (cf_socket**)key;
+	as_hb_channel* channel = (as_hb_channel*)data;
+	as_hb_channel_endpoint_reduce_udata* endpoint_reduce_udata =
+			(as_hb_channel_endpoint_reduce_udata*)udata;
+
+	as_hb_channel_endpoint_iterate_udata iterate_udata;
+	iterate_udata.addr_to_search = &channel->endpoint_addr;
+	iterate_udata.found = false;
+
+	as_endpoint_list_iterate(endpoint_reduce_udata->endpoint_list,
+			channel_endpoint_find_iterate_fn, &iterate_udata);
+
+	if (iterate_udata.found) {
+		endpoint_reduce_udata->found = true;
+		endpoint_reduce_udata->socket = *socket;
+		// Stop the reduce, we have found a match.
+		return CF_SHASH_ERR_FOUND;
+	}
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Indicates if any endpoint from the input endpoint list is already connected.
+ * @param endpoint_list the endpoint list to check.
+ * @return true if at least one endpoint is already connected to, false
+ * otherwise.
+ */
+static bool
+channel_endpoint_is_connected(as_endpoint_list* endpoint_list)
+{
+	CHANNEL_LOCK();
+	// Linear search. This will in practice not be a very frequent operation.
+	as_hb_channel_endpoint_reduce_udata udata;
+	memset(&udata, 0, sizeof(udata));
+	udata.endpoint_list = endpoint_list;
+
+	cf_shash_reduce(g_hb.channel_state.socket_to_channel,
+			channel_endpoint_search_reduce, &udata);
+
+	CHANNEL_UNLOCK();
+	return udata.found;
+}
+
+/**
+ * Read a message from the multicast socket.
+ *
+ * @param socket the multicast socket to read from.
+ * @param msg the message to read into.
+ *
+ * @return the status the read operation.
+ */
+static as_hb_channel_msg_read_status
+channel_multicast_msg_read(cf_socket* socket, msg* msg)
+{
+	CHANNEL_LOCK();
+
+	as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF;
+
+	int buffer_len = MAX(hb_mtu(), STACK_ALLOC_LIMIT);
+	uint8_t* buffer = MSG_BUFF_ALLOC(buffer_len);
+
+	if (!buffer) {
+		WARNING(
+				"error allocating space for multicast recv buffer of size %d on fd %d",
+				buffer_len, CSFD(socket));
+		goto Exit;
+	}
+
+	cf_sock_addr from;
+
+	int num_rcvd = cf_socket_recv_from(socket, buffer, buffer_len, 0, &from);
+
+	if (num_rcvd <= 0) {
+		DEBUG("multicast packed read failed on fd %d", CSFD(socket));
+		rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL;
+		goto Exit;
+	}
+
+	rv = channel_message_parse(msg, buffer, num_rcvd);
+	if (rv != AS_HB_CHANNEL_MSG_READ_SUCCESS) {
+		goto Exit;
+	}
+
+	rv = AS_HB_CHANNEL_MSG_READ_SUCCESS;
+
+Exit:
+	MSG_BUFF_FREE(buffer, buffer_len);
+
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Read a message from the a tcp mesh socket.
+ *
+ * @param socket the tcp socket to read from.
+ * @param msg the message to read into.
+ *
+ * @return status of the read operation.
+ */
+static as_hb_channel_msg_read_status
+channel_mesh_msg_read(cf_socket* socket, msg* msg)
+{
+	CHANNEL_LOCK();
+
+	uint32_t buffer_len = 0;
+	uint8_t* buffer = NULL;
+
+	as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF;
+	uint8_t len_buff[MSG_WIRE_LENGTH_SIZE];
+
+	if (cf_socket_recv_all(socket, len_buff, MSG_WIRE_LENGTH_SIZE, 0,
+	MESH_RW_TIMEOUT) < 0) {
+		WARNING("mesh size recv failed fd %d : %s", CSFD(socket),
+				cf_strerror(errno));
+		rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL;
+		goto Exit;
+	}
+
+	buffer_len = ntohl(*((uint32_t*)len_buff)) + 6;
+
+	buffer = MSG_BUFF_ALLOC(buffer_len);
+
+	if (!buffer) {
+		WARNING(
+				"error allocating space for mesh recv buffer of size %d on fd %d",
+				buffer_len, CSFD(socket));
+		goto Exit;
+	}
+
+	memcpy(buffer, len_buff, MSG_WIRE_LENGTH_SIZE);
+
+	if (cf_socket_recv_all(socket, buffer + MSG_WIRE_LENGTH_SIZE,
+			buffer_len - MSG_WIRE_LENGTH_SIZE, 0, MESH_RW_TIMEOUT) < 0) {
+		DETAIL("mesh recv failed fd %d : %s", CSFD(socket), cf_strerror(errno));
+		rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL;
+		goto Exit;
+	}
+
+	DETAIL("mesh recv success fd %d message size %d", CSFD(socket), buffer_len);
+
+	rv = channel_message_parse(msg, buffer, buffer_len);
+
+Exit:
+	MSG_BUFF_FREE(buffer, buffer_len);
+
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Associate a socket with a nodeid and notify listeners about a node being
+ * connected, effective only for mesh channels.
+ *
+ * For multicast channels this function is a no-op. The reason being additional
+ * machinery would be required to clean up the node to channel mapping on node
+ * expiry.
+ *
+ * @param socket the socket.
+ * @param channel the channel to associate.
+ * @param nodeid the nodeid associated with this socket.
+ */
+static void
+channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid)
+{
+	// For now node to socket mapping is not maintained for multicast channels.
+	if (channel->is_multicast) {
+		return;
+	}
+
+	CHANNEL_LOCK();
+
+	// Update the node information for the channel.
+	// This is the first time this node has a connection. Record the mapping.
+	cf_shash_put(g_hb.channel_state.nodeid_to_socket, &nodeid, &socket);
+
+	channel->nodeid = nodeid;
+	cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, channel);
+
+	DEBUG("attached fd %d to node %" PRIx64, CSFD(socket), nodeid);
+
+	CHANNEL_UNLOCK();
+
+	// Publish an event to let know that a new node has a channel now.
+	as_hb_channel_event node_connected_event;
+	channel_event_init(&node_connected_event);
+	node_connected_event.nodeid = nodeid;
+	node_connected_event.type = AS_HB_CHANNEL_NODE_CONNECTED;
+	channel_event_queue(&node_connected_event);
+}
+
+/**
+ * Indicates if a channel should be allowed to continue to win and live because
+ * of a winning grace period.
+ */
+static bool
+channel_socket_should_live(cf_socket* socket, as_hb_channel* channel)
+{
+	if (channel->resolution_win_ts > 0
+			&& channel->resolution_win_ts + channel_win_grace_ms()
+					> cf_getms()) {
+		// Losing socket was a previous winner. Allow it time to do some work
+		// before knocking it off.
+		INFO("giving %d unresolved fd some grace time", CSFD(socket));
+		return true;
+	}
+	return false;
+}
+
+/**
+ * Selects one out give two sockets connected to same remote node. The algorithm
+ * is deterministic and ensures the remote node also chooses a socket that drops
+ * the same connection.
+ *
+ * @param socket1 one of the sockets
+ * @param socket2 one of the sockets
+ * @return resolved socket on success, NULL if resolution fails.
+ */
+static cf_socket*
+channel_socket_resolve(cf_socket* socket1, cf_socket* socket2)
+{
+	cf_socket* rv = NULL;
+	CHANNEL_LOCK();
+
+	DEBUG("resolving between fd %d and %d", CSFD(socket1), CSFD(socket2));
+
+	as_hb_channel channel1;
+	if (channel_get_channel(socket1, &channel1) < 0) {
+		// Should not happen in practice.
+		WARNING("resolving fd %d without channel", CSFD(socket1));
+		rv = socket2;
+		goto Exit;
+	}
+
+	as_hb_channel channel2;
+	if (channel_get_channel(socket2, &channel2) < 0) {
+		// Should not happen in practice.
+		WARNING("resolving fd %d without channel", CSFD(socket2));
+		rv = socket1;
+		goto Exit;
+	}
+
+	if (channel_socket_should_live(socket1, &channel1)) {
+		rv = socket1;
+		goto Exit;
+	}
+
+	if (channel_socket_should_live(socket2, &channel2)) {
+		rv = socket2;
+		goto Exit;
+	}
+
+	cf_node remote_nodeid =
+			channel1.nodeid != 0 ? channel1.nodeid : channel2.nodeid;
+
+	if (remote_nodeid == 0) {
+		// Should not happen in practice.
+		WARNING("remote node id unknown for fds %d and %d", CSFD(socket1),
+				CSFD(socket2));
+		rv = NULL;
+		goto Exit;
+	}
+
+	// Choose the socket with the highest acceptor nodeid.
+	cf_node acceptor_nodeid1 =
+			channel1.is_inbound ? config_self_nodeid_get() : remote_nodeid;
+	cf_node acceptor_nodeid2 =
+			channel2.is_inbound ? config_self_nodeid_get() : remote_nodeid;
+
+	as_hb_channel* winner_channel = NULL;
+	cf_socket* winner_socket = NULL;
+	if (acceptor_nodeid1 > acceptor_nodeid2) {
+		winner_channel = &channel1;
+		winner_socket = socket1;
+	}
+	else if (acceptor_nodeid1 < acceptor_nodeid2) {
+		winner_channel = &channel2;
+		winner_socket = socket2;
+	}
+	else {
+		// Both connections have the same acceptor. Should not happen in
+		// practice. Despair and report resolution failure.
+		INFO(
+				"found redundant connections to same node, fds %d %d - choosing at random",
+				CSFD(socket1), CSFD(socket2));
+
+		if (cf_getms() % 2 == 0) {
+			winner_channel = &channel1;
+			winner_socket = socket1;
+		}
+		else {
+			winner_channel = &channel2;
+			winner_socket = socket2;
+		}
+	}
+
+	cf_clock now = cf_getms();
+	if (winner_channel->resolution_win_ts == 0) {
+		winner_channel->resolution_win_ts = now;
+		// Update the winning count of the winning channel in the channel data
+		// structures.
+		cf_shash_put(g_hb.channel_state.socket_to_channel, &winner_socket,
+				winner_channel);
+	}
+
+	if (winner_channel->resolution_win_ts > now + channel_win_grace_ms()) {
+		// The winner has been winning a lot, most likely the other side has us
+		// with a seed address different from our published address.
+		//
+		// Break the cycle here and choose the loosing channel as the winner.
+		INFO("breaking socket resolve loop dropping winning fd %d",
+				CSFD(winner_socket));
+		winner_channel = (winner_channel == &channel1) ? &channel2 : &channel1;
+		winner_socket = (socket1 == winner_socket) ? socket2 : socket1;
+	}
+
+	rv = winner_socket;
+
+Exit:
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Basic sanity check for a message.
+ * @param msg_event the message event.
+ * @return 0 if the message passes basic sanity tests. -1 on failure.
+ */
+static int
+channel_msg_sanity_check(as_hb_channel_event* msg_event)
+{
+	msg* msg = msg_event->msg;
+	uint32_t id = 0;
+
+	as_hb_msg_type type = 0;
+	cf_node src_nodeid = 0;
+
+	int rv = 0;
+
+	if (msg_nodeid_get(msg, &src_nodeid) != 0) {
+		TICKER_WARNING("received message without a source node");
+		rv = -1;
+	}
+
+	// Validate the fact that we have a valid source nodeid.
+	if (src_nodeid == 0) {
+		// Event nodeid is zero. Not a valid source nodeid. This will happen in
+		// compatibility mode if the info request from a new node arrives before
+		// the pulse message. Can be ignored.
+		TICKER_WARNING("received a message from node with unknown nodeid");
+		rv = -1;
+	}
+
+	if (msg_id_get(msg, &id) != 0) {
+		TICKER_WARNING(
+				"received message without heartbeat protocol identifier from node %" PRIx64,
+				src_nodeid);
+		rv = -1;
+	}
+	else {
+		DETAIL(
+				"received message with heartbeat protocol identifier %d from node %" PRIx64,
+				id, src_nodeid);
+
+		// Ignore the message if the protocol of the incoming message does not
+		// match.
+		if (id != hb_protocol_identifier_get()) {
+			TICKER_WARNING(
+					"received message with different heartbeat protocol identifier from node %" PRIx64,
+					src_nodeid);
+			rv = -1;
+		}
+	}
+
+	if (msg_type_get(msg, &type) != 0) {
+		TICKER_WARNING(
+				"received message without message type from node %" PRIx64,
+				src_nodeid);
+		rv = -1;
+	}
+
+	as_endpoint_list* endpoint_list;
+	if (hb_is_mesh()) {
+		// Check only applies to v3 mesh.
+		// v3 multicast protocol does not advertise endpoint list.
+		if (msg_endpoint_list_get(msg, &endpoint_list) != 0
+				|| endpoint_list->n_endpoints <= 0) {
+			TICKER_WARNING(
+					"received message without address/port from node %" PRIx64,
+					src_nodeid);
+			rv = -1;
+		}
+	}
+
+	as_hlc_timestamp send_ts;
+	if (msg_send_hlc_ts_get(msg, &send_ts) != 0) {
+		TICKER_WARNING("received message without HLC time from node %" PRIx64,
+				src_nodeid);
+		rv = -1;
+	}
+
+	if (type == AS_HB_MSG_TYPE_PULSE) {
+		char* remote_cluster_name = NULL;
+		if (msg_cluster_name_get(msg, &remote_cluster_name) != 0) {
+			remote_cluster_name = "";
+		}
+
+		if (!as_config_cluster_name_matches(remote_cluster_name)) {
+			// Generate cluster-name mismatch event.
+			as_hb_channel_event mismatch_event;
+			channel_event_init(&mismatch_event);
+
+			// Notify hb about cluster-name mismatch.
+			mismatch_event.type = AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH;
+			mismatch_event.nodeid = src_nodeid;
+			mismatch_event.msg = NULL;
+			memcpy(&mismatch_event.msg_hlc_ts, &msg_event->msg_hlc_ts,
+					sizeof(msg_event->msg_hlc_ts));
+
+			channel_event_queue(&mismatch_event);
+
+			TICKER_WARNING("ignoring message from %"PRIX64" with different cluster name(%s)",
+					src_nodeid, remote_cluster_name[0] == '\0' ? "null" : remote_cluster_name );
+			rv = -1;
+		}
+	}
+
+	DETAIL("received message of type %d from node %" PRIx64, type, src_nodeid);
+
+	return rv;
+}
+
+/**
+ * Process incoming message to possibly update channel state.
+ *
+ * @param socket the socket on which the message is received.
+ * @param event the message wrapped around in a channel event.
+ * @return 0 if the message can be further processed, -1 if the message should
+ * be discarded.
+ */
+static int
+channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event)
+{
+	// Basic sanity check for the inbound message.
+	if (channel_msg_sanity_check(event) != 0) {
+		DETAIL("sanity check failed for message on fd %d", CSFD(socket));
+		return -1;
+	}
+
+	int rv = -1;
+	CHANNEL_LOCK();
+
+	as_hb_channel channel;
+	if (channel_get_channel(socket, &channel) < 0) {
+		// This is a bug and should not happen. Be paranoid and try fixing it ?
+		WARNING("received a message on an unregistered fd %d - closing the fd",
+				CSFD(socket));
+		channel_socket_close_queue(socket, false, true);
+		rv = -1;
+		goto Exit;
+	}
+
+	if (channel.is_multicast) {
+		rv = 0;
+		goto Exit;
+	}
+
+	cf_node nodeid = event->nodeid;
+
+	if (channel.nodeid != 0 && channel.nodeid != nodeid) {
+		// The event nodeid does not match previously know event id. Something
+		// seriously wrong here.
+		WARNING("received a message from node with incorrect nodeid - expected %" PRIx64 " received %" PRIx64 "on fd %d",
+				channel.nodeid, nodeid, CSFD(socket));
+		rv = -1;
+		goto Exit;
+	}
+
+	// Update the last received time for this node
+	channel.last_received = cf_getms();
+
+	cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel);
+
+	cf_socket* existing_socket;
+	int get_result = cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid,
+			&existing_socket);
+
+	if (get_result == CF_SHASH_ERR_NOT_FOUND) {
+		// Associate this socket with the node.
+		channel_node_attach(socket, &channel, nodeid);
+	}
+	else if (existing_socket != socket) {
+		// Somehow the other node and this node discovered each other together
+		// both connected via two tcp connections. Choose one and close the
+		// other.
+		cf_socket* resolved = channel_socket_resolve(socket, existing_socket);
+
+		if (!resolved) {
+			DEBUG(
+					"resolving between fd %d and %d failed - closing both connections",
+					CSFD(socket), CSFD(existing_socket));
+
+			// Resolution failed. Should not happen but there is a window where
+			// the same node initiated two connections.
+			// Close both connections and try again.
+			channel_socket_close_queue(socket, false, true);
+			channel_socket_close_queue(existing_socket, false, true);
+
+			// Nothing wrong with the message. Let it through.
+			rv = 0;
+			goto Exit;
+		}
+
+		DEBUG("resolved fd %d between redundant fd %d and %d for node %" PRIx64,
+				CSFD(resolved), CSFD(socket), CSFD(existing_socket), nodeid);
+
+		if (resolved == existing_socket) {
+			// The node to socket mapping is correct, just close this socket and
+			// this node will  still be connected to the remote node. Do not
+			// raise any event for this closure.
+			channel_socket_close_queue(socket, false, false);
+		}
+		else {
+			// We need to close the existing socket. Disable channel events
+			// because we make the node appear to be not connected. Do not raise
+			// any event for this closure.
+			channel_socket_close_queue(existing_socket, false, false);
+			// Associate this socket with the node.
+			channel_node_attach(socket, &channel, nodeid);
+		}
+	}
+
+	rv = 0;
+
+Exit:
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Read a message from a socket that has data.
+ * @param socket the socket having data to be read.
+ */
+static void
+channel_msg_read(cf_socket* socket)
+{
+	CHANNEL_LOCK();
+
+	as_hb_channel_msg_read_status status;
+	as_hb_channel channel;
+
+	bool free_msg = true;
+
+	msg* msg = hb_msg_get();
+
+	if (channel_get_channel(socket, &channel) != 0) {
+		// Would happen if the channel was closed in the same epoll loop.
+		DEBUG("error the channel does not exist for fd %d", CSFD(socket));
+		goto Exit;
+	}
+
+	if (channel.is_multicast) {
+		status = channel_multicast_msg_read(socket, msg);
+	}
+	else {
+		status = channel_mesh_msg_read(socket, msg);
+	}
+
+	switch (status) {
+	case AS_HB_CHANNEL_MSG_READ_SUCCESS: {
+		break;
+	}
+
+	case AS_HB_CHANNEL_MSG_PARSE_FAIL: {
+		TICKER_WARNING("unable to parse heartbeat message on fd %d",
+				CSFD(socket));
+		goto Exit;
+	}
+
+	case AS_HB_CHANNEL_MSG_CHANNEL_FAIL:	// Falling through
+	default: {
+		DEBUG("could not read message from fd %d", CSFD(socket));
+		if (!channel.is_multicast) {
+			// Shut down only mesh socket.
+			channel_socket_shutdown(socket);
+		}
+		goto Exit;
+	}
+	}
+
+	as_hb_channel_event event;
+	channel_event_init(&event);
+
+	if (msg_get_uint64(msg, AS_HB_MSG_NODE, &event.nodeid) < 0) {
+		// Node id missing from the message. Assume this message to be corrupt.
+		TICKER_WARNING("message with invalid nodeid received on fd %d",
+				CSFD(socket));
+		goto Exit;
+	}
+
+	event.msg = msg;
+	event.type = AS_HB_CHANNEL_MSG_RECEIVED;
+
+	// Update hlc and store update message timestamp for the event.
+	as_hlc_timestamp send_ts = 0;
+	msg_send_hlc_ts_get(msg, &send_ts);
+	as_hlc_timestamp_update(event.nodeid, send_ts, &event.msg_hlc_ts);
+
+	// Process received message to update channel state.
+	if (channel_msg_event_process(socket, &event) == 0) {
+		// The message needs to be delivered to the listeners. Prevent a free.
+		free_msg = false;
+		channel_event_queue(&event);
+	}
+
+Exit:
+	CHANNEL_UNLOCK();
+
+	// release the message.
+	if (free_msg) {
+		hb_msg_return(msg);
+	}
+}
+
+/**
+ * Reduce function to remove faulty channels / nodes. Shutdown associated socket
+ * to have channel tender cleanup.
+ */
+static int
+channel_channels_tend_reduce(const void* key, void* data, void* udata)
+{
+	cf_socket** socket = (cf_socket**)key;
+	as_hb_channel* channel = (as_hb_channel*)data;
+
+	DETAIL("tending channel fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s",
+			CSFD(*socket), channel->nodeid, channel->last_received,
+			cf_sock_addr_print(&channel->endpoint_addr));
+
+	if (channel->last_received + CHANNEL_NODE_READ_IDLE_TIMEOUT()
+			< cf_getms()) {
+		// Shutdown associated socket if it is not a multicast socket.
+		if (!channel->is_multicast) {
+			DEBUG("channel shutting down idle fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s",
+					CSFD(*socket), channel->nodeid, channel->last_received,
+					cf_sock_addr_print(&channel->endpoint_addr));
+			channel_socket_shutdown(*socket);
+		}
+	}
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Tend channel specific node information to remove channels that are faulty (or
+ * TODO: attached to misbehaving nodes).
+ */
+static void
+channel_channels_idle_check()
+{
+	CHANNEL_LOCK();
+
+	cf_clock now = cf_getms();
+	if (g_hb.channel_state.last_channel_idle_check + CHANNEL_IDLE_CHECK_PERIOD
+			<= now) {
+		cf_shash_reduce(g_hb.channel_state.socket_to_channel,
+				channel_channels_tend_reduce, NULL);
+		g_hb.channel_state.last_channel_idle_check = now;
+	}
+
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Socket tending thread. Manages heartbeat receive as well.
+ */
+void*
+channel_tender(void* arg)
+{
+	DETAIL("channel tender started");
+
+	while (channel_is_running()) {
+		cf_poll_event events[POLL_SZ];
+		int32_t nevents = cf_poll_wait(g_hb.channel_state.poll, events, POLL_SZ,
+				AS_HB_TX_INTERVAL_MS_MIN);
+
+		DETAIL("tending channel");
+
+		for (int32_t i = 0; i < nevents; i++) {
+			cf_socket* socket = events[i].data;
+			if (channel_cf_sockets_contains(
+					g_hb.channel_state.listening_sockets, socket)
+					&& hb_is_mesh()) {
+				// Accept a new connection.
+				channel_accept_connection(socket);
+			}
+			else if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
+				channel_socket_close_queue(socket, true, true);
+			}
+			else if (events[i].events & EPOLLIN) {
+				// Read a message for the socket that is ready.
+				channel_msg_read(socket);
+			}
+		}
+
+		// Tend channels to discard stale channels.
+		channel_channels_idle_check();
+
+		// Close queued up socket.
+		channel_socket_close_pending();
+
+		// Publish pending events. Should be outside channel lock.
+		channel_event_publish_pending();
+
+		DETAIL("done tending channel");
+	}
+
+	DETAIL("channel tender shut down");
+	return NULL;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Channel public API
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Filter out endpoints not matching this node's capabilities.
+ */
+static bool
+channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata)
+{
+	if ((cf_ip_addr_legacy_only())
+			&& endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) {
+		return false;
+	}
+
+	// If we don't offer TLS, then we won't connect via TLS, either.
+	if (g_config.hb_tls_serv_spec.bind_port == 0
+			&& as_endpoint_capability_is_supported(endpoint,
+					AS_ENDPOINT_TLS_MASK)) {
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * Try and connect to a set of endpoint_lists.
+ */
+static void
+channel_mesh_channel_establish(as_endpoint_list** endpoint_lists,
+		int endpoint_list_count)
+{
+	for (int i = 0; i < endpoint_list_count; i++) {
+		char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+		as_endpoint_list_to_string(endpoint_lists[i], endpoint_list_str,
+				sizeof(endpoint_list_str));
+
+		if (channel_endpoint_is_connected(endpoint_lists[i])) {
+			DEBUG(
+					"duplicate endpoint connect request - ignoring endpoint list {%s}",
+					endpoint_list_str);
+			continue;
+		}
+
+		DEBUG("attempting to connect mesh host at {%s}", endpoint_list_str);
+
+		cf_socket* sock = (cf_socket*)cf_malloc(sizeof(cf_socket));
+
+		const as_endpoint* connected_endpoint = as_endpoint_connect_any(
+				endpoint_lists[i], channel_mesh_endpoint_filter, NULL,
+				CONNECT_TIMEOUT(), sock);
+
+		if (connected_endpoint) {
+			cf_atomic_int_incr(&g_stats.heartbeat_connections_opened);
+
+			cf_sock_addr endpoint_addr;
+			memset(&endpoint_addr, 0, sizeof(endpoint_addr));
+			cf_ip_addr_set_any(&endpoint_addr.addr);
+			if (as_endpoint_to_sock_addr(connected_endpoint, &endpoint_addr)
+					!= 0) {
+				// Should never happen in practice.
+				WARNING("error converting endpoint to socket address");
+				channel_socket_destroy(sock);
+				sock = NULL;
+
+				cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
+				continue;
+			}
+
+			if (as_endpoint_capability_is_supported(connected_endpoint,
+					AS_ENDPOINT_TLS_MASK)) {
+				tls_socket_prepare_client(g_config.hb_config.tls, sock);
+
+				if (tls_socket_connect_block(sock) != 1) {
+					WARNING("heartbeat TLS client handshake with {%s} failed",
+							endpoint_list_str);
+					channel_socket_destroy(sock);
+					sock = NULL;
+
+					cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
+					return;
+				}
+			}
+
+			channel_socket_register(sock, false, false, &endpoint_addr);
+		}
+		else {
+			TICKER_WARNING("could not create heartbeat connection to node {%s}",
+					endpoint_list_str);
+			if (sock) {
+				cf_free(sock);
+				sock = NULL;
+			}
+		}
+	}
+}
+
+/**
+ * Disconnect a node from the channel list.
+ * @param nodeid the nodeid of the node whose channel should be disconnected.
+ * @return 0 if the node had a channel and was disconnected. -1 otherwise.
+ */
+static int
+channel_node_disconnect(cf_node nodeid)
+{
+	int rv = -1;
+
+	CHANNEL_LOCK();
+
+	cf_socket* socket;
+	if (channel_socket_get(nodeid, &socket) != 0) {
+		// not found
+		rv = -1;
+		goto Exit;
+	}
+
+	DEBUG("disconnecting the channel attached to node %" PRIx64, nodeid);
+
+	channel_socket_close_queue(socket, false, true);
+
+	rv = 0;
+
+Exit:
+	CHANNEL_UNLOCK();
+
+	return rv;
+}
+
+/**
+ * Register mesh listening sockets.
+ */
+static void
+channel_mesh_listening_socks_register(cf_sockets* listening_sockets)
+{
+	CHANNEL_LOCK();
+	g_hb.channel_state.listening_sockets = listening_sockets;
+
+	cf_poll_add_sockets(g_hb.channel_state.poll,
+			g_hb.channel_state.listening_sockets,
+			EPOLLIN | EPOLLERR | EPOLLHUP);
+	cf_socket_show_server(AS_HB, "mesh heartbeat",
+			g_hb.channel_state.listening_sockets);
+
+	// We do not need a separate channel to cover this socket because IO will
+	// not happen on these sockets.
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Deregister mesh listening socket from epoll event.
+ * @param socket the listening socket socket.
+ */
+static void
+channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets)
+{
+	CHANNEL_LOCK();
+	cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets);
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Register the multicast listening socket.
+ * @param socket the listening socket.
+ * @param endpoint the endpoint on which multicast io happens.
+ */
+static void
+channel_multicast_listening_socks_register(cf_sockets* listening_sockets)
+{
+	CHANNEL_LOCK();
+	g_hb.channel_state.listening_sockets = listening_sockets;
+
+	// Create a new multicast channel for each multicast socket.
+	for (uint32_t i = 0;
+			i < g_hb.mode_state.multicast_state.listening_sockets.n_socks;
+			++i) {
+		channel_socket_register(&g_hb.channel_state.listening_sockets->socks[i],
+				true, false, NULL);
+	}
+
+	cf_socket_mcast_show(AS_HB, "multicast heartbeat",
+			g_hb.channel_state.listening_sockets);
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Deregister multicast listening socket from epoll event.
+ * @param socket the listening socket socket.
+ */
+static void
+channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets)
+{
+	CHANNEL_LOCK();
+	cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets);
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Initialize the channel sub module.
+ */
+static void
+channel_init()
+{
+	CHANNEL_LOCK();
+
+	// Disable events till initialization is complete.
+	channel_events_enabled_set(false);
+
+	// Initialize unpublished event queue.
+	cf_queue_init(&g_hb.channel_state.events_queue, sizeof(as_hb_channel_event),
+	AS_HB_CLUSTER_MAX_SIZE_SOFT, true);
+
+	// Initialize sockets to close queue.
+	cf_queue_init(&g_hb.channel_state.socket_close_queue,
+			sizeof(as_hb_channel_socket_close_entry),
+			AS_HB_CLUSTER_MAX_SIZE_SOFT, true);
+
+	// Initialize the nodeid to socket hash.
+	g_hb.channel_state.nodeid_to_socket = cf_shash_create(cf_nodeid_shash_fn,
+			sizeof(cf_node), sizeof(cf_socket*), AS_HB_CLUSTER_MAX_SIZE_SOFT,
+			0);
+
+	// Initialize the socket to channel state hash.
+	g_hb.channel_state.socket_to_channel = cf_shash_create(hb_socket_hash_fn,
+			sizeof(cf_socket*), sizeof(as_hb_channel),
+			AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	g_hb.channel_state.status = AS_HB_STATUS_STOPPED;
+
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Start channel sub module. Kicks off the channel tending thread.
+ */
+static void
+channel_start()
+{
+	CHANNEL_LOCK();
+
+	if (channel_is_running()) {
+		WARNING("heartbeat channel already started");
+		goto Exit;
+	}
+
+	// create the epoll socket.
+	cf_poll_create(&g_hb.channel_state.poll);
+
+	DEBUG("created epoll fd %d", CEFD(g_hb.channel_state.poll));
+
+	// Disable events till initialization is complete.
+	channel_events_enabled_set(false);
+
+	// Data structures have been initialized.
+	g_hb.channel_state.status = AS_HB_STATUS_RUNNING;
+
+	// Initialization complete enable events.
+	channel_events_enabled_set(true);
+
+	// Start the channel tender.
+	if (pthread_create(&g_hb.channel_state.channel_tender_tid, 0,
+			channel_tender, &g_hb) != 0) {
+		CRASH("could not create channel tender thread: %s", cf_strerror(errno));
+	}
+
+Exit:
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Get all sockets.
+ */
+static int
+channel_sockets_get_reduce(const void* key, void* data, void* udata)
+{
+	cf_vector* sockets = (cf_vector*)udata;
+	cf_vector_append(sockets, key);
+	return CF_SHASH_OK;
+}
+
+/**
+ * Stop the channel sub module called on hb_stop.
+ */
+static void
+channel_stop()
+{
+	if (!channel_is_running()) {
+		WARNING("heartbeat channel already stopped");
+		return;
+	}
+
+	DEBUG("stopping the channel");
+
+	// Unguarded state change but this should be OK.
+	g_hb.channel_state.status = AS_HB_STATUS_SHUTTING_DOWN;
+
+	// Wait for the channel tender thread to finish.
+	pthread_join(g_hb.channel_state.channel_tender_tid, NULL);
+
+	CHANNEL_LOCK();
+
+	cf_vector sockets;
+	cf_socket buff[cf_shash_get_size(g_hb.channel_state.socket_to_channel)];
+	cf_vector_init_smalloc(&sockets, sizeof(cf_socket*), (uint8_t*)buff,
+			sizeof(buff), VECTOR_FLAG_INITZERO);
+
+	cf_shash_reduce(g_hb.channel_state.socket_to_channel,
+			channel_sockets_get_reduce, &sockets);
+
+	channel_sockets_close(&sockets);
+
+	// Disable events.
+	channel_events_enabled_set(false);
+
+	cf_vector_destroy(&sockets);
+
+	// Close epoll socket.
+	cf_poll_destroy(g_hb.channel_state.poll);
+	EFD(g_hb.channel_state.poll) = -1;
+
+	// Disable the channel thread.
+	g_hb.channel_state.status = AS_HB_STATUS_STOPPED;
+
+	DEBUG("channel Stopped");
+
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK
+ * @param socket the socket to send the buffer over.
+ * @param buff the data buffer.
+ * @param buffer_length the number of bytes in the buffer to send.
+ * @return 0 on successful send -1 on failure
+ */
+static int
+channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length)
+{
+	CHANNEL_LOCK();
+	int rv;
+
+	if (cf_socket_send_all(socket, buff, buffer_length, 0,
+	MESH_RW_TIMEOUT) < 0) {
+		as_hb_channel channel;
+		if (channel_get_channel(socket, &channel) == 0) {
+			// Would happen if the channel was closed in the same epoll loop.
+			TICKER_WARNING("sending mesh message to %"PRIx64" on fd %d failed : %s",
+					channel.nodeid, CSFD(socket), cf_strerror(errno));
+		}
+		else {
+			TICKER_WARNING("sending mesh message on fd %d failed : %s",
+					CSFD(socket), cf_strerror(errno));
+		}
+
+		channel_socket_shutdown(socket);
+		rv = -1;
+	}
+	else {
+		rv = 0;
+	}
+
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK
+ * @param socket the socket to send the buffer over.
+ * @param buff the data buffer.
+ * @param buffer_length the number of bytes in the buffer to send.
+ * @return 0 on successful send -1 on failure
+ */
+static int
+channel_multicast_msg_send(cf_socket* socket, uint8_t* buff,
+		size_t buffer_length)
+{
+	CHANNEL_LOCK();
+	int rv = 0;
+	DETAIL("sending udp heartbeat to fd %d: msg size %zu", CSFD(socket),
+			buffer_length);
+
+	int mtu = hb_mtu();
+	if (buffer_length > mtu) {
+		TICKER_WARNING("mtu breach, sending udp heartbeat to fd %d: mtu %d",
+				CSFD(socket), mtu);
+	}
+
+	cf_msock_cfg* socket_cfg = (cf_msock_cfg*)(socket->cfg);
+	cf_sock_addr dest;
+	dest.port = socket_cfg->port;
+	cf_ip_addr_copy(&socket_cfg->addr, &dest.addr);
+
+	if (cf_socket_send_to(socket, buff, buffer_length, 0, &dest) < 0) {
+		TICKER_WARNING("multicast message send failed on fd %d %s",
+				CSFD(socket), cf_strerror(errno));
+		rv = -1;
+	}
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Indicates if this msg requires compression.
+ */
+static bool
+channel_msg_is_compression_required(msg* msg, int wire_size, int mtu)
+{
+	return wire_size > msg_compression_threshold(mtu);
+}
+
+/**
+ * Estimate the size of the buffer required to fill out the serialized message.
+ * @param msg the input message.
+ * @param mtu the underlying network mtu.
+ * @return the size of the buffer required.
+ */
+static int
+channel_msg_buffer_size_get(int wire_size, int mtu)
+{
+	return round_up_pow2(MAX(wire_size, compressBound(wire_size)));
+}
+
+/**
+ * Fills the buffer with the serialized message.
+ * @param original_msg the original message to serialize.
+ * @param wire_size the message wire size.
+ * @param mtu the underlying network mtu.
+ * @param buffer the destination buffer.
+ * @param buffer_len the buffer length.
+ *
+ * @return length of the serialized message.
+ */
+static size_t
+channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu,
+		uint8_t* buffer, size_t buffer_len)
+{
+	// This is output by msg_to_wire. Using a separate variable so that we do
+	// not lose the actual buffer length needed for compression later on.
+	size_t msg_size = msg_to_wire(original_msg, buffer);
+
+	if (channel_msg_is_compression_required(original_msg, msg_size, mtu)) {
+		// Compression is required.
+		const size_t compressed_buffer_len = buffer_len;
+		uint8_t* compressed_buffer = MSG_BUFF_ALLOC_OR_DIE(
+				compressed_buffer_len,
+				"error allocating memory size %zu for compressing message",
+				compressed_buffer_len);
+
+		size_t compressed_msg_size = compressed_buffer_len;
+		int compress_rv = compress2(compressed_buffer, &compressed_msg_size,
+				buffer, wire_size, Z_BEST_COMPRESSION);
+
+		if (compress_rv == Z_BUF_ERROR) {
+			// Compression result going to be larger than original input buffer.
+			// Skip compression and try to send the message as is.
+			DETAIL(
+					"skipping compression - compressed size larger than input size %zu",
+					msg_size);
+		}
+		else {
+			msg* temp_msg = hb_msg_get();
+
+			msg_set_buf(temp_msg, AS_HB_MSG_COMPRESSED_PAYLOAD,
+					compressed_buffer, compressed_msg_size, MSG_SET_COPY);
+			msg_size = msg_to_wire(temp_msg, buffer);
+
+			hb_msg_return(temp_msg);
+		}
+
+		MSG_BUFF_FREE(compressed_buffer, compressed_buffer_len);
+
+	}
+
+	return msg_size;
+}
+
+/**
+ * Send a message to a destination node.
+ */
+static int
+channel_msg_unicast(cf_node dest, msg* msg)
+{
+	size_t buffer_len = 0;
+	uint8_t* buffer = NULL;
+	if (!hb_is_mesh()) {
+		// Can't send a unicast message in the multicast mode.
+		WARNING("ignoring sending unicast message in multicast mode");
+		return -1;
+	}
+
+	CHANNEL_LOCK();
+
+	int rv = -1;
+	cf_socket* connected_socket;
+
+	if (channel_socket_get(dest, &connected_socket) != 0) {
+		DEBUG("failing message send to disconnected node %" PRIx64, dest);
+		rv = -1;
+		goto Exit;
+	}
+
+	// Read the message to a buffer.
+	int mtu = hb_mtu();
+	int wire_size = msg_get_wire_size(msg);
+	buffer_len = channel_msg_buffer_size_get(wire_size, mtu);
+	buffer =
+			MSG_BUFF_ALLOC_OR_DIE(buffer_len,
+					"error allocating memory size %zu for sending message to node %" PRIx64,
+					buffer_len, dest);
+
+	size_t msg_size = channel_msg_buffer_fill(msg, wire_size, mtu, buffer,
+			buffer_len);
+
+	// Send over the buffer.
+	rv = channel_mesh_msg_send(connected_socket, buffer, msg_size);
+
+Exit:
+	MSG_BUFF_FREE(buffer, buffer_len);
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Shash reduce function to walk over the socket to channel hash and broadcast
+ * the message in udata.
+ */
+static int
+channel_msg_broadcast_reduce(const void* key, void* data, void* udata)
+{
+	CHANNEL_LOCK();
+	cf_socket** socket = (cf_socket**)key;
+	as_hb_channel* channel = (as_hb_channel*)data;
+	as_hb_channel_buffer_udata* buffer_udata =
+			(as_hb_channel_buffer_udata*)udata;
+
+	if (!channel->is_multicast) {
+		DETAIL(
+				"broadcasting message of length %zu on channel %d assigned to node %" PRIx64,
+				buffer_udata->buffer_len, CSFD(*socket), channel->nodeid);
+
+		channel_mesh_msg_send(*socket, buffer_udata->buffer,
+				buffer_udata->buffer_len);
+	}
+	else {
+		channel_multicast_msg_send(*socket, buffer_udata->buffer,
+				buffer_udata->buffer_len);
+	}
+
+	CHANNEL_UNLOCK();
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Broadcast a message over all channels.
+ */
+static int
+channel_msg_broadcast(msg* msg)
+{
+	CHANNEL_LOCK();
+
+	int rv = -1;
+
+	// Read the message to a buffer.
+	int mtu = hb_mtu();
+	int wire_size = msg_get_wire_size(msg);
+	size_t buffer_len = channel_msg_buffer_size_get(wire_size, mtu);
+	uint8_t* buffer = MSG_BUFF_ALLOC_OR_DIE(buffer_len,
+			"error allocating memory size %zu for sending broadcast message",
+			buffer_len);
+
+	as_hb_channel_buffer_udata udata;
+	udata.buffer = buffer;
+
+	// Note this is the length of buffer to send.
+	udata.buffer_len = channel_msg_buffer_fill(msg, wire_size, mtu, buffer,
+			buffer_len);
+
+	cf_shash_reduce(g_hb.channel_state.socket_to_channel,
+			channel_msg_broadcast_reduce, &udata);
+
+	MSG_BUFF_FREE(buffer, buffer_len);
+	CHANNEL_UNLOCK();
+	return rv;
+}
+
+/**
+ * Clear all channel state.
+ */
+static void
+channel_clear()
+{
+	if (!channel_is_stopped()) {
+		WARNING("attempted channel clear without stopping the channel");
+		return;
+	}
+
+	CHANNEL_LOCK();
+
+	// Free the unpublished event queue.
+	cf_queue_delete_all(&g_hb.channel_state.events_queue);
+
+	// Delete nodeid to socket hash.
+	cf_shash_reduce(g_hb.channel_state.nodeid_to_socket, hb_delete_all_reduce,
+	NULL);
+
+	// Delete the socket_to_channel hash.
+	cf_shash_reduce(g_hb.channel_state.socket_to_channel, hb_delete_all_reduce,
+	NULL);
+
+	DETAIL("cleared channel information");
+	CHANNEL_UNLOCK();
+}
+
+/**
+ * Reduce function to dump channel node info to log file.
+ */
+static int
+channel_dump_reduce(const void* key, void* data, void* udata)
+{
+	cf_socket** socket = (cf_socket**)key;
+	as_hb_channel* channel = (as_hb_channel*)data;
+
+	INFO("\tHB Channel (%s): node-id %" PRIx64 " fd %d endpoint %s polarity %s last-received %" PRIu64,
+			channel->is_multicast ? "multicast" : "mesh", channel->nodeid,
+			CSFD(*socket), (cf_sock_addr_is_any(&channel->endpoint_addr))
+			? "unknown"
+			: cf_sock_addr_print(&channel->endpoint_addr),
+			channel->is_inbound ? "inbound" : "outbound",
+			channel->last_received);
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Dump channel state to logs.
+ * @param verbose enables / disables verbose logging.
+ */
+static void
+channel_dump(bool verbose)
+{
+	CHANNEL_LOCK();
+
+	INFO("HB Channel Count %d",
+			cf_shash_get_size(g_hb.channel_state.socket_to_channel));
+
+	if (verbose) {
+		cf_shash_reduce(g_hb.channel_state.socket_to_channel,
+				channel_dump_reduce, NULL);
+	}
+
+	CHANNEL_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Mesh sub module.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Is mesh running.
+ */
+static bool
+mesh_is_running()
+{
+	MESH_LOCK();
+	bool retval =
+			(g_hb.mode_state.mesh_state.status == AS_HB_STATUS_RUNNING) ?
+					true : false;
+	MESH_UNLOCK();
+	return retval;
+}
+
+/**
+ * Is mesh stopped.
+ */
+static bool
+mesh_is_stopped()
+{
+	MESH_LOCK();
+	bool retval =
+			(g_hb.mode_state.mesh_state.status == AS_HB_STATUS_STOPPED) ?
+					true : false;
+	MESH_UNLOCK();
+	return retval;
+}
+
+/**
+ * Refresh	the mesh published endpoint list.
+ * @return 0 on successful list creation, -1 otherwise.
+ */
+static int
+mesh_published_endpoint_list_refresh()
+{
+	int rv = -1;
+	MESH_LOCK();
+
+	// TODO: Add interface addresses change detection logic here as well.
+	if (g_hb.mode_state.mesh_state.published_endpoint_list != NULL
+			&& g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only
+					== cf_ip_addr_legacy_only()) {
+		rv = 0;
+		goto Exit;
+	}
+
+	// The global flag has changed, refresh the published address list.
+	if (g_hb.mode_state.mesh_state.published_endpoint_list) {
+		// Free the obsolete list.
+		cf_free(g_hb.mode_state.mesh_state.published_endpoint_list);
+	}
+
+	const cf_serv_cfg* bind_cfg = config_bind_cfg_get();
+	cf_serv_cfg published_cfg;
+
+	config_bind_serv_cfg_expand(bind_cfg, &published_cfg,
+			g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only);
+
+	g_hb.mode_state.mesh_state.published_endpoint_list =
+			as_endpoint_list_from_serv_cfg(&published_cfg);
+
+	if (!g_hb.mode_state.mesh_state.published_endpoint_list) {
+		CRASH("error initializing mesh published address list");
+	}
+
+	g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only =
+			cf_ip_addr_legacy_only();
+
+	rv = 0;
+
+	char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+	as_endpoint_list_to_string(
+			g_hb.mode_state.mesh_state.published_endpoint_list,
+			endpoint_list_str, sizeof(endpoint_list_str));
+	INFO("updated heartbeat published address list to {%s}", endpoint_list_str);
+
+Exit:
+	MESH_UNLOCK();
+	return rv;
+}
+
+/**
+ * Read the published endpoint list via a callback. The call back pattern is to
+ * prevent access to the published list outside the mesh lock.
+ * @param process_fn the list process function. The list passed to the process
+ * function can be NULL.
+ * @param udata passed as is to the process function.
+ */
+static void
+mesh_published_endpoints_process(endpoint_list_process_fn process_fn,
+		void* udata)
+{
+	MESH_LOCK();
+
+	as_endpoint_list* rv = NULL;
+	if (mesh_published_endpoint_list_refresh()) {
+		WARNING("error creating mesh published endpoint list");
+		rv = NULL;
+	}
+	else {
+		rv = g_hb.mode_state.mesh_state.published_endpoint_list;
+	}
+
+	(process_fn)(rv, udata);
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Convert mesh status to a string.
+ */
+static const char*
+mesh_node_status_string(as_hb_mesh_node_status status)
+{
+	static char* status_str[] = {
+		"active",
+		"pending",
+		"inactive",
+		"endpoint-unknown" };
+
+	if (status > AS_HB_MESH_NODE_STATUS_SENTINEL) {
+		return "corrupted";
+	}
+	return status_str[status];
+}
+
+/**
+ * Change the state of a mesh node. Note: memset the mesh_nodes to zero before
+ * calling state change for the first time.
+ */
+static void
+mesh_seed_status_change(as_hb_mesh_seed* seed,
+		as_hb_mesh_node_status new_status)
+{
+	seed->status = new_status;
+	seed->last_status_updated = cf_getms();
+}
+
+/**
+ * Destroy a mesh seed node.
+ */
+static void
+mesh_seed_destroy(as_hb_mesh_seed* seed)
+{
+	MESH_LOCK();
+	if (seed->resolved_endpoint_list) {
+		cf_free(seed->resolved_endpoint_list);
+		seed->resolved_endpoint_list = NULL;
+	}
+	MESH_UNLOCK();
+}
+
+/**
+ * Fill the endpoint list for a mesh seed using the mesh seed hostname and port.
+ * returns the
+ * @param mesh_node the mesh node
+ * @return 0 on success. -1 if a valid endpoint list does not exist and it could
+ * not be generated.
+ */
+static int
+mesh_seed_endpoint_list_fill(as_hb_mesh_seed* seed)
+{
+	if (seed->resolved_endpoint_list != NULL
+			&& seed->resolved_endpoint_list->n_endpoints > 0) {
+		// A valid endpoint list already exists. For now we resolve only once.
+		return 0;
+	}
+
+	cf_clock now = cf_getms();
+	if (now
+			< seed->resolved_endpoint_list_ts
+					+ MESH_SEED_RESOLVE_ATTEMPT_INTERVAL()) {
+		// We have just resolved this seed entry unsuccessfully. Don't try again
+		// for sometime.
+		return -1;
+	}
+
+	uint32_t n_resolved_addresses = CF_SOCK_CFG_MAX;
+	cf_ip_addr resolved_addresses[n_resolved_addresses];
+
+	// Resolve and get all IPv4/IPv6 ip addresses.
+	seed->resolved_endpoint_list_ts = now;
+	if (cf_ip_addr_from_string_multi(seed->seed_host_name, resolved_addresses,
+			&n_resolved_addresses) != 0 || n_resolved_addresses == 0) {
+		TICKER_WARNING("failed resolving mesh seed hostname %s",
+				seed->seed_host_name);
+
+		// Hostname resolution failed.
+		return -1;
+	}
+
+	// Convert resolved addresses to an endpoint list.
+	cf_serv_cfg temp_serv_cfg;
+	cf_serv_cfg_init(&temp_serv_cfg);
+
+	cf_sock_cfg sock_cfg;
+	cf_sock_cfg_init(&sock_cfg,
+			seed->seed_tls ?
+					CF_SOCK_OWNER_HEARTBEAT_TLS : CF_SOCK_OWNER_HEARTBEAT);
+	sock_cfg.port = seed->seed_port;
+
+	for (int i = 0; i < n_resolved_addresses; i++) {
+		cf_ip_addr_copy(&resolved_addresses[i], &sock_cfg.addr);
+		if (cf_serv_cfg_add_sock_cfg(&temp_serv_cfg, &sock_cfg)) {
+			CRASH("error initializing resolved address list");
+		}
+
+		DETAIL("resolved mesh node hostname %s to %s", seed->seed_host_name,
+				cf_ip_addr_print(&resolved_addresses[i]));
+	}
+
+	seed->resolved_endpoint_list = as_endpoint_list_from_serv_cfg(
+			&temp_serv_cfg);
+	return seed->resolved_endpoint_list != NULL ? 0 : -1;
+}
+
+/**
+ * Find a mesh seed in the seed list that has an overlapping endpoint and return
+ * an internal pointer. Assumes this function is called within mesh lock to
+ * prevent invalidating the returned index after function return.
+ *
+ * @param endpoint_list the	 endpoint list to find the endpoint by.
+ * @return index to matching seed entry if found, else -1
+ */
+static int
+mesh_seed_endpoint_list_overlapping_find_unsafe(as_endpoint_list* endpoint_list)
+{
+	MESH_LOCK();
+
+	int match_index = -1;
+	if (!endpoint_list) {
+		// Null / empty endpoint list.
+		goto Exit;
+	}
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	for (int i = 0; i < element_count; i++) {
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+
+		// Ensure the seed hostname is resolved.
+		mesh_seed_endpoint_list_fill(seed);
+
+		if (as_endpoint_lists_are_overlapping(endpoint_list,
+				seed->resolved_endpoint_list, true)) {
+			match_index = i;
+			break;
+		}
+	}
+
+Exit:
+	MESH_UNLOCK();
+	return match_index;
+}
+
+/**
+ * Remove a seed entry from the seed list.
+ * Assumes this function is called within mesh lock to prevent invalidating the
+ * used index during a function call.
+ * @param seed_index the index of the seed element.
+ * @return 0 on success -1 on failure.
+ */
+static int
+mesh_seed_delete_unsafe(int seed_index)
+{
+	int rv = -1;
+	MESH_LOCK();
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	if (seed_index >= 0) {
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, seed_index);
+		mesh_seed_destroy(seed);
+		rv = cf_vector_delete(seeds, seed_index);
+		if (rv == 0) {
+			INFO("removed mesh seed host:%s port %d", seed->seed_host_name,
+					seed->seed_port);
+		}
+	}
+	MESH_UNLOCK();
+	return rv;
+}
+
+/**
+ * Find a mesh seed in the seed list with exactly matching hostname and port.
+ * Assumes this function is called within mesh lock to prevent invalidating the
+ * returned index after function return.
+ *
+ * @param host the seed hostname
+ * @param port the seed port
+ * @return index to matching seed entry if found, else -1
+ */
+static int
+mesh_seed_find_unsafe(char* host, int port)
+{
+	MESH_LOCK();
+
+	int match_index = -1;
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	for (int i = 0; i < element_count; i++) {
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+		if (strncmp(seed->seed_host_name, host, sizeof(seed->seed_host_name))
+				== 0 && seed->seed_port == port) {
+			match_index = i;
+			break;
+		}
+	}
+
+	MESH_UNLOCK();
+	return match_index;
+}
+
+/**
+ * Endure mesh tend udata has enough space for current mesh nodes.
+ */
+static void
+mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata,
+		int mesh_node_count)
+{
+	// Ensure capacity for nodes to connect.
+	if (tend_reduce_udata->to_connect_capacity < mesh_node_count) {
+		uint32_t alloc_size = round_up_pow2(
+				mesh_node_count * sizeof(as_endpoint_list*));
+		int old_capacity = tend_reduce_udata->to_connect_capacity;
+		tend_reduce_udata->to_connect_capacity = alloc_size
+				/ sizeof(as_endpoint_list*);
+		tend_reduce_udata->to_connect = cf_realloc(
+				tend_reduce_udata->to_connect, alloc_size);
+
+		// NULL out newly allocated elements.
+		for (int i = old_capacity; i < tend_reduce_udata->to_connect_capacity;
+				i++) {
+			tend_reduce_udata->to_connect[i] = NULL;
+		}
+	}
+}
+
+/**
+ * Change the state of a mesh node. Note: memset the mesh_nodes to zero before
+ * calling state change for the first time.
+ */
+static void
+mesh_node_status_change(as_hb_mesh_node* mesh_node,
+		as_hb_mesh_node_status new_status)
+{
+	as_hb_mesh_node_status old_status = mesh_node->status;
+	mesh_node->status = new_status;
+
+	if ((new_status != AS_HB_MESH_NODE_CHANNEL_ACTIVE
+			&& old_status == AS_HB_MESH_NODE_CHANNEL_ACTIVE)
+			|| mesh_node->last_status_updated == 0) {
+		mesh_node->inactive_since = cf_getms();
+	}
+	mesh_node->last_status_updated = cf_getms();
+	return;
+}
+
+/**
+ * Close mesh listening sockets.
+ */
+static void
+mesh_listening_sockets_close()
+{
+	MESH_LOCK();
+	INFO("closing mesh heartbeat sockets");
+	cf_sockets_close(&g_hb.mode_state.mesh_state.listening_sockets);
+	DEBUG("closed mesh heartbeat sockets");
+	MESH_UNLOCK();
+}
+
+/**
+ * Populate the buffer with mesh seed list.
+ */
+static void
+mesh_seed_host_list_get(cf_dyn_buf* db, bool tls)
+{
+	if (!hb_is_mesh()) {
+		return;
+	}
+
+	MESH_LOCK();
+
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	for (int i = 0; i < element_count; i++) {
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+		const char* info_key =
+				seed->seed_tls ?
+						"heartbeat.tls-mesh-seed-address-port=" :
+						"heartbeat.mesh-seed-address-port=";
+
+		cf_dyn_buf_append_string(db, info_key);
+		cf_dyn_buf_append_string(db, seed->seed_host_name);
+		cf_dyn_buf_append_char(db, ':');
+		cf_dyn_buf_append_uint32(db, seed->seed_port);
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Checks if the match between a mesh seed and a mesh node is valid.
+ * The matching would be invalid if the mesh node's endpoint has been updated
+ * after the match was made or there has been no match.
+ */
+static bool
+mesh_seed_mesh_node_check(as_hb_mesh_seed* seed)
+{
+	if (seed->status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) {
+		return false;
+	}
+
+	as_hb_mesh_node node;
+	if (mesh_node_get(seed->mesh_nodeid, &node) != 0) {
+		// The matched node has vanished.
+		return false;
+	}
+
+	return seed->mesh_node_endpoint_change_ts == node.endpoint_change_ts;
+}
+
+/**
+ * Refresh the matching between seeds and mesh nodes and get inactive seeds.
+ * Should be invoked under a mesh lock to ensure the validity of returned
+ * pointers.
+ * @param inactive_seeds_p output vector of inactive seed pointers. Can be NULL
+ * if inactive nodes need not be returned.
+ */
+static void
+mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p)
+{
+	MESH_LOCK();
+
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	if (inactive_seeds_p) {
+		cf_vector_clear(inactive_seeds_p);
+	}
+
+	// Mark seeds that do not have a matching mesh node and transitively do not
+	// have a matching channel.
+	cf_clock now = cf_getms();
+	for (int i = 0; i < element_count; i++) {
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+		if (mesh_seed_mesh_node_check(seed)) {
+			continue;
+		}
+
+		seed->mesh_nodeid = 0;
+		seed->mesh_node_endpoint_change_ts = 0;
+
+		// The mesh node is being connected. Skip.
+		if (seed->status == AS_HB_MESH_NODE_CHANNEL_PENDING) {
+			if (seed->last_status_updated + MESH_PENDING_TIMEOUT > now) {
+				// Spare the pending seeds, since we are attempting to connect
+				// to the seed host.
+				continue;
+			}
+
+			// Flip to inactive if we have been in pending state for a long
+			// time.
+			mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+		}
+
+		if (seed->status != AS_HB_MESH_NODE_CHANNEL_PENDING) {
+			mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+			if (inactive_seeds_p) {
+				cf_vector_append(inactive_seeds_p, &seed);
+			}
+		}
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Match input seeds to a mesh node using its endpoint address and
+ */
+static void
+mesh_seeds_mesh_node_match_update(cf_vector* inactive_seeds_p,
+		as_hb_mesh_node* mesh_node, cf_node mesh_nodeid)
+{
+	if (mesh_node->status
+			== AS_HB_MESH_NODE_ENDPOINT_UNKNOWN|| mesh_node->endpoint_list == NULL) {
+		return;
+	}
+
+	int element_count = cf_vector_size(inactive_seeds_p);
+	for (int i = 0; i < element_count; i++) {
+		// No null check required since we are iterating under a lock and within
+		// vector bounds.
+		as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(
+				inactive_seeds_p, i);
+		if (as_endpoint_lists_are_overlapping(seed->resolved_endpoint_list,
+				mesh_node->endpoint_list, true)) {
+			// We found a matching mesh node for the seed, flip its status to
+			// active.
+			seed->mesh_nodeid = mesh_nodeid;
+			seed->mesh_node_endpoint_change_ts = mesh_node->endpoint_change_ts;
+			mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_ACTIVE);
+			DEBUG("seed entry %s:%d connected", seed->seed_host_name,
+					seed->seed_port);
+		}
+	}
+}
+
+/**
+ * Determines if a mesh entry should be connected to or expired and deleted.
+ */
+static int
+mesh_tend_reduce(const void* key, void* data, void* udata)
+{
+	MESH_LOCK();
+
+	int rv = CF_SHASH_OK;
+	cf_node nodeid = *(cf_node*)key;
+	as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
+	as_hb_mesh_tend_reduce_udata* tend_reduce_udata =
+			(as_hb_mesh_tend_reduce_udata*)udata;
+
+	DETAIL("tending mesh node %"PRIx64" with status %s", nodeid,
+			mesh_node_status_string(mesh_node->status));
+
+	mesh_seeds_mesh_node_match_update(tend_reduce_udata->inactive_seeds_p,
+			mesh_node, nodeid);
+
+	if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) {
+		// The mesh node is connected. Skip.
+		goto Exit;
+	}
+
+	cf_clock now = cf_getms();
+
+	if (!mesh_node->endpoint_list) {
+		// Will happen if node discover and disconnect happen close together.
+		mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_ENDPOINT_UNKNOWN);
+	}
+
+	if (mesh_node->inactive_since + MESH_INACTIVE_TIMEOUT <= now) {
+		DEBUG("mesh forgetting node %" PRIx64" because it could not be connected since %" PRIx64,
+				nodeid, mesh_node->inactive_since);
+		rv = CF_SHASH_REDUCE_DELETE;
+		goto Exit;
+	}
+
+	if (mesh_node->status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) {
+		if (mesh_node->last_status_updated + MESH_ENDPOINT_UNKNOWN_TIMEOUT
+				> now) {
+			DEBUG("mesh forgetting node %"PRIx64" ip address/port undiscovered since %"PRIu64,
+					nodeid, mesh_node->last_status_updated);
+
+			rv = CF_SHASH_REDUCE_DELETE;
+		}
+		// Skip connecting with a node with unknown endpoint.
+		goto Exit;
+	}
+
+	if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_PENDING) {
+		// The mesh node is being connected. Skip.
+		if (mesh_node->last_status_updated + MESH_PENDING_TIMEOUT > now) {
+			goto Exit;
+		}
+
+		// Flip to inactive if we have been in pending state for a long time.
+		mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+	}
+
+	// Channel for this node is inactive. Prompt the channel sub module to
+	// connect to this node.
+	if (tend_reduce_udata->to_connect_count
+			>= tend_reduce_udata->to_connect_capacity) {
+		// New nodes found but we are out of capacity. Ultra defensive coding.
+		// This will never happen under the locks.
+		WARNING("skipping connecting to node %" PRIx64" - not enough memory allocated",
+				nodeid);
+		goto Exit;
+	}
+
+	endpoint_list_copy(
+			&tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count],
+			mesh_node->endpoint_list);
+	tend_reduce_udata->to_connect_count++;
+
+	// Flip status to pending.
+	mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_PENDING);
+
+Exit:
+	if (rv == CF_SHASH_REDUCE_DELETE) {
+		// Clear all internal allocated memory.
+		mesh_node_destroy(mesh_node);
+	}
+
+	MESH_UNLOCK();
+
+	return rv;
+}
+
+/**
+ * Add inactive seeds to to_connect array.
+ * Should be invoked under mesh lock to prevent invalidating the array of seed
+ * node pointers.
+ * @param seed_p vector of seed pointers.
+ * @param tend reduce udata having the to connect endpoint list.
+ */
+void
+mesh_seeds_inactive_add_to_connect(cf_vector* seeds_p,
+		as_hb_mesh_tend_reduce_udata* tend_reduce_udata)
+{
+	MESH_LOCK();
+	int element_count = cf_vector_size(seeds_p);
+	for (int i = 0; i < element_count; i++) {
+		as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(seeds_p, i);
+		if (seed->status != AS_HB_MESH_NODE_CHANNEL_INACTIVE) {
+			continue;
+		}
+
+		// Channel for this node is inactive. Prompt the channel sub module to
+		// connect to this node.
+		if (tend_reduce_udata->to_connect_count
+			>= tend_reduce_udata->to_connect_capacity) {
+			// New nodes found but we are out of capacity. Ultra defensive
+			// coding.
+			// This will never happen under the locks.
+			WARNING(
+				"skipping connecting to %s:%d - not enough memory allocated",
+				seed->seed_host_name, seed->seed_port);
+			return;
+		}
+
+		// Ensure the seed hostname is resolved.
+		if (mesh_seed_endpoint_list_fill(seed) != 0) {
+			continue;
+		}
+
+		endpoint_list_copy(
+				&tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count],
+				seed->resolved_endpoint_list);
+		tend_reduce_udata->to_connect_count++;
+
+		// Flip status to pending.
+		mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_PENDING);
+	}
+	MESH_UNLOCK();
+}
+
+/**
+ * Tends the mesh host list, to discover and remove nodes. Should never invoke a
+ * channel call while holding a mesh lock.
+ */
+void*
+mesh_tender(void* arg)
+{
+	DETAIL("mesh tender started");
+	// Figure out which nodes need to be connected to.
+	// collect nodes to connect to and remove dead nodes.
+	as_hb_mesh_tend_reduce_udata tend_reduce_udata = { NULL, 0, 0 };
+
+	// Vector of pointer to inactive seeds.
+	cf_vector inactive_seeds_p;
+	cf_vector_init(&inactive_seeds_p, sizeof(as_hb_mesh_seed*),
+	AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
+
+	cf_clock last_time = 0;
+
+	while (hb_is_mesh() && mesh_is_running()) {
+		cf_clock curr_time = cf_getms();
+
+		// Unlocked access but this should be alright Set the discovered flag.
+		bool nodes_discovered = g_hb.mode_state.mesh_state.nodes_discovered;
+		if ((curr_time - last_time) < MESH_TEND_INTERVAL && !nodes_discovered) {
+			// Interval has not been reached for sending heartbeats
+			usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time +
+			MESH_TEND_INTERVAL) - curr_time) * 1000);
+			continue;
+		}
+		last_time = curr_time;
+
+		DETAIL("tending mesh list");
+
+		MESH_LOCK();
+		// Unset the discovered flag.
+		g_hb.mode_state.mesh_state.nodes_discovered = false;
+
+		// Update the list of inactive seeds.
+		mesh_seed_inactive_refresh_get_unsafe(&inactive_seeds_p);
+
+		// Make sure the udata has enough capacity.
+		int connect_count_max = cf_shash_get_size(
+				g_hb.mode_state.mesh_state.nodeid_to_mesh_node)
+				+ cf_vector_size(&inactive_seeds_p);
+		mesh_tend_udata_capacity_ensure(&tend_reduce_udata, connect_count_max);
+
+		tend_reduce_udata.to_connect_count = 0;
+		tend_reduce_udata.inactive_seeds_p = &inactive_seeds_p;
+		cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
+				mesh_tend_reduce, &tend_reduce_udata);
+
+		// Add inactive seeds for connection.
+		mesh_seeds_inactive_add_to_connect(&inactive_seeds_p,
+				&tend_reduce_udata);
+
+		MESH_UNLOCK();
+
+		// Connect can be time consuming, especially in failure cases.
+		// Connect outside of the mesh lock and prevent hogging the lock.
+		if (tend_reduce_udata.to_connect_count > 0) {
+			// Try connecting the newer nodes.
+			channel_mesh_channel_establish(tend_reduce_udata.to_connect,
+					tend_reduce_udata.to_connect_count);
+		}
+
+		DETAIL("done tending mesh list");
+	}
+
+	if (tend_reduce_udata.to_connect) {
+		// Free space allocated for endpoint lists.
+		for (int i = 0; i < tend_reduce_udata.to_connect_capacity; i++) {
+			if (tend_reduce_udata.to_connect[i]) {
+				cf_free(tend_reduce_udata.to_connect[i]);
+			}
+		}
+		cf_free(tend_reduce_udata.to_connect);
+	}
+
+	cf_vector_destroy(&inactive_seeds_p);
+
+	DETAIL("mesh tender shut down");
+	return NULL;
+}
+
+/**
+ * Add or update a mesh node to mesh node list.
+ */
+static void
+mesh_node_add_update(cf_node nodeid, as_hb_mesh_node* mesh_node)
+{
+	MESH_LOCK();
+	cf_shash_put(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid,
+			mesh_node);
+	MESH_UNLOCK();
+}
+
+/**
+ * Destroy a mesh node.
+ */
+static void
+mesh_node_destroy(as_hb_mesh_node* mesh_node)
+{
+	MESH_LOCK();
+	if (mesh_node->endpoint_list) {
+		cf_free(mesh_node->endpoint_list);
+		mesh_node->endpoint_list = NULL;
+	}
+	MESH_UNLOCK();
+}
+
+/**
+ * Endpoint list iterate function find endpoint matching sock addr.
+ */
+static void
+mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata)
+{
+	cf_sock_addr endpoint_addr;
+	if (as_endpoint_to_sock_addr(endpoint, &endpoint_addr) != 0) {
+		return;
+	}
+
+	as_hb_endpoint_list_addr_find_udata* endpoint_reduce_udata =
+			(as_hb_endpoint_list_addr_find_udata*)udata;
+
+	if (cf_sock_addr_compare(&endpoint_addr, endpoint_reduce_udata->to_search)
+			== 0) {
+		endpoint_reduce_udata->found = true;
+	}
+}
+
+/**
+ * Indicates if a give node is discovered.
+ * @param nodeid the input nodeid.
+ * @return true if discovered, false otherwise.
+ */
+static bool
+mesh_node_is_discovered(cf_node nodeid)
+{
+	if (nodeid == config_self_nodeid_get()) {
+		// Assume this node knows itself.
+		return true;
+	}
+
+	as_hb_mesh_node mesh_node;
+	return mesh_node_get(nodeid, &mesh_node) == 0;
+}
+
+/**
+ * Indicates if a give node has a valid endpoint list.
+ * @param nodeid the input nodeid.
+ * @return true if node has valid endpoint list, false otherwise.
+ */
+static bool
+mesh_node_endpoint_list_is_valid(cf_node nodeid)
+{
+	if (nodeid == config_self_nodeid_get()) {
+		// Assume this node knows itself.
+		return true;
+	}
+
+	as_hb_mesh_node mesh_node;
+	return mesh_node_get(nodeid, &mesh_node) == 0
+			&& mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN
+			&& mesh_node.endpoint_list;
+}
+
+/**
+ * Get the mesh node associated with this node.
+ * @param nodeid the nodeid to search for.
+ * @param is_real_nodeid indicates if the query is for a real or fake nodeid.
+ * @param mesh_node the output mesh node.
+ * @return 0 on success -1 if there is mesh node attached.
+ */
+static int
+mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node)
+{
+	int rv = -1;
+
+	MESH_LOCK();
+	if (cf_shash_get(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid,
+			mesh_node) == CF_SHASH_OK) {
+		rv = 0;
+	}
+	else {
+		// The node not found.
+		rv = -1;
+	}
+	MESH_UNLOCK();
+	return rv;
+}
+
+/**
+ * Handle the event when the channel reports a node as disconnected.
+ */
+static void
+mesh_channel_on_node_disconnect(as_hb_channel_event* event)
+{
+	MESH_LOCK();
+
+	as_hb_mesh_node mesh_node;
+	if (mesh_node_get(event->nodeid, &mesh_node) != 0) {
+		// Again should not happen in practice. But not really bad.
+		DEBUG("unknown mesh node disconnected %" PRIx64, event->nodeid);
+		goto Exit;
+	}
+
+	DEBUG("mesh setting node %" PRIx64" status as inactive on loss of channel",
+			event->nodeid);
+
+	// Mark this node inactive and move on. Mesh tender should remove this node
+	// after it has been inactive for a while.
+	mesh_node_status_change(&mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+
+	// Update the mesh entry.
+	mesh_node_add_update(event->nodeid, &mesh_node);
+
+Exit:
+	MESH_UNLOCK();
+}
+
+/**
+ * Check and fix the case where we received a self incoming message probably
+ * because one of our non loop back interfaces was used as a seed address.
+ *
+ * @return true if this message is a self message, false otherwise.
+ */
+static bool
+mesh_node_check_fix_self_msg(as_hb_channel_event* event)
+{
+	if (event->nodeid == config_self_nodeid_get()) {
+		// Handle self message. Will happen if the seed node address on this
+		// node does not match the listen / publish address.
+		as_endpoint_list* msg_endpoint_list;
+		msg_endpoint_list_get(event->msg, &msg_endpoint_list);
+
+		MESH_LOCK();
+
+		// Check if this node has published an endpoint list matching self node.
+		endpoint_list_equal_check_udata udata = { 0 };
+		udata.are_equal = false;
+		udata.other = msg_endpoint_list;
+		mesh_published_endpoints_process(endpoint_list_equal_process, &udata);
+
+		if (udata.are_equal) {
+			// Definitely pulse message from self node.
+			int self_seed_index =
+					mesh_seed_endpoint_list_overlapping_find_unsafe(
+							msg_endpoint_list);
+			if (self_seed_index >= 0) {
+				as_hb_mesh_seed* self_seed = cf_vector_getp(
+						&g_hb.mode_state.mesh_state.seeds, self_seed_index);
+				INFO("removing self seed entry host:%s port:%d",
+						self_seed->seed_host_name, self_seed->seed_port);
+				as_hb_mesh_tip_clear(self_seed->seed_host_name,
+						self_seed->seed_port);
+			}
+		}
+		MESH_UNLOCK();
+		return true;
+	}
+	return false;
+}
+
+/**
+ * Update mesh node status based on an incoming message.
+ */
+static void
+mesh_node_data_update(as_hb_channel_event* event)
+{
+	if (mesh_node_check_fix_self_msg(event)) {
+		// Message from self, can be ignored.
+		return;
+	}
+
+	MESH_LOCK();
+	as_hb_mesh_node existing_mesh_node = { 0 };
+	as_endpoint_list* msg_endpoint_list = NULL;
+	msg_endpoint_list_get(event->msg, &msg_endpoint_list);
+
+	// Search for existing entry.
+	bool needs_update = mesh_node_get(event->nodeid, &existing_mesh_node) != 0;
+
+	// Update the endpoint list to be the message endpoint list if the seed ip
+	// list and the published ip list differ
+	if (!as_endpoint_lists_are_equal(existing_mesh_node.endpoint_list,
+			msg_endpoint_list)) {
+		char endpoint_list_str1[ENDPOINT_LIST_STR_SIZE];
+		endpoint_list_str1[0] = 0;
+
+		as_endpoint_list_to_string(existing_mesh_node.endpoint_list,
+				endpoint_list_str1, sizeof(endpoint_list_str1));
+
+		char endpoint_list_str2[ENDPOINT_LIST_STR_SIZE];
+		as_endpoint_list_to_string(msg_endpoint_list, endpoint_list_str2,
+				sizeof(endpoint_list_str2));
+
+		if (existing_mesh_node.endpoint_list) {
+			INFO("for node %"PRIx64" updating mesh endpoint address from {%s} to {%s}",event->nodeid,
+					endpoint_list_str1, endpoint_list_str2);
+		}
+
+		// Update the endpoints.
+		endpoint_list_copy(&existing_mesh_node.endpoint_list,
+				msg_endpoint_list);
+		existing_mesh_node.endpoint_change_ts = as_hlc_timestamp_now();
+
+		needs_update = true;
+	}
+
+	if (existing_mesh_node.status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) {
+		// Update status to active.
+		mesh_node_status_change(&existing_mesh_node,
+				AS_HB_MESH_NODE_CHANNEL_ACTIVE);
+		needs_update = true;
+	}
+
+	if (needs_update) {
+		// Apply the update.
+		mesh_node_add_update(event->nodeid, &existing_mesh_node);
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Return the in memory and on wire size of an info reply array.
+ * @param reply the info reply.
+ * @param reply_count the number of replies.
+ * @param reply_size the wire size of the message.
+ * @return 0 on successful reply count computation, -1 otherwise,
+ */
+static int
+mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count,
+		size_t* reply_size)
+{
+	// Go over reply and compute the count of replies and also validate the
+	// endpoint lists.
+	uint8_t* start_ptr = (uint8_t*)reply;
+	*reply_size = 0;
+
+	for (int i = 0; i < reply_count; i++) {
+		as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr;
+		*reply_size += sizeof(as_hb_mesh_info_reply);
+		start_ptr += sizeof(as_hb_mesh_info_reply);
+
+		size_t endpoint_list_size = 0;
+		if (as_endpoint_list_sizeof(&reply_ptr->endpoint_list[0],
+				&endpoint_list_size)) {
+			// Incomplete / garbled info reply message.
+			*reply_size = 0;
+			return -1;
+		}
+
+		*reply_size += endpoint_list_size;
+		start_ptr += endpoint_list_size;
+	}
+
+	return 0;
+}
+
+/**
+ * Send a info reply in reply to an info request.
+ * @param dest the destination node to send the info reply to.
+ * @param reply array of node ids and endpoints
+ * @param reply_count the count of replies.
+ */
+static void
+mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply,
+		size_t reply_count)
+{
+	// Create the discover message.
+	msg* msg = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REPLY);
+
+	// Set the reply.
+	msg_info_reply_set(msg, reply, reply_count);
+
+	DEBUG("sending info reply to node %" PRIx64, dest);
+
+	// Send the info reply.
+	if (channel_msg_unicast(dest, msg) != 0) {
+		TICKER_WARNING("error sending info reply message to node %" PRIx64,
+				dest);
+	}
+
+	hb_msg_return(msg);
+}
+
+/**
+ * Initialize the info request msg buffer
+ */
+static msg*
+mesh_info_msg_init(as_hb_msg_type msg_type)
+{
+	msg* msg = hb_msg_get();
+	msg_src_fields_fill(msg);
+	msg_type_set(msg, msg_type);
+	return msg;
+}
+
+/**
+ * Send a info request for all undiscovered nodes.
+ * @param dest the destination node to send the discover message to.
+ * @param to_discover array of node ids to discover.
+ * @param to_discover_count the count of nodes in the array.
+ */
+static void
+mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover,
+		size_t to_discover_count)
+{
+	// Create the discover message.
+	msg* info_req = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REQUEST);
+
+	// Set the list of nodes to discover.
+	msg_node_list_set(info_req, AS_HB_MSG_INFO_REQUEST, to_discover,
+			to_discover_count);
+
+	DEBUG("sending info request to node %" PRIx64, dest);
+
+	// Send the info request.
+	if (channel_msg_unicast(dest, info_req) != 0) {
+		TICKER_WARNING("error sending info request message to node %" PRIx64,
+				dest);
+	}
+	hb_msg_return(info_req);
+}
+
+/**
+ * Handle an incoming pulse message to discover new neighbours.
+ */
+static void
+mesh_channel_on_pulse(msg* msg)
+{
+	cf_node* adj_list;
+	size_t adj_length;
+
+	cf_node source;
+
+	// Channel has validated the source. Don't bother checking here.
+	msg_nodeid_get(msg, &source);
+	if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) {
+		// Adjacency list absent.
+		WARNING("received message from %" PRIx64" without adjacency list",
+				source);
+		return;
+	}
+
+	cf_node to_discover[adj_length];
+	size_t num_to_discover = 0;
+
+	// TODO: Track already queried nodes so that we do not retry immediately.
+	// Will need a separate state, pending query.
+	MESH_LOCK();
+
+	// Try and discover new nodes from this message's adjacency list.
+	for (int i = 0; i < adj_length; i++) {
+		if (!mesh_node_is_discovered(adj_list[i])) {
+			DEBUG("discovered new mesh node %" PRIx64, adj_list[i]);
+
+			as_hb_mesh_node new_node;
+			memset(&new_node, 0, sizeof(new_node));
+			mesh_node_status_change(&new_node,
+					AS_HB_MESH_NODE_ENDPOINT_UNKNOWN);
+
+			// Add as a new node
+			mesh_node_add_update(adj_list[i], &new_node);
+		}
+
+		if (!mesh_node_endpoint_list_is_valid(adj_list[i])) {
+			to_discover[num_to_discover++] = adj_list[i];
+		}
+	}
+
+	MESH_UNLOCK();
+
+	// Discover these nodes outside a lock.
+	if (num_to_discover) {
+		mesh_nodes_send_info_request(msg, source, to_discover, num_to_discover);
+	}
+}
+
+/**
+ * Handle an incoming info message.
+ */
+static void
+mesh_channel_on_info_request(msg* msg)
+{
+	cf_node* query_nodeids;
+	size_t query_count;
+
+	cf_node source;
+	msg_nodeid_get(msg, &source);
+
+	if (msg_node_list_get(msg, AS_HB_MSG_INFO_REQUEST, &query_nodeids,
+			&query_count) != 0) {
+		TICKER_WARNING("got an info request without query nodes from %" PRIx64,
+				source);
+		return;
+	}
+
+	MESH_LOCK();
+
+	// Compute the entire response size.
+	size_t reply_size = 0;
+
+	for (int i = 0; i < query_count; i++) {
+		as_hb_mesh_node mesh_node;
+
+		if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) {
+			if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN
+					&& mesh_node.endpoint_list) {
+				size_t endpoint_list_size = 0;
+				as_endpoint_list_sizeof(mesh_node.endpoint_list,
+						&endpoint_list_size);
+				reply_size += sizeof(as_hb_mesh_info_reply)
+						+ endpoint_list_size;
+			}
+		}
+	}
+
+	as_hb_mesh_info_reply* replies = alloca(reply_size);
+	uint8_t* reply_ptr = (uint8_t*)replies;
+	size_t reply_count = 0;
+
+	DEBUG("received info request from node : %" PRIx64, source);
+	DEBUG("preparing a reply for %zu requests", query_count);
+
+	for (int i = 0; i < query_count; i++) {
+		as_hb_mesh_node mesh_node;
+
+		DEBUG("mesh received info request for node %" PRIx64, query_nodeids[i]);
+
+		if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) {
+			if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN
+					&& mesh_node.endpoint_list) {
+				as_hb_mesh_info_reply* reply = (as_hb_mesh_info_reply*)reply_ptr;
+
+				reply->nodeid = query_nodeids[i];
+
+				size_t endpoint_list_size = 0;
+				as_endpoint_list_sizeof(mesh_node.endpoint_list,
+						&endpoint_list_size);
+
+				memcpy(&reply->endpoint_list[0], mesh_node.endpoint_list,
+						endpoint_list_size);
+
+				reply_ptr += sizeof(as_hb_mesh_info_reply) + endpoint_list_size;
+
+				reply_count++;
+			}
+		}
+	}
+
+	MESH_UNLOCK();
+
+	// Send the reply
+	if (reply_count > 0) {
+		mesh_nodes_send_info_reply(source, replies, reply_count);
+	}
+}
+
+/**
+ * Handle an incoming info reply.
+ */
+static void
+mesh_channel_on_info_reply(msg* msg)
+{
+	as_hb_mesh_info_reply* reply = NULL;
+	size_t reply_count = 0;
+	cf_node source = 0;
+	msg_nodeid_get(msg, &source);
+	if (msg_info_reply_get(msg, &reply, &reply_count) != 0
+			|| reply_count == 0) {
+		TICKER_WARNING(
+				"got an info reply from without query nodes from %" PRIx64,
+				source);
+		return;
+	}
+
+	DEBUG("received info reply from node %" PRIx64, source);
+
+	MESH_LOCK();
+
+	uint8_t *start_ptr = (uint8_t*)reply;
+	for (int i = 0; i < reply_count; i++) {
+		as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr;
+		as_hb_mesh_node existing_node;
+		if (mesh_node_get(reply_ptr->nodeid, &existing_node) != 0) {
+			// Somehow the node was removed from the mesh hash. Maybe a timeout.
+			goto NextReply;
+		}
+
+		// Update the state of this node.
+		if (existing_node.status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) {
+			// Update the endpoint.
+			endpoint_list_copy(&existing_node.endpoint_list,
+					reply_ptr->endpoint_list);
+
+			mesh_node_status_change(&existing_node,
+					AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+			// Set the discovered flag.
+			g_hb.mode_state.mesh_state.nodes_discovered = true;
+
+			char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+			as_endpoint_list_to_string(existing_node.endpoint_list,
+					endpoint_list_str, sizeof(endpoint_list_str));
+
+			DEBUG("for node %" PRIx64" discovered endpoints {%s}",
+					reply_ptr->nodeid, endpoint_list_str);
+
+			// Update the hash.
+			mesh_node_add_update(reply_ptr->nodeid, &existing_node);
+		}
+
+	NextReply:
+		start_ptr += sizeof(as_hb_mesh_info_reply);
+		size_t endpoint_list_size = 0;
+		as_endpoint_list_sizeof(reply_ptr->endpoint_list, &endpoint_list_size);
+		start_ptr += endpoint_list_size;
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Handle the case when a message is received on a channel.
+ */
+static void
+mesh_channel_on_msg_rcvd(as_hb_channel_event* event)
+{
+	// Update the mesh node status.
+	mesh_node_data_update(event);
+
+	as_hb_msg_type msg_type;
+	msg_type_get(event->msg, &msg_type);
+
+	switch (msg_type) {
+	case AS_HB_MSG_TYPE_PULSE:	// A pulse message. Try and discover new nodes.
+		mesh_channel_on_pulse(event->msg);
+		break;
+	case AS_HB_MSG_TYPE_INFO_REQUEST:	// Send back an info reply.
+		mesh_channel_on_info_request(event->msg);
+		break;
+	case AS_HB_MSG_TYPE_INFO_REPLY:	// Update the list of mesh nodes, if this is an undiscovered node.
+		mesh_channel_on_info_reply(event->msg);
+		break;
+	default:
+		WARNING("received a message of unknown type from");
+		// Ignore other messages.
+		break;
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Mesh public API
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Add a host / port to the mesh seed list.
+ * @param host the seed node hostname / ip address
+ * @param port the seed node port.
+ * @param tls indicates TLS support.
+ * @return CF_SHASH_OK, CF_SHASH_ERR, CF_SHASH_ERR_FOUND.
+ */
+static int
+mesh_tip(char* host, int port, bool tls)
+{
+	MESH_LOCK();
+
+	int rv = -1;
+	as_hb_mesh_seed new_seed = { { 0 } };
+
+	// Check validity of hostname and port.
+	int hostname_len = strnlen(host, HOST_NAME_MAX);
+	if (hostname_len <= 0 || hostname_len == HOST_NAME_MAX) {
+		// Invalid hostname.
+		WARNING("mesh seed host %s exceeds allowed %d characters", host,
+				HOST_NAME_MAX);
+		goto Exit;
+	}
+	if (port <= 0 || port > USHRT_MAX) {
+		WARNING("mesh seed port %s:%d exceeds should be between 0 to %d", host,
+				port, USHRT_MAX);
+		goto Exit;
+	}
+
+	// Check if we already have a match for this seed.
+	if (mesh_seed_find_unsafe(host, port) >= 0) {
+		WARNING("mesh seed host %s:%d already in seed list", host, port);
+		goto Exit;
+	}
+
+	mesh_seed_status_change(&new_seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+	strncpy(new_seed.seed_host_name, host, sizeof(new_seed.seed_host_name));
+	new_seed.seed_port = port;
+	new_seed.seed_tls = tls;
+
+	cf_vector_append(&g_hb.mode_state.mesh_state.seeds, &new_seed);
+
+	INFO("added new mesh seed %s:%d", host, port);
+	rv = 0;
+
+Exit:
+	if (rv != 0) {
+		// Ensure endpoint allocated space is freed.
+		mesh_seed_destroy(&new_seed);
+	}
+
+	MESH_UNLOCK();
+	return rv;
+}
+
+/**
+ * Handle a channel event on an endpoint.
+ */
+static void
+mesh_channel_event_process(as_hb_channel_event* event)
+{
+	// Skip if we are not in mesh mode.
+	if (!hb_is_mesh()) {
+		return;
+	}
+
+	MESH_LOCK();
+	switch (event->type) {
+	case AS_HB_CHANNEL_NODE_CONNECTED:
+		// Ignore this event. The subsequent message event will be use for
+		// determining mesh node active status.
+		break;
+	case AS_HB_CHANNEL_NODE_DISCONNECTED:
+		mesh_channel_on_node_disconnect(event);
+		break;
+	case AS_HB_CHANNEL_MSG_RECEIVED:
+		mesh_channel_on_msg_rcvd(event);
+		break;
+	case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH:	// Ignore this event. HB module will handle it.
+		break;
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Initialize mesh mode data structures.
+ */
+static void
+mesh_init()
+{
+	if (!hb_is_mesh()) {
+		return;
+	}
+
+	MESH_LOCK();
+
+	g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED;
+
+	// Initialize the mesh node hash.
+	g_hb.mode_state.mesh_state.nodeid_to_mesh_node = cf_shash_create(
+			cf_nodeid_shash_fn, sizeof(cf_node), sizeof(as_hb_mesh_node),
+			AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	// Initialize the seed list.
+	cf_vector_init(&g_hb.mode_state.mesh_state.seeds, sizeof(as_hb_mesh_seed),
+	AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Delete the shash entries only if they are not seed entries.
+ */
+static int
+mesh_free_node_data_reduce(const void* key, void* data, void* udata)
+{
+	as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
+	mesh_node_destroy(mesh_node);
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+/**
+ * Remove a host / port from the mesh list.
+ */
+static int
+mesh_tip_clear_reduce(const void* key, void* data, void* udata)
+{
+	int rv = CF_SHASH_OK;
+
+	MESH_LOCK();
+
+	cf_node nodeid = *(cf_node*)key;
+	as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
+	as_hb_mesh_tip_clear_udata* tip_clear_udata =
+			(as_hb_mesh_tip_clear_udata*)udata;
+
+	if (tip_clear_udata == NULL || nodeid == tip_clear_udata->nodeid) {
+		// Handling tip clear all or clear of a specific node.
+		rv = CF_SHASH_REDUCE_DELETE;
+		goto Exit;
+	}
+
+	// See if the address matches any one of the endpoints in the node's
+	// endpoint list.
+	cf_ip_addr addrs[CF_SOCK_CFG_MAX];
+	uint32_t n_addrs = CF_SOCK_CFG_MAX;
+
+	if (cf_ip_addr_from_string_multi(tip_clear_udata->host, addrs, &n_addrs)
+			== 0) {
+		for (int i = 0; i < n_addrs; i++) {
+			cf_sock_addr sock_addr;
+			cf_ip_addr_copy(&addrs[i], &sock_addr.addr);
+			sock_addr.port = tip_clear_udata->port;
+			as_hb_endpoint_list_addr_find_udata udata;
+			udata.found = false;
+			udata.to_search = &sock_addr;
+
+			as_endpoint_list_iterate(mesh_node->endpoint_list,
+					mesh_endpoint_addr_find_iterate, &udata);
+
+			if (udata.found) {
+				rv = CF_SHASH_REDUCE_DELETE;
+				goto Exit;
+			}
+		}
+
+		// Not found by endpoint.
+		rv = CF_SHASH_OK;
+	}
+
+Exit:
+	if (rv == CF_SHASH_REDUCE_DELETE) {
+		char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+		as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str,
+				sizeof(endpoint_list_str));
+
+		// Find all seed entries matching this mesh entry and delete them.
+		cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+		int element_count = cf_vector_size(seeds);
+		for (int i = 0; i < element_count; i++) {
+			as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+			if (seed->mesh_nodeid != nodeid) {
+				// Does not match this mesh entry.
+				continue;
+			}
+			if (mesh_seed_delete_unsafe(i) == 0) {
+				i--;
+				element_count--;
+			}
+			else {
+				// Should not happen in practice.
+				CRASH("error deleting mesh seed entry %s:%d",
+						seed->seed_host_name, seed->seed_port);
+			}
+		}
+
+		if (channel_node_disconnect(nodeid) != 0) {
+			WARNING("unable to disconnect the channel to node %" PRIx64,
+					nodeid);
+		}
+
+		mesh_node_destroy(mesh_node);
+		if (tip_clear_udata != NULL) {
+			tip_clear_udata->entry_deleted = true;
+		}
+	}
+
+	MESH_UNLOCK();
+	return rv;
+}
+
+/**
+ * Output Heartbeat endpoints of peers.
+ */
+static int
+mesh_peer_endpoint_reduce(const void* key, void* data, void* udata)
+{
+	int rv = CF_SHASH_OK;
+	MESH_LOCK();
+	cf_node nodeid = *(cf_node*)key;
+	as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
+	cf_dyn_buf* db = (cf_dyn_buf*)udata;
+
+	cf_dyn_buf_append_string(db, "heartbeat.peer=");
+	cf_dyn_buf_append_string(db, "node-id=");
+	cf_dyn_buf_append_uint64_x(db, nodeid);
+	cf_dyn_buf_append_string(db, ":");
+	as_endpoint_list_info(mesh_node->endpoint_list, db);
+	cf_dyn_buf_append_string(db, ";");
+
+	MESH_UNLOCK();
+	return rv;
+}
+
+/**
+ * Free the mesh mode data structures.
+ */
+static void
+mesh_clear()
+{
+	if (!mesh_is_stopped()) {
+		WARNING(
+				"attempted clearing mesh module without stopping it - skip mesh clear!");
+		return;
+	}
+
+	MESH_LOCK();
+	// Delete the elements from the map.
+	cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
+			mesh_free_node_data_reduce, NULL);
+
+	// Reset the seeds to inactive state
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	for (int i = 0; i < element_count; i++) {
+		// Should not happen in practice.
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+		seed->mesh_nodeid = 0;
+		mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Open mesh listening socket. Crashes if open failed.
+ */
+static void
+mesh_listening_sockets_open()
+{
+	MESH_LOCK();
+
+	const cf_serv_cfg* bind_cfg = config_bind_cfg_get();
+
+	// Compute min MTU across all binding interfaces.
+	int min_mtu = -1;
+	char addr_string[HOST_NAME_MAX];
+	for (uint32_t i = 0; i < bind_cfg->n_cfgs; ++i) {
+		const cf_sock_cfg* sock_cfg = &bind_cfg->cfgs[i];
+		cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string,
+				sizeof(addr_string));
+
+		INFO("initializing mesh heartbeat socket: %s:%d", addr_string,
+				sock_cfg->port);
+
+		int bind_interface_mtu =
+				!cf_ip_addr_is_any(&sock_cfg->addr) ?
+						cf_inter_mtu(&sock_cfg->addr) : cf_inter_min_mtu();
+
+		if (min_mtu == -1 || min_mtu > bind_interface_mtu) {
+			min_mtu = bind_interface_mtu;
+		}
+	}
+
+	if (cf_socket_init_server((cf_serv_cfg*)bind_cfg,
+			&g_hb.mode_state.mesh_state.listening_sockets) != 0) {
+		CRASH("couldn't initialize unicast heartbeat sockets");
+	}
+
+	for (uint32_t i = 0;
+			i < g_hb.mode_state.mesh_state.listening_sockets.n_socks; ++i) {
+		DEBUG("opened mesh heartbeat socket: %d",
+				CSFD(&g_hb.mode_state.mesh_state.listening_sockets.socks[i]));
+	}
+
+	if (min_mtu == -1) {
+		WARNING("error getting the min MTU - using the default %d",
+				DEFAULT_MIN_MTU);
+		min_mtu = DEFAULT_MIN_MTU;
+	}
+
+	g_hb.mode_state.mesh_state.min_mtu = min_mtu;
+	INFO("mtu of the network is %d", min_mtu);
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Start mesh threads.
+ */
+static void
+mesh_start()
+{
+	if (!hb_is_mesh()) {
+		return;
+	}
+
+	MESH_LOCK();
+
+	mesh_listening_sockets_open();
+	channel_mesh_listening_socks_register(
+			&g_hb.mode_state.mesh_state.listening_sockets);
+
+	g_hb.mode_state.mesh_state.status = AS_HB_STATUS_RUNNING;
+
+	// Start the mesh tender thread.
+	if (pthread_create(&g_hb.mode_state.mesh_state.mesh_tender_tid, 0,
+			mesh_tender, &g_hb) != 0) {
+		CRASH("could not create channel tender thread: %s", cf_strerror(errno));
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Stop the mesh module.
+ */
+static void
+mesh_stop()
+{
+	if (!mesh_is_running()) {
+		WARNING("mesh is already stopped");
+		return;
+	}
+
+	// Unguarded state, but this should be OK.
+	g_hb.mode_state.mesh_state.status = AS_HB_STATUS_SHUTTING_DOWN;
+
+	// Wait for the channel tender thread to finish.
+	pthread_join(g_hb.mode_state.mesh_state.mesh_tender_tid, NULL);
+
+	MESH_LOCK();
+
+	channel_mesh_listening_socks_deregister(
+			&g_hb.mode_state.mesh_state.listening_sockets);
+
+	mesh_listening_sockets_close();
+
+	g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED;
+
+	// Clear allocated state if any.
+	if (g_hb.mode_state.mesh_state.published_endpoint_list) {
+		cf_free(g_hb.mode_state.mesh_state.published_endpoint_list);
+		g_hb.mode_state.mesh_state.published_endpoint_list = NULL;
+	}
+
+	MESH_UNLOCK();
+}
+
+/**
+ * Reduce function to dump mesh node info to log file.
+ */
+static int
+mesh_dump_reduce(const void* key, void* data, void* udata)
+{
+	cf_node nodeid = *(cf_node*)key;
+	as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
+
+	char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+	as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str,
+			sizeof(endpoint_list_str));
+
+	INFO("\tHB Mesh Node: node-id %" PRIx64" status %s last-updated %" PRIu64 " endpoints {%s}",
+			nodeid, mesh_node_status_string(mesh_node->status),
+			mesh_node->last_status_updated, endpoint_list_str);
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Dump mesh state to logs.
+ * @param verbose enables / disables verbose logging.
+ */
+static void
+mesh_dump(bool verbose)
+{
+	if (!hb_is_mesh() || !verbose) {
+		return;
+	}
+
+	MESH_LOCK();
+	cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
+	int element_count = cf_vector_size(seeds);
+	INFO("HB Seed Count %d", element_count);
+	for (int i = 0; i < element_count; i++) {
+		as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
+		char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+		as_endpoint_list_to_string(seed->resolved_endpoint_list,
+				endpoint_list_str, sizeof(endpoint_list_str));
+		INFO("\tHB Mesh Seed: host %s port %d node-id %" PRIx64" status %s endpoints {%s}",
+				seed->seed_host_name, seed->seed_port, seed->mesh_nodeid, mesh_node_status_string(seed->status),
+				endpoint_list_str);
+	}
+
+	INFO("HB Mesh Nodes Count %d", cf_shash_get_size(g_hb.mode_state.mesh_state.nodeid_to_mesh_node));
+	cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
+			mesh_dump_reduce, NULL);
+	MESH_UNLOCK();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Multicast sub module.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Initialize multicast data structures.
+ */
+static void
+multicast_init()
+{
+}
+
+/**
+ * Clear multicast data structures.
+ */
+static void
+multicast_clear()
+{
+	// Free multicast data structures. Nothing to do.
+}
+
+/**
+ * Open multicast sockets. Crashes if open failed.
+ */
+static void
+multicast_listening_sockets_open()
+{
+	MULTICAST_LOCK();
+
+	const cf_mserv_cfg* mserv_cfg = config_multicast_group_cfg_get();
+
+	// Compute min MTU across all binding interfaces.
+	int min_mtu = -1;
+	char addr_string[HOST_NAME_MAX];
+	for (uint32_t i = 0; i < mserv_cfg->n_cfgs; ++i) {
+		const cf_msock_cfg* sock_cfg = &mserv_cfg->cfgs[i];
+		cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string,
+				sizeof(addr_string));
+
+		INFO("initializing multicast heartbeat socket: %s:%d", addr_string,
+				sock_cfg->port);
+
+		int bind_interface_mtu =
+				!cf_ip_addr_is_any(&sock_cfg->if_addr) ?
+						cf_inter_mtu(&sock_cfg->if_addr) : cf_inter_min_mtu();
+
+		if (min_mtu == -1 || min_mtu > bind_interface_mtu) {
+			min_mtu = bind_interface_mtu;
+		}
+	}
+
+	if (cf_socket_mcast_init((cf_mserv_cfg*)mserv_cfg,
+			&g_hb.mode_state.multicast_state.listening_sockets) != 0) {
+		CRASH("couldn't initialize multicast heartbeat socket: %s",
+				cf_strerror(errno));
+	}
+
+	for (uint32_t i = 0;
+			i < g_hb.mode_state.multicast_state.listening_sockets.n_socks;
+			++i) {
+		DEBUG("opened multicast socket %d",
+				CSFD(
+						&g_hb.mode_state.multicast_state.listening_sockets.socks[i]));
+	}
+
+	if (min_mtu == -1) {
+		WARNING("error getting the min mtu - using the default %d",
+				DEFAULT_MIN_MTU);
+		min_mtu = DEFAULT_MIN_MTU;
+	}
+
+	g_hb.mode_state.multicast_state.min_mtu = min_mtu;
+
+	INFO("mtu of the network is %d", min_mtu);
+	MULTICAST_UNLOCK();
+}
+
+/**
+ * Start multicast module.
+ */
+static void
+multicast_start()
+{
+	MULTICAST_LOCK();
+	multicast_listening_sockets_open();
+	channel_multicast_listening_socks_register(
+			&g_hb.mode_state.multicast_state.listening_sockets);
+	MULTICAST_UNLOCK();
+}
+
+/**
+ * Close multicast listening socket.
+ */
+static void
+multicast_listening_sockets_close()
+{
+	MULTICAST_LOCK();
+	INFO("closing multicast heartbeat sockets");
+	cf_sockets_close(&g_hb.mode_state.multicast_state.listening_sockets);
+	DEBUG("closed multicast heartbeat socket");
+	MULTICAST_UNLOCK();
+}
+
+/**
+ * Stop Multicast.
+ */
+static void
+multicast_stop()
+{
+	MULTICAST_LOCK();
+	channel_multicast_listening_socks_deregister(
+			&g_hb.mode_state.multicast_state.listening_sockets);
+	multicast_listening_sockets_close();
+
+	MULTICAST_UNLOCK();
+}
+
+/**
+ * Dump multicast state to logs.
+ * @param verbose enables / disables verbose logging.
+ */
+static void
+multicast_dump(bool verbose)
+{
+	if (hb_is_mesh()) {
+		return;
+	}
+
+	// Mode is multicast.
+	INFO("HB Multicast TTL: %d", config_multicast_ttl_get());
+}
+
+/**
+ * Find the maximum cluster size based on MTU of the network.
+ *
+ * num_nodes is computed so that
+ *
+ * MTU = compression_factor(fixed_size +	 num_nodesper_node_size)
+ * where,
+ * fixed_size = udp_header_size + msg_header_size +
+ * sigma(per_plugin_fixed_size)
+ * per_node_size = sigma(per_plugin_per_node_size).
+ */
+static int
+multicast_supported_cluster_size_get()
+{
+	// Calculate the fixed size for a UDP packet and the message header.
+	size_t msg_fixed_size = msg_get_template_fixed_sz(g_hb_msg_template,
+			sizeof(g_hb_msg_template) / sizeof(msg_template));
+
+	size_t msg_plugin_per_node_size = 0;
+
+	for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
+		// Adding plugin specific fixed size
+		msg_fixed_size += g_hb.plugins[i].wire_size_fixed;
+		// Adding plugin specific per node size.
+		msg_plugin_per_node_size += g_hb.plugins[i].wire_size_per_node;
+	}
+
+	// TODO: Compute the max cluster size using max storage per node in cluster
+	// and the min mtu.
+	int supported_cluster_size = MAX(1,
+			(((hb_mtu() - UDP_HEADER_SIZE_MAX) * MSG_COMPRESSION_RATIO)
+					- msg_fixed_size) / msg_plugin_per_node_size);
+
+	return supported_cluster_size;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Heartbeat main sub module.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Is Main module initialized.
+ */
+static bool
+hb_is_initialized()
+{
+	HB_LOCK();
+	bool retval = (g_hb.status != AS_HB_STATUS_UNINITIALIZED) ? true : false;
+	HB_UNLOCK();
+	return retval;
+}
+
+/**
+ * Is Main module running.
+ */
+static bool
+hb_is_running()
+{
+	HB_LOCK();
+	bool retval = (g_hb.status == AS_HB_STATUS_RUNNING) ? true : false;
+	HB_UNLOCK();
+	return retval;
+}
+
+/**
+ * Is Main module stopped.
+ */
+static bool
+hb_is_stopped()
+{
+	HB_LOCK();
+	bool retval = (g_hb.status == AS_HB_STATUS_STOPPED) ? true : false;
+	HB_UNLOCK();
+	return retval;
+}
+
+/**
+ * Initialize the mode specific data structures.
+ */
+static void
+hb_mode_init()
+{
+	if (hb_is_mesh()) {
+		mesh_init();
+	}
+	else {
+		multicast_init();
+	}
+}
+
+/**
+ * Start mode specific threads..
+ */
+static void
+hb_mode_start()
+{
+	if (hb_is_mesh()) {
+		mesh_start();
+	}
+	else {
+		multicast_start();
+	}
+}
+
+/**
+ * The MTU for underlying network.
+ */
+static int
+hb_mtu()
+{
+	int __mtu = config_override_mtu_get();
+	if (!__mtu) {
+		__mtu = hb_is_mesh() ?
+				g_hb.mode_state.mesh_state.min_mtu :
+				g_hb.mode_state.multicast_state.min_mtu;
+		__mtu = __mtu > 0 ? __mtu : DEFAULT_MIN_MTU;
+	}
+	return __mtu;
+}
+
+/**
+ * Initialize the template to be used for heartbeat messages.
+ */
+static void
+hb_msg_init()
+{
+	// Register fabric heartbeat msg type with no processing function:
+	// This permits getting / putting heartbeat msgs to be moderated via an idle
+	// msg queue.
+	as_fabric_register_msg_fn(M_TYPE_HEARTBEAT, g_hb_msg_template,
+			sizeof(g_hb_msg_template),
+			AS_HB_MSG_SCRATCH_SIZE, 0, 0);
+}
+
+/**
+ * Get hold of current heartbeat protocol version
+ */
+static uint32_t
+hb_protocol_identifier_get()
+{
+	return HB_PROTOCOL_V3_IDENTIFIER;
+}
+
+/**
+ * Node depart event time estimate. Assumes node departed timeout milliseconds
+ * before the detection.
+ */
+static cf_clock
+hb_node_depart_time(cf_clock detect_time)
+{
+	return (detect_time - HB_NODE_TIMEOUT());
+}
+
+/**
+ * Indicates if mode is mesh.
+ */
+static bool
+hb_is_mesh()
+{
+	return (config_mode_get() == AS_HB_MODE_MESH);
+}
+
+/**
+ * Publish an event to subsystems listening to heart beat events.
+ */
+static void
+hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes,
+		int node_count)
+{
+	// Lock-less because the queue is thread safe and we do not use heartbeat
+	// state here.
+	for (int i = 0; i < node_count; i++) {
+		as_hb_event_node event;
+		event.nodeid = nodes[i];
+		event.event_detected_time = cf_getms();
+
+		switch (event_type) {
+		case AS_HB_INTERNAL_NODE_ARRIVE:
+			event.evt = AS_HB_NODE_ARRIVE;
+			event.event_time = event.event_detected_time;
+			break;
+		case AS_HB_INTERNAL_NODE_DEPART:
+			event.evt = AS_HB_NODE_DEPART;
+			event.event_time = hb_node_depart_time(event.event_detected_time);
+			break;
+		case AS_HB_INTERNAL_NODE_EVICT:
+			event.evt = AS_HB_NODE_DEPART;
+			event.event_time = event.event_detected_time;
+			break;
+		case AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED:
+			event.evt = AS_HB_NODE_ADJACENCY_CHANGED;
+			event.event_time = event.event_detected_time;
+			break;
+		}
+
+		DEBUG("queuing event of type %d for node %" PRIx64, event.evt,
+				event.nodeid);
+		cf_queue_push(&g_hb_event_listeners.external_events_queue, &event);
+	}
+}
+
+/**
+ * Publish all pending events. Should be invoked outside hb locks.
+ */
+static void
+hb_event_publish_pending()
+{
+	EXTERNAL_EVENT_PUBLISH_LOCK();
+	int num_events = cf_queue_sz(&g_hb_event_listeners.external_events_queue);
+	if (num_events <= 0) {
+		// Events need not be published.
+		goto Exit;
+	}
+
+	as_hb_event_node events[AS_HB_CLUSTER_MAX_SIZE_SOFT];
+	int published_count = 0;
+	while (published_count < AS_HB_CLUSTER_MAX_SIZE_SOFT
+			&& cf_queue_pop(&g_hb_event_listeners.external_events_queue,
+					&events[published_count], 0) == CF_QUEUE_OK) {
+		published_count++;
+	}
+
+	if (published_count) {
+		// Assuming that event listeners are not registered after system init,
+		// no locks here.
+		DEBUG("publishing %d heartbeat events", published_count);
+		for (int i = 0; i < g_hb_event_listeners.event_listener_count; i++) {
+			(g_hb_event_listeners.event_listeners[i].event_callback)(
+					published_count, events,
+					g_hb_event_listeners.event_listeners[i].udata);
+		}
+	}
+
+Exit:
+	EXTERNAL_EVENT_PUBLISH_UNLOCK();
+}
+
+/**
+ * Delete the heap allocated data while iterating through the hash and deleting
+ * entries.
+ */
+static int
+hb_adjacency_free_data_reduce(const void* key, void* data, void* udata)
+{
+	as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
+
+	const cf_node* nodeid = (const cf_node*)key;
+
+	hb_adjacent_node_destroy(adjacent_node);
+
+	// Send event depart to for this node
+	hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, nodeid, 1);
+
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+/**
+ * Clear the heartbeat data structures.
+ */
+static void
+hb_clear()
+{
+	if (!hb_is_stopped()) {
+		WARNING("attempted to clear heartbeat module without stopping it");
+		return;
+	}
+
+	HB_LOCK();
+
+	// Free the plugin data and delete adjacent nodes.
+	cf_shash_reduce(g_hb.adjacency, hb_adjacency_free_data_reduce, NULL);
+	cf_shash_reduce(g_hb.on_probation, hb_adjacency_free_data_reduce, NULL);
+	hb_adjacent_node_destroy(&g_hb.self_node);
+	memset(&g_hb.self_node, 0, sizeof(g_hb.self_node));
+
+	HB_UNLOCK();
+
+	// Publish node departed events for the removed nodes.
+	hb_event_publish_pending();
+
+	// Clear the mode module.
+	if (hb_is_mesh()) {
+		mesh_clear();
+	}
+	else {
+		multicast_clear();
+	}
+
+	channel_clear();
+}
+
+/**
+ * Reduce function to get hold of current adjacency list.
+ */
+static int
+hb_adjacency_iterate_reduce(const void* key, void* data, void* udata)
+{
+	const cf_node* nodeid = (const cf_node*)key;
+	as_hb_adjacency_reduce_udata* adjacency_reduce_udata =
+			(as_hb_adjacency_reduce_udata*)udata;
+
+	adjacency_reduce_udata->adj_list[adjacency_reduce_udata->adj_count] =
+			*nodeid;
+	adjacency_reduce_udata->adj_count++;
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Plugin function to set heartbeat adjacency list into a pulse message.
+ */
+static void
+hb_plugin_set_fn(msg* msg)
+{
+	HB_LOCK();
+
+	cf_node adj_list[cf_shash_get_size(g_hb.adjacency)];
+	as_hb_adjacency_reduce_udata adjacency_reduce_udata = { adj_list, 0 };
+
+	cf_shash_reduce(g_hb.adjacency, hb_adjacency_iterate_reduce,
+			&adjacency_reduce_udata);
+
+	HB_UNLOCK();
+
+	// Populate adjacency list.
+	msg_adjacency_set(msg, adj_list, adjacency_reduce_udata.adj_count);
+
+	// Set cluster name.
+	char cluster_name[AS_CLUSTER_NAME_SZ];
+	as_config_cluster_name_get(cluster_name);
+
+	if (cluster_name[0] != '\0'
+			&& msg_set_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name,
+					MSG_SET_COPY) != 0) {
+		CRASH("error setting cluster name on msg");
+	}
+}
+
+/**
+ * Plugin function that parses adjacency list out of a heartbeat pulse message.
+ */
+static void
+hb_plugin_parse_data_fn(msg* msg, cf_node source,
+		as_hb_plugin_node_data* plugin_data)
+{
+	size_t adj_length = 0;
+	cf_node* adj_list = NULL;
+
+	if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) {
+		// Store a zero length adjacency list. Should not have happened.
+		WARNING("received heartbeat without adjacency list %" PRIx64, source);
+		adj_length = 0;
+	}
+
+	// The guess can be larger for older protocols which also include self node
+	// in the adjacency list.
+	int guessed_data_size = (adj_length * sizeof(cf_node));
+
+	if (guessed_data_size > plugin_data->data_capacity) {
+		// Round up to nearest multiple of block size to prevent very frequent
+		// reallocation.
+		size_t data_capacity = ((guessed_data_size + HB_PLUGIN_DATA_BLOCK_SIZE
+				- 1) /
+		HB_PLUGIN_DATA_BLOCK_SIZE) *
+		HB_PLUGIN_DATA_BLOCK_SIZE;
+
+		// Reallocate since we have outgrown existing capacity.
+		plugin_data->data = cf_realloc(plugin_data->data, data_capacity);
+		plugin_data->data_capacity = data_capacity;
+	}
+
+	cf_node* dest_list = (cf_node*)(plugin_data->data);
+
+	size_t final_list_length = 0;
+	for (size_t i = 0; i < adj_length; i++) {
+		if (adj_list[i] == source) {
+			// Skip the source node.
+			continue;
+		}
+		dest_list[final_list_length++] = adj_list[i];
+	}
+
+	plugin_data->data_size = (final_list_length * sizeof(cf_node));
+}
+
+/**
+ * Get the msg buffer from a pool based on the protocol under use.
+ * @return the msg buff
+ */
+static msg*
+hb_msg_get()
+{
+	return as_fabric_msg_get(M_TYPE_HEARTBEAT);
+}
+
+/**
+ * Return the message buffer back to the pool.
+ */
+static void
+hb_msg_return(msg* msg)
+{
+	as_fabric_msg_put(msg);
+}
+
+/**
+ * Fill the outgoing pulse message with plugin specific data.
+ *
+ * Note: The set functions would be acquiring their locks. This function should
+ * never directly use nor have a call stack under HB_LOCK.
+ *
+ * @param msg the outgoing pulse message.
+ */
+static void
+hb_plugin_msg_fill(msg* msg)
+{
+	for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
+		if (g_hb.plugins[i].set_fn) {
+			(g_hb.plugins[i].set_fn)(msg);
+		}
+	}
+}
+
+/**
+ * Parse fields from the message into plugin specific data.
+ * @param msg the outgoing pulse message.
+ * @param adjacent_node the node from which this message was received.
+ * @param plugin_data_changed (output) array whose ith entry is set to true if
+ * ith plugin's data changed, false otherwise. Should be large enough to hold
+ * flags for all plugins.
+ */
+static void
+hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node,
+		as_hb_plugin* plugins, bool plugin_data_changed[])
+{
+	cf_node source;
+	adjacent_node->plugin_data_cycler++;
+
+	msg_nodeid_get(msg, &source);
+	for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
+		plugin_data_changed[i] = false;
+		if (plugins[i].parse_fn) {
+			as_hb_plugin_node_data* curr_data =
+					&adjacent_node->plugin_data[i][adjacent_node->plugin_data_cycler
+							% 2];
+
+			as_hb_plugin_node_data* prev_data =
+					&adjacent_node->plugin_data[i][(adjacent_node->plugin_data_cycler
+							+ 1) % 2];
+
+			// Ensure there is a preallocated data pointer.
+			if (curr_data->data == NULL) {
+				curr_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE);
+				curr_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE;
+				curr_data->data_size = 0;
+			}
+
+			// Parse message data into current data.
+			(plugins[i]).parse_fn(msg, source, curr_data);
+
+			if (!plugins[i].change_listener) {
+				// No change listener configured. Skip detecting change.
+				continue;
+			}
+
+			size_t curr_data_size = curr_data->data_size;
+			void* curr_data_blob = curr_data_size ? curr_data->data : NULL;
+
+			size_t prev_data_size = prev_data->data_size;
+			void* prev_data_blob = prev_data_size ? prev_data->data : NULL;
+
+			if (prev_data_blob == curr_data_blob) {
+				// Old and new data both NULL or both point to the same memory
+				// location.
+				plugin_data_changed[i] = false;
+				continue;
+			}
+
+			if (prev_data_size
+					!= curr_data_size|| prev_data_blob == NULL || curr_data_blob == NULL) {
+				// Plugin data definitely changed, as the data sizes differ or
+				// exactly one of old or new data pointers is NULL.
+				plugin_data_changed[i] = true;
+				continue;
+			}
+
+			// The data sizes match at this point and neither values are NULL.
+			plugin_data_changed[i] = memcmp(prev_data_blob, curr_data_blob,
+					curr_data_size) != 0;
+		}
+	}
+}
+
+/**
+ * Adjacency list for an adjacent node changed.
+ */
+static void
+hb_plugin_data_change_listener(cf_node changed_node_id)
+{
+	hb_event_queue(AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED, &changed_node_id, 1);
+}
+
+/**
+ * Initialize the plugin specific data structures.
+ */
+static void
+hb_plugin_init()
+{
+	memset(&g_hb.plugins, 0, sizeof(g_hb.plugins));
+
+	// Be cute. Register self as a plugin.
+	as_hb_plugin self_plugin;
+	memset(&self_plugin, 0, sizeof(self_plugin));
+	self_plugin.id = AS_HB_PLUGIN_HB;
+	self_plugin.wire_size_fixed = 0;
+	self_plugin.wire_size_per_node = sizeof(cf_node);
+	self_plugin.set_fn = hb_plugin_set_fn;
+	self_plugin.parse_fn = hb_plugin_parse_data_fn;
+	self_plugin.change_listener = hb_plugin_data_change_listener;
+	hb_plugin_register(&self_plugin);
+}
+
+/**
+ * Transmits heartbeats at fixed intervals.
+ */
+void*
+hb_transmitter(void* arg)
+{
+	DETAIL("heartbeat transmitter started");
+
+	cf_clock last_time = 0;
+
+	while (hb_is_running()) {
+		cf_clock curr_time = cf_getms();
+
+		if ((curr_time - last_time) < PULSE_TRANSMIT_INTERVAL()) {
+			// Interval has not been reached for sending heartbeats
+			usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time +
+			PULSE_TRANSMIT_INTERVAL()) - curr_time) * 1000);
+			continue;
+		}
+
+		last_time = curr_time;
+
+		// Construct the pulse message.
+		msg* msg = hb_msg_get();
+
+		msg_src_fields_fill(msg);
+		msg_type_set(msg, AS_HB_MSG_TYPE_PULSE);
+
+		// Have plugins fill their data into the heartbeat pulse message.
+		hb_plugin_msg_fill(msg);
+
+		// Broadcast the heartbeat to all known recipients.
+		channel_msg_broadcast(msg);
+
+		// Return the msg back to the fabric.
+		hb_msg_return(msg);
+
+		DETAIL("done sending pulse message");
+	}
+
+	DETAIL("heartbeat transmitter stopped");
+	return NULL;
+}
+
+/**
+ * Get hold of adjacent node information given its nodeid.
+ * @param nodeid the nodeid.
+ * @param adjacent_node the output node information.
+ * @return 0 on success, -1 on failure.
+ */
+static int
+hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node)
+{
+	int rv = -1;
+	HB_LOCK();
+
+	if (cf_shash_get(g_hb.adjacency, &nodeid, adjacent_node) == CF_SHASH_OK) {
+		rv = 0;
+	}
+
+	HB_UNLOCK();
+	return rv;
+}
+
+/**
+ * Get hold of an on-probation node information given its nodeid.
+ * @param nodeid the nodeid.
+ * @param adjacent_node the output node information.
+ * @return 0 on success, -1 on failure.
+ */
+static int
+hb_on_probation_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node)
+{
+	int rv = -1;
+	HB_LOCK();
+
+	if (cf_shash_get(g_hb.on_probation, &nodeid, adjacent_node)
+			== CF_SHASH_OK) {
+		rv = 0;
+	}
+
+	HB_UNLOCK();
+	return rv;
+}
+
+/**
+ * Read the plugin data from an adjacent node.
+ * @param adjacent_node the adjacent node.
+ * @param plugin_data (output) will be null if this node has no plugin data.
+ * Else will point to the plugin data.
+ * @param plugin_data_size (output) the size of the plugin data.
+ */
+static void
+hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node,
+		as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size)
+{
+	*plugin_data_size =
+			adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler
+					% 2].data_size;
+
+	*plugin_data =
+			*plugin_data_size ?
+					(cf_node*)(adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler
+							% 2].data) : NULL;
+}
+
+/**
+ * Get adjacency list for an adjacent node.
+ */
+static void
+hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node,
+		cf_node** adjacency_list, size_t* adjacency_length)
+{
+	hb_adjacent_node_plugin_data_get(adjacent_node, AS_HB_PLUGIN_HB,
+			(void**)adjacency_list, adjacency_length);
+	(*adjacency_length) /= sizeof(cf_node);
+}
+
+/**
+ * Indicates if a give node has expired and should be removed from the adjacency
+ * list.
+ */
+static bool
+hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node)
+{
+	if (nodeid == config_self_nodeid_get()) {
+		return false;
+	}
+
+	HB_LOCK();
+
+	cf_clock now = cf_getms();
+
+	bool expired = adjacent_node->last_updated_monotonic_ts + HB_NODE_TIMEOUT()
+			< now;
+
+	HB_UNLOCK();
+	return expired;
+}
+
+/**
+ * Indicates if self node has duplicate ids.
+ */
+static bool
+hb_self_is_duplicate(){
+	HB_LOCK();
+	bool self_is_duplicate = g_hb.self_is_duplicate;
+	HB_UNLOCK();
+	return self_is_duplicate;
+}
+
+/**
+ * Updates the self is duplicate flag.
+ */
+static void
+hb_self_duplicate_update()
+{
+	cf_clock now = cf_getms();
+	HB_LOCK();
+	if (g_hb.self_is_duplicate) {
+		uint32_t duplicate_block_interval =
+			config_endpoint_track_intervals_get()
+			* config_tx_interval_get();
+		if (g_hb.self_duplicate_detected_ts + duplicate_block_interval <= now) {
+			// We have not seen duplicates for the endpoint change tracking
+			// interval. Mark ourself as non-duplicate.
+			g_hb.self_is_duplicate = false;
+		}
+	}
+	HB_UNLOCK();
+}
+
+/**
+ * Free up space occupied by plugin data from adjacent node.
+ */
+static void
+hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node)
+{
+	HB_LOCK();
+	for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
+		as_hb_plugin_node_data* curr_plugin_data = adjacent_node->plugin_data[i];
+		for (int j = 0; j < 2; j++) {
+			if (curr_plugin_data[j].data) {
+				cf_free(curr_plugin_data[j].data);
+				curr_plugin_data[j].data = NULL;
+			}
+
+			curr_plugin_data[j].data_capacity = 0;
+			curr_plugin_data[j].data_size = 0;
+		}
+	}
+
+	if (adjacent_node->endpoint_list) {
+		// Free the endpoint list.
+		cf_free(adjacent_node->endpoint_list);
+		adjacent_node->endpoint_list = NULL;
+	}
+
+	HB_UNLOCK();
+}
+
+/**
+ * Tend reduce function that removes expired nodes from adjacency list.
+ */
+static int
+hb_adjacency_tend_reduce(const void* key, void* data, void* udata)
+{
+	cf_node nodeid = *(const cf_node*)key;
+	as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
+	as_hb_adjacency_tender_udata* adjacency_tender_udata =
+			(as_hb_adjacency_tender_udata*)udata;
+
+	int rv = CF_SHASH_OK;
+	bool cluster_name_mismatch = adjacent_node->cluster_name_mismatch_count
+			> CLUSTER_NAME_MISMATCH_MAX;
+	if (hb_node_has_expired(nodeid, adjacent_node) || cluster_name_mismatch) {
+		INFO("node expired %" PRIx64" %s", nodeid, cluster_name_mismatch ? "(cluster name mismatch)" : "");
+		if (cluster_name_mismatch) {
+			adjacency_tender_udata->evicted_nodes[adjacency_tender_udata->evicted_node_count++] =
+					nodeid;
+		}
+		else {
+			adjacency_tender_udata->dead_nodes[adjacency_tender_udata->dead_node_count++] =
+					nodeid;
+		}
+
+		// Free plugin data as well.
+		hb_adjacent_node_destroy(adjacent_node);
+
+		rv = CF_SHASH_REDUCE_DELETE;
+	}
+
+	return rv;
+}
+
+/**
+ * Tend reduce function that removes expired nodes from the probationary list.
+ */
+static int
+hb_on_probation_tend_reduce(const void* key, void* data, void* udata)
+{
+	cf_node nodeid = *(const cf_node*)key;
+	as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
+
+	int rv = CF_SHASH_OK;
+	if (hb_node_has_expired(nodeid, adjacent_node)) {
+		DEBUG("on-probation node %" PRIx64 " expired", nodeid);
+		// Free plugin data as well.
+		hb_adjacent_node_destroy(adjacent_node);
+		rv = CF_SHASH_REDUCE_DELETE;
+	}
+	return rv;
+}
+
+/**
+ * Tends the adjacency list. Removes nodes that expire.
+ */
+void*
+hb_adjacency_tender(void* arg)
+{
+	DETAIL("adjacency tender started");
+
+	cf_clock last_time = 0;
+	cf_clock last_depart_time = 0;
+
+	while (hb_is_running()) {
+		cf_clock curr_time = cf_getms();
+		uint32_t adjacency_tend_interval = ADJACENCY_TEND_INTERVAL;
+		// Interval after node depart where we tend faster to detect additional
+		// node departures.
+		uint32_t fast_check_interval = 2 * config_tx_interval_get();
+		if (last_depart_time + fast_check_interval > curr_time) {
+			adjacency_tend_interval = ADJACENCY_FAST_TEND_INTERVAL;
+		}
+
+		hb_self_duplicate_update();
+
+		if ((curr_time - last_time) < adjacency_tend_interval) {
+			// Publish any pendng events.
+			hb_event_publish_pending();
+
+			// Interval has not been reached for sending heartbeats
+			usleep(
+					MIN(AS_HB_TX_INTERVAL_MS_MIN,
+							(last_time + adjacency_tend_interval) - curr_time)
+							* 1000);
+			continue;
+		}
+
+		last_time = curr_time;
+
+		DETAIL("tending adjacency list");
+
+		HB_LOCK();
+		cf_node dead_nodes[cf_shash_get_size(g_hb.adjacency)];
+		cf_node evicted_nodes[cf_shash_get_size(g_hb.adjacency)];
+		as_hb_adjacency_tender_udata adjacency_tender_udata;
+		adjacency_tender_udata.dead_nodes = dead_nodes;
+		adjacency_tender_udata.dead_node_count = 0;
+		adjacency_tender_udata.evicted_nodes = evicted_nodes;
+		adjacency_tender_udata.evicted_node_count = 0;
+
+		cf_shash_reduce(g_hb.adjacency, hb_adjacency_tend_reduce,
+				&adjacency_tender_udata);
+
+		if (adjacency_tender_udata.dead_node_count > 0) {
+			last_depart_time = curr_time;
+			// Queue events for dead nodes.
+			hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, dead_nodes,
+					adjacency_tender_udata.dead_node_count);
+		}
+
+		if (adjacency_tender_udata.evicted_node_count > 0) {
+			last_depart_time = curr_time;
+			// Queue events for evicted nodes.
+			hb_event_queue(AS_HB_INTERNAL_NODE_EVICT, evicted_nodes,
+					adjacency_tender_udata.evicted_node_count);
+		}
+
+		// Expire nodes from the on-probation list.
+		cf_shash_reduce(g_hb.on_probation, hb_on_probation_tend_reduce, NULL);
+		HB_UNLOCK();
+
+		// See if we have pending events to publish.
+		hb_event_publish_pending();
+
+		DETAIL("done tending adjacency list");
+	}
+
+	DETAIL("adjacency tender shut down");
+	return NULL;
+}
+
+/**
+ * Start the transmitter thread.
+ */
+static void
+hb_tx_start()
+{
+	// Start the transmitter thread.
+	if (pthread_create(&g_hb.transmitter_tid, 0, hb_transmitter, &g_hb) != 0) {
+		CRASH("could not create heartbeat transmitter thread: %s",
+				cf_strerror(errno));
+	}
+}
+
+/**
+ * Stop the transmitter thread.
+ */
+static void
+hb_tx_stop()
+{
+	DETAIL("waiting for the transmitter thread to stop");
+	// Wait for the adjacency tender thread to stop.
+	pthread_join(g_hb.transmitter_tid, NULL);
+}
+
+/**
+ * Start the transmitter thread.
+ */
+static void
+hb_adjacency_tender_start()
+{
+	// Start the transmitter thread.
+	if (pthread_create(&g_hb.adjacency_tender_tid, 0, hb_adjacency_tender,
+			&g_hb) != 0) {
+		CRASH("could not create heartbeat adjacency tender thread: %s",
+				cf_strerror(errno));
+	}
+}
+
+/**
+ * Stop the adjacency tender thread.
+ */
+static void
+hb_adjacency_tender_stop()
+{
+	// Wait for the adjacency tender thread to stop.
+	pthread_join(g_hb.adjacency_tender_tid, NULL);
+}
+
+/**
+ * Initialize the heartbeat subsystem.
+ */
+static void
+hb_init()
+{
+	if (hb_is_initialized()) {
+		WARNING("heartbeat main module is already initialized");
+		return;
+	}
+
+	// Operate under a lock. Let's be paranoid everywhere.
+	HB_LOCK();
+
+	// Initialize the heartbeat data structure.
+	memset(&g_hb, 0, sizeof(g_hb));
+
+	// Initialize the adjacency hash.
+	g_hb.adjacency = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node),
+			sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	// Initialize the on_probation hash.
+	g_hb.on_probation = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node),
+			sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	// Initialize the temporary hash to map nodeid to index.
+	g_hb.nodeid_to_index = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node),
+			sizeof(int), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
+
+	// Initialize unpublished event queue.
+	cf_queue_init(&g_hb_event_listeners.external_events_queue,
+			sizeof(as_hb_event_node),
+			AS_HB_CLUSTER_MAX_SIZE_SOFT, true);
+
+	// Initialize the mode specific state.
+	hb_mode_init();
+
+	// Initialize the plugin functions.
+	hb_plugin_init();
+
+	// Initialize IO channel subsystem.
+	channel_init();
+
+	g_hb.status = AS_HB_STATUS_STOPPED;
+
+	HB_UNLOCK();
+}
+
+/**
+ * Start the heartbeat subsystem.
+ */
+static void
+hb_start()
+{
+	// Operate under a lock. Let's be paranoid everywhere.
+	HB_LOCK();
+
+	if (hb_is_running()) {
+		// Shutdown the heartbeat subsystem.
+		hb_stop();
+	}
+
+	g_hb.status = AS_HB_STATUS_RUNNING;
+
+	// Initialize the heartbeat message templates. Called from here because
+	// fabric needs to be initialized for this call to succeed. Fabric init
+	// happens after heartbeat init.
+	hb_msg_init();
+
+	// Initialize channel sub module.
+	channel_start();
+
+	// Start the mode sub module
+	hb_mode_start();
+
+	// Start heart beat transmitter.
+	hb_tx_start();
+
+	// Start heart beat adjacency tender.
+	hb_adjacency_tender_start();
+
+	HB_UNLOCK();
+}
+
+/**
+ * Shut down the heartbeat subsystem.
+ */
+static void
+hb_stop()
+{
+	if (!hb_is_running()) {
+		WARNING("heartbeat is already stopped");
+		return;
+	}
+
+	HB_LOCK();
+	g_hb.status = AS_HB_STATUS_SHUTTING_DOWN;
+	HB_UNLOCK();
+
+	// Publish pending events. Should not delay any events.
+	hb_event_publish_pending();
+
+	// Shutdown mode.
+	if (hb_is_mesh()) {
+		mesh_stop();
+	}
+	else {
+		multicast_stop();
+	}
+
+	// Wait for the threads to shut down.
+	hb_tx_stop();
+
+	hb_adjacency_tender_stop();
+
+	// Stop channels.
+	channel_stop();
+
+	g_hb.status = AS_HB_STATUS_STOPPED;
+}
+
+/**
+ * Register a plugin with the heart beat system.
+ */
+static void
+hb_plugin_register(as_hb_plugin* plugin)
+{
+	HB_LOCK();
+	memcpy(&g_hb.plugins[plugin->id], plugin, sizeof(as_hb_plugin));
+	HB_UNLOCK();
+}
+
+/**
+ * Check if the heartbeat recieved is duplicate or stale.
+ */
+static bool
+hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp last_send_ts)
+{
+	if (as_hlc_timestamp_order_get(event->msg_hlc_ts.send_ts, last_send_ts)
+			== AS_HLC_HAPPENS_BEFORE) {
+		// Received a delayed heartbeat send before the current heartbeat.
+		return true;
+	}
+	return false;
+}
+
+/**
+ * Update the tracker with endpoint change status.
+ */
+static void
+hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed)
+{
+	*tracker = *tracker << 1;
+	if (endpoint_changed) {
+		(*tracker)++;
+	}
+}
+
+/**
+ * Indicates if endpoint changes for this node are normal.
+ */
+static bool
+hb_endpoint_change_tracker_is_normal(uint64_t tracker)
+{
+	if (tracker == 0) {
+		// Normal and healthy case.
+		return true;
+	}
+
+	uint32_t num_intervals_to_track = MIN(64,
+			config_endpoint_track_intervals_get());
+	uint64_t mask = ~(~(uint64_t)0 << num_intervals_to_track);
+
+	// Ignore older history.
+	tracker &= mask;
+
+	int flip_count = 0;
+	static int nibblebits[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+	for (; tracker != 0; tracker >>= 4) {
+		flip_count += nibblebits[tracker & 0x0f];
+	}
+
+	return flip_count <= config_endpoint_changes_allowed_get();
+}
+
+
+/**
+ * Indicates if the change tracker just changed.
+ */
+static bool
+hb_endpoint_change_tracker_has_changed(uint64_t tracker)
+{
+	return tracker % 2;
+}
+
+/**
+ * Update adjacent node data on receiving a valid pulse message.
+ */
+static void
+hb_adjacent_node_update(as_hb_channel_event* msg_event,
+		as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[])
+{
+	msg* msg = msg_event->msg;
+	cf_node source = 0;
+	// Channel has validated the source. Don't bother checking here.
+	msg_nodeid_get(msg, &source);
+
+	// Update all fields irrespective of whether this is a new node.
+	msg_id_get(msg, &adjacent_node->protocol_version);
+
+	// Get the ip address.
+	as_endpoint_list* msg_endpoint_list;
+	if (msg_endpoint_list_get(msg, &msg_endpoint_list) == 0
+			&& !as_endpoint_lists_are_equal(adjacent_node->endpoint_list,
+					msg_endpoint_list)) {
+		// Update the endpoints.
+		endpoint_list_copy(&adjacent_node->endpoint_list, msg_endpoint_list);
+	}
+
+	// Populate plugin data.
+	hb_plugin_msg_parse(msg, adjacent_node, g_hb.plugins, plugin_data_changed);
+
+	// Update the last updated time.
+	adjacent_node->last_updated_monotonic_ts = cf_getms();
+	memcpy(&adjacent_node->last_msg_hlc_ts, &msg_event->msg_hlc_ts,
+			sizeof(adjacent_node->last_msg_hlc_ts));
+
+	// Update the latency.
+	int64_t latency = as_hlc_timestamp_diff_ms(msg_event->msg_hlc_ts.send_ts,
+			msg_event->msg_hlc_ts.recv_ts);
+	latency = latency < 0 ? -latency : latency;
+	adjacent_node->avg_latency = ALPHA * latency
+			+ (1 - ALPHA) * adjacent_node->avg_latency;
+
+	// Reset the cluster-name mismatch counter to zero.
+	adjacent_node->cluster_name_mismatch_count = 0;
+
+	// Check if fabric endpoints have changed.
+	as_hb_plugin_node_data* curr_data =
+			&adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][adjacent_node->plugin_data_cycler
+					% 2];
+
+	as_hb_plugin_node_data* prev_data =
+			&adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][(adjacent_node->plugin_data_cycler
+					+ 1) % 2];
+
+	as_endpoint_list* curr_fabric_endpoints =
+			as_fabric_hb_plugin_get_endpoint_list(curr_data);
+	as_endpoint_list* prev_fabric_endpoints =
+			as_fabric_hb_plugin_get_endpoint_list(prev_data);
+
+	// Endpoints changed if this is not the first update where there is no
+	// previous data or if the endpoint lists do not match.
+	bool endpoints_changed = prev_fabric_endpoints != NULL
+			&& !as_endpoint_lists_are_equal(curr_fabric_endpoints,
+					prev_fabric_endpoints);
+
+	if (endpoints_changed) {
+		char curr_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE];
+		char prev_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE];
+
+		as_endpoint_list_to_string(curr_fabric_endpoints,
+				curr_fabric_endpoints_str, sizeof(curr_fabric_endpoints_str));
+		as_endpoint_list_to_string(prev_fabric_endpoints,
+				prev_fabric_endpoints_str, sizeof(prev_fabric_endpoints_str));
+
+		TICKER_WARNING("node: %"PRIx64" fabric endpoints changed from {%s} to {%s}", source, prev_fabric_endpoints_str, curr_fabric_endpoints_str);
+	}
+
+	hb_endpoint_change_tracker_update(&adjacent_node->endpoint_change_tracker,
+			endpoints_changed);
+}
+
+/**
+ * Indicates if a node can be considered adjacent, based on accumulated
+ * statistics.
+ */
+static bool
+hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node)
+{
+	return hb_endpoint_change_tracker_is_normal(
+			adjacent_node->endpoint_change_tracker);
+}
+
+/**
+ * Process a pulse from source having the out node-id.
+ */
+static void
+hb_channel_on_self_pulse(as_hb_channel_event* msg_event)
+{
+	bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 };
+
+	HB_LOCK();
+	hb_adjacent_node_update(msg_event, &g_hb.self_node, plugin_data_changed);
+
+	as_hb_plugin_node_data* curr_data =
+			&g_hb.self_node.plugin_data[AS_HB_PLUGIN_FABRIC][g_hb.self_node.plugin_data_cycler
+					% 2];
+	as_endpoint_list* curr_fabric_endpoints =
+			as_fabric_hb_plugin_get_endpoint_list(curr_data);
+
+	if (!as_fabric_is_published_endpoint_list(curr_fabric_endpoints)) {
+		// Mark self as having duplicate node-id.
+		g_hb.self_is_duplicate = true;
+		g_hb.self_duplicate_detected_ts = cf_getms();
+
+		// Found another node with duplicate node-id.
+		char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+		as_endpoint_list_to_string(curr_fabric_endpoints, endpoint_list_str,
+				sizeof(endpoint_list_str));
+		TICKER_WARNING("duplicate node-id: %" PRIx64 " with fabric endpoints {%s}", config_self_nodeid_get(), endpoint_list_str);
+	}
+	else {
+		cf_atomic_int_incr(&g_stats.heartbeat_received_self);
+	}
+
+	HB_UNLOCK();
+}
+
+/**
+ * Process an incoming pulse message.
+ */
+static void
+hb_channel_on_pulse(as_hb_channel_event* msg_event)
+{
+	msg* msg = msg_event->msg;
+	cf_node source;
+
+	// Print cluster breach only once per second.
+	static cf_clock last_cluster_breach_print = 0;
+
+	// Channel has validated the source. Don't bother checking here.
+	msg_nodeid_get(msg, &source);
+
+	if (source == config_self_nodeid_get()) {
+		hb_channel_on_self_pulse(msg_event);
+		// Ignore self heartbeats.
+		return;
+	}
+
+	HB_LOCK();
+
+	as_hb_adjacent_node adjacent_node = { 0 };
+
+	bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 };
+	bool is_in_adjacency = (hb_adjacent_node_get(source, &adjacent_node) == 0);
+	bool should_be_on_probation = false;
+
+	if (!is_in_adjacency) {
+		hb_on_probation_node_get(source, &adjacent_node);
+	}
+
+	// Update the adjacent node with contents of the message. If the msg is
+	// obsolete
+	hb_adjacent_node_update(msg_event, &adjacent_node, plugin_data_changed);
+	as_hlc_timestamp send_ts = adjacent_node.last_msg_hlc_ts.send_ts;
+
+	// Check if this node needs to be on probation.
+	should_be_on_probation = !hb_node_can_consider_adjacent(&adjacent_node);
+
+	if (hb_endpoint_change_tracker_has_changed(
+			adjacent_node.endpoint_change_tracker)) {
+		// Allow a little more slack for obsolete checking because the two nodes
+		// might not have matching send timestamps.
+		send_ts = as_hlc_timestamp_subtract_ms(send_ts, config_tx_interval_get());
+	}
+
+	if (hb_msg_is_obsolete(msg_event, send_ts)) {
+		WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was  %" PRIu64 " from node: %" PRIx64,
+				send_ts,
+				msg_event->msg_hlc_ts.send_ts, source);
+		goto Exit;
+	}
+
+	cf_atomic_int_incr(&g_stats.heartbeat_received_foreign);
+
+	bool is_new = !should_be_on_probation && !is_in_adjacency;
+
+	if (is_new) {
+		int mcsize = config_mcsize();
+		// Note: adjacency list does not contain self node hence
+		// (mcsize - 1) in the check.
+		if (cf_shash_get_size(g_hb.adjacency) >= (mcsize - 1)) {
+			if (last_cluster_breach_print != (cf_getms() / 1000L)) {
+				WARNING("ignoring node: %" PRIx64" - exceeding maximum supported cluster size %d",
+						source, mcsize);
+				last_cluster_breach_print = cf_getms() / 1000L;
+			}
+			goto Exit;
+		}
+	}
+
+	// Update plugin data, update times, etc.
+	cf_shash_put(should_be_on_probation ? g_hb.on_probation : g_hb.adjacency,
+			&source, &adjacent_node);
+
+	// Maintain mutual exclusion between adjacency and on_probation hashes.
+	cf_shash_delete(should_be_on_probation ? g_hb.adjacency : g_hb.on_probation,
+			&source);
+
+	if (is_new) {
+		// Publish event if this is a new node.
+		INFO("node arrived %" PRIx64, source);
+		hb_event_queue(AS_HB_INTERNAL_NODE_ARRIVE, &source, 1);
+	}
+	else if (should_be_on_probation && is_in_adjacency) {
+		// This node needs to be on probation, most likely due to duplicate
+		// node-ids.
+		WARNING("node expired %" PRIx64" - potentially duplicate node-id", source);
+		hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, &source, 1);
+	}
+
+Exit:
+	HB_UNLOCK();
+
+	// Publish any pending node arrival events.
+	hb_event_publish_pending();
+
+	if (!should_be_on_probation) {
+		// Call plugin change listeners outside of a lock to prevent deadlocks.
+		for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
+			if (plugin_data_changed[i] && g_hb.plugins[i].change_listener) {
+				// Notify that data for this plugin for the source node has
+				// changed.
+				DETAIL("plugin data for node %" PRIx64" changed for plugin %d",
+						source, i);
+				(g_hb.plugins[i]).change_listener(source);
+			}
+		}
+	}
+}
+
+/**
+ * Process an incoming heartbeat message.
+ */
+static void
+hb_channel_on_msg_rcvd(as_hb_channel_event* event)
+{
+	msg* msg = event->msg;
+	as_hb_msg_type type;
+	msg_type_get(msg, &type);
+
+	switch (type) {
+	case AS_HB_MSG_TYPE_PULSE:	// A pulse message. Update the adjacent node data.
+		hb_channel_on_pulse(event);
+		break;
+	default:	// Ignore other messages.
+		break;
+	}
+}
+
+/**
+ * Increase the cluster-name mismatch counter the node.
+ */
+static void
+hb_handle_cluster_name_mismatch(as_hb_channel_event* event)
+{
+	HB_LOCK();
+
+	as_hb_adjacent_node adjacent_node;
+	memset(&adjacent_node, 0, sizeof(adjacent_node));
+
+	if (hb_adjacent_node_get(event->nodeid, &adjacent_node) != 0) {
+		// Node does not exist in the adjacency list
+		goto Exit;
+	}
+
+	if (hb_msg_is_obsolete(event, adjacent_node.last_msg_hlc_ts.send_ts)) {
+		WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was  %" PRIu64 " from node: %" PRIx64,
+				adjacent_node.last_msg_hlc_ts.send_ts,
+				event->msg_hlc_ts.send_ts, event->nodeid);
+		goto Exit;
+	}
+
+	// Update the cluster_name_mismatch counter.
+	adjacent_node.cluster_name_mismatch_count++;
+	cf_shash_put(g_hb.adjacency, &event->nodeid, &adjacent_node);
+Exit:
+	HB_UNLOCK();
+}
+
+/**
+ * Process channel events.
+ */
+static void
+hb_channel_event_process(as_hb_channel_event* event)
+{
+	// Deal with pulse messages here.
+	switch (event->type) {
+	case AS_HB_CHANNEL_MSG_RECEIVED:
+		hb_channel_on_msg_rcvd(event);
+		break;
+	case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH:
+		hb_handle_cluster_name_mismatch(event);
+		break;
+	default:	// Ignore channel active and inactive events. Rather rely on the adjacency
+	// tender to expire nodes.
+		break;
+	}
+}
+
+/**
+ * Dump hb mode state to logs.
+ * @param verbose enables / disables verbose logging.
+ */
+static void
+hb_mode_dump(bool verbose)
+{
+	if (hb_is_mesh()) {
+		mesh_dump(verbose);
+	}
+	else {
+		multicast_dump(verbose);
+	}
+}
+
+/**
+ * Reduce function to dump hb node info to log file.
+ */
+static int
+hb_dump_reduce(const void* key, void* data, void* udata)
+{
+	const cf_node* nodeid = (const cf_node*)key;
+	as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
+
+	char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
+	as_endpoint_list_to_string(adjacent_node->endpoint_list, endpoint_list_str,
+			sizeof(endpoint_list_str));
+
+	INFO("\tHB %s Node: node-id %" PRIx64" protocol %" PRIu32" endpoints {%s} last-updated %" PRIu64 " latency-ms %" PRIu64 ,
+			(char*)udata,
+			*nodeid, adjacent_node->protocol_version, endpoint_list_str,
+			adjacent_node->last_updated_monotonic_ts, adjacent_node->avg_latency);
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Dump hb state to logs.
+ * @param verbose enables / disables verbose logging.
+ */
+static void
+hb_dump(bool verbose)
+{
+	HB_LOCK();
+
+	INFO("HB Adjacency Size: %d", cf_shash_get_size(g_hb.adjacency));
+
+	if (verbose) {
+		cf_shash_reduce(g_hb.adjacency, hb_dump_reduce, "Adjacent");
+	}
+
+	if (cf_shash_get_size(g_hb.on_probation)) {
+		INFO("HB On-probation Size: %d", cf_shash_get_size(g_hb.on_probation));
+
+		if (verbose) {
+			cf_shash_reduce(g_hb.on_probation, hb_dump_reduce, "On-probation");
+		}
+	}
+
+	HB_UNLOCK();
+}
+
+/**
+ * Compute a complement / inverted adjacency graph for input nodes such that
+ * entry
+ *
+ * inverted_graph[i][j] = 0 iff node[i] and node[j] are in each others adjacency
+ * lists. That is they have a bidirectional network link active between them.
+ *
+ * else
+ *
+ * inverted_graph[i][j] > 0 iff there is no link or a unidirectional link
+ * between them.
+ *
+ *
+ * @param nodes the input vector of nodes.
+ * @param inverted_graph (output) a (num_nodes x num_nodes ) 2D byte array.
+ */
+static void
+hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph)
+{
+	HB_LOCK();
+	int num_nodes = cf_vector_size(nodes);
+
+	for (int i = 0; i < num_nodes; i++) {
+		for (int j = 0; j < num_nodes; j++) {
+			inverted_graph[i][j] = 2;
+		}
+		cf_node nodeid = 0;
+		cf_vector_get(nodes, i, &nodeid);
+		cf_shash_put(g_hb.nodeid_to_index, &nodeid, &i);
+	}
+
+	cf_node self_nodeid = config_self_nodeid_get();
+	int self_node_index = -1;
+	cf_shash_get(g_hb.nodeid_to_index, &self_nodeid, &self_node_index);
+
+	for (int i = 0; i < num_nodes; i++) {
+		// Mark the node connected from itself, i.e, disconnected in the
+		// inverted graph.
+		inverted_graph[i][i] = 0;
+
+		cf_node node = *(cf_node*)cf_vector_getp(nodes, i);
+		as_hb_adjacent_node node_info;
+
+		if (hb_adjacent_node_get(node, &node_info) == 0) {
+			if (self_node_index >= 0) {
+				// Self node will not have plugin data. But the fact that this
+				// node has an adjacent node indicates that is is in our
+				// adjacency list. Adjust the graph.
+				inverted_graph[i][self_node_index]--;
+				inverted_graph[self_node_index][i]--;
+			}
+
+			cf_node* adjacency_list = NULL;
+			size_t adjacency_length = 0;
+			hb_adjacent_node_adjacency_get(&node_info, &adjacency_list, &adjacency_length);
+
+			for (int j = 0; j < adjacency_length; j++) {
+				int other_node_index = -1;
+				cf_shash_get(g_hb.nodeid_to_index, &adjacency_list[j],
+						&other_node_index);
+				if (other_node_index < 0) {
+					// This node is not in the input set of nodes.
+					continue;
+				}
+
+				if (i != other_node_index) {
+					inverted_graph[i][other_node_index]--;
+					inverted_graph[other_node_index][i]--;
+				}
+			}
+		}
+	}
+
+	// Cleanup the temporary hash.
+	cf_shash_delete_all(g_hb.nodeid_to_index);
+
+	HB_UNLOCK();
+}
+
+/**
+ * Compute the nodes to evict from the input nodes so that remaining nodes form
+ * a clique, based on adjacency lists using minimal vertex cover.
+ *
+ * The minimal vertex cover on this graph is the set of nodes that should be
+ * removed to result in  a clique on the remaining nodes. This implementation is
+ * an approximation of the minimal vertex cover. The notion is to keep removing
+ * vertices having the highest degree until there are no more edges remaining.
+ * The heuristic gets rid of the more problematic nodes first.
+ *
+ * @param nodes input cf_node vector.
+ * @param nodes_to_evict output cf_node clique array, that is initialized.
+ */
+static void
+hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict)
+{
+	int num_nodes = cf_vector_size(nodes);
+
+	if (num_nodes == 0) {
+		// Nothing to do.
+		return;
+	}
+
+	int graph_alloc_size = sizeof(uint8_t) * num_nodes * num_nodes;
+	void* graph_data = MSG_BUFF_ALLOC(graph_alloc_size);
+
+	if (!graph_data) {
+		CRASH("error allocating space for clique finding data structure");
+	}
+
+	uint8_t* inverted_graph[num_nodes];
+	inverted_graph[0] = graph_data;
+	for (int i = 1; i < num_nodes; i++) {
+		inverted_graph[i] = *inverted_graph + num_nodes * i;
+	}
+
+	hb_adjacency_graph_invert(nodes, inverted_graph);
+
+	// Count the number of edges in the inverted graph. These edges are the ones
+	// that need to be removed so that the remaining nodes form a clique in the
+	// adjacency graph. Also for performance get hold of the self node index in
+	// the nodes vector.
+	int edge_count = 0;
+	int self_node_index = -1;
+	for (int i = 0; i < num_nodes; i++) {
+		cf_node node = 0;
+		cf_vector_get(nodes, i, &node);
+		if (node == config_self_nodeid_get()) {
+			self_node_index = i;
+		}
+
+		for (int j = 0; j < num_nodes; j++) {
+			if (inverted_graph[i][j]) {
+				edge_count++;
+			}
+		}
+	}
+
+	cf_vector_delete_range(nodes_to_evict, 0,
+			cf_vector_size(nodes_to_evict) - 1);
+
+	// Since we always decide to retain self node, first get rid of all nodes
+	// having missing links to self node.
+	if (self_node_index >= 0) {
+		for (int i = 0; i < num_nodes; i++) {
+			if (inverted_graph[self_node_index][i]
+					|| inverted_graph[i][self_node_index]) {
+				cf_node to_evict = 0;
+				cf_vector_get(nodes, i, &to_evict);
+				DEBUG("marking node %" PRIx64" for clique based eviction",
+						to_evict);
+
+				cf_vector_append(nodes_to_evict, &to_evict);
+
+				// Remove all edges attached to the removed node.
+				for (int j = 0; j < num_nodes; j++) {
+					if (inverted_graph[i][j]) {
+						inverted_graph[i][j] = 0;
+						edge_count--;
+					}
+					if (inverted_graph[j][i]) {
+						inverted_graph[j][i] = 0;
+						edge_count--;
+					}
+				}
+			}
+		}
+	}
+
+	while (edge_count > 0) {
+		// Find vertex with highest degree.
+		cf_node max_degree_node = 0;
+		int max_degree_node_idx = -1;
+		int max_degree = 0;
+
+		for (int i = 0; i < num_nodes; i++) {
+			cf_node to_evict = 0;
+			cf_vector_get(nodes, i, &to_evict);
+
+			if (vector_find(nodes_to_evict, &to_evict) >= 0) {
+				// We have already decided to evict this node.
+				continue;
+			}
+
+			if (to_evict == config_self_nodeid_get()) {
+				// Do not evict self.
+				continue;
+			}
+
+			// Get the degree of this node.
+			int degree = 0;
+			for (int j = 0; j < num_nodes; j++) {
+				if (inverted_graph[i][j]) {
+					degree++;
+				}
+			}
+
+			DETAIL("inverted degree for node %" PRIx64" is %d",
+					to_evict, degree);
+
+			// See if this node has a higher degree. On ties choose the node
+			// with a smaller nodeid
+			if (degree > max_degree
+					|| (degree == max_degree && max_degree_node > to_evict)) {
+				max_degree = degree;
+				max_degree_node = to_evict;
+				max_degree_node_idx = i;
+			}
+		}
+
+		if (max_degree_node_idx < 0) {
+			// We are done no node to evict.
+			break;
+		}
+
+		DEBUG("marking node %" PRIx64" with degree %d for clique based eviction",
+				max_degree_node, max_degree);
+
+		cf_vector_append(nodes_to_evict, &max_degree_node);
+
+		// Remove all edges attached to the removed node.
+		for (int i = 0; i < num_nodes; i++) {
+			if (inverted_graph[max_degree_node_idx][i]) {
+				inverted_graph[max_degree_node_idx][i] = 0;
+				edge_count--;
+			}
+			if (inverted_graph[i][max_degree_node_idx]) {
+				inverted_graph[i][max_degree_node_idx] = 0;
+				edge_count--;
+			}
+		}
+	}
+
+	MSG_BUFF_FREE(graph_data, graph_alloc_size);
+}
+
+/**
+ * Reduce function to iterate over plugin data for all adjacent nodes.
+ */
+static int
+hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata)
+{
+	const cf_node* nodeid = (const cf_node*)key;
+	as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
+	as_hb_adjacecny_iterate_reduce_udata* reduce_udata =
+			(as_hb_adjacecny_iterate_reduce_udata*)udata;
+
+	size_t plugin_data_size =
+			adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler
+					% 2].data_size;
+	void* plugin_data =
+			plugin_data_size ?
+					adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler
+							% 2].data : NULL;
+
+	reduce_udata->iterate_fn(*nodeid, plugin_data, plugin_data_size,
+			adjacent_node->last_updated_monotonic_ts,
+			&adjacent_node->last_msg_hlc_ts, reduce_udata->udata);
+
+	return CF_SHASH_OK;
+}
+
+/**
+ * Call the iterate method on all nodes in current adjacency list. Note plugin
+ * data can still be NULL if the plugin data failed to parse the plugin data.
+ *
+ * @param pluginid the plugin identifier.
+ * @param iterate_fn the iterate function invoked for plugin data forevery node.
+ * @param udata passed as is to the iterate function. Useful for getting results
+ * out of the iteration. NULL if there is no plugin data.
+ * @return the size of the plugin data. 0 if there is no plugin data.
+ */
+static void
+hb_plugin_data_iterate_all(as_hb_plugin_id pluginid,
+		as_hb_plugin_data_iterate_fn iterate_fn, void* udata)
+{
+	HB_LOCK();
+
+	as_hb_adjacecny_iterate_reduce_udata reduce_udata;
+	reduce_udata.pluginid = pluginid;
+	reduce_udata.iterate_fn = iterate_fn;
+	reduce_udata.udata = udata;
+	cf_shash_reduce(g_hb.adjacency, hb_plugin_data_iterate_reduce,
+			&reduce_udata);
+
+	HB_UNLOCK();
+}
diff --git a/as/src/fabric/hlc.c b/as/src/fabric/hlc.c
new file mode 100644
index 00000000..5b7fc01c
--- /dev/null
+++ b/as/src/fabric/hlc.c
@@ -0,0 +1,557 @@
+/*
+ * hlc.c
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fabric/hlc.h"
+
+#include <math.h>
+#include <sys/param.h> // For MAX() and MIN().
+
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_atomic.h"
+
+#include "fault.h"
+
+#include "base/cfg.h"
+
+/*
+ * Overview
+ * ========
+ * Hybrid logical clock as described in
+ * "Logical Physical Clocks and Consistent Snapshots in Globally Distributed
+ * Databases" available at http://www.cse.buffalo.edu/tech-reports/2014-04.pdf.
+ *
+ * Relies on a global 64 bit variable that has the logical time.
+ * The 48 MSBs include the physical component of the timestamp and the least
+ * significant 16 bits include the logical component. 48 bits for milliseconds
+ * since epoch gives us (8925 - years elapsed since epoch today) years before
+ * wrap around.
+ *
+ * The notion of HLC is to bound the skew between the logical clock and phsycial
+ * clock. This requires rejecting updates to the clock from nodes with large
+ * clock skews. We DO NOT do that yet and print a warning instead. The current
+ * envisioned usage is a global monotonically increasing timestamp. Should be
+ * fixed if we are to use it as a surrogate for wall clock.
+ *
+ * Guarantees
+ * ==========
+ * 1. Monotonically increasing. (Wraps around after ~8900 years). Service
+ * restarts might break the monotonicity, however the new clock will leapfrog
+ * the hlc value before the restart eventually.
+ * 2. as_hlc_timestamp_update call after every message receipt will ensure the
+ * message (send hlc ts)  < (message receive hlc ts).
+ * 3. A fixed local timestamp will eventually be marked as happened before a
+ * remote message. This is an important requirement. For example, in paxos the
+ * local cluster change timestamp should have happened before some incoming
+ * heartbeat. The ordering system should not always return a
+ * AS_HLC_ORDER_INDETERMINATE for a fixed local timestamp and a new message
+ * received.
+ *
+ * Not guaranteed (requires hlc persistence across service restarts)
+ * ==============
+ * 1. On service restart the HLC clock will not start where it left off, however
+ * it will eventually leapfrog the older value. Fixing this requires persistence
+ * which is not implemented. eventually leapfrogging is alright for all current
+ * requirements.
+ * 2. If a as_hlc_msg_timestamp is persisted and compared with a current running
+ * value, the result may not be correct.
+ *
+ *
+ * Requirements
+ * ============
+ * Subsystems that reply on hlc should have their network messages timestamped
+ * with hlc timestamps and should invoke the as_hlc_timestamp_update on receipt
+ * of every message. This will ensure the hlc are in sync across the cluster and
+ * (send hlc ts)  < (message receive hlc ts).
+ */
+
+/**
+ * Global timestamp with current hlc value.
+ */
+static as_hlc_timestamp g_now;
+
+/**
+ * Previous value of the physical component.
+ */
+static cf_atomic64 g_prev_physical_component;
+
+/**
+ * Previous value of the wall clock, when the physical component changed.
+ */
+static cf_atomic64 g_prev_wall_clock;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Globals.
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Mask for the physical component of a hlc timestamp.
+ */
+#define PHYSICAL_TS_MASK 0xffffffffffff0000
+
+/**
+ * Mask for logical component of a hls timestamp.
+ */
+#define LOGICAL_TS_MASK 0x000000000000ffff
+
+/**
+ * Print the skew warning once every five seconds.
+ */
+#define SKEW_WARNING_INTERVAL_MS() (5000)
+
+/**
+ * Logging macros.
+ */
+#define CRASH(format, ...) cf_crash(AS_HLC, format, ##__VA_ARGS__)
+#define WARNING(format, ...) cf_warning(AS_HLC, format, ##__VA_ARGS__)
+#define INFO(format, ...) cf_info(AS_HLC, format, ##__VA_ARGS__)
+#define DEBUG(format, ...) cf_debug(AS_HLC, format, ##__VA_ARGS__)
+#define DETAIL(format, ...) cf_detail(AS_HLC, format, ##__VA_ARGS__)
+#define ASSERT(expression, message, ...)				\
+if (!(expression)) {WARNING(message, __VA_ARGS__);}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Forward declarations.
+ * ----------------------------------------------------------------------------
+ */
+static cf_clock
+hlc_wall_clock_get();
+static as_hlc_timestamp
+hlc_ts_get();
+static bool
+hlc_ts_set(as_hlc_timestamp old_value, as_hlc_timestamp new_value);
+static cf_clock
+hlc_physical_ts_get(as_hlc_timestamp hlc_ts);
+static uint16_t
+hlc_logical_ts_get(as_hlc_timestamp hlc_ts);
+static void
+hlc_physical_ts_set(as_hlc_timestamp* hlc_ts, cf_clock physical_ts);
+static void
+hlc_physical_ts_on_set(cf_clock physical_ts, cf_clock wall_clock_now);
+static void
+hlc_logical_ts_set(as_hlc_timestamp* hlc_ts, uint16_t logical_ts);
+static void
+hlc_logical_ts_incr(uint16_t* logical_ts, cf_clock* physical_ts,
+		cf_clock wall_clock_now);
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public API.
+ * ----------------------------------------------------------------------------
+ */
+/**
+ * Initialize hybrid logical clock.
+ */
+void
+as_hlc_init()
+{
+	g_now = 0;
+	g_prev_physical_component = 0;
+	g_prev_wall_clock = 0;
+}
+
+/**
+ * Return the physical component of a hlc timstamp
+ * @param hlc_ts the hybrid logical clock timestamp.
+ */
+cf_clock
+as_hlc_physical_ts_get(as_hlc_timestamp hlc_ts)
+{
+	return hlc_physical_ts_get(hlc_ts);
+}
+
+/**
+ * Return a hlc timestamp representing the hlc time "now". The notion is to make
+ * the minimum increment to the hlc timestamp necessary.
+ */
+as_hlc_timestamp
+as_hlc_timestamp_now()
+{
+	// Keep trying till an atomic operation succeeds. Looks like a tight loop
+	// but even with reasonable contention should not take more then a few
+	// iterations to succeed.
+	while (true) {
+		as_hlc_timestamp current_hlc_ts = hlc_ts_get();
+
+		// Initialize the new physical and logical values to current values.
+		cf_clock new_hlc_physical_ts = hlc_physical_ts_get(current_hlc_ts);
+		uint16_t new_hlc_logical_ts = hlc_logical_ts_get(current_hlc_ts);
+
+		cf_clock wall_clock_physical_ts = hlc_wall_clock_get();
+
+		if (new_hlc_physical_ts >= wall_clock_physical_ts) {
+			// The HLC physical component is greater than the physical wall
+			// time. Advance the logical timestamp.
+			hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
+					wall_clock_physical_ts);
+		}
+		else {
+			// The wall clock is greater, use this as the physical component and
+			// reset the logical timestamp.
+			new_hlc_physical_ts = wall_clock_physical_ts;
+			new_hlc_logical_ts = 0;
+		}
+
+		as_hlc_timestamp new_hlc_ts = 0;
+
+		hlc_physical_ts_set(&new_hlc_ts, new_hlc_physical_ts);
+		hlc_logical_ts_set(&new_hlc_ts, new_hlc_logical_ts);
+
+		if (hlc_ts_set(current_hlc_ts, new_hlc_ts)) {
+			hlc_physical_ts_on_set(new_hlc_physical_ts, wall_clock_physical_ts);
+			DETAIL("changed HLC value from %" PRIu64 " to %" PRIu64,
+					current_hlc_ts, new_hlc_ts);
+			return new_hlc_ts;
+		}
+	}
+}
+
+/**
+ * Update the HLC on receipt of a remote message. The notion is to adjust this
+ * node's hlc to ensure the receive hlc ts > the send hlc ts.
+ *
+ * @param source for debugging and tracking only.
+ * @param send_timestamp the hlc timestamp when this message was sent.
+ * @param recv_timestamp (output) the message receive timestamp which will be
+ * populated. Can be NULL in which case it will be ignored.
+ */
+void
+as_hlc_timestamp_update(cf_node source, as_hlc_timestamp send_ts,
+		as_hlc_msg_timestamp* msg_ts)
+{
+	cf_clock send_ts_physical_ts = hlc_physical_ts_get(send_ts);
+	uint16_t send_ts_logical_ts = hlc_logical_ts_get(send_ts);
+
+	// Keep trying till an atomic operation succeeds. Looks like a tight loop
+	// but even with reasonable contention should not take more then a few
+	// iterations to succeed.
+	while (true) {
+		as_hlc_timestamp current_hlc_ts = hlc_ts_get();
+
+		cf_clock current_hlc_physical_ts = hlc_physical_ts_get(current_hlc_ts);
+		uint16_t current_hlc_logical_ts = hlc_logical_ts_get(current_hlc_ts);
+
+		cf_clock wall_clock_physical_ts = hlc_wall_clock_get();
+
+		cf_clock new_hlc_physical_ts = MAX(
+				MAX(current_hlc_physical_ts, send_ts_physical_ts),
+				wall_clock_physical_ts);
+		uint16_t new_hlc_logical_ts = 0;
+
+		if (new_hlc_physical_ts == current_hlc_physical_ts
+				&& new_hlc_physical_ts == send_ts_physical_ts) {
+			// There is no change in the physical components of peer and local
+			// hlc clocks. Set logical component to max of the two values and
+			// increment.
+			new_hlc_logical_ts = MAX(current_hlc_logical_ts,
+					send_ts_logical_ts);
+			hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
+					wall_clock_physical_ts);
+		}
+		else if (new_hlc_physical_ts == current_hlc_physical_ts) {
+			// The physical component of the send timestamp is smaller than our
+			// current physical component. We just need to increment the logical
+			// component.
+			new_hlc_logical_ts = current_hlc_ts;
+			hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
+					wall_clock_physical_ts);
+		}
+		else if (new_hlc_physical_ts == send_ts_physical_ts) {
+			// Current physical component is lesser than the incoming physical
+			// component. We need to ensure that the updated logical component
+			// is greater than the send logical component.
+			new_hlc_logical_ts = send_ts_logical_ts;
+			hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
+					wall_clock_physical_ts);
+		}
+		else {
+			// Our physical clock is greater than current physical component and
+			// the send physical component. We can reset the logical clock to
+			// zero and still maintain the send and receive ordering.
+			new_hlc_logical_ts = 0;
+		}
+
+		as_hlc_timestamp new_hlc_ts = 0;
+
+		hlc_physical_ts_set(&new_hlc_ts, new_hlc_physical_ts);
+		hlc_logical_ts_set(&new_hlc_ts, new_hlc_logical_ts);
+
+		if (hlc_ts_set(current_hlc_ts, new_hlc_ts)) {
+			hlc_physical_ts_on_set(new_hlc_physical_ts, wall_clock_physical_ts);
+			DETAIL("message received from node %" PRIx64 " with HLC %" PRIu64 " - changed HLC value from %" PRIu64 " to %" PRIu64,
+					source, send_ts, current_hlc_ts, new_hlc_ts);
+			if (msg_ts) {
+				msg_ts->send_ts = send_ts;
+				msg_ts->recv_ts = new_hlc_ts;
+			}
+			return;
+		}
+	}
+}
+
+/**
+ * Return the difference in milliseconds between two hlc timestamps. Note this
+ * difference may be greater than or equal to (but never less than)
+ * the physical wall call difference, because HLC can have non linear jumps,
+ * whenever the clock is adjusted. The difference should be used as an estimate
+ * rather than an absolute difference.
+ * For e.g. use the difference to check that the real time difference is most
+ * some number of milliseconds. However do not use this for interval statistics
+ * or to check if the difference in time is at least some number of
+ * milliseconds.
+ *
+ * @param ts1 the first timestamp.
+ * @param ts2 the seconds timestamp.
+ * @return ts1 - ts2 in milliseconds. if ts1 < ts2 the result is negative,
+ * else it is positive or zero.
+ */
+int64_t
+as_hlc_timestamp_diff_ms(as_hlc_timestamp ts1, as_hlc_timestamp ts2)
+{
+	int64_t diff = 0;
+	if (ts1 >= ts2) {
+		diff = hlc_physical_ts_get(ts1) - hlc_physical_ts_get(ts2);
+	}
+	else {
+		diff = -(hlc_physical_ts_get(ts2) - hlc_physical_ts_get(ts1));
+	}
+
+	return diff;
+}
+
+/**
+ * Orders a local timestamp and remote message send timestamp.
+ *
+ * @param local_ts the local timestamp.
+ * @param msg_ts message receive timestamp containing the remote send and the
+ * local receive timestamp.
+ * @return the order between the local and the message timestamp.
+ */
+as_hlc_timestamp_order
+as_hlc_send_timestamp_order(as_hlc_timestamp local_ts,
+		as_hlc_msg_timestamp* msg_ts)
+{
+	if (local_ts > msg_ts->recv_ts) {
+		// The local event happened after the local message received timestamp
+		// and therefore after the remote send as well.
+		return AS_HLC_HAPPENS_AFTER;
+	}
+
+	// Compute the unceratinty window around the local receive timestamp.
+	uint64_t offset = abs(msg_ts->send_ts - msg_ts->recv_ts);
+
+	if (local_ts > (msg_ts->recv_ts - offset)) {
+		// Local timestamp is in the uncertainty window. We cannot tell the
+		// order.
+		return AS_HLC_ORDER_INDETERMINATE;
+	}
+
+	cf_clock local_physical_ts = hlc_physical_ts_get(local_ts);
+	cf_clock recv_physical_ts = hlc_physical_ts_get(msg_ts->recv_ts);
+
+	if ((recv_physical_ts - local_physical_ts)
+			< g_config.fabric_latency_max_ms) {
+		// Consider the max network delay worth of time to also be part of the
+		// uncertainty window.
+		return AS_HLC_ORDER_INDETERMINATE;
+	}
+
+	return AS_HLC_HAPPENS_BEFORE;
+}
+
+/**
+ * Orders two timestamp generated by the same node / process.
+ *
+ * @param ts1 the first timestamp.
+ * @param ts2 the second timestamp.
+ * @return AS_HLC_HAPPENS_BEFORE if ts1 happens before ts2 else
+ * AS_HLC_HAPPENS_AFTER if ts1 happens after ts2  else
+ * AS_HLC_ORDER_INDETERMINATE.
+ */
+as_hlc_timestamp_order
+as_hlc_timestamp_order_get(as_hlc_timestamp ts1, as_hlc_timestamp ts2)
+{
+	if (ts1 < ts2) {
+		return AS_HLC_HAPPENS_BEFORE;
+	}
+	else if (ts1 > ts2) {
+		return AS_HLC_HAPPENS_AFTER;
+	}
+
+	return AS_HLC_ORDER_INDETERMINATE;
+}
+
+/**
+ * Subtract milliseconds worth of time from the timestamp.
+ * @param timestamp the input timestamp.
+ * @param ms the number of milliseconds to subtract.
+ */
+as_hlc_timestamp
+as_hlc_timestamp_subtract_ms(as_hlc_timestamp timestamp, int ms)
+{
+	cf_clock physical_ts = hlc_physical_ts_get(timestamp);
+	uint16_t logical_ts = hlc_logical_ts_get(timestamp);
+	physical_ts -= ms;
+	as_hlc_timestamp new_hlc_ts = 0;
+
+	hlc_physical_ts_set(&new_hlc_ts, physical_ts);
+	hlc_logical_ts_set(&new_hlc_ts, logical_ts);
+	return new_hlc_ts;
+}
+
+/**
+ * Dump some debugging information to the logs.
+ */
+void
+as_hlc_dump(bool verbose)
+{
+	as_hlc_timestamp now = as_hlc_timestamp_now();
+	cf_clock current_hlc_physical_ts = hlc_physical_ts_get(now);
+	uint16_t current_hlc_logical_ts = hlc_logical_ts_get(now);
+
+	INFO("HLC Ts:%" PRIu64 " HLC Physical Ts:%" PRIu64 " HLC Logical Ts:%d Wall Clock:%" PRIu64,
+			now, current_hlc_physical_ts, current_hlc_logical_ts,
+			hlc_wall_clock_get());
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Private functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Return this node's wall clock.
+ */
+static cf_clock
+hlc_wall_clock_get()
+{
+	// Unix timestamps will be 48 bits for a reasonable future. We will use only
+	// 48 bits.
+	return cf_clock_getabsolute();
+}
+
+/**
+ * Return the physical component of a hlc timstamp
+ * @param hlc_ts the hybrid logical clock timestamp.
+ */
+static cf_clock
+hlc_physical_ts_get(as_hlc_timestamp hlc_ts)
+{
+	return hlc_ts >> 16;
+}
+
+/**
+ * Return the logical component of a hlc timstamp
+ * @param hlc_ts the hybrid logical clock timestamp.
+ */
+static uint16_t
+hlc_logical_ts_get(as_hlc_timestamp hlc_ts)
+{
+	return (uint16_t)(hlc_ts & LOGICAL_TS_MASK);
+}
+
+/**
+ * Set the physical component of a hlc timestamp. 16 LSBs of the input physical
+ * timestamp will be ignored.
+ * @param hlc_ts the timestamp
+ * @param physical_ts the physical timestamp whose value should be set into the
+ * hls timestamp.
+ */
+static void
+hlc_physical_ts_set(as_hlc_timestamp* hlc_ts, cf_clock physical_ts)
+{
+	*hlc_ts = (*hlc_ts & LOGICAL_TS_MASK) | (physical_ts << 16);
+}
+
+/**
+ * Handle setting updating the physical component of the hlc timestamp.
+ */
+static void
+hlc_physical_ts_on_set(cf_clock physical_ts, cf_clock wall_clock_now)
+{
+	if (g_prev_physical_component != physical_ts) {
+		g_prev_physical_component = physical_ts;
+		g_prev_wall_clock = wall_clock_now;
+	}
+}
+
+/**
+ * Increment the logical timestamp and deal with a wrap around by incrementing
+ * the physical timestamp and ensure physical component moves at least at the
+ * rate of the wall clock to ensure hlc can be used as a crude measure of time
+ * intervals.
+ */
+static void
+hlc_logical_ts_incr(uint16_t* logical_ts, cf_clock* physical_ts,
+		cf_clock wall_clock_now)
+{
+	(*logical_ts)++;
+	if (logical_ts == 0) {
+		(*physical_ts)++;
+	}
+	cf_clock physical_component_diff = *physical_ts - g_prev_physical_component;
+	cf_clock wall_clock_diff =
+			(wall_clock_now > g_prev_wall_clock) ?
+					wall_clock_now - g_prev_wall_clock : 0;
+	if (physical_component_diff < wall_clock_diff) {
+		*physical_ts += wall_clock_diff - physical_component_diff;
+	}
+}
+
+/**
+ * Set the logical component of a hlc timestamp.
+ * @param hlc_ts the timestamp
+ * @param logical_ts the logical timestamp whose value should be set into the
+ * hls timestamp.
+ */
+static void
+hlc_logical_ts_set(as_hlc_timestamp* hlc_ts, uint16_t logical_ts)
+{
+	*hlc_ts = (*hlc_ts & PHYSICAL_TS_MASK) | (((uint64_t)logical_ts));
+}
+
+/**
+ * Get current value for the global timestamp atomically.
+ *
+ * @param new_value the new value for the global timestamp.
+ * @return true on successful set, false on failure to do an atomic set.
+ */
+static as_hlc_timestamp
+hlc_ts_get()
+{
+	return ck_pr_load_64(&g_now);
+}
+
+/**
+ * Set a new value for the global timestamp atomically.
+ *
+ * @param new_value the new value for the global timestamp.
+ * @return true on successful set, false on failure to do an atomic set.
+ */
+static bool
+hlc_ts_set(as_hlc_timestamp old_value, as_hlc_timestamp new_value)
+{
+	// Default to ck atomic check and set.
+	return ck_pr_cas_64(&g_now, old_value, new_value);
+}
diff --git a/as/src/fabric/meta_batch_ce.c b/as/src/fabric/meta_batch_ce.c
new file mode 100644
index 00000000..97e799cd
--- /dev/null
+++ b/as/src/fabric/meta_batch_ce.c
@@ -0,0 +1,65 @@
+/*
+ * meta_batch.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/meta_batch.h"
+
+#include <stddef.h>
+
+
+//==========================================================
+// Public API.
+//
+
+struct meta_in_q_s *
+meta_in_q_create()
+{
+	return NULL;
+}
+
+
+void
+meta_in_q_destroy(struct meta_in_q_s *iq)
+{
+}
+
+
+void
+meta_in_q_rejected(struct meta_in_q_s *iq)
+{
+}
+
+
+struct meta_out_q_s *
+meta_out_q_create()
+{
+	return NULL;
+}
+
+
+void
+meta_out_q_destroy(struct meta_out_q_s *oq)
+{
+}
diff --git a/as/src/fabric/migrate.c b/as/src/fabric/migrate.c
new file mode 100644
index 00000000..4382e41b
--- /dev/null
+++ b/as/src/fabric/migrate.c
@@ -0,0 +1,1758 @@
+/*
+ * migrate.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+// migrate.c
+// Moves a partition from one machine to another using the fabric messaging
+// system.
+
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/migrate.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_rchash.h"
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+#include "shash.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/rec_props.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/meta_batch.h"
+#include "fabric/partition.h"
+#include "fabric/partition_balance.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+const msg_template migrate_mt[] = {
+		{ MIG_FIELD_OP, M_FT_UINT32 },
+		{ MIG_FIELD_UNUSED_1, M_FT_UINT32 },
+		{ MIG_FIELD_EMIG_ID, M_FT_UINT32 },
+		{ MIG_FIELD_NAMESPACE, M_FT_BUF },
+		{ MIG_FIELD_PARTITION, M_FT_UINT32 },
+		{ MIG_FIELD_DIGEST, M_FT_BUF },
+		{ MIG_FIELD_GENERATION, M_FT_UINT32 },
+		{ MIG_FIELD_RECORD, M_FT_BUF },
+		{ MIG_FIELD_CLUSTER_KEY, M_FT_UINT64 },
+		{ MIG_FIELD_UNUSED_9, M_FT_BUF },
+		{ MIG_FIELD_VOID_TIME, M_FT_UINT32 },
+		{ MIG_FIELD_UNUSED_11, M_FT_UINT32 },
+		{ MIG_FIELD_UNUSED_12, M_FT_BUF },
+		{ MIG_FIELD_INFO, M_FT_UINT32 },
+		{ MIG_FIELD_UNUSED_14, M_FT_UINT64 },
+		{ MIG_FIELD_UNUSED_15, M_FT_BUF },
+		{ MIG_FIELD_UNUSED_16, M_FT_BUF },
+		{ MIG_FIELD_UNUSED_17, M_FT_UINT32 },
+		{ MIG_FIELD_UNUSED_18, M_FT_UINT32 },
+		{ MIG_FIELD_LAST_UPDATE_TIME, M_FT_UINT64 },
+		{ MIG_FIELD_FEATURES, M_FT_UINT32 },
+		{ MIG_FIELD_UNUSED_21, M_FT_UINT32 },
+		{ MIG_FIELD_META_RECORDS, M_FT_BUF },
+		{ MIG_FIELD_META_SEQUENCE, M_FT_UINT32 },
+		{ MIG_FIELD_META_SEQUENCE_FINAL, M_FT_UINT32 },
+		{ MIG_FIELD_PARTITION_SIZE, M_FT_UINT64 },
+		{ MIG_FIELD_SET_NAME, M_FT_BUF },
+		{ MIG_FIELD_KEY, M_FT_BUF },
+		{ MIG_FIELD_UNUSED_28, M_FT_UINT32 },
+		{ MIG_FIELD_EMIG_INSERT_ID, M_FT_UINT64 }
+};
+
+COMPILER_ASSERT(sizeof(migrate_mt) / sizeof(msg_template) == NUM_MIG_FIELDS);
+
+#define MIG_MSG_SCRATCH_SIZE 192
+
+#define EMIGRATION_SLOW_Q_WAIT_MS 1000 // 1 second
+#define MIGRATE_RETRANSMIT_STARTDONE_MS 1000 // for now, not configurable
+#define MIGRATE_RETRANSMIT_SIGNAL_MS 1000 // for now, not configurable
+#define MAX_BYTES_EMIGRATING (16 * 1024 * 1024)
+
+#define IMMIGRATION_DEBOUNCE_MS (60 * 1000) // 1 minute
+
+typedef struct pickled_record_s {
+	cf_digest     keyd;
+	uint32_t      generation;
+	uint32_t      void_time;
+	uint64_t      last_update_time;
+	uint8_t       *record_buf; // pickled!
+	size_t        record_len;
+} pickled_record;
+
+typedef enum {
+	EMIG_START_RESULT_OK,
+	EMIG_START_RESULT_ERROR,
+	EMIG_START_RESULT_EAGAIN
+} emigration_start_result;
+
+typedef enum {
+	// Order matters - we use an atomic set-max that relies on it.
+	EMIG_STATE_ACTIVE,
+	EMIG_STATE_FINISHED,
+	EMIG_STATE_ABORTED
+} emigration_state;
+
+typedef struct emigration_pop_info_s {
+	uint32_t order;
+	uint64_t dest_score;
+	uint64_t n_elements;
+
+	uint64_t avoid_dest;
+} emigration_pop_info;
+
+typedef struct emigration_reinsert_ctrl_s {
+	uint64_t xmit_ms; // time of last xmit - 0 when done
+	emigration *emig;
+	msg *m;
+} emigration_reinsert_ctrl;
+
+
+//==========================================================
+// Globals.
+//
+
+cf_rchash *g_emigration_hash = NULL;
+cf_rchash *g_immigration_hash = NULL;
+
+static uint64_t g_avoid_dest = 0;
+static cf_atomic32 g_emigration_id = 0;
+static cf_queue g_emigration_q;
+static cf_queue g_emigration_slow_q;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+// Various initializers and destructors.
+void emigration_init(emigration *emig);
+void emigration_destroy(void *parm);
+int emigration_reinsert_destroy_reduce_fn(const void *key, void *data, void *udata);
+void immigration_destroy(void *parm);
+void pickled_record_destroy(pickled_record *pr);
+
+// Emigration.
+void *run_emigration(void *arg);
+void *run_emigration_slow(void *arg);
+void emigration_pop(emigration **emigp);
+int emigration_pop_reduce_fn(void *buf, void *udata);
+void emigration_hash_insert(emigration *emig);
+void emigration_hash_delete(emigration *emig);
+bool emigrate_transfer(emigration *emig);
+void emigrate_signal(emigration *emig);
+emigration_start_result emigration_send_start(emigration *emig);
+bool emigrate_tree(emigration *emig);
+bool emigration_send_done(emigration *emig);
+void *run_emigration_reinserter(void *arg);
+void emigrate_tree_reduce_fn(as_index_ref *r_ref, void *udata);
+int emigration_reinsert_reduce_fn(const void *key, void *data, void *udata);
+void emigrate_record(emigration *emig, msg *m);
+
+// Immigration.
+uint32_t immigration_hashfn(const void *value, uint32_t value_len);
+void *run_immigration_reaper(void *arg);
+int immigration_reaper_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata);
+
+// Migrate fabric message handling.
+int migrate_receive_msg_cb(cf_node src, msg *m, void *udata);
+void immigration_handle_start_request(cf_node src, msg *m);
+void immigration_ack_start_request(cf_node src, msg *m, uint32_t op);
+void immigration_handle_insert_request(cf_node src, msg *m);
+void immigration_handle_done_request(cf_node src, msg *m);
+void immigration_handle_all_done_request(cf_node src, msg *m);
+void emigration_handle_insert_ack(cf_node src, msg *m);
+void emigration_handle_ctrl_ack(cf_node src, msg *m, uint32_t op);
+
+// Info API helpers.
+int emigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata);
+int immigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata);
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_migrate_init()
+{
+	g_avoid_dest = (uint64_t)g_config.self_node;
+
+	cf_queue_init(&g_emigration_q, sizeof(emigration*), 4096, true);
+	cf_queue_init(&g_emigration_slow_q, sizeof(emigration*), 4096, true);
+
+	cf_rchash_create(&g_emigration_hash, cf_rchash_fn_u32, emigration_destroy,
+			sizeof(uint32_t), 64, CF_RCHASH_MANY_LOCK);
+
+	cf_rchash_create(&g_immigration_hash, immigration_hashfn,
+			immigration_destroy, sizeof(immigration_hkey), 64,
+			CF_RCHASH_BIG_LOCK);
+
+	// Looks like an as_priority_thread_pool, but the reduce-pop is different.
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	for (uint32_t i = 0; i < g_config.n_migrate_threads; i++) {
+		if (pthread_create(&thread, &attrs, run_emigration, NULL) != 0) {
+			cf_crash(AS_MIGRATE, "failed to create emigration thread");
+		}
+	}
+
+	if (pthread_create(&thread, &attrs, run_emigration_slow, NULL) != 0) {
+		cf_crash(AS_MIGRATE, "failed to create emigration slow thread");
+	}
+
+	if (pthread_create(&thread, &attrs, run_immigration_reaper, NULL) != 0) {
+		cf_crash(AS_MIGRATE, "failed to create immigration reaper thread");
+	}
+
+	as_fabric_register_msg_fn(M_TYPE_MIGRATE, migrate_mt, sizeof(migrate_mt),
+			MIG_MSG_SCRATCH_SIZE, migrate_receive_msg_cb, NULL);
+}
+
+
+// Kicks off an emigration.
+void
+as_migrate_emigrate(const pb_task *task)
+{
+	emigration *emig = cf_rc_alloc(sizeof(emigration));
+
+	emig->dest = task->dest;
+	emig->cluster_key = task->cluster_key;
+	emig->id = cf_atomic32_incr(&g_emigration_id);
+	emig->type = task->type;
+	emig->tx_flags = task->tx_flags;
+	emig->state = EMIG_STATE_ACTIVE;
+	emig->aborted = false;
+
+	// Create these later only when we need them - we'll get lots at once.
+	emig->bytes_emigrating = 0;
+	emig->reinsert_hash = NULL;
+	emig->insert_id = 0;
+	emig->ctrl_q = NULL;
+	emig->meta_q = NULL;
+
+	as_partition_reserve(task->ns, task->pid, &emig->rsv);
+
+	emig->from_replica = is_self_replica(emig->rsv.p);
+
+	cf_atomic_int_incr(&emig->rsv.ns->migrate_tx_instance_count);
+
+	cf_queue_push(&g_emigration_q, &emig);
+}
+
+
+// Called via info command. Caller has sanity-checked n_threads.
+void
+as_migrate_set_num_xmit_threads(uint32_t n_threads)
+{
+	if (g_config.n_migrate_threads > n_threads) {
+		// Decrease the number of migrate transmit threads to n_threads.
+		while (g_config.n_migrate_threads > n_threads) {
+			void *death_msg = NULL;
+
+			// Send terminator (NULL message).
+			cf_queue_push(&g_emigration_q, &death_msg);
+			g_config.n_migrate_threads--;
+		}
+	}
+	else {
+		// Increase the number of migrate transmit threads to n_threads.
+		pthread_t thread;
+		pthread_attr_t attrs;
+
+		pthread_attr_init(&attrs);
+		pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+		while (g_config.n_migrate_threads < n_threads) {
+			if (pthread_create(&thread, &attrs, run_emigration, NULL) != 0) {
+				cf_warning(AS_MIGRATE, "failed to create emigration thread");
+				return;
+			}
+
+			g_config.n_migrate_threads++;
+		}
+	}
+}
+
+
+// Called via info command - print information about migration to the log.
+void
+as_migrate_dump(bool verbose)
+{
+	cf_info(AS_MIGRATE, "migration info:");
+	cf_info(AS_MIGRATE, "---------------");
+	cf_info(AS_MIGRATE, "number of emigrations in g_emigration_hash: %d",
+			cf_rchash_get_size(g_emigration_hash));
+	cf_info(AS_MIGRATE, "number of requested emigrations waiting in g_emigration_q : %d",
+			cf_queue_sz(&g_emigration_q));
+	cf_info(AS_MIGRATE, "number of requested emigrations waiting in g_emigration_slow_q : %d",
+			cf_queue_sz(&g_emigration_slow_q));
+	cf_info(AS_MIGRATE, "number of immigrations in g_immigration_hash: %d",
+			cf_rchash_get_size(g_immigration_hash));
+	cf_info(AS_MIGRATE, "current emigration id: %d", g_emigration_id);
+
+	if (verbose) {
+		int item_num = 0;
+
+		if (cf_rchash_get_size(g_emigration_hash) > 0) {
+			cf_info(AS_MIGRATE, "contents of g_emigration_hash:");
+			cf_info(AS_MIGRATE, "------------------------------");
+
+			cf_rchash_reduce(g_emigration_hash, emigration_dump_reduce_fn,
+					&item_num);
+		}
+
+		if (cf_rchash_get_size(g_immigration_hash) > 0) {
+			item_num = 0;
+
+			cf_info(AS_MIGRATE, "contents of g_immigration_hash:");
+			cf_info(AS_MIGRATE, "-------------------------------");
+
+			cf_rchash_reduce(g_immigration_hash, immigration_dump_reduce_fn,
+					&item_num);
+		}
+	}
+}
+
+
+//==========================================================
+// Local helpers - various initializers and destructors.
+//
+
+void
+emigration_init(emigration *emig)
+{
+	emig->reinsert_hash = cf_shash_create(cf_shash_fn_u32, sizeof(uint64_t),
+			sizeof(emigration_reinsert_ctrl), 16 * 1024, CF_SHASH_MANY_LOCK);
+	emig->ctrl_q = cf_queue_create(sizeof(int), true);
+	emig->meta_q = meta_in_q_create();
+}
+
+
+// Destructor handed to rchash.
+void
+emigration_destroy(void *parm)
+{
+	emigration *emig = (emigration *)parm;
+
+	if (emig->reinsert_hash) {
+		cf_shash_reduce(emig->reinsert_hash,
+				emigration_reinsert_destroy_reduce_fn, NULL);
+		cf_shash_destroy(emig->reinsert_hash);
+	}
+
+	if (emig->ctrl_q) {
+		cf_queue_destroy(emig->ctrl_q);
+	}
+
+	if (emig->meta_q) {
+		meta_in_q_destroy(emig->meta_q);
+	}
+
+	as_partition_release(&emig->rsv);
+
+	cf_atomic_int_decr(&emig->rsv.ns->migrate_tx_instance_count);
+}
+
+
+int
+emigration_reinsert_destroy_reduce_fn(const void *key, void *data, void *udata)
+{
+	emigration_reinsert_ctrl *ri_ctrl = (emigration_reinsert_ctrl *)data;
+
+	as_fabric_msg_put(ri_ctrl->m);
+
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+
+void
+emigration_release(emigration *emig)
+{
+	if (cf_rc_release(emig) == 0) {
+		emigration_destroy((void *)emig);
+		cf_rc_free(emig);
+	}
+}
+
+
+// Destructor handed to rchash.
+void
+immigration_destroy(void *parm)
+{
+	immigration *immig = (immigration *)parm;
+
+	if (immig->rsv.p) {
+		as_partition_release(&immig->rsv);
+	}
+
+	if (immig->meta_q) {
+		meta_out_q_destroy(immig->meta_q);
+	}
+
+	cf_atomic_int_decr(&immig->ns->migrate_rx_instance_count);
+}
+
+
+void
+immigration_release(immigration *immig)
+{
+	if (cf_rc_release(immig) == 0) {
+		immigration_destroy((void *)immig);
+		cf_rc_free(immig);
+	}
+}
+
+
+void
+pickled_record_destroy(pickled_record *pr)
+{
+	cf_free(pr->record_buf);
+}
+
+
+//==========================================================
+// Local helpers - emigration.
+//
+
+void *
+run_emigration(void *arg)
+{
+	while (true) {
+		emigration *emig;
+
+		emigration_pop(&emig);
+
+		// This is the case for intentionally stopping the migrate thread.
+		if (! emig) {
+			break; // signal of death
+		}
+
+		as_partition_balance_emigration_yield();
+
+		if (emig->cluster_key != as_exchange_cluster_key()) {
+			emigration_hash_delete(emig);
+			continue;
+		}
+
+		as_namespace *ns = emig->rsv.ns;
+		bool requeued = false;
+
+		// Add the emigration to the global hash so acks can find it.
+		emigration_hash_insert(emig);
+
+		switch (emig->type) {
+		case PB_TASK_EMIG_TRANSFER:
+			cf_atomic_int_incr(&ns->migrate_tx_partitions_active);
+			requeued = emigrate_transfer(emig);
+			cf_atomic_int_decr(&ns->migrate_tx_partitions_active);
+			break;
+		case PB_TASK_EMIG_SIGNAL_ALL_DONE:
+			cf_atomic_int_incr(&ns->migrate_signals_active);
+			emigrate_signal(emig);
+			cf_atomic_int_decr(&ns->migrate_signals_active);
+			break;
+		default:
+			cf_crash(AS_MIGRATE, "bad emig type %u", emig->type);
+			break;
+		}
+
+		if (! requeued) {
+			emigration_hash_delete(emig);
+		}
+	}
+
+	return NULL;
+}
+
+
+void *
+run_emigration_slow(void *arg)
+{
+	while (true) {
+		emigration *emig;
+
+		if (cf_queue_pop(&g_emigration_slow_q, (void *)&emig,
+				CF_QUEUE_FOREVER) != CF_QUEUE_OK) {
+			cf_crash(AS_MIGRATE, "emigration slow queue pop failed");
+		}
+
+		uint64_t now_ms = cf_getms();
+
+		if (emig->wait_until_ms > now_ms) {
+			usleep(1000 * (emig->wait_until_ms - now_ms));
+		}
+
+		cf_queue_push(&g_emigration_q, &emig);
+	}
+
+	return NULL;
+}
+
+
+void
+emigration_pop(emigration **emigp)
+{
+	emigration_pop_info best;
+
+	best.order = 0xFFFFffff;
+	best.dest_score = 0;
+	best.n_elements = 0xFFFFffffFFFFffff;
+
+	best.avoid_dest = 0;
+
+	if (cf_queue_reduce_pop(&g_emigration_q, (void *)emigp, CF_QUEUE_FOREVER,
+			emigration_pop_reduce_fn, &best) != CF_QUEUE_OK) {
+		cf_crash(AS_MIGRATE, "emigration queue reduce pop failed");
+	}
+}
+
+
+int
+emigration_pop_reduce_fn(void *buf, void *udata)
+{
+	emigration_pop_info *best = (emigration_pop_info *)udata;
+	emigration *emig = *(emigration **)buf;
+
+	if (! emig || // null emig terminates thread
+			emig->cluster_key != as_exchange_cluster_key()) {
+		return -1; // process immediately
+	}
+
+	if (emig->ctrl_q && cf_queue_sz(emig->ctrl_q) > 0) {
+		// This emig was requeued after its start command got an ACK_EAGAIN,
+		// likely because dest hit 'migrate-max-num-incoming'. A new ack has
+		// arrived - if it's ACK_OK, don't leave remote node hanging.
+
+		return -1; // process immediately
+	}
+
+	if (emig->type == PB_TASK_EMIG_SIGNAL_ALL_DONE) {
+		return -1; // process immediately
+	}
+
+	if (best->avoid_dest == 0) {
+		best->avoid_dest = g_avoid_dest;
+	}
+
+	uint32_t order = emig->rsv.ns->migrate_order;
+	uint64_t dest_score = (uint64_t)emig->dest - best->avoid_dest;
+	uint64_t n_elements = as_index_tree_size(emig->rsv.tree);
+
+	if (order < best->order ||
+			(order == best->order &&
+					(dest_score > best->dest_score ||
+							(dest_score == best->dest_score &&
+									n_elements < best->n_elements)))) {
+		best->order = order;
+		best->dest_score = dest_score;
+		best->n_elements = n_elements;
+
+		g_avoid_dest = (uint64_t)emig->dest;
+
+		return -2; // candidate
+	}
+
+	return 0; // not interested
+}
+
+
+void
+emigration_hash_insert(emigration *emig)
+{
+	if (! emig->ctrl_q) {
+		emigration_init(emig); // creates emig->ctrl_q etc.
+
+		cf_rchash_put(g_emigration_hash, (void *)&emig->id, sizeof(emig->id),
+				(void *)emig);
+	}
+}
+
+
+void
+emigration_hash_delete(emigration *emig)
+{
+	if (emig->ctrl_q) {
+		cf_rchash_delete(g_emigration_hash, (void *)&emig->id,
+				sizeof(emig->id));
+	}
+	else {
+		emigration_release(emig);
+	}
+}
+
+
+bool
+emigrate_transfer(emigration *emig)
+{
+	//--------------------------------------------
+	// Send START request.
+	//
+
+	emigration_start_result result = emigration_send_start(emig);
+
+	if (result == EMIG_START_RESULT_EAGAIN) {
+		// Remote node refused migration, requeue and fetch another.
+		emig->wait_until_ms = cf_getms() + EMIGRATION_SLOW_Q_WAIT_MS;
+
+		cf_queue_push(&g_emigration_slow_q, &emig);
+
+		return true; // requeued
+	}
+
+	if (result != EMIG_START_RESULT_OK) {
+		return false; // did not requeue
+	}
+
+	//--------------------------------------------
+	// Send whole tree - may block a while.
+	//
+
+	if (! emigrate_tree(emig)) {
+		return false; // did not requeue
+	}
+
+	//--------------------------------------------
+	// Send DONE request.
+	//
+
+	if (emigration_send_done(emig)) {
+		as_partition_emigrate_done(emig->rsv.ns, emig->rsv.p->id,
+				emig->cluster_key, emig->tx_flags);
+	}
+
+	return false; // did not requeue
+}
+
+
+void
+emigrate_signal(emigration *emig)
+{
+	as_namespace *ns = emig->rsv.ns;
+	msg *m = as_fabric_msg_get(M_TYPE_MIGRATE);
+
+	switch (emig->type) {
+	case PB_TASK_EMIG_SIGNAL_ALL_DONE:
+		msg_set_uint32(m, MIG_FIELD_OP, OPERATION_ALL_DONE);
+		break;
+	default:
+		cf_crash(AS_MIGRATE, "signal: bad emig type %u", emig->type);
+		break;
+	}
+
+	msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id);
+	msg_set_uint64(m, MIG_FIELD_CLUSTER_KEY, emig->cluster_key);
+	msg_set_buf(m, MIG_FIELD_NAMESPACE, (const uint8_t *)ns->name,
+			strlen(ns->name), MSG_SET_COPY);
+	msg_set_uint32(m, MIG_FIELD_PARTITION, emig->rsv.p->id);
+
+	uint64_t signal_xmit_ms = 0;
+
+	while (true) {
+		if (emig->cluster_key != as_exchange_cluster_key()) {
+			as_fabric_msg_put(m);
+			return;
+		}
+
+		uint64_t now = cf_getms();
+
+		if (signal_xmit_ms + MIGRATE_RETRANSMIT_SIGNAL_MS < now) {
+			msg_incr_ref(m);
+
+			if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_CTRL) !=
+					AS_FABRIC_SUCCESS) {
+				as_fabric_msg_put(m);
+			}
+
+			signal_xmit_ms = now;
+		}
+
+		int op;
+
+		if (cf_queue_pop(emig->ctrl_q, &op, MIGRATE_RETRANSMIT_SIGNAL_MS) ==
+				CF_QUEUE_OK) {
+			switch (op) {
+			case OPERATION_ALL_DONE_ACK:
+				cf_atomic_int_decr(&ns->migrate_signals_remaining);
+				as_fabric_msg_put(m);
+				return;
+			default:
+				cf_warning(AS_MIGRATE, "signal: unexpected ctrl op %d", op);
+				break;
+			}
+		}
+	}
+}
+
+
+emigration_start_result
+emigration_send_start(emigration *emig)
+{
+	as_namespace *ns = emig->rsv.ns;
+	msg *m = as_fabric_msg_get(M_TYPE_MIGRATE);
+
+	msg_set_uint32(m, MIG_FIELD_OP, OPERATION_START);
+	msg_set_uint32(m, MIG_FIELD_FEATURES, MY_MIG_FEATURES);
+	msg_set_uint64(m, MIG_FIELD_PARTITION_SIZE,
+			as_index_tree_size(emig->rsv.tree));
+	msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id);
+	msg_set_uint64(m, MIG_FIELD_CLUSTER_KEY, emig->cluster_key);
+	msg_set_buf(m, MIG_FIELD_NAMESPACE, (const uint8_t *)ns->name,
+			strlen(ns->name), MSG_SET_COPY);
+	msg_set_uint32(m, MIG_FIELD_PARTITION, emig->rsv.p->id);
+
+	uint64_t start_xmit_ms = 0;
+
+	while (true) {
+		if (emig->cluster_key != as_exchange_cluster_key()) {
+			as_fabric_msg_put(m);
+			return EMIG_START_RESULT_ERROR;
+		}
+
+		uint64_t now = cf_getms();
+
+		if (cf_queue_sz(emig->ctrl_q) == 0 &&
+				start_xmit_ms + MIGRATE_RETRANSMIT_STARTDONE_MS < now) {
+			msg_incr_ref(m);
+
+			if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_CTRL) !=
+					AS_FABRIC_SUCCESS) {
+				as_fabric_msg_put(m);
+			}
+
+			start_xmit_ms = now;
+		}
+
+		int op;
+
+		if (cf_queue_pop(emig->ctrl_q, &op, MIGRATE_RETRANSMIT_STARTDONE_MS) ==
+				CF_QUEUE_OK) {
+			switch (op) {
+			case OPERATION_START_ACK_OK:
+				as_fabric_msg_put(m);
+				return EMIG_START_RESULT_OK;
+			case OPERATION_START_ACK_EAGAIN:
+				as_fabric_msg_put(m);
+				return EMIG_START_RESULT_EAGAIN;
+			case OPERATION_START_ACK_FAIL:
+				cf_warning(AS_MIGRATE, "imbalance: dest refused migrate with ACK_FAIL");
+				cf_atomic_int_incr(&ns->migrate_tx_partitions_imbalance);
+				as_fabric_msg_put(m);
+				return EMIG_START_RESULT_ERROR;
+			default:
+				cf_warning(AS_MIGRATE, "unexpected ctrl op %d", op);
+				break;
+			}
+		}
+	}
+
+	// Should never get here.
+	cf_crash(AS_MIGRATE, "unexpected - exited infinite while loop");
+
+	return EMIG_START_RESULT_ERROR;
+}
+
+
+bool
+emigrate_tree(emigration *emig)
+{
+	if (as_index_tree_size(emig->rsv.tree) == 0) {
+		return true;
+	}
+
+	cf_atomic32_set(&emig->state, EMIG_STATE_ACTIVE);
+
+	pthread_t thread;
+
+	if (pthread_create(&thread, NULL, run_emigration_reinserter, emig) != 0) {
+		cf_crash(AS_MIGRATE, "could not start reinserter thread");
+	}
+
+	as_index_reduce(emig->rsv.tree, emigrate_tree_reduce_fn, emig);
+
+	// Sets EMIG_STATE_FINISHED only if not already EMIG_STATE_ABORTED.
+	cf_atomic32_setmax(&emig->state, EMIG_STATE_FINISHED);
+
+	pthread_join(thread, NULL);
+
+	return emig->state != EMIG_STATE_ABORTED;
+}
+
+
+bool
+emigration_send_done(emigration *emig)
+{
+	as_namespace *ns = emig->rsv.ns;
+
+	if (! as_partition_pre_emigrate_done(ns, emig->rsv.p->id, emig->cluster_key,
+			emig->tx_flags)) {
+		return false;
+	}
+
+	msg *m = as_fabric_msg_get(M_TYPE_MIGRATE);
+
+	msg_set_uint32(m, MIG_FIELD_OP, OPERATION_DONE);
+	msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id);
+
+	uint64_t done_xmit_ms = 0;
+
+	while (true) {
+		if (emig->cluster_key != as_exchange_cluster_key()) {
+			as_fabric_msg_put(m);
+			return false;
+		}
+
+		uint64_t now = cf_getms();
+
+		if (done_xmit_ms + MIGRATE_RETRANSMIT_STARTDONE_MS < now) {
+			msg_incr_ref(m);
+
+			if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_CTRL) !=
+					AS_FABRIC_SUCCESS) {
+				as_fabric_msg_put(m);
+			}
+
+			done_xmit_ms = now;
+		}
+
+		int op;
+
+		if (cf_queue_pop(emig->ctrl_q, &op, MIGRATE_RETRANSMIT_STARTDONE_MS) ==
+				CF_QUEUE_OK) {
+			if (op == OPERATION_DONE_ACK) {
+				as_fabric_msg_put(m);
+				return true;
+			}
+		}
+	}
+
+	// Should never get here.
+	cf_crash(AS_MIGRATE, "unexpected - exited infinite while loop");
+
+	return false;
+}
+
+
+void *
+run_emigration_reinserter(void *arg)
+{
+	emigration *emig = (emigration *)arg;
+	emigration_state emig_state;
+
+	// Reduce over the reinsert hash until finished.
+	while ((emig_state = cf_atomic32_get(emig->state)) != EMIG_STATE_ABORTED) {
+		if (emig->cluster_key != as_exchange_cluster_key()) {
+			cf_atomic32_set(&emig->state, EMIG_STATE_ABORTED);
+			return NULL;
+		}
+
+		usleep(1000);
+
+		if (cf_shash_get_size(emig->reinsert_hash) == 0) {
+			if (emig_state == EMIG_STATE_FINISHED) {
+				return NULL;
+			}
+
+			continue;
+		}
+
+		cf_shash_reduce(emig->reinsert_hash, emigration_reinsert_reduce_fn,
+				(void *)cf_getms());
+	}
+
+	return NULL;
+}
+
+
+void
+emigrate_tree_reduce_fn(as_index_ref *r_ref, void *udata)
+{
+	emigration *emig = (emigration *)udata;
+	as_namespace *ns = emig->rsv.ns;
+
+	if (emig->aborted) {
+		as_record_done(r_ref, ns);
+		return; // no point continuing to reduce this tree
+	}
+
+	if (emig->cluster_key != as_exchange_cluster_key()) {
+		as_record_done(r_ref, ns);
+		emig->aborted = true;
+		cf_atomic32_set(&emig->state, EMIG_STATE_ABORTED);
+		return; // no point continuing to reduce this tree
+	}
+
+	if (! should_emigrate_record(emig, r_ref)) {
+		as_record_done(r_ref, ns);
+		return;
+	}
+
+	//--------------------------------------------
+	// Read the record and pickle it.
+	//
+
+	as_record *r = r_ref->r;
+	as_storage_rd rd;
+
+	as_storage_record_open(ns, r, &rd);
+
+	as_storage_rd_load_n_bins(&rd); // TODO - handle error returned
+
+	as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+	as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned
+
+	pickled_record pr;
+
+	pr.keyd = r->keyd;
+	pr.generation = r->generation;
+	pr.void_time = r->void_time;
+	pr.last_update_time = r->last_update_time;
+	pr.record_buf = as_record_pickle(&rd, &pr.record_len);
+
+	as_storage_record_get_key(&rd);
+
+	const char *set_name = as_index_get_set_name(r, ns);
+	uint32_t key_size = rd.key_size;
+	uint8_t key[key_size];
+
+	if (key_size != 0) {
+		memcpy(key, rd.key, key_size);
+	}
+
+	uint32_t info = emigration_pack_info(emig, r);
+
+	as_storage_record_close(&rd);
+	as_record_done(r_ref, ns);
+
+	//--------------------------------------------
+	// Fill and send the fabric message.
+	//
+
+	msg *m = as_fabric_msg_get(M_TYPE_MIGRATE);
+
+	msg_set_uint32(m, MIG_FIELD_OP, OPERATION_INSERT);
+	msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id);
+	msg_set_buf(m, MIG_FIELD_DIGEST, (const uint8_t *)&pr.keyd,
+			sizeof(cf_digest), MSG_SET_COPY);
+	msg_set_uint32(m, MIG_FIELD_GENERATION, pr.generation);
+	msg_set_uint64(m, MIG_FIELD_LAST_UPDATE_TIME, pr.last_update_time);
+
+	if (pr.void_time != 0) {
+		msg_set_uint32(m, MIG_FIELD_VOID_TIME, pr.void_time);
+	}
+
+	if (info != 0) {
+		msg_set_uint32(m, MIG_FIELD_INFO, info);
+	}
+
+	// Note - after MSG_SET_HANDOFF_MALLOCs, no need to destroy pickled_record.
+
+	if (set_name) {
+		msg_set_buf(m, MIG_FIELD_SET_NAME, (const uint8_t *)set_name,
+				strlen(set_name), MSG_SET_COPY);
+	}
+
+	if (key_size != 0) {
+		msg_set_buf(m, MIG_FIELD_KEY, key, key_size, MSG_SET_COPY);
+	}
+
+	msg_set_buf(m, MIG_FIELD_RECORD, pr.record_buf, pr.record_len,
+			MSG_SET_HANDOFF_MALLOC);
+
+	// This might block if the queues are backed up.
+	emigrate_record(emig, m);
+
+	cf_atomic_int_incr(&ns->migrate_records_transmitted);
+
+	if (ns->migrate_sleep != 0) {
+		usleep(ns->migrate_sleep);
+	}
+
+	uint32_t waits = 0;
+
+	while (cf_atomic32_get(emig->bytes_emigrating) > MAX_BYTES_EMIGRATING &&
+			emig->cluster_key == as_exchange_cluster_key()) {
+		usleep(1000);
+
+		// Temporary paranoia to inform us old nodes aren't acking properly.
+		if (++waits % (ns->migrate_retransmit_ms * 4) == 0) {
+			cf_warning(AS_MIGRATE, "missing acks from node %lx", emig->dest);
+		}
+	}
+}
+
+
+int
+emigration_reinsert_reduce_fn(const void *key, void *data, void *udata)
+{
+	emigration_reinsert_ctrl *ri_ctrl = (emigration_reinsert_ctrl *)data;
+	as_namespace *ns = ri_ctrl->emig->rsv.ns;
+	uint64_t now = (uint64_t)udata;
+
+	if (ri_ctrl->xmit_ms + ns->migrate_retransmit_ms < now) {
+		msg_incr_ref(ri_ctrl->m);
+
+		if (as_fabric_send(ri_ctrl->emig->dest, ri_ctrl->m,
+				AS_FABRIC_CHANNEL_BULK) != AS_FABRIC_SUCCESS) {
+			as_fabric_msg_put(ri_ctrl->m);
+			return -1; // this will stop the reduce
+		}
+
+		ri_ctrl->xmit_ms = now;
+		cf_atomic_int_incr(&ns->migrate_record_retransmits);
+	}
+
+	return 0;
+}
+
+
+void
+emigrate_record(emigration *emig, msg *m)
+{
+	uint64_t insert_id = emig->insert_id++;
+
+	msg_set_uint64(m, MIG_FIELD_EMIG_INSERT_ID, insert_id);
+
+	emigration_reinsert_ctrl ri_ctrl;
+
+	msg_incr_ref(m); // the reference in the hash
+	ri_ctrl.m = m;
+	ri_ctrl.emig = emig;
+	ri_ctrl.xmit_ms = cf_getms();
+
+	cf_shash_put(emig->reinsert_hash, &insert_id, &ri_ctrl);
+
+	cf_atomic32_add(&emig->bytes_emigrating, (int32_t)msg_get_wire_size(m));
+
+	if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_BULK) !=
+			AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+//==========================================================
+// Local helpers - immigration.
+//
+
+uint32_t
+immigration_hashfn(const void *value, uint32_t value_len)
+{
+	return ((const immigration_hkey *)value)->emig_id;
+}
+
+
+void *
+run_immigration_reaper(void *arg)
+{
+	while (true) {
+		cf_rchash_reduce(g_immigration_hash, immigration_reaper_reduce_fn,
+				NULL);
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+
+int
+immigration_reaper_reduce_fn(const void *key, uint32_t keylen, void *object,
+		void *udata)
+{
+	immigration *immig = (immigration *)object;
+
+	if (immig->start_recv_ms == 0) {
+		// If the start time isn't set, immigration is still being processed.
+		return CF_RCHASH_OK;
+	}
+
+	if (immig->cluster_key != as_exchange_cluster_key() ||
+			(immig->done_recv_ms != 0 && cf_getms() > immig->done_recv_ms +
+					IMMIGRATION_DEBOUNCE_MS)) {
+		if (immig->start_result == AS_MIGRATE_OK &&
+				// If we started ok, must be a cluster key change - make sure
+				// DONE handler doesn't also decrement active counter.
+				cf_atomic32_incr(&immig->done_recv) == 1) {
+			as_namespace *ns = immig->rsv.ns;
+
+			if (cf_atomic_int_decr(&ns->migrate_rx_partitions_active) < 0) {
+				cf_warning(AS_MIGRATE, "migrate_rx_partitions_active < 0");
+				cf_atomic_int_incr(&ns->migrate_rx_partitions_active);
+			}
+		}
+
+		return CF_RCHASH_REDUCE_DELETE;
+	}
+
+	return CF_RCHASH_OK;
+}
+
+
+//==========================================================
+// Local helpers - migrate fabric message handling.
+//
+
+int
+migrate_receive_msg_cb(cf_node src, msg *m, void *udata)
+{
+	uint32_t op;
+
+	if (msg_get_uint32(m, MIG_FIELD_OP, &op) != 0) {
+		cf_warning(AS_MIGRATE, "received message with no op");
+		as_fabric_msg_put(m);
+		return 0;
+	}
+
+	switch (op) {
+	//--------------------------------------------
+	// Emigration - handle requests:
+	//
+	case OPERATION_MERGE_META:
+		emigration_handle_meta_batch_request(src, m);
+		break;
+
+	//--------------------------------------------
+	// Immigration - handle requests:
+	//
+	case OPERATION_START:
+		immigration_handle_start_request(src, m);
+		break;
+	case OPERATION_INSERT:
+		immigration_handle_insert_request(src, m);
+		break;
+	case OPERATION_DONE:
+		immigration_handle_done_request(src, m);
+		break;
+	case OPERATION_ALL_DONE:
+		immigration_handle_all_done_request(src, m);
+		break;
+
+	//--------------------------------------------
+	// Emigration - handle acknowledgments:
+	//
+	case OPERATION_INSERT_ACK:
+		emigration_handle_insert_ack(src, m);
+		break;
+	case OPERATION_START_ACK_OK:
+	case OPERATION_START_ACK_EAGAIN:
+	case OPERATION_START_ACK_FAIL:
+	case OPERATION_DONE_ACK:
+	case OPERATION_ALL_DONE_ACK:
+		emigration_handle_ctrl_ack(src, m, op);
+		break;
+
+	//--------------------------------------------
+	// Immigration - handle acknowledgments:
+	//
+	case OPERATION_MERGE_META_ACK:
+		immigration_handle_meta_batch_ack(src, m);
+		break;
+
+	default:
+		cf_detail(AS_MIGRATE, "received unexpected message op %u", op);
+		as_fabric_msg_put(m);
+		break;
+	}
+
+	return 0;
+}
+
+
+//----------------------------------------------------------
+// Immigration - request message handling.
+//
+
+void
+immigration_handle_start_request(cf_node src, msg *m)
+{
+	uint32_t emig_id;
+
+	if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) {
+		cf_warning(AS_MIGRATE, "handle start: msg get for emig id failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint64_t cluster_key;
+
+	if (msg_get_uint64(m, MIG_FIELD_CLUSTER_KEY, &cluster_key) != 0) {
+		cf_warning(AS_MIGRATE, "handle start: msg get for cluster key failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint8_t *ns_name;
+	size_t ns_name_len;
+
+	if (msg_get_buf(m, MIG_FIELD_NAMESPACE, &ns_name, &ns_name_len,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_MIGRATE, "handle start: msg get for namespace failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	as_namespace *ns = as_namespace_get_bybuf(ns_name, ns_name_len);
+
+	if (! ns) {
+		cf_warning(AS_MIGRATE, "handle start: bad namespace");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t pid;
+
+	if (msg_get_uint32(m, MIG_FIELD_PARTITION, &pid) != 0) {
+		cf_warning(AS_MIGRATE, "handle start: msg get for pid failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t emig_features = 0;
+
+	msg_get_uint32(m, MIG_FIELD_FEATURES, &emig_features);
+
+	uint64_t emig_n_recs = 0;
+
+	msg_get_uint64(m, MIG_FIELD_PARTITION_SIZE, &emig_n_recs);
+
+	msg_preserve_fields(m, 1, MIG_FIELD_EMIG_ID);
+
+	immigration *immig = cf_rc_alloc(sizeof(immigration));
+
+	cf_atomic_int_incr(&ns->migrate_rx_instance_count);
+
+	immig->src = src;
+	immig->cluster_key = cluster_key;
+	immig->pid = pid;
+	immig->start_recv_ms = 0;
+	immig->done_recv = 0;
+	immig->done_recv_ms = 0;
+	immig->emig_id = emig_id;
+	immig->meta_q = meta_out_q_create();
+	immig->features = MY_MIG_FEATURES;
+	immig->ns = ns;
+	immig->rsv.p = NULL;
+
+	immigration_hkey hkey;
+
+	hkey.src = src;
+	hkey.emig_id = emig_id;
+
+	while (true) {
+		if (cf_rchash_put_unique(g_immigration_hash, (void *)&hkey,
+				sizeof(hkey), (void *)immig) == CF_RCHASH_OK) {
+			cf_rc_reserve(immig); // so either put or get yields ref-count 2
+
+			// First start request (not a retransmit) for this pid this round,
+			// or we had ack'd previous start request with 'EAGAIN'.
+			immig->start_result = as_partition_immigrate_start(ns, pid,
+					cluster_key, src);
+			break;
+		}
+
+		immigration *immig0;
+
+		if (cf_rchash_get(g_immigration_hash, (void *)&hkey, sizeof(hkey),
+				(void *)&immig0) == CF_RCHASH_OK) {
+			immigration_release(immig); // free just-alloc'd immig ...
+
+			if (immig0->start_recv_ms == 0) {
+				immigration_release(immig0);
+				return; // allow previous thread to respond
+			}
+
+			if (immig0->cluster_key != cluster_key) {
+				immigration_release(immig0);
+				return; // other node reused an immig_id, allow reaper to reap
+			}
+
+			immig = immig0; // ...  and use original
+			break;
+		}
+	}
+
+	switch (immig->start_result) {
+	case AS_MIGRATE_OK:
+		break;
+	case AS_MIGRATE_FAIL:
+		immig->start_recv_ms = cf_getms(); // permits reaping
+		immig->done_recv_ms = immig->start_recv_ms; // permits reaping
+		immigration_release(immig);
+		immigration_ack_start_request(src, m, OPERATION_START_ACK_FAIL);
+		return;
+	case AS_MIGRATE_AGAIN:
+		// Remove from hash so that the immig can be tried again.
+		cf_rchash_delete(g_immigration_hash, (void *)&hkey, sizeof(hkey));
+		immigration_release(immig);
+		immigration_ack_start_request(src, m, OPERATION_START_ACK_EAGAIN);
+		return;
+	default:
+		cf_crash(AS_MIGRATE, "unexpected as_partition_immigrate_start result");
+		break;
+	}
+
+	if (immig->start_recv_ms == 0) {
+		as_partition_reserve(ns, pid, &immig->rsv);
+		cf_atomic_int_incr(&immig->rsv.ns->migrate_rx_partitions_active);
+
+		if (! immigration_start_meta_sender(immig, emig_features,
+				emig_n_recs)) {
+			immig->features &= ~MIG_FEATURE_MERGE;
+		}
+
+		immig->start_recv_ms = cf_getms(); // permits reaping
+	}
+
+	msg_set_uint32(m, MIG_FIELD_FEATURES, immig->features);
+
+	immigration_release(immig);
+	immigration_ack_start_request(src, m, OPERATION_START_ACK_OK);
+}
+
+
+void
+immigration_ack_start_request(cf_node src, msg *m, uint32_t op)
+{
+	msg_set_uint32(m, MIG_FIELD_OP, op);
+
+	if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_CTRL) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+void
+immigration_handle_insert_request(cf_node src, msg *m)
+{
+	uint32_t emig_id;
+
+	if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) {
+		cf_warning(AS_MIGRATE, "handle insert: msg get for emig id failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	immigration_hkey hkey;
+
+	hkey.src = src;
+	hkey.emig_id = emig_id;
+
+	immigration *immig;
+
+	if (cf_rchash_get(g_immigration_hash, (void *)&hkey, sizeof(hkey),
+			(void **)&immig) != CF_RCHASH_OK) {
+		// The immig no longer exists, likely the cluster key advanced and this
+		// record immigration is from prior round. Do not ack this request.
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	if (immig->start_result != AS_MIGRATE_OK || immig->start_recv_ms == 0) {
+		// If this immigration didn't start and reserve a partition, it's
+		// likely in the hash on a retransmit and this insert is for the
+		// original - ignore, and let this immigration proceed.
+		immigration_release(immig);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	cf_atomic_int_incr(&immig->rsv.ns->migrate_record_receives);
+
+	if (immig->cluster_key != as_exchange_cluster_key()) {
+		immigration_release(immig);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	as_remote_record rr = { .src = src, .rsv = &immig->rsv };
+
+	if (msg_get_buf(m, MIG_FIELD_DIGEST, (uint8_t **)&rr.keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_MIGRATE, "handle insert: got no digest");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	if (msg_get_buf(m, MIG_FIELD_RECORD, (uint8_t **)&rr.record_buf,
+			&rr.record_buf_sz, MSG_GET_DIRECT) != 0 || rr.record_buf_sz < 2) {
+		cf_warning(AS_MIGRATE, "handle insert: got no or bad record");
+		immigration_release(immig);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	if (msg_get_uint32(m, MIG_FIELD_GENERATION, &rr.generation) != 0 ||
+			rr.generation == 0) {
+		cf_warning(AS_MIGRATE, "handle insert: got no or bad generation");
+		immigration_release(immig);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	if (msg_get_uint64(m, MIG_FIELD_LAST_UPDATE_TIME,
+			&rr.last_update_time) != 0) {
+		cf_warning(AS_MIGRATE, "handle insert: got no last-update-time");
+		immigration_release(immig);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	msg_get_uint32(m, MIG_FIELD_VOID_TIME, &rr.void_time);
+
+	msg_get_buf(m, MIG_FIELD_SET_NAME, (uint8_t **)&rr.set_name,
+			&rr.set_name_len, MSG_GET_DIRECT);
+
+	msg_get_buf(m, MIG_FIELD_KEY, (uint8_t **)&rr.key, &rr.key_size,
+			MSG_GET_DIRECT);
+
+	uint32_t info = 0;
+
+	msg_get_uint32(m, MIG_FIELD_INFO, &info);
+
+	if (immigration_ignore_pickle(rr.record_buf, info)) {
+		cf_warning_digest(AS_MIGRATE, rr.keyd, "handle insert: binless pickle ");
+	}
+	else {
+		immigration_init_repl_state(&rr, info);
+
+		int rv = as_record_replace_if_better(&rr, false, false, false);
+
+		// If replace failed, don't ack - it will be retransmitted.
+		if (! (rv == AS_PROTO_RESULT_OK ||
+				// Migrations just treat these errors as successful no-ops:
+				rv == AS_PROTO_RESULT_FAIL_RECORD_EXISTS ||
+				rv == AS_PROTO_RESULT_FAIL_GENERATION)) {
+			immigration_release(immig);
+			as_fabric_msg_put(m);
+			return;
+		}
+	}
+
+	immigration_release(immig);
+
+	msg_preserve_fields(m, 2, MIG_FIELD_EMIG_INSERT_ID, MIG_FIELD_EMIG_ID);
+
+	msg_set_uint32(m, MIG_FIELD_OP, OPERATION_INSERT_ACK);
+
+	if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_BULK) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+void
+immigration_handle_done_request(cf_node src, msg *m)
+{
+	uint32_t emig_id;
+
+	if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) {
+		cf_warning(AS_MIGRATE, "handle done: msg get for emig id failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	msg_preserve_fields(m, 1, MIG_FIELD_EMIG_ID);
+
+	// See if this migration already exists & has been notified.
+	immigration_hkey hkey;
+
+	hkey.src = src;
+	hkey.emig_id = emig_id;
+
+	immigration *immig;
+
+	if (cf_rchash_get(g_immigration_hash, (void *)&hkey, sizeof(hkey),
+			(void **)&immig) == CF_RCHASH_OK) {
+		if (immig->start_result != AS_MIGRATE_OK || immig->start_recv_ms == 0) {
+			// If this immigration didn't start and reserve a partition, it's
+			// likely in the hash on a retransmit and this DONE is for the
+			// original - ignore, and let this immigration proceed.
+			immigration_release(immig);
+			as_fabric_msg_put(m);
+			return;
+		}
+
+		if (cf_atomic32_incr(&immig->done_recv) == 1) {
+			// Record the time of the first DONE received.
+			immig->done_recv_ms = cf_getms();
+
+			as_namespace *ns = immig->rsv.ns;
+
+			if (cf_atomic_int_decr(&ns->migrate_rx_partitions_active) < 0) {
+				cf_warning(AS_MIGRATE, "migrate_rx_partitions_active < 0");
+				cf_atomic_int_incr(&ns->migrate_rx_partitions_active);
+			}
+
+			as_partition_immigrate_done(ns, immig->rsv.p->id,
+					immig->cluster_key, immig->src);
+		}
+		// else - was likely a retransmitted done message.
+
+		immigration_release(immig);
+	}
+	// else - garbage, or super-stale retransmitted done message.
+
+	msg_set_uint32(m, MIG_FIELD_OP, OPERATION_DONE_ACK);
+
+	if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_CTRL) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+		return;
+	}
+}
+
+
+void
+immigration_handle_all_done_request(cf_node src, msg *m)
+{
+	uint32_t emig_id;
+
+	if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) {
+		cf_warning(AS_MIGRATE, "handle all done: msg get for emig id failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint64_t cluster_key;
+
+	if (msg_get_uint64(m, MIG_FIELD_CLUSTER_KEY, &cluster_key) != 0) {
+		cf_warning(AS_MIGRATE, "handle all done: msg get for cluster key failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint8_t *ns_name;
+	size_t ns_name_len;
+
+	if (msg_get_buf(m, MIG_FIELD_NAMESPACE, &ns_name, &ns_name_len,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_MIGRATE, "handle all done: msg get for namespace failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	as_namespace *ns = as_namespace_get_bybuf(ns_name, ns_name_len);
+
+	if (! ns) {
+		cf_warning(AS_MIGRATE, "handle all done: bad namespace");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t pid;
+
+	if (msg_get_uint32(m, MIG_FIELD_PARTITION, &pid) != 0) {
+		cf_warning(AS_MIGRATE, "handle all done: msg get for pid failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	msg_preserve_fields(m, 1, MIG_FIELD_EMIG_ID);
+
+	// TODO - optionally, for replicas we might use this to remove immig objects
+	// from hash and deprecate timer...
+
+	if (as_partition_migrations_all_done(ns, pid, cluster_key) !=
+			AS_MIGRATE_OK) {
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	msg_set_uint32(m, MIG_FIELD_OP, OPERATION_ALL_DONE_ACK);
+
+	if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_CTRL) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+		return;
+	}
+}
+
+
+//----------------------------------------------------------
+// Emigration - acknowledgment message handling.
+//
+
+void
+emigration_handle_insert_ack(cf_node src, msg *m)
+{
+	uint32_t emig_id;
+
+	if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) {
+		cf_warning(AS_MIGRATE, "insert ack: msg get for emig id failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	emigration *emig;
+
+	if (cf_rchash_get(g_emigration_hash, (void *)&emig_id, sizeof(emig_id),
+			(void **)&emig) != CF_RCHASH_OK) {
+		// Probably came from a migration prior to the latest rebalance.
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint64_t insert_id;
+
+	if (msg_get_uint64(m, MIG_FIELD_EMIG_INSERT_ID, &insert_id) != 0) {
+		cf_warning(AS_MIGRATE, "insert ack: msg get for emig insert id failed");
+		emigration_release(emig);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	emigration_reinsert_ctrl *ri_ctrl = NULL;
+	pthread_mutex_t *vlock;
+
+	if (cf_shash_get_vlock(emig->reinsert_hash, &insert_id, (void **)&ri_ctrl,
+			&vlock) == CF_SHASH_OK) {
+		if (src == emig->dest) {
+			if (cf_atomic32_sub(&emig->bytes_emigrating,
+					(int32_t)msg_get_wire_size(ri_ctrl->m)) < 0) {
+				cf_warning(AS_MIGRATE, "bytes_emigrating less than zero");
+			}
+
+			as_fabric_msg_put(ri_ctrl->m);
+			// At this point, the rt is *GONE*.
+			cf_shash_delete_lockfree(emig->reinsert_hash, &insert_id);
+			ri_ctrl = NULL;
+		}
+		else {
+			cf_warning(AS_MIGRATE, "insert ack: unexpected source %lx", src);
+		}
+
+		pthread_mutex_unlock(vlock);
+	}
+
+	emigration_release(emig);
+	as_fabric_msg_put(m);
+}
+
+
+void
+emigration_handle_ctrl_ack(cf_node src, msg *m, uint32_t op)
+{
+	uint32_t emig_id;
+
+	if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) {
+		cf_warning(AS_MIGRATE, "ctrl ack: msg get for emig id failed");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t immig_features = 0;
+
+	msg_get_uint32(m, MIG_FIELD_FEATURES, &immig_features);
+
+	as_fabric_msg_put(m);
+
+	emigration *emig;
+
+	if (cf_rchash_get(g_emigration_hash, (void *)&emig_id, sizeof(emig_id),
+			(void **)&emig) == CF_RCHASH_OK) {
+		if (emig->dest == src) {
+			if ((immig_features & MIG_FEATURE_MERGE) == 0) {
+				// TODO - rethink where this should go after further refactor.
+				if (op == OPERATION_START_ACK_OK && emig->meta_q) {
+					meta_in_q_rejected(emig->meta_q);
+				}
+			}
+
+			cf_queue_push(emig->ctrl_q, &op);
+		}
+		else {
+			cf_warning(AS_MIGRATE, "ctrl ack (%d): unexpected source %lx", op,
+					src);
+		}
+
+		emigration_release(emig);
+	}
+	else {
+		cf_detail(AS_MIGRATE, "ctrl ack (%d): can't find emig id %u", op,
+				emig_id);
+	}
+}
+
+
+//==========================================================
+// Local helpers - info API helpers.
+//
+
+int
+emigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object,
+		void *udata)
+{
+	uint32_t emig_id = *(const uint32_t *)key;
+	emigration *emig = (emigration *)object;
+	int *item_num = (int *)udata;
+
+	cf_info(AS_MIGRATE, "[%d]: mig_id %u : id %u ; ck %lx", *item_num, emig_id,
+			emig->id, emig->cluster_key);
+
+	*item_num += 1;
+
+	return 0;
+}
+
+
+int
+immigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object,
+		void *udata)
+{
+	const immigration_hkey *hkey = (const immigration_hkey *)key;
+	immigration *immig = (immigration *)object;
+	int *item_num = (int *)udata;
+
+	cf_info(AS_MIGRATE, "[%d]: src %016lx ; id %u : src %016lx ; done recv %u ; start recv ms %lu ; done recv ms %lu ; ck %lx",
+			*item_num, hkey->src, hkey->emig_id, immig->src, immig->done_recv,
+			immig->start_recv_ms, immig->done_recv_ms, immig->cluster_key);
+
+	*item_num += 1;
+
+	return 0;
+}
diff --git a/as/src/fabric/migrate_ce.c b/as/src/fabric/migrate_ce.c
new file mode 100644
index 00000000..a3e98919
--- /dev/null
+++ b/as/src/fabric/migrate_ce.c
@@ -0,0 +1,94 @@
+/* migrate_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/migrate.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+
+#include "base/datamodel.h"
+#include "fabric/fabric.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+const uint32_t MY_MIG_FEATURES = 0;
+
+
+//==========================================================
+// Community Edition API.
+//
+
+bool
+should_emigrate_record(emigration *emig, as_index_ref *r_ref)
+{
+	return true;
+}
+
+uint32_t
+emigration_pack_info(const emigration *emig, const as_record *r)
+{
+	return 0;
+}
+
+void
+emigration_handle_meta_batch_request(cf_node src, msg *m)
+{
+	cf_warning(AS_MIGRATE, "CE node received meta-batch request - unexpected");
+	as_fabric_msg_put(m);
+}
+
+bool
+immigration_ignore_pickle(const uint8_t *buf, uint32_t info)
+{
+	return as_record_pickle_is_binless(buf);
+}
+
+void
+immigration_init_repl_state(as_remote_record* rr, uint32_t info)
+{
+}
+
+void
+immigration_handle_meta_batch_ack(cf_node src, msg *m)
+{
+	cf_warning(AS_MIGRATE, "CE node received meta-batch ack - unexpected");
+	as_fabric_msg_put(m);
+}
+
+bool
+immigration_start_meta_sender(immigration *immig, uint32_t emig_features,
+		uint64_t emig_partition_sz)
+{
+	return false;
+}
diff --git a/as/src/fabric/partition.c b/as/src/fabric/partition.c
new file mode 100644
index 00000000..5b8f599f
--- /dev/null
+++ b/as/src/fabric/partition.c
@@ -0,0 +1,809 @@
+/*
+ * partition.c
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/partition.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_b64.h"
+
+#include "fault.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "fabric/partition_balance.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+cf_node find_best_node(const as_partition* p, bool is_read);
+void accumulate_replica_stats(const as_partition* p, uint64_t* p_n_objects, uint64_t* p_n_tombstones);
+void partition_reserve_lockfree(as_partition* p, as_namespace* ns, as_partition_reservation* rsv);
+cf_node partition_getreplica_prole(as_namespace* ns, uint32_t pid);
+char partition_descriptor(const as_partition* p);
+int partition_get_replica_self_lockfree(const as_namespace* ns, uint32_t pid);
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_partition_init(as_namespace* ns, uint32_t pid)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	// Note - as_partition has been zeroed since it's a member of as_namespace.
+	// Set non-zero members.
+
+	pthread_mutex_init(&p->lock, NULL);
+
+	p->id = pid;
+
+	if (ns->cold_start) {
+		p->vp = as_index_tree_create(&ns->tree_shared, ns->arena);
+	}
+	else {
+		p->vp = as_index_tree_resume(&ns->tree_shared, ns->arena,
+				&ns->xmem_roots[pid * ns->tree_shared.n_sprigs]);
+	}
+}
+
+
+void
+as_partition_shutdown(as_namespace* ns, uint32_t pid)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	as_index_tree_shutdown(p->vp,
+			&ns->xmem_roots[pid * ns->tree_shared.n_sprigs]);
+}
+
+
+void
+as_partition_freeze(as_partition* p)
+{
+	// TODO - rearrange as_partition so we can call memset() here?
+	p->n_nodes = 0;
+	p->n_replicas = 0;
+	memset(p->replicas, 0, sizeof(p->replicas));
+
+	p->pending_emigrations = 0;
+	p->pending_immigrations = 0;
+	memset(p->immigrators, 0, sizeof(p->immigrators));
+
+	p->working_master = (cf_node)0;
+
+	p->n_dupl = 0;
+	memset(p->dupls, 0, sizeof(p->dupls));
+
+	p->n_witnesses = 0;
+	memset(p->witnesses, 0, sizeof(p->witnesses));
+}
+
+
+// Get a list of all nodes (excluding self) that are replicas for a specified
+// partition: place the list in *nv and return the number of nodes found.
+uint32_t
+as_partition_get_other_replicas(as_partition* p, cf_node* nv)
+{
+	uint32_t n_other_replicas = 0;
+
+	pthread_mutex_lock(&p->lock);
+
+	for (uint32_t repl_ix = 0; repl_ix < p->n_replicas; repl_ix++) {
+		// Don't ever include yourself.
+		if (p->replicas[repl_ix] == g_config.self_node) {
+			continue;
+		}
+
+		// Copy the node ID into the user-supplied vector.
+		nv[n_other_replicas++] = p->replicas[repl_ix];
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	return n_other_replicas;
+}
+
+
+cf_node
+as_partition_writable_node(as_namespace* ns, uint32_t pid)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	if (p->n_replicas == 0) {
+		// This partition is unavailable.
+		pthread_mutex_unlock(&p->lock);
+		return (cf_node)0;
+	}
+
+	cf_node best_node = find_best_node(p, false);
+
+	pthread_mutex_unlock(&p->lock);
+
+	return best_node;
+}
+
+
+// If this node is an eventual master, return the acting master, else return 0.
+cf_node
+as_partition_proxyee_redirect(as_namespace* ns, uint32_t pid)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	cf_node node = (cf_node)0;
+
+	if (g_config.self_node == p->replicas[0] &&
+			g_config.self_node != p->working_master) {
+		node = p->working_master;
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	return node;
+}
+
+
+// TODO - deprecate in "six months".
+void
+as_partition_get_replicas_prole_str(cf_dyn_buf* db)
+{
+	uint8_t prole_bitmap[CLIENT_BITMAP_BYTES];
+	char b64_bitmap[CLIENT_B64MAP_BYTES];
+
+	size_t db_sz = db->used_sz;
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		memset(prole_bitmap, 0, sizeof(uint8_t) * CLIENT_BITMAP_BYTES);
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+
+		for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+			if (g_config.self_node == partition_getreplica_prole(ns, pid) ) {
+				prole_bitmap[pid >> 3] |= (0x80 >> (pid & 7));
+			}
+		}
+
+		cf_b64_encode(prole_bitmap, CLIENT_BITMAP_BYTES, b64_bitmap);
+		cf_dyn_buf_append_buf(db, (uint8_t*)b64_bitmap, CLIENT_B64MAP_BYTES);
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	if (db_sz != db->used_sz) {
+		cf_dyn_buf_chomp(db);
+	}
+}
+
+
+void
+as_partition_get_replicas_master_str(cf_dyn_buf* db)
+{
+	size_t db_sz = db->used_sz;
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+		cf_dyn_buf_append_buf(db, (uint8_t*)ns->replica_maps[0].b64map,
+				sizeof(ns->replica_maps[0].b64map));
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	if (db_sz != db->used_sz) {
+		cf_dyn_buf_chomp(db);
+	}
+}
+
+
+void
+as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime)
+{
+	size_t db_sz = db->used_sz;
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		cf_dyn_buf_append_string(db, ns->name);
+		cf_dyn_buf_append_char(db, ':');
+
+		if (include_regime) {
+			cf_dyn_buf_append_uint32(db, ns->rebalance_regime);
+			cf_dyn_buf_append_char(db, ',');
+		}
+
+		uint32_t repl_factor = ns->replication_factor;
+
+		// If we haven't rebalanced yet, report 1 column with no ownership.
+		if (repl_factor == 0) {
+			repl_factor = 1;
+		}
+
+		cf_dyn_buf_append_uint32(db, repl_factor);
+
+		for (uint32_t repl_ix = 0; repl_ix < repl_factor; repl_ix++) {
+			cf_dyn_buf_append_char(db, ',');
+			cf_dyn_buf_append_buf(db,
+					(uint8_t*)&ns->replica_maps[repl_ix].b64map,
+					sizeof(ns->replica_maps[repl_ix].b64map));
+		}
+
+		cf_dyn_buf_append_char(db, ';');
+	}
+
+	if (db_sz != db->used_sz) {
+		cf_dyn_buf_chomp(db);
+	}
+}
+
+
+void
+as_partition_get_replica_stats(as_namespace* ns, repl_stats* p_stats)
+{
+	memset(p_stats, 0, sizeof(repl_stats));
+
+	for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+		as_partition* p = &ns->partitions[pid];
+
+		pthread_mutex_lock(&p->lock);
+
+		int self_n = find_self_in_replicas(p); // -1 if not
+
+		if (g_config.self_node == p->working_master) {
+			accumulate_replica_stats(p,
+					&p_stats->n_master_objects,
+					&p_stats->n_master_tombstones);
+		}
+		else if (self_n >= 0) {
+			accumulate_replica_stats(p,
+					&p_stats->n_prole_objects,
+					&p_stats->n_prole_tombstones);
+		}
+		else {
+			accumulate_replica_stats(p,
+					&p_stats->n_non_replica_objects,
+					&p_stats->n_non_replica_tombstones);
+		}
+
+		pthread_mutex_unlock(&p->lock);
+	}
+}
+
+
+// TODO - what if partition is unavailable?
+void
+as_partition_reserve(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	partition_reserve_lockfree(p, ns, rsv);
+
+	pthread_mutex_unlock(&p->lock);
+}
+
+
+// TODO - what if partition is unavailable?
+int
+as_partition_reserve_timeout(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv, int timeout_ms)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	struct timespec tp;
+	cf_set_wait_timespec(timeout_ms, &tp);
+
+	if (pthread_mutex_timedlock(&p->lock, &tp) != 0) {
+		return -1;
+	}
+
+	partition_reserve_lockfree(p, ns, rsv);
+
+	pthread_mutex_unlock(&p->lock);
+
+	return 0;
+}
+
+
+int
+as_partition_reserve_replica(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	if (! is_self_replica(p)) {
+		pthread_mutex_unlock(&p->lock);
+		return AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH;
+	}
+
+	partition_reserve_lockfree(p, ns, rsv);
+
+	pthread_mutex_unlock(&p->lock);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+// Returns:
+//  0 - reserved - node parameter returns self node
+// -1 - not reserved - node parameter returns other "better" node
+// -2 - not reserved - node parameter not filled - partition is unavailable
+int
+as_partition_reserve_write(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv, cf_node* node)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	// If this partition is frozen, return.
+	if (p->n_replicas == 0) {
+		if (node) {
+			*node = (cf_node)0;
+		}
+
+		pthread_mutex_unlock(&p->lock);
+		return -2;
+	}
+
+	cf_node best_node = find_best_node(p, false);
+
+	if (node) {
+		*node = best_node;
+	}
+
+	// If this node is not the appropriate one, return.
+	if (best_node != g_config.self_node) {
+		pthread_mutex_unlock(&p->lock);
+		return -1;
+	}
+
+	partition_reserve_lockfree(p, ns, rsv);
+
+	pthread_mutex_unlock(&p->lock);
+
+	return 0;
+}
+
+
+// Returns:
+//  0 - reserved - node parameter returns self node
+// -1 - not reserved - node parameter returns other "better" node
+// -2 - not reserved - node parameter not filled - partition is unavailable
+int
+as_partition_reserve_read(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv, bool would_dup_res, cf_node* node)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	// If this partition is unavailable, return.
+	if (p->n_replicas == 0) {
+		if (node) {
+			*node = (cf_node)0;
+		}
+
+		pthread_mutex_unlock(&p->lock);
+		return -2;
+	}
+
+	cf_node best_node = find_best_node(p,
+			! partition_reserve_promote(ns, p, would_dup_res));
+
+	if (node) {
+		*node = best_node;
+	}
+
+	// If this node is not the appropriate one, return.
+	if (best_node != g_config.self_node) {
+		pthread_mutex_unlock(&p->lock);
+		return -1;
+	}
+
+	partition_reserve_lockfree(p, ns, rsv);
+
+	pthread_mutex_unlock(&p->lock);
+
+	return 0;
+}
+
+
+// Reserves all query-able partitions.
+// Returns the number of partitions reserved.
+int
+as_partition_prereserve_query(as_namespace* ns, bool can_partition_query[],
+		as_partition_reservation rsv[])
+{
+	int reserved = 0;
+
+	for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+		if (as_partition_reserve_query(ns, pid, &rsv[pid])) {
+			can_partition_query[pid] = false;
+		}
+		else {
+			can_partition_query[pid] = true;
+			reserved++;
+		}
+	}
+
+	return reserved;
+}
+
+
+// Reserve a partition for query.
+// Return value 0 means the reservation was taken, -1 means not.
+int
+as_partition_reserve_query(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv)
+{
+	return as_partition_reserve_write(ns, pid, rsv, NULL);
+}
+
+
+// Obtain a partition reservation for XDR reads. Succeeds, if we are sync or
+// zombie for the partition.
+// TODO - what if partition is unavailable?
+int
+as_partition_reserve_xdr_read(as_namespace* ns, uint32_t pid,
+		as_partition_reservation* rsv)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	int res = -1;
+
+	if (as_partition_version_has_data(&p->version)) {
+		partition_reserve_lockfree(p, ns, rsv);
+		res = 0;
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	return res;
+}
+
+
+void
+as_partition_reservation_copy(as_partition_reservation* dst,
+		as_partition_reservation* src)
+{
+	dst->ns = src->ns;
+	dst->p = src->p;
+	dst->tree = src->tree;
+	dst->regime = src->regime;
+	dst->n_dupl = src->n_dupl;
+
+	if (dst->n_dupl != 0) {
+		memcpy(dst->dupl_nodes, src->dupl_nodes, sizeof(cf_node) * dst->n_dupl);
+	}
+}
+
+
+void
+as_partition_release(as_partition_reservation* rsv)
+{
+	as_index_tree_release(rsv->tree);
+}
+
+
+void
+as_partition_getinfo_str(cf_dyn_buf* db)
+{
+	size_t db_sz = db->used_sz;
+
+	cf_dyn_buf_append_string(db, "namespace:partition:state:n_replicas:replica:"
+			"n_dupl:working_master:emigrates:immigrates:records:tombstones:"
+			"regime:version:final_version;");
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+			as_partition* p = &ns->partitions[pid];
+
+			pthread_mutex_lock(&p->lock);
+
+			cf_dyn_buf_append_string(db, ns->name);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint32(db, pid);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_char(db, partition_descriptor(p));
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint32(db, p->n_replicas);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_int(db, find_self_in_replicas(p));
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint32(db, p->n_dupl);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint64_x(db, p->working_master);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_int(db, p->pending_emigrations);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_int(db, p->pending_immigrations);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint32(db, as_index_tree_size(p->vp));
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint64(db, p->n_tombstones);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_uint32(db, p->regime);
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_string(db, VERSION_AS_STRING(&p->version));
+			cf_dyn_buf_append_char(db, ':');
+			cf_dyn_buf_append_string(db, VERSION_AS_STRING(&p->final_version));
+
+			cf_dyn_buf_append_char(db, ';');
+
+			pthread_mutex_unlock(&p->lock);
+		}
+	}
+
+	if (db_sz != db->used_sz) {
+		cf_dyn_buf_chomp(db); // take back the final ';'
+	}
+}
+
+
+//==========================================================
+// Public API - client view replica maps.
+//
+
+void
+client_replica_maps_create(as_namespace* ns)
+{
+	uint32_t size = sizeof(client_replica_map) * ns->cfg_replication_factor;
+
+	ns->replica_maps = cf_malloc(size);
+	memset(ns->replica_maps, 0, size);
+
+	for (uint32_t repl_ix = 0; repl_ix < ns->cfg_replication_factor;
+			repl_ix++) {
+		client_replica_map* repl_map = &ns->replica_maps[repl_ix];
+
+		pthread_mutex_init(&repl_map->write_lock, NULL);
+
+		cf_b64_encode((uint8_t*)repl_map->bitmap,
+				(uint32_t)sizeof(repl_map->bitmap), (char*)repl_map->b64map);
+	}
+}
+
+
+void
+client_replica_maps_clear(as_namespace* ns)
+{
+	memset(ns->replica_maps, 0,
+			sizeof(client_replica_map) * ns->cfg_replication_factor);
+
+	for (uint32_t repl_ix = 0; repl_ix < ns->cfg_replication_factor;
+			repl_ix++) {
+		client_replica_map* repl_map = &ns->replica_maps[repl_ix];
+
+		cf_b64_encode((uint8_t*)repl_map->bitmap,
+				(uint32_t)sizeof(repl_map->bitmap), (char*)repl_map->b64map);
+	}
+}
+
+
+bool
+client_replica_maps_update(as_namespace* ns, uint32_t pid)
+{
+	uint32_t byte_i = pid >> 3;
+	uint32_t byte_chunk = (byte_i / 3);
+	uint32_t chunk_bitmap_offset = byte_chunk * 3;
+	uint32_t chunk_b64map_offset = byte_chunk << 2;
+
+	uint32_t bytes_from_end = CLIENT_BITMAP_BYTES - chunk_bitmap_offset;
+	uint32_t input_size = bytes_from_end > 3 ? 3 : bytes_from_end;
+
+	int replica = partition_get_replica_self_lockfree(ns, pid); // -1 if not
+	uint8_t set_mask = 0x80 >> (pid & 0x7);
+	bool changed = false;
+
+	for (int repl_ix = 0; repl_ix < (int)ns->cfg_replication_factor;
+			repl_ix++) {
+		client_replica_map* repl_map = &ns->replica_maps[repl_ix];
+
+		volatile uint8_t* mbyte = repl_map->bitmap + byte_i;
+		bool owned = replica == repl_ix;
+		bool is_set = (*mbyte & set_mask) != 0;
+		bool needs_update = (owned && ! is_set) || (! owned && is_set);
+
+		if (! needs_update) {
+			continue;
+		}
+
+		volatile uint8_t* bitmap_chunk = repl_map->bitmap + chunk_bitmap_offset;
+		volatile char* b64map_chunk = repl_map->b64map + chunk_b64map_offset;
+
+		pthread_mutex_lock(&repl_map->write_lock);
+
+		*mbyte ^= set_mask;
+		cf_b64_encode((uint8_t*)bitmap_chunk, input_size, (char*)b64map_chunk);
+
+		pthread_mutex_unlock(&repl_map->write_lock);
+
+		changed = true;
+	}
+
+	return changed;
+}
+
+
+bool
+client_replica_maps_is_partition_queryable(const as_namespace* ns, uint32_t pid)
+{
+	uint32_t byte_i = pid >> 3;
+
+	const client_replica_map* repl_map = ns->replica_maps;
+	const volatile uint8_t* mbyte = repl_map->bitmap + byte_i;
+
+	uint8_t set_mask = 0x80 >> (pid & 0x7);
+
+	return (*mbyte & set_mask) != 0;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+// Find best node to handle read/write. Called within partition lock.
+cf_node
+find_best_node(const as_partition* p, bool is_read)
+{
+	// Working master (final or acting) returns self, eventual master returns
+	// acting master. Others don't have p->working_master set.
+	if (p->working_master != (cf_node)0) {
+		return p->working_master;
+	}
+
+	if (is_read && p->pending_immigrations == 0 &&
+			find_self_in_replicas(p) > 0) {
+		return g_config.self_node; // may read from prole that's got everything
+	}
+
+	return p->replicas[0]; // final master as a last resort
+}
+
+
+void
+accumulate_replica_stats(const as_partition* p, uint64_t* p_n_objects,
+		uint64_t* p_n_tombstones)
+{
+	int64_t n_tombstones = (int64_t)p->n_tombstones;
+	int64_t n_objects = (int64_t)as_index_tree_size(p->vp) - n_tombstones;
+
+	*p_n_objects += n_objects > 0 ? (uint64_t)n_objects : 0;
+	*p_n_tombstones += (uint64_t)n_tombstones;
+}
+
+
+void
+partition_reserve_lockfree(as_partition* p, as_namespace* ns,
+		as_partition_reservation* rsv)
+{
+	cf_rc_reserve(p->vp);
+
+	rsv->ns = ns;
+	rsv->p = p;
+	rsv->tree = p->vp;
+	rsv->regime = p->regime;
+	rsv->n_dupl = p->n_dupl;
+
+	if (rsv->n_dupl != 0) {
+		memcpy(rsv->dupl_nodes, p->dupls, sizeof(cf_node) * rsv->n_dupl);
+	}
+}
+
+
+// TODO - deprecate in "six months".
+cf_node
+partition_getreplica_prole(as_namespace* ns, uint32_t pid)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	// Check is this is a master node.
+	cf_node best_node = find_best_node(p, false);
+
+	if (best_node == g_config.self_node) {
+		// It's a master, return 0.
+		best_node = (cf_node)0;
+	}
+	else {
+		// Not a master, see if it's a prole.
+		best_node = find_best_node(p, true);
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	return best_node;
+}
+
+
+char
+partition_descriptor(const as_partition* p)
+{
+	int self_n = find_self_in_replicas(p); // -1 if not
+
+	if (self_n >= 0) {
+		return p->pending_immigrations == 0 ? 'S' : 'D';
+	}
+
+	if (as_partition_version_is_null(&p->version)) {
+		return 'A';
+	}
+
+	return as_partition_version_has_data(&p->version) ? 'Z' : 'X';
+}
+
+
+int
+partition_get_replica_self_lockfree(const as_namespace* ns, uint32_t pid)
+{
+	const as_partition* p = &ns->partitions[pid];
+
+	int self_n = find_self_in_replicas(p); // -1 if not
+
+	if (g_config.self_node == p->working_master) {
+		return 0;
+	}
+
+	if (self_n > 0 && p->pending_immigrations == 0 &&
+			// Check self_n < n_repl only because n_repl could be out-of-sync
+			// with (less than) partition's replica list count.
+			self_n < (int)ns->replication_factor) {
+		return self_n;
+	}
+
+	return -1; // not a replica
+}
diff --git a/as/src/fabric/partition_balance.c b/as/src/fabric/partition_balance.c
new file mode 100644
index 00000000..1c6d1634
--- /dev/null
+++ b/as/src/fabric/partition_balance.c
@@ -0,0 +1,1456 @@
+/*
+ * partition_balance.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/partition_balance.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_hash_math.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "compare.h"
+#include "fault.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "fabric/exchange.h"
+#include "fabric/hb.h"
+#include "fabric/migrate.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+const as_partition_version ZERO_VERSION = { 0 };
+
+
+//==========================================================
+// Globals.
+//
+
+cf_atomic32 g_partition_generation = (uint32_t)-1;
+
+// Using int for 4-byte size, but maintaining bool semantics.
+// TODO - ok as non-volatile, but should selectively load/store in the future.
+static int g_init_balance_done = false;
+
+static cf_atomic32 g_migrate_num_incoming = 0;
+
+// Using int for 4-byte size, but maintaining bool semantics.
+volatile int g_allow_migrations = false;
+
+uint64_t g_hashed_pids[AS_PARTITIONS];
+
+// Shortcuts to values set by as_exchange, for use in partition balance only.
+uint32_t g_cluster_size = 0;
+cf_node* g_succession = NULL;
+
+cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS];
+sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS];
+
+
+//==========================================================
+// Forward declarations.
+//
+
+// Only partition_balance hooks into exchange.
+extern cf_node* as_exchange_succession_unsafe();
+
+// Helpers - balance partitions.
+void fill_global_tables();
+void apply_single_replica_limit_ap(as_namespace* ns);
+uint32_t rack_count(const as_namespace* ns);
+int find_working_master_ap(const as_partition* p, const sl_ix_t* ns_sl_ix, const as_namespace* ns);
+uint32_t find_duplicates_ap(const as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, uint32_t working_master_n, cf_node dupls[]);
+void advance_version_ap(as_partition* p, const sl_ix_t* ns_sl_ix, as_namespace* ns, uint32_t self_n,	uint32_t working_master_n, uint32_t n_dupl, const cf_node dupls[]);
+uint32_t fill_family_versions(const as_partition* p, const sl_ix_t* ns_sl_ix, const as_namespace* ns, uint32_t working_master_n, uint32_t n_dupl, const cf_node dupls[], as_partition_version family_versions[]);
+bool has_replica_parent(const as_partition* p, const sl_ix_t* ns_sl_ix, const as_namespace* ns, const as_partition_version* subset_version, uint32_t subset_n);
+uint32_t find_family(const as_partition_version* self_version, uint32_t n_families, const as_partition_version family_versions[]);
+
+// Helpers - migration-related.
+bool partition_immigration_is_valid(const as_partition* p, cf_node source_node, const as_namespace* ns, const char* tag);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline bool
+is_self_final_master(const as_partition* p)
+{
+	return p->replicas[0] == g_config.self_node;
+}
+
+
+//==========================================================
+// Public API - regulate migrations.
+//
+
+void
+as_partition_balance_disallow_migrations()
+{
+	cf_detail(AS_PARTITION, "disallow migrations");
+
+	g_allow_migrations = false;
+}
+
+
+bool
+as_partition_balance_are_migrations_allowed()
+{
+	return g_allow_migrations;
+}
+
+
+void
+as_partition_balance_synchronize_migrations()
+{
+	// Acquire and release each partition lock to ensure threads acquiring a
+	// partition lock after this will be forced to check the latest cluster key.
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+			as_partition* p = &ns->partitions[pid];
+
+			pthread_mutex_lock(&p->lock);
+			pthread_mutex_unlock(&p->lock);
+		}
+	}
+
+	// Prior-round migrations won't decrement g_migrate_num_incoming due to
+	// cluster key check.
+	cf_atomic32_set(&g_migrate_num_incoming, 0);
+}
+
+
+//==========================================================
+// Public API - balance partitions.
+//
+
+void
+as_partition_balance_init()
+{
+	// Cache hashed pids for all future rebalances.
+	for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+		g_hashed_pids[pid] = cf_hash_fnv64((const uint8_t*)&pid,
+				sizeof(uint32_t));
+	}
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		uint32_t n_stored = 0;
+
+		for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+			as_partition* p = &ns->partitions[pid];
+
+			as_storage_info_get(ns, p);
+
+			if (as_partition_version_has_data(&p->version)) {
+				as_partition_isolate_version(ns, p);
+				n_stored++;
+			}
+		}
+
+		cf_info(AS_PARTITION, "{%s} %u partitions: found %u absent, %u stored",
+				ns->name, AS_PARTITIONS, AS_PARTITIONS - n_stored, n_stored);
+	}
+
+	partition_balance_init();
+}
+
+
+// Has the node resolved as operating either in a multi-node cluster or as a
+// single-node cluster?
+bool
+as_partition_balance_is_init_resolved()
+{
+	return g_init_balance_done;
+}
+
+
+void
+as_partition_balance_revert_to_orphan()
+{
+	g_init_balance_done = false;
+	g_allow_migrations = false;
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		client_replica_maps_clear(ns);
+
+		for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+			as_partition* p = &ns->partitions[pid];
+
+			pthread_mutex_lock(&p->lock);
+
+			as_partition_freeze(p);
+			as_partition_isolate_version(ns, p);
+
+			pthread_mutex_unlock(&p->lock);
+		}
+
+		ns->n_unavailable_partitions = AS_PARTITIONS;
+	}
+
+	cf_atomic32_incr(&g_partition_generation);
+}
+
+
+void
+as_partition_balance()
+{
+	// Temporary paranoia.
+	static uint64_t last_cluster_key = 0;
+
+	if (last_cluster_key == as_exchange_cluster_key()) {
+		cf_warning(AS_PARTITION, "as_partition_balance: cluster key %lx same as last time",
+				last_cluster_key);
+		return;
+	}
+
+	last_cluster_key = as_exchange_cluster_key();
+	// End - temporary paranoia.
+
+	// These shortcuts must only be used within the scope of this function.
+	g_cluster_size = as_exchange_cluster_size();
+	g_succession = as_exchange_succession_unsafe();
+
+	// Each partition separately shuffles the node succession list to generate
+	// its own node sequence.
+	fill_global_tables();
+
+	cf_queue mq;
+
+	cf_queue_init(&mq, sizeof(pb_task), g_config.n_namespaces * AS_PARTITIONS,
+			false);
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		balance_namespace(g_config.namespaces[ns_ix], &mq);
+	}
+
+	prepare_for_appeals();
+
+	// All partitions now have replicas assigned, ok to allow transactions.
+	g_init_balance_done = true;
+	cf_atomic32_incr(&g_partition_generation);
+
+	g_allow_migrations = true;
+	cf_detail(AS_PARTITION, "allow migrations");
+
+	process_pb_tasks(&mq);
+	cf_queue_destroy(&mq);
+}
+
+
+uint64_t
+as_partition_balance_remaining_migrations()
+{
+	uint64_t remaining_migrations = 0;
+
+	for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) {
+		as_namespace* ns = g_config.namespaces[ns_ix];
+
+		remaining_migrations += ns->migrate_tx_partitions_remaining;
+		remaining_migrations += ns->migrate_rx_partitions_remaining;
+	}
+
+	return remaining_migrations;
+}
+
+
+//==========================================================
+// Public API - migration-related as_partition methods.
+//
+
+// Currently used only for enterprise build.
+bool
+as_partition_pending_migrations(as_partition* p)
+{
+	pthread_mutex_lock(&p->lock);
+
+	bool pending = p->pending_immigrations + p->pending_emigrations > 0;
+
+	pthread_mutex_unlock(&p->lock);
+
+	return pending;
+}
+
+
+void
+as_partition_emigrate_done(as_namespace* ns, uint32_t pid,
+		uint64_t orig_cluster_key, uint32_t tx_flags)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key()) {
+		cf_debug(AS_PARTITION, "{%s:%u} emigrate_done - cluster key mismatch",
+				ns->name, pid);
+		pthread_mutex_unlock(&p->lock);
+		return;
+	}
+
+	if (p->pending_emigrations == 0) {
+		cf_warning(AS_PARTITION, "{%s:%u} emigrate_done - no pending emigrations",
+				ns->name, pid);
+		pthread_mutex_unlock(&p->lock);
+		return;
+	}
+
+	p->pending_emigrations--;
+
+	int64_t migrates_tx_remaining =
+			cf_atomic_int_decr(&ns->migrate_tx_partitions_remaining);
+
+	if (migrates_tx_remaining < 0){
+		cf_warning(AS_PARTITION, "{%s:%u} (%d,%ld) emigrate_done - counter went negative",
+				ns->name, pid, p->pending_emigrations, migrates_tx_remaining);
+	}
+
+	if (! is_self_final_master(p)) {
+		emigrate_done_advance_non_master_version(ns, p, tx_flags);
+	}
+
+	if (client_replica_maps_update(ns, pid)) {
+		cf_atomic32_incr(&g_partition_generation);
+	}
+
+	cf_queue mq;
+	pb_task task;
+	int w_ix = -1;
+
+	if (is_self_final_master(p) &&
+			p->pending_emigrations == 0 && p->pending_immigrations == 0) {
+		cf_queue_init(&mq, sizeof(pb_task), p->n_witnesses, false);
+
+		for (w_ix = 0; w_ix < (int)p->n_witnesses; w_ix++) {
+			pb_task_init(&task, p->witnesses[w_ix], ns, pid, orig_cluster_key,
+					PB_TASK_EMIG_SIGNAL_ALL_DONE, TX_FLAGS_NONE);
+			cf_queue_push(&mq, &task);
+		}
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	if (w_ix >= 0) {
+		while (cf_queue_pop(&mq, &task, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) {
+			as_migrate_emigrate(&task);
+		}
+
+		cf_queue_destroy(&mq);
+	}
+}
+
+
+as_migrate_result
+as_partition_immigrate_start(as_namespace* ns, uint32_t pid,
+		uint64_t orig_cluster_key, cf_node source_node)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key() ||
+			immigrate_yield()) {
+		cf_debug(AS_PARTITION, "{%s:%u} immigrate_start - cluster key mismatch",
+				ns->name, pid);
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_AGAIN;
+	}
+
+	uint32_t num_incoming = (uint32_t)cf_atomic32_incr(&g_migrate_num_incoming);
+
+	if (num_incoming > g_config.migrate_max_num_incoming) {
+		cf_debug(AS_PARTITION, "{%s:%u} immigrate_start - exceeded max_num_incoming",
+				ns->name, pid);
+		cf_atomic32_decr(&g_migrate_num_incoming);
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_AGAIN;
+	}
+
+	if (! partition_immigration_is_valid(p, source_node, ns, "start")) {
+		cf_atomic32_decr(&g_migrate_num_incoming);
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_FAIL;
+	}
+
+	if (! is_self_final_master(p)) {
+		immigrate_start_advance_non_master_version(ns, p);
+		as_storage_info_set(ns, p, true);
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	return AS_MIGRATE_OK;
+}
+
+
+as_migrate_result
+as_partition_immigrate_done(as_namespace* ns, uint32_t pid,
+		uint64_t orig_cluster_key, cf_node source_node)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key()) {
+		cf_debug(AS_PARTITION, "{%s:%u} immigrate_done - cluster key mismatch",
+				ns->name, pid);
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_FAIL;
+	}
+
+	cf_atomic32_decr(&g_migrate_num_incoming);
+
+	if (! partition_immigration_is_valid(p, source_node, ns, "done")) {
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_FAIL;
+	}
+
+	p->pending_immigrations--;
+
+	int64_t migrates_rx_remaining =
+			cf_atomic_int_decr(&ns->migrate_rx_partitions_remaining);
+
+	// Sanity-check only.
+	if (migrates_rx_remaining < 0) {
+		cf_warning(AS_PARTITION, "{%s:%u} (%d,%ld) immigrate_done - counter went negative",
+				ns->name, pid, p->pending_immigrations, migrates_rx_remaining);
+	}
+
+	if (p->pending_immigrations == 0 &&
+			! as_partition_version_same(&p->version, &p->final_version)) {
+		p->version = p->final_version;
+		as_storage_info_set(ns, p, true);
+	}
+
+	if (! is_self_final_master(p)) {
+		if (client_replica_maps_update(ns, pid)) {
+			cf_atomic32_incr(&g_partition_generation);
+		}
+
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_OK;
+	}
+
+	// Final master finished an immigration, adjust duplicates.
+
+	if (source_node == p->working_master) {
+		p->working_master = g_config.self_node;
+
+		immigrate_done_advance_final_master_version(ns, p);
+	}
+	else {
+		p->n_dupl = remove_node(p->dupls, p->n_dupl, source_node);
+	}
+
+	if (client_replica_maps_update(ns, pid)) {
+		cf_atomic32_incr(&g_partition_generation);
+	}
+
+	if (p->pending_immigrations != 0) {
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_OK;
+	}
+
+	// Final master finished all immigration.
+
+	cf_queue mq;
+	pb_task task;
+
+	if (p->pending_emigrations != 0) {
+		cf_queue_init(&mq, sizeof(pb_task), p->n_replicas - 1, false);
+
+		for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) {
+			if (p->immigrators[repl_ix]) {
+				pb_task_init(&task, p->replicas[repl_ix], ns, pid,
+						orig_cluster_key, PB_TASK_EMIG_TRANSFER, TX_FLAGS_NONE);
+				cf_queue_push(&mq, &task);
+			}
+		}
+	}
+	else {
+		cf_queue_init(&mq, sizeof(pb_task), p->n_witnesses, false);
+
+		for (uint32_t w_ix = 0; w_ix < p->n_witnesses; w_ix++) {
+			pb_task_init(&task, p->witnesses[w_ix], ns, pid, orig_cluster_key,
+					PB_TASK_EMIG_SIGNAL_ALL_DONE, TX_FLAGS_NONE);
+			cf_queue_push(&mq, &task);
+		}
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	while (cf_queue_pop(&mq, &task, 0) == CF_QUEUE_OK) {
+		as_migrate_emigrate(&task);
+	}
+
+	cf_queue_destroy(&mq);
+
+	return AS_MIGRATE_OK;
+}
+
+
+as_migrate_result
+as_partition_migrations_all_done(as_namespace* ns, uint32_t pid,
+		uint64_t orig_cluster_key)
+{
+	as_partition* p = &ns->partitions[pid];
+
+	pthread_mutex_lock(&p->lock);
+
+	if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key()) {
+		cf_debug(AS_PARTITION, "{%s:%u} all_done - cluster key mismatch",
+				ns->name, pid);
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_FAIL;
+	}
+
+	if (p->pending_emigrations != 0) {
+		cf_debug(AS_PARTITION, "{%s:%u} all_done - eagain",
+				ns->name, pid);
+		pthread_mutex_unlock(&p->lock);
+		return AS_MIGRATE_AGAIN;
+	}
+
+	// Not a replica - drop partition.
+	if (! is_self_replica(p)) {
+		p->version = ZERO_VERSION;
+		as_storage_info_set(ns, p, true);
+		drop_trees(p, ns);
+	}
+
+	pthread_mutex_unlock(&p->lock);
+
+	return AS_MIGRATE_OK;
+}
+
+
+//==========================================================
+// Local helpers - generic.
+//
+
+void
+pb_task_init(pb_task* task, cf_node dest, as_namespace* ns,
+		uint32_t pid, uint64_t cluster_key, pb_task_type type,
+		uint32_t tx_flags)
+{
+	task->dest = dest;
+	task->ns = ns;
+	task->pid = pid;
+	task->type = type;
+	task->tx_flags = tx_flags;
+	task->cluster_key = cluster_key;
+}
+
+
+void
+drop_trees(as_partition* p, as_namespace* ns)
+{
+	as_index_tree* temp = p->vp;
+
+	p->vp = as_index_tree_create(&ns->tree_shared, ns->arena);
+	as_index_tree_release(temp);
+
+	// TODO - consider p->n_tombstones?
+	cf_atomic64_set(&p->max_void_time, 0);
+}
+
+
+//==========================================================
+// Local helpers - balance partitions.
+//
+
+// fill_global_tables()
+//
+//  Succession list - all nodes in cluster
+//  +---------------+
+//  | A | B | C | D |
+//  +---------------+
+//
+//  Succession list index (sl_ix) - used as version table and rack-id index
+//  +---------------+
+//  | 0 | 1 | 2 | 3 |
+//  +---------------+
+//
+// Every partition shuffles the succession list independently, e.g. for pid 0:
+// Hash the node names with the pid:
+//  H(A,0) = Y, H(B,0) = X, H(C,0) = W, H(D,0) = Z
+// Store sl_ix in last byte of hash results so it doesn't affect sort:
+//  +-----------------------+
+//  | Y_0 | X_1 | W_2 | Z_3 |
+//  +-----------------------+
+// This sorts to:
+//  +-----------------------+
+//  | W_2 | X_1 | Y_0 | Z_3 |
+//  +-----------------------+
+// Replace original node names, and keep sl_ix order, resulting in:
+//  +---------------+    +---------------+
+//  | C | B | A | D |    | 2 | 1 | 0 | 3 |
+//  +---------------+    +---------------+
+//
+//  Node sequence table      Succession list index table
+//   pid                      pid
+//  +===+---------------+    +===+---------------+
+//  | 0 | C | B | A | D |    | 0 | 2 | 1 | 0 | 3 |
+//  +===+---------------+    +===+---------------+
+//  | 1 | A | D | C | B |    | 1 | 0 | 3 | 2 | 1 |
+//  +===+---------------+    +===+---------------+
+//  | 2 | D | C | B | A |    | 2 | 3 | 2 | 1 | 0 |
+//  +===+---------------+    +===+---------------+
+//  | 3 | B | A | D | C |    | 3 | 1 | 0 | 3 | 2 |
+//  +===+---------------+    +===+---------------+
+//  | 4 | D | B | C | A |    | 4 | 3 | 1 | 2 | 0 |
+//  +===+---------------+    +===+---------------+
+//  ... to pid 4095.
+//
+// We keep the succession list index table so we can refer back to namespaces'
+// partition version tables and rack-id lists, where nodes are in the original
+// succession list order.
+void
+fill_global_tables()
+{
+	uint64_t hashed_nodes[g_cluster_size];
+
+	for (uint32_t n = 0; n < g_cluster_size; n++) {
+		hashed_nodes[n] = cf_hash_fnv64((const uint8_t*)&g_succession[n],
+				sizeof(cf_node));
+	}
+
+	// Build the node sequence table.
+	for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+		inter_hash h;
+
+		h.hashed_pid = g_hashed_pids[pid];
+
+		for (uint32_t n = 0; n < g_cluster_size; n++) {
+			h.hashed_node = hashed_nodes[n];
+
+			cf_node* node_p = &FULL_NODE_SEQ(pid, n);
+
+			*node_p = cf_hash_jen64((const uint8_t*)&h, sizeof(h));
+
+			// Overlay index onto last byte.
+			*node_p &= AS_CLUSTER_SZ_MASKP;
+			*node_p += n;
+		}
+
+		// Sort the hashed node values.
+		qsort(&FULL_NODE_SEQ(pid, 0), g_cluster_size, sizeof(cf_node),
+				cf_node_compare_desc);
+
+		// Overwrite the sorted hash values with the original node IDs.
+		for (uint32_t n = 0; n < g_cluster_size; n++) {
+			cf_node* node_p = &FULL_NODE_SEQ(pid, n);
+			sl_ix_t sl_ix = (sl_ix_t)(*node_p & AS_CLUSTER_SZ_MASKN);
+
+			*node_p = g_succession[sl_ix];
+
+			// Saved to refer back to partition version table and rack-id list.
+			FULL_SL_IX(pid, n) = sl_ix;
+		}
+	}
+}
+
+
+void
+balance_namespace_ap(as_namespace* ns, cf_queue* mq)
+{
+	bool ns_less_than_global = ns->cluster_size != g_cluster_size;
+
+	if (ns_less_than_global) {
+		cf_info(AS_PARTITION, "{%s} is on %u of %u nodes", ns->name,
+				ns->cluster_size, g_cluster_size);
+	}
+
+	// Figure out effective replication factor in the face of node failures.
+	apply_single_replica_limit_ap(ns);
+
+	uint32_t n_racks = rack_count(ns);
+
+	// If a namespace is not on all nodes or is rack aware, it can't use the
+	// global node sequence and index tables.
+	bool ns_not_equal_global = ns_less_than_global || n_racks != 1;
+
+	// The translation array is used to convert global table rows to namespace
+	// rows, if  necessary.
+	int translation[ns_less_than_global ? g_cluster_size : 0];
+
+	if (ns_less_than_global) {
+		fill_translation(translation, ns);
+	}
+
+	uint32_t ns_pending_immigrations = 0;
+	uint32_t ns_pending_emigrations = 0;
+	uint32_t ns_pending_signals = 0;
+
+	uint32_t ns_fresh_partitions = 0;
+
+	for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+		as_partition* p = &ns->partitions[pid];
+
+		cf_node* full_node_seq = &FULL_NODE_SEQ(pid, 0);
+		sl_ix_t* full_sl_ix = &FULL_SL_IX(pid, 0);
+
+		// Usually a namespace can simply use the global tables...
+		cf_node* ns_node_seq = full_node_seq;
+		sl_ix_t* ns_sl_ix = full_sl_ix;
+
+		cf_node stack_node_seq[ns_not_equal_global ? ns->cluster_size : 0];
+		sl_ix_t stack_sl_ix[ns_not_equal_global ? ns->cluster_size : 0];
+
+		// ... but sometimes a namespace is different.
+		if (ns_not_equal_global) {
+			ns_node_seq = stack_node_seq;
+			ns_sl_ix = stack_sl_ix;
+
+			fill_namespace_rows(full_node_seq, full_sl_ix, ns_node_seq,
+					ns_sl_ix, ns, translation);
+
+			if (n_racks != 1) {
+				rack_aware_adjust_row(ns_node_seq, ns_sl_ix,
+						ns->replication_factor, ns->rack_ids, ns->cluster_size,
+						n_racks, 1);
+			}
+		}
+
+		pthread_mutex_lock(&p->lock);
+
+		p->n_replicas = ns->replication_factor;
+		memset(p->replicas, 0, sizeof(p->replicas));
+		memcpy(p->replicas, ns_node_seq, p->n_replicas * sizeof(cf_node));
+
+		p->pending_emigrations = 0;
+		p->pending_immigrations = 0;
+		memset(p->immigrators, 0, sizeof(p->immigrators));
+
+		p->working_master = (cf_node)0;
+
+		p->n_dupl = 0;
+		memset(p->dupls, 0, sizeof(p->dupls));
+
+		p->n_witnesses = 0;
+		memset(p->witnesses, 0, sizeof(p->witnesses));
+
+		uint32_t self_n = find_self(ns_node_seq, ns);
+
+		as_partition_version final_version = {
+				.ckey = as_exchange_cluster_key()
+		};
+
+		p->final_version = final_version;
+		p->final_version.master = self_n == 0 ? 1 : 0;
+
+		int working_master_n = find_working_master_ap(p, ns_sl_ix, ns);
+
+		uint32_t n_dupl = 0;
+		cf_node dupls[ns->cluster_size];
+
+		memset(dupls, 0, sizeof(dupls));
+
+		// TEMPORARY debugging.
+		uint32_t debug_n_immigrators = 0;
+		as_partition_version debug_orig = ZERO_VERSION;
+
+		if (working_master_n == -1) {
+			// No existing versions - assign fresh version to replicas.
+			working_master_n = 0;
+
+			if (self_n < p->n_replicas) {
+				p->version = p->final_version;
+			}
+
+			ns_fresh_partitions++;
+		}
+		else {
+			n_dupl = find_duplicates_ap(p, ns_node_seq, ns_sl_ix, ns,
+					(uint32_t)working_master_n, dupls);
+
+			uint32_t n_immigrators = fill_immigrators(p, ns_sl_ix, ns,
+					(uint32_t)working_master_n, n_dupl);
+
+			// TEMPORARY debugging.
+			debug_n_immigrators = n_immigrators;
+			debug_orig = p->version;
+
+			if (n_immigrators != 0) {
+				// Migrations required - advance versions for next rebalance,
+				// queue migrations for this rebalance.
+
+				advance_version_ap(p, ns_sl_ix, ns, self_n,
+						(uint32_t)working_master_n, n_dupl, dupls);
+
+				queue_namespace_migrations(p, ns, self_n,
+						ns_node_seq[working_master_n], n_dupl, dupls, mq);
+
+				if (self_n == 0) {
+					fill_witnesses(p, ns_node_seq, ns_sl_ix, ns);
+					ns_pending_signals += p->n_witnesses;
+				}
+			}
+			else if (self_n < p->n_replicas) {
+				// No migrations required - refresh replicas' versions (only
+				// truly necessary if replication factor decreased).
+				p->version = p->final_version;
+			}
+			else {
+				// No migrations required - drop superfluous non-replica
+				// partitions immediately.
+				p->version = ZERO_VERSION;
+				as_storage_info_set(ns, p, false);
+				drop_trees(p, ns);
+			}
+		}
+
+		if (self_n == 0 || self_n == working_master_n) {
+			p->working_master = ns_node_seq[working_master_n];
+		}
+
+		if (! as_partition_version_is_null(&p->version)) {
+			as_storage_info_set(ns, p, false);
+		}
+
+		ns_pending_immigrations += (uint32_t)p->pending_immigrations;
+		ns_pending_emigrations += (uint32_t)p->pending_emigrations;
+
+		// TEMPORARY debugging.
+		if (pid < 20) {
+			cf_debug(AS_PARTITION, "ck%012lX %02u (%d %d) %s -> %s - self_n %u wm_n %d repls %u dupls %u immigrators %u",
+					as_exchange_cluster_key(), pid, p->pending_emigrations,
+					p->pending_immigrations, VERSION_AS_STRING(&debug_orig),
+					VERSION_AS_STRING(&p->version), self_n, working_master_n,
+					p->n_replicas, n_dupl, debug_n_immigrators);
+		}
+
+		client_replica_maps_update(ns, pid);
+
+		pthread_mutex_unlock(&p->lock);
+	}
+
+	// Commit partition versions to device.
+	// TODO - always flush each partition's version on storage format change.
+	as_storage_info_flush(ns);
+
+	cf_info(AS_PARTITION, "{%s} rebalanced: expected-migrations (%u,%u) expected-signals %u fresh-partitions %u",
+			ns->name, ns_pending_emigrations, ns_pending_immigrations,
+			ns_pending_signals, ns_fresh_partitions);
+
+	ns->n_unavailable_partitions = 0;
+
+	ns->migrate_tx_partitions_initial = ns_pending_emigrations;
+	ns->migrate_tx_partitions_remaining = ns_pending_emigrations;
+
+	ns->migrate_rx_partitions_initial = ns_pending_immigrations;
+	ns->migrate_rx_partitions_remaining = ns_pending_immigrations;
+
+	ns->migrate_signals_remaining = ns_pending_signals;
+}
+
+
+void
+apply_single_replica_limit_ap(as_namespace* ns)
+{
+	// Replication factor can't be bigger than observed cluster.
+	uint32_t repl_factor = ns->cluster_size < ns->cfg_replication_factor ?
+			ns->cluster_size : ns->cfg_replication_factor;
+
+	// Reduce the replication factor to 1 if the cluster size is less than or
+	// equal to the specified limit.
+	ns->replication_factor =
+			ns->cluster_size <= g_config.paxos_single_replica_limit ?
+					1 : repl_factor;
+
+	cf_info(AS_PARTITION, "{%s} replication factor is %u", ns->name,
+			ns->replication_factor);
+}
+
+
+uint32_t
+rack_count(const as_namespace* ns)
+{
+	uint32_t ids[ns->cluster_size];
+
+	memcpy(ids, ns->rack_ids, sizeof(ids));
+	qsort(ids, ns->cluster_size, sizeof(uint32_t), cf_compare_uint32_desc);
+
+	if (ids[0] == ids[ns->cluster_size - 1]) {
+		return 1; // common path - not rack-aware
+	}
+
+	uint32_t n_racks = 1;
+	uint32_t cur_id = ids[0];
+
+	for (uint32_t i = 1; i < ns->cluster_size; i++) {
+		if (ids[i] != cur_id) {
+			cur_id = ids[i];
+			n_racks++;
+		}
+	}
+
+	return n_racks;
+}
+
+
+void
+fill_translation(int translation[], const as_namespace* ns)
+{
+	int ns_n = 0;
+
+	for (uint32_t full_n = 0; full_n < g_cluster_size; full_n++) {
+		translation[full_n] = ns_n < ns->cluster_size &&
+				g_succession[full_n] == ns->succession[ns_n] ? ns_n++ : -1;
+	}
+}
+
+
+void
+fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix,
+		cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const as_namespace* ns,
+		const int translation[])
+{
+	if (ns->cluster_size == g_cluster_size) {
+		// Rack-aware but namespace is on all nodes - just copy. Rack-aware will
+		// rearrange the copies - we can't rearrange the global originals.
+		memcpy(ns_node_seq, full_node_seq, g_cluster_size * sizeof(cf_node));
+		memcpy(ns_sl_ix, full_sl_ix, g_cluster_size * sizeof(sl_ix_t));
+
+		return;
+	}
+
+	// Fill namespace sequences from global table rows using translation array.
+	uint32_t n = 0;
+
+	for (uint32_t full_n = 0; full_n < g_cluster_size; full_n++) {
+		int ns_n = translation[full_sl_ix[full_n]];
+
+		if (ns_n != -1) {
+			ns_node_seq[n] = ns->succession[ns_n];
+			ns_sl_ix[n] = (sl_ix_t)ns_n;
+			n++;
+		}
+	}
+}
+
+
+uint32_t
+find_self(const cf_node* ns_node_seq, const as_namespace* ns)
+{
+	int n = index_of_node(ns_node_seq, ns->cluster_size, g_config.self_node);
+
+	cf_assert(n != -1, AS_PARTITION, "{%s} self node not in succession list",
+			ns->name);
+
+	return (uint32_t)n;
+}
+
+
+// Preference: Vm > V > Ve > Vs > Vse > absent.
+int
+find_working_master_ap(const as_partition* p, const sl_ix_t* ns_sl_ix,
+		const as_namespace* ns)
+{
+	int best_n = -1;
+	int best_score = -1;
+
+	for (int n = 0; n < (int)ns->cluster_size; n++) {
+		const as_partition_version* version = INPUT_VERSION(n);
+
+		// Skip versions with no data.
+		if (! as_partition_version_has_data(version)) {
+			continue;
+		}
+
+		// If previous working master exists, use it. (There can be more than
+		// one after split brains. Also, the flag is only to prevent superfluous
+		// master swaps on rebalance when rack-aware.)
+		if (version->master == 1) {
+			return n;
+		}
+		// else - keep going but remember the best so far.
+
+		// V = 3 > Ve = 2 > Vs = 1 > Vse = 0.
+		int score = (version->evade == 1 ? 0 : 1) +
+				(version->subset == 1 ? 0 : 2);
+
+		if (score > best_score) {
+			best_score = score;
+			best_n = n;
+		}
+	}
+
+	return best_n;
+}
+
+
+uint32_t
+find_duplicates_ap(const as_partition* p, const cf_node* ns_node_seq,
+		const sl_ix_t* ns_sl_ix, const as_namespace* ns,
+		uint32_t working_master_n, cf_node dupls[])
+{
+	uint32_t n_dupl = 0;
+	as_partition_version parent_dupl_versions[ns->cluster_size];
+
+	memset(parent_dupl_versions, 0, sizeof(parent_dupl_versions));
+
+	for (uint32_t n = 0; n < ns->cluster_size; n++) {
+		const as_partition_version* version = INPUT_VERSION(n);
+
+		// Skip versions without data, and postpone subsets to next pass.
+		if (! as_partition_version_has_data(version) || version->subset == 1) {
+			continue;
+		}
+
+		// Every unique version is a duplicate.
+		if (version->family == VERSION_FAMILY_UNIQUE) {
+			dupls[n_dupl++] = ns_node_seq[n];
+			continue;
+		}
+
+		// Add parent versions as duplicates, unless they are already in.
+
+		uint32_t d;
+
+		for (d = 0; d < n_dupl; d++) {
+			if (is_family_same(&parent_dupl_versions[d], version)) {
+				break;
+			}
+		}
+
+		if (d == n_dupl) {
+			// Not in dupls.
+			parent_dupl_versions[n_dupl] = *version;
+			dupls[n_dupl++] = ns_node_seq[n];
+		}
+	}
+
+	// Second pass to deal with subsets.
+	for (uint32_t n = 0; n < ns->cluster_size; n++) {
+		const as_partition_version* version = INPUT_VERSION(n);
+
+		if (version->subset == 0) {
+			continue;
+		}
+
+		uint32_t d;
+
+		for (d = 0; d < n_dupl; d++) {
+			if (is_family_same(&parent_dupl_versions[d], version)) {
+				break;
+			}
+		}
+
+		if (d == n_dupl) {
+			// Not in dupls.
+			// Leave 0 in parent_dupl_versions array.
+			dupls[n_dupl++] = ns_node_seq[n];
+		}
+	}
+
+	// Remove working master from 'variants' to leave duplicates.
+	return remove_node(dupls, n_dupl, ns_node_seq[working_master_n]);
+}
+
+
+uint32_t
+fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, as_namespace* ns,
+		uint32_t working_master_n, uint32_t n_dupl)
+{
+	uint32_t n_immigrators = 0;
+
+	for (uint32_t repl_ix = 0; repl_ix < p->n_replicas; repl_ix++) {
+		const as_partition_version* version = INPUT_VERSION(repl_ix);
+
+		if (n_dupl != 0 || (repl_ix != working_master_n &&
+				(! as_partition_version_has_data(version) ||
+						version->subset == 1))) {
+			p->immigrators[repl_ix] = true;
+			n_immigrators++;
+		}
+	}
+
+	return n_immigrators;
+}
+
+
+void
+advance_version_ap(as_partition* p, const sl_ix_t* ns_sl_ix, as_namespace* ns,
+		uint32_t self_n, uint32_t working_master_n, uint32_t n_dupl,
+		const cf_node dupls[])
+{
+	// Advance working master.
+	if (self_n == working_master_n) {
+		p->version.ckey = p->final_version.ckey;
+		p->version.family = (self_n == 0 || n_dupl == 0) ? 0 : 1;
+		p->version.master = 1;
+		p->version.subset = 0;
+		p->version.evade = 0;
+
+		return;
+	}
+
+	p->version.master = 0;
+
+	bool self_is_versionless = ! as_partition_version_has_data(&p->version);
+
+	// Advance eventual master.
+	if (self_n == 0) {
+		bool was_subset = p->version.subset == 1;
+
+		p->version.ckey = p->final_version.ckey;
+		p->version.family = 0;
+		p->version.subset = n_dupl == 0 ? 1 : 0;
+
+		if (self_is_versionless || (was_subset && p->version.subset == 0)) {
+			p->version.evade = 1;
+		}
+		// else - don't change evade flag.
+
+		return;
+	}
+
+	// Advance version-less proles and non-replicas (common case).
+	if (self_is_versionless) {
+		if (self_n < p->n_replicas) {
+			p->version.ckey = p->final_version.ckey;
+			p->version.family = 0;
+			p->version.subset = 1;
+			p->version.evade = 1;
+		}
+		// else - non-replicas remain version-less.
+
+		return;
+	}
+
+	// Fill family versions.
+
+	uint32_t max_n_families = p->n_replicas + 1;
+
+	if (max_n_families > AS_PARTITION_N_FAMILIES) {
+		max_n_families = AS_PARTITION_N_FAMILIES;
+	}
+
+	as_partition_version family_versions[max_n_families];
+	uint32_t n_families = fill_family_versions(p, ns_sl_ix, ns,
+			working_master_n, n_dupl, dupls, family_versions);
+
+	uint32_t family = find_family(&p->version, n_families, family_versions);
+
+	// Advance non-masters with prior versions ...
+
+	// ... proles ...
+	if (self_n < p->n_replicas) {
+		p->version.ckey = p->final_version.ckey;
+		p->version.family = family;
+
+		if (n_dupl != 0 && p->version.family == 0) {
+			p->version.subset = 1;
+		}
+		// else - don't change either subset or evade flag.
+
+		return;
+	}
+
+	// ... or non-replicas.
+	if (family != VERSION_FAMILY_UNIQUE &&
+			family_versions[family].subset == 0) {
+		p->version.ckey = p->final_version.ckey;
+		p->version.family = family;
+		p->version.subset = 1;
+	}
+	// else - leave version as-is.
+}
+
+
+uint32_t
+fill_family_versions(const as_partition* p, const sl_ix_t* ns_sl_ix,
+		const as_namespace* ns, uint32_t working_master_n, uint32_t n_dupl,
+		const cf_node dupls[], as_partition_version family_versions[])
+{
+	uint32_t n_families = 1;
+	const as_partition_version* final_master_version = INPUT_VERSION(0);
+
+	family_versions[0] = *final_master_version;
+
+	if (working_master_n != 0) {
+		const as_partition_version* working_master_version =
+				INPUT_VERSION(working_master_n);
+
+		if (n_dupl == 0) {
+			family_versions[0] = *working_master_version;
+		}
+		else {
+			family_versions[0] = p->final_version; // not matchable
+			family_versions[1] = *working_master_version;
+			n_families = 2;
+		}
+	}
+
+	for (uint32_t repl_ix = 1;
+			repl_ix < p->n_replicas && n_families < AS_PARTITION_N_FAMILIES;
+			repl_ix++) {
+		if (repl_ix == working_master_n) {
+			continue;
+		}
+
+		const as_partition_version* version = INPUT_VERSION(repl_ix);
+
+		if (contains_node(dupls, n_dupl, p->replicas[repl_ix])) {
+			family_versions[n_families++] = *version;
+		}
+		else if (version->subset == 1 &&
+				! has_replica_parent(p, ns_sl_ix, ns, version, repl_ix)) {
+			family_versions[n_families++] = *version;
+		}
+	}
+
+	return n_families;
+}
+
+
+bool
+has_replica_parent(const as_partition* p, const sl_ix_t* ns_sl_ix,
+		const as_namespace* ns, const as_partition_version* subset_version,
+		uint32_t subset_n)
+{
+	for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) {
+		if (repl_ix == subset_n) {
+			continue;
+		}
+
+		const as_partition_version* version = INPUT_VERSION(repl_ix);
+
+		if (version->subset == 0 && is_family_same(version, subset_version)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+uint32_t
+find_family(const as_partition_version* self_version, uint32_t n_families,
+		const as_partition_version family_versions[])
+{
+	for (uint32_t n = 0; n < n_families; n++) {
+		if (is_family_same(self_version, &family_versions[n])) {
+			return n;
+		}
+	}
+
+	return VERSION_FAMILY_UNIQUE;
+}
+
+
+void
+queue_namespace_migrations(as_partition* p, as_namespace* ns, uint32_t self_n,
+		cf_node working_master, uint32_t n_dupl, cf_node dupls[], cf_queue* mq)
+{
+	pb_task task;
+
+	if (self_n == 0) {
+		// <><><><><><>  Final Master  <><><><><><>
+
+		if (g_config.self_node == working_master) {
+			p->pending_immigrations = (int)n_dupl;
+		}
+		else {
+			// Remove self from duplicates.
+			n_dupl = remove_node(dupls, n_dupl, g_config.self_node);
+
+			p->pending_immigrations = (int)n_dupl + 1;
+		}
+
+		if (n_dupl != 0) {
+			p->n_dupl = n_dupl;
+			memcpy(p->dupls, dupls, n_dupl * sizeof(cf_node));
+		}
+
+		if (p->pending_immigrations != 0) {
+			for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) {
+				if (p->immigrators[repl_ix]) {
+					p->pending_emigrations++;
+				}
+			}
+
+			// Emigrate later, after all immigration is complete.
+			return;
+		}
+
+		// Emigrate now, no immigrations to wait for.
+		for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) {
+			if (p->immigrators[repl_ix]) {
+				p->pending_emigrations++;
+				pb_task_init(&task, p->replicas[repl_ix], ns, p->id,
+						as_exchange_cluster_key(), PB_TASK_EMIG_TRANSFER,
+						TX_FLAGS_NONE);
+				cf_queue_push(mq, &task);
+			}
+		}
+
+		return;
+	}
+	// else - <><><><><><>  Not Final Master  <><><><><><>
+
+	if (g_config.self_node == working_master) {
+		if (n_dupl != 0) {
+			p->n_dupl = n_dupl;
+			memcpy(p->dupls, dupls, n_dupl * sizeof(cf_node));
+		}
+
+		p->pending_emigrations = 1;
+		pb_task_init(&task, p->replicas[0], ns, p->id,
+				as_exchange_cluster_key(), PB_TASK_EMIG_TRANSFER,
+				TX_FLAGS_ACTING_MASTER);
+		cf_queue_push(mq, &task);
+	}
+	else if (contains_self(dupls, n_dupl)) {
+		p->pending_emigrations = 1;
+		pb_task_init(&task, p->replicas[0], ns, p->id,
+				as_exchange_cluster_key(), PB_TASK_EMIG_TRANSFER,
+				TX_FLAGS_NONE);
+		cf_queue_push(mq, &task);
+	}
+
+	if (self_n < p->n_replicas && p->immigrators[self_n]) {
+		p->pending_immigrations = 1;
+	}
+}
+
+
+void
+fill_witnesses(as_partition* p, const cf_node* ns_node_seq,
+		const sl_ix_t* ns_sl_ix, as_namespace* ns)
+{
+	for (uint32_t n = 1; n < ns->cluster_size; n++) {
+		const as_partition_version* version = INPUT_VERSION(n);
+
+		// Note - 0e versions (CP) are witnesses.
+		if (n < p->n_replicas || ! as_partition_version_is_null(version)) {
+			p->witnesses[p->n_witnesses++] = ns_node_seq[n];
+		}
+	}
+}
+
+
+//==========================================================
+// Local helpers - migration-related as_partition methods.
+//
+
+// Sanity checks for immigrations commands.
+bool
+partition_immigration_is_valid(const as_partition* p, cf_node source_node,
+		const as_namespace* ns, const char* tag)
+{
+	char* failure_reason = NULL;
+
+	if (p->pending_immigrations == 0) {
+		failure_reason = "no immigrations expected";
+	}
+	else if (is_self_final_master(p)) {
+		if (source_node != p->working_master &&
+				! contains_node(p->dupls, p->n_dupl, source_node)) {
+			failure_reason = "final master's source not acting master or duplicate";
+		}
+	}
+	else if (source_node != p->replicas[0]) {
+		failure_reason = "prole's source not final working master";
+	}
+
+	if (failure_reason) {
+		cf_warning(AS_PARTITION, "{%s:%u} immigrate_%s - source %lx working-master %lx pending-immigrations %d - %s",
+				ns->name, p->id, tag, source_node, p->working_master,
+				p->pending_immigrations, failure_reason);
+
+		return false;
+	}
+
+	return true;
+}
+
+
+void
+emigrate_done_advance_non_master_version_ap(as_namespace* ns, as_partition* p,
+		uint32_t tx_flags)
+{
+	if ((tx_flags & TX_FLAGS_ACTING_MASTER) != 0) {
+		p->working_master = (cf_node)0;
+		p->n_dupl = 0;
+		p->version.master = 0;
+	}
+
+	p->version.ckey = p->final_version.ckey;
+	p->version.family = 0;
+
+	if (p->pending_immigrations != 0 || ! is_self_replica(p)) {
+		p->version.subset = 1;
+	}
+	// else - must already be a parent.
+
+	as_storage_info_set(ns, p, true);
+}
+
+
+void
+immigrate_start_advance_non_master_version_ap(as_partition* p)
+{
+	// Become subset of final version if not already such.
+	if (! (p->version.ckey == p->final_version.ckey &&
+			p->version.family == 0 && p->version.subset == 1)) {
+		p->version.ckey = p->final_version.ckey;
+		p->version.family = 0;
+		p->version.master = 0; // racing emigrate done if we were acting master
+		p->version.subset = 1;
+		// Leave evade flag as-is.
+	}
+}
+
+
+void
+immigrate_done_advance_final_master_version_ap(as_namespace* ns,
+		as_partition* p)
+{
+	if (! as_partition_version_same(&p->version, &p->final_version)) {
+		p->version = p->final_version;
+		as_storage_info_set(ns, p, true);
+	}
+}
diff --git a/as/src/fabric/partition_balance_ce.c b/as/src/fabric/partition_balance_ce.c
new file mode 100644
index 00000000..0f59de87
--- /dev/null
+++ b/as/src/fabric/partition_balance_ce.c
@@ -0,0 +1,126 @@
+/*
+ * partition_balance_ce.c
+ *
+ * Copyright (C) 2017-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/partition_balance.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_queue.h"
+
+#include "node.h"
+
+#include "base/datamodel.h"
+#include "fabric/partition.h"
+#include "fabric/migrate.h"
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_partition_balance_emigration_yield()
+{
+}
+
+bool
+as_partition_balance_revive(as_namespace* ns)
+{
+	cf_warning(AS_PARTITION, "revive is an enterprise feature");
+	return true;
+}
+
+bool
+as_partition_pre_emigrate_done(as_namespace* ns, uint32_t pid,
+		uint64_t orig_cluster_key, uint32_t tx_flags)
+{
+	return true;
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+void
+partition_balance_init()
+{
+}
+
+void
+balance_namespace(as_namespace* ns, cf_queue* mq)
+{
+	balance_namespace_ap(ns, mq);
+}
+
+void
+prepare_for_appeals()
+{
+}
+
+void
+process_pb_tasks(cf_queue* tq)
+{
+	pb_task task;
+
+	while (cf_queue_pop(tq, &task, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) {
+		as_migrate_emigrate(&task);
+	}
+}
+
+void
+rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix,
+		uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids,
+		uint32_t n_racks, uint32_t start_n)
+{
+	cf_crash(AS_PARTITION, "CE code called rack_aware_adjust_row()");
+}
+
+void
+emigrate_done_advance_non_master_version(as_namespace* ns, as_partition* p,
+		uint32_t tx_flags)
+{
+	emigrate_done_advance_non_master_version_ap(ns, p, tx_flags);
+}
+
+void
+immigrate_start_advance_non_master_version(as_namespace* ns, as_partition* p)
+{
+	immigrate_start_advance_non_master_version_ap(p);
+}
+
+void
+immigrate_done_advance_final_master_version(as_namespace* ns, as_partition* p)
+{
+	immigrate_done_advance_final_master_version_ap(ns, p);
+}
+
+bool
+immigrate_yield()
+{
+	return false;
+}
diff --git a/as/src/fabric/partition_ce.c b/as/src/fabric/partition_ce.c
new file mode 100644
index 00000000..86520f73
--- /dev/null
+++ b/as/src/fabric/partition_ce.c
@@ -0,0 +1,67 @@
+/*
+ * partition_ce.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/partition.h"
+
+#include <stdbool.h>
+
+#include "node.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_partition_isolate_version(const as_namespace* ns, as_partition* p)
+{
+	if (as_partition_version_has_data(&p->version)) {
+		p->version.master = 0;
+		p->version.subset = 1;
+	}
+}
+
+int
+as_partition_check_source(const as_namespace* ns, as_partition* p, cf_node src,
+		bool* from_replica)
+{
+	return AS_PROTO_RESULT_OK;
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+bool
+partition_reserve_promote(const as_namespace* ns, const as_partition* p,
+		bool would_dup_res)
+{
+	return p->n_dupl != 0 && would_dup_res;
+}
diff --git a/as/src/fabric/roster_ce.c b/as/src/fabric/roster_ce.c
new file mode 100644
index 00000000..f86be963
--- /dev/null
+++ b/as/src/fabric/roster_ce.c
@@ -0,0 +1,50 @@
+/*
+ * roster_ce.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "fabric/roster.h"
+
+#include <stdbool.h>
+
+#include "fault.h"
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_roster_init_smd()
+{
+	// CE Code doesn't invoke roster SMD module.
+	// TODO - how to handle with future static SMD module initialization?
+}
+
+bool
+as_roster_set_nodes_cmd(const char* ns_name, const char* nodes)
+{
+	cf_warning(AS_ROSTER, "roster is an enterprise feature");
+	return false;
+}
diff --git a/as/src/fabric/skew_monitor.c b/as/src/fabric/skew_monitor.c
new file mode 100644
index 00000000..44ea0339
--- /dev/null
+++ b/as/src/fabric/skew_monitor.c
@@ -0,0 +1,611 @@
+/*
+ * skew_monitor.c
+ *
+ * Copyright (C) 2012-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fabric/skew_monitor.h"
+
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/param.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "msg.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "fabric/clustering.h"
+#include "fabric/exchange.h"
+#include "fabric/hb.h"
+
+/*
+ * Overview
+ * ========
+ * Monitors skew across nodes in a cluster to allow other modules to handle skew
+ * beyond tolerances. For example CP namespaces block transctions on skew beyond
+ * tolerable limits.
+ *
+ * Principle of skew monitoring
+ * ============================
+ * The hlc clock forms a pretty close upper bound on the physical clocks for
+ * adjacent nodes within the bounds of network trip time.
+ *
+ * Lets call the difference between a node's physical component of hlc time and
+ * physical time at the same instant as its hlc_delta.
+ * The premise is that the difference between the min hlc_delta and max
+ * hlc_delta observed for adjacent nodes closely follows the  maximum clock skew
+ * in the cluster.
+ *
+ * The clock skew monitor adds a physical timestamp field to each heartbeat
+ * pulse message.
+ * For a peer node on receipt of a heartbeat pulse, hlc_delta is computed as
+ * 	hlc_delta = physical-component(pulse-hlc) - pulse-timestamp
+ *
+ * We maintain a exponential moving average of the hlc_delta to buffer against
+ * small fluctuations
+ * 	avg_hlc_delta = (ALPHA)(hlc_delta) + (1-ALPHA)(avg_hlc_delta)
+ *
+ * where ALPHA is set to weigh current values more over older values.
+ *
+ * Cluster wide clock ckew is updated at periodic intervals. A low water mark
+ * breach of the skew generates warnings and a high water mark breach causes
+ * (TODO: ????).
+ *
+ * Design
+ * =======
+ * The monitor is ticks on heartbeat message sends without requiring an
+ * additional thread. This is alright as heartbeat pulse messages are the
+ * vehicle used for skew detection. The amount of computation amortized across
+ * sent heartbeat pulse messages is minimal and should be maintained so.
+ */
+
+/*
+ * ----------------------------------------------------------------------------
+ * Constants
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Weightage of current clock delta over current moving average. For now weigh
+ * recent values heavily over older values.
+ */
+#define ALPHA (0.65)
+
+/*
+ * ----------------------------------------------------------------------------
+ * Logging
+ * ----------------------------------------------------------------------------
+ */
+#define CRASH(format, ...) cf_crash(AS_SKEW, format, ##__VA_ARGS__)
+#define WARNING(format, ...) cf_warning(AS_SKEW, format, ##__VA_ARGS__)
+#define INFO(format, ...) cf_info(AS_SKEW, format, ##__VA_ARGS__)
+#define DEBUG(format, ...) cf_debug(AS_SKEW, format, ##__VA_ARGS__)
+#define DETAIL(format, ...) cf_detail(AS_SKEW, format, ##__VA_ARGS__)
+
+/*
+ * ----------------------------------------------------------------------------
+ * Skew monitor data structures
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * A struct to hold and its skew related information.
+ */
+typedef struct as_skew_monitor_node_skew_data_s
+{
+	cf_node nodeid;
+	int64_t delta;
+} as_skew_monitor_node_skew_data;
+
+/**
+ * HB plugin data iterate to get node hlc deltas.
+ */
+typedef struct as_skew_monitor_hlc_delta_udata_s
+{
+	int num_nodes;
+	as_skew_monitor_node_skew_data skew_data[AS_CLUSTER_SZ];
+} as_skew_monitor_hlc_delta_udata;
+
+/*
+ * ----------------------------------------------------------------------------
+ * External protected API for skew monitor
+ * ----------------------------------------------------------------------------
+ */
+extern int
+as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts);
+
+/*
+ * ----------------------------------------------------------------------------
+ * Globals
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Last time skew was checked.
+ */
+cf_atomic64 g_last_skew_check_time = 0;
+
+/**
+ * Current value of clock skew.
+ */
+cf_atomic64 g_skew = 0;
+
+/**
+ * Moving average of the clock skew for self node.
+ */
+volatile int64_t g_self_skew_avg = 0;
+
+/*
+ * ----------------------------------------------------------------------------
+ * Skew intervals and limits
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Interval at which skew checks should be made.
+ */
+static uint64_t
+skew_check_interval()
+{
+	return MIN(2000, as_clustering_quantum_interval() / 2);
+}
+
+/**
+ * Threshold for outlier detection. Skew values less than this threshold will
+ * not invoke outlier detection.
+ */
+static uint64_t
+skew_monitor_outlier_detection_threshold()
+{
+	return as_clustering_quantum_interval();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * HLC delta related
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Find min and max skew using difference between physical clock and hlc.
+ */
+static void
+skew_monitor_delta_collect_iterate(cf_node nodeid, void* plugin_data,
+		size_t plugin_data_size, cf_clock recv_monotonic_ts,
+		as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
+{
+	int64_t delta = 0;
+	as_skew_monitor_hlc_delta_udata* deltas =
+			(as_skew_monitor_hlc_delta_udata*)udata;
+
+	if (!plugin_data || plugin_data_size < sizeof(uint64_t)) {
+		// Assume missing nodes share the same delta as self.
+		// Note: self node will not be in adjacency list and hence will also
+		// follow same code path.
+		delta = g_self_skew_avg;
+	}
+	else {
+		delta = *(int64_t*)plugin_data;
+	}
+
+	int index = deltas->num_nodes;
+	deltas->skew_data[index].delta = delta;
+	deltas->skew_data[index].nodeid = nodeid;
+	deltas->num_nodes++;
+}
+
+/**
+ * Compute the skew across the cluster.
+ */
+static uint64_t
+skew_monitor_compute_skew()
+{
+	uint64_t skew = 0;
+	uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)];
+	cf_vector succession = { 0 };
+
+	cf_vector_init_smalloc(&succession, sizeof(cf_node), buffer, sizeof(buffer),
+			VECTOR_FLAG_INITZERO);
+	as_exchange_succession(&succession);
+
+	if (cf_vector_size(&succession) <= 1) {
+		// Self node is an orphan or single node cluster. No cluster wide skew.
+		skew = 0;
+		goto Cleanup;
+	}
+
+	as_skew_monitor_hlc_delta_udata udata = { 0 };
+	as_hb_plugin_data_iterate(&succession, AS_HB_PLUGIN_SKEW_MONITOR,
+			skew_monitor_delta_collect_iterate, &udata);
+
+	int64_t min = INT64_MAX;
+	int64_t max = INT64_MIN;
+
+	for (int i = 0; i < udata.num_nodes; i++) {
+		int64_t delta = udata.skew_data[i].delta;
+		if (delta < min) {
+			min = delta;
+		}
+
+		if (delta > max) {
+			max = delta;
+		}
+	}
+	skew = max - min;
+
+Cleanup:
+	cf_vector_destroy(&succession);
+	return skew;
+}
+
+/**
+ * Update clock skew and fire skew events.
+ */
+static void
+skew_monitor_update()
+{
+	cf_clock now = cf_getms();
+	cf_atomic64_set(&g_last_skew_check_time, now);
+
+	uint64_t skew = skew_monitor_compute_skew();
+	uint64_t avg_skew = cf_atomic64_get(g_skew);
+	avg_skew = ALPHA * skew + (1 - ALPHA) * avg_skew;
+	cf_atomic64_set(&g_skew, avg_skew);
+
+	for (int i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace* ns = g_config.namespaces[i];
+		handle_clock_skew(ns, avg_skew);
+	}
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Outlier detection
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Comparator for deltas.
+ */
+static int
+skew_monitor_hlc_delta_compare(const void* o1, const void* o2)
+{
+	int64_t delta1 = ((as_skew_monitor_node_skew_data*)o1)->delta;
+	int64_t delta2 = ((as_skew_monitor_node_skew_data*)o2)->delta;
+
+	return delta1 > delta2 ? 1 : (delta1 == delta2 ? 0 : -1);
+}
+
+/**
+ * Compute the median of the data.
+ * @param values the values sorted.
+ * @param from the start index (inclusive)
+ * @param to the end index (inclusive)
+ * @return the index of the median element
+ */
+static int
+skew_monitor_median_index(int from, int to)
+{
+	int numElements = to - from + 1;
+	if (numElements < 0) {
+		return from;
+	}
+	return (to + from) / 2;
+}
+
+/**
+ * Return the currently estimated outliers from our cluster.
+ * Outliers should have space to hold at least AS_CLUSTER_SZ nodes.
+ */
+static uint32_t
+skew_monitor_outliers_from_skew_data(cf_vector* outliers,
+		as_skew_monitor_hlc_delta_udata* udata)
+{
+	// Use inter-quartile distance to detect outliers.
+	// Sort the deltas in ascending order.
+	qsort(udata->skew_data, udata->num_nodes,
+			sizeof(as_skew_monitor_node_skew_data),
+			skew_monitor_hlc_delta_compare);
+	int q2_index = skew_monitor_median_index(0, udata->num_nodes - 1);
+	int q3_index = skew_monitor_median_index(q2_index, udata->num_nodes - 1);
+	int q1_index = skew_monitor_median_index(0, q2_index);
+	int64_t q3 = udata->skew_data[q3_index].delta;
+	int64_t q1 = udata->skew_data[q1_index].delta;
+
+	// Compute the inter quartile range. Lower bound iqr to network latency to
+	// allow that allow some fuzziness with tigth clock grouping.
+	int64_t iqr = MAX(q3 - q1, g_config.fabric_latency_max_ms);
+	double lower_bound = q1 - 1.5 * iqr;
+	double upper_bound = q3 + 1.5 * iqr;
+
+	uint32_t num_outliers = 0;
+
+	// Isolate outliers
+	for (int i = 0; i < udata->num_nodes; i++) {
+		if (udata->skew_data[i].delta < lower_bound
+				|| udata->skew_data[i].delta > upper_bound) {
+			if (outliers) {
+				cf_vector_append(outliers, &udata->skew_data[i].nodeid);
+			}
+
+			num_outliers++;
+		}
+	}
+
+	return num_outliers;
+}
+
+/**
+ * Return the currently estimated outliers from our cluster.
+ * Outliers should have space to hold at least AS_CLUSTER_SZ nodes.
+ */
+static uint32_t
+skew_monitor_outliers(cf_vector* outliers)
+{
+	if (as_skew_monitor_skew() < skew_monitor_outlier_detection_threshold()) {
+		// Skew is not significant. Skip printing outliers.
+		return 0;
+	}
+
+	uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)];
+	cf_vector succession;
+	cf_vector_init_smalloc(&succession, sizeof(cf_node), buffer, sizeof(buffer),
+			VECTOR_FLAG_INITZERO);
+	as_exchange_succession(&succession);
+
+	uint32_t num_outliers = 0;
+
+	uint32_t cluster_size = cf_vector_size(&succession);
+	if (cluster_size <= 1) {
+		// Self node is an orphan or single node cluster. No cluster wide skew.
+		goto Cleanup;
+	}
+
+	as_skew_monitor_hlc_delta_udata udata = { 0 };
+	as_hb_plugin_data_iterate(&succession, AS_HB_PLUGIN_SKEW_MONITOR,
+			skew_monitor_delta_collect_iterate, &udata);
+
+	num_outliers = skew_monitor_outliers_from_skew_data(outliers, &udata);
+
+Cleanup:
+	cf_vector_destroy(&succession);
+
+	return num_outliers;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * HB plugin functions
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Push current timestamp for self node into the heartbeat pulse message.
+ */
+static void
+skew_monitor_hb_plugin_set_fn(msg* msg)
+{
+	cf_clock send_ts = cf_clock_getabsolute();
+	if (msg_set_uint64(msg, AS_HB_MSG_SKEW_MONITOR_DATA, send_ts) != 0) {
+		CRASH("error setting current timestamp on msg");
+	}
+
+	// Update self skew.
+	as_hlc_timestamp send_hlc_ts = as_hlc_timestamp_now();
+	int64_t clock_delta = as_hlc_physical_ts_get(send_hlc_ts) - send_ts;
+
+	// Update the average delta for self.
+	g_self_skew_avg = clock_delta * ALPHA + (1 - ALPHA) * (g_self_skew_avg);
+
+	cf_clock now = cf_getms();
+	if (cf_atomic64_get(g_last_skew_check_time) + skew_check_interval() < now) {
+		skew_monitor_update();
+	}
+}
+
+/**
+ * Compare the HLC timestamp and the physical clock and store the difference as
+ * plugin data for the source node to enable skew detection.
+ */
+static void
+skew_monitor_hb_plugin_parse_data_fn(msg* msg, cf_node source,
+		as_hb_plugin_node_data* plugin_data)
+{
+	cf_clock send_ts = 0;
+	as_hlc_timestamp send_hlc_ts = 0;
+	if (msg_get_uint64(msg, AS_HB_MSG_SKEW_MONITOR_DATA, &send_ts) != 0
+			|| as_hb_msg_send_hlc_ts_get(msg, &send_hlc_ts) != 0) {
+		// Pre CP mode node. For now assumes it shares the same delta with hlc
+		// as us.
+		send_hlc_ts = as_hlc_timestamp_now();
+		send_ts = cf_clock_getabsolute();
+	}
+
+	size_t required_capacity = sizeof(int64_t);
+	if (required_capacity > plugin_data->data_capacity) {
+		plugin_data->data = cf_realloc(plugin_data->data, required_capacity);
+
+		if (plugin_data->data == NULL) {
+			CRASH(
+					"error allocating space for storing succession list for node %"PRIx64,
+					source);
+		}
+		plugin_data->data_capacity = required_capacity;
+		memset(plugin_data->data, 0, required_capacity);
+	}
+
+	int64_t clock_delta = as_hlc_physical_ts_get(send_hlc_ts) - send_ts;
+	int64_t* average_clock_delta = (int64_t*)plugin_data->data;
+
+	if (plugin_data->data_size == 0) {
+		// This is the first data point.
+		*average_clock_delta = clock_delta;
+	}
+
+	plugin_data->data_size = required_capacity;
+
+	// update the average
+	*average_clock_delta = clock_delta * ALPHA
+			+ (1 - ALPHA) * (*average_clock_delta);
+
+	DETAIL("node %"PRIx64" hlc:%lu clock:%lu delta:%ld moving-average:%ld", source, send_hlc_ts, send_ts, clock_delta, *average_clock_delta);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Protceted API only mean for clustering.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Update clock skew and fire skew events.
+ */
+void
+as_skew_monitor_update()
+{
+	skew_monitor_update();
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * Public API
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * Initialize skew monitor.
+ */
+void
+as_skew_monitor_init()
+{
+	as_hb_plugin skew_monitor_plugin = { 0 };
+
+	skew_monitor_plugin.id = AS_HB_PLUGIN_SKEW_MONITOR;
+	skew_monitor_plugin.wire_size_fixed = sizeof(int64_t);
+	// Size of the node in succession list.
+	skew_monitor_plugin.wire_size_per_node = 0;
+	skew_monitor_plugin.set_fn = skew_monitor_hb_plugin_set_fn;
+	skew_monitor_plugin.parse_fn = skew_monitor_hb_plugin_parse_data_fn;
+	as_hb_plugin_register(&skew_monitor_plugin);
+
+	DETAIL("skew monitor initialized");
+}
+
+/**
+ * Return the current estimate of the clock skew in the cluster.
+ */
+uint64_t
+as_skew_monitor_skew()
+{
+	return cf_atomic64_get(g_skew);
+}
+
+/**
+ * Return the currently estimated outliers from our cluster.
+ * Outliers should have space to hold at least AS_CLUSTER_SZ nodes.
+ */
+uint32_t
+as_skew_monitor_outliers(cf_vector* outliers)
+{
+	return skew_monitor_outliers(outliers);
+}
+
+/**
+ * Print skew outliers to a dynamic buffer.
+ */
+uint32_t
+as_skew_monitor_outliers_append(cf_dyn_buf* db)
+{
+	uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)];
+	cf_vector outliers;
+	cf_vector_init_smalloc(&outliers, sizeof(cf_node), buffer, sizeof(buffer),
+			VECTOR_FLAG_INITZERO);
+	uint32_t num_outliers = skew_monitor_outliers(&outliers);
+
+	for (uint32_t i = 0; i < num_outliers; i++) {
+		cf_node outlier_id;
+		cf_vector_get(&outliers, i, &outlier_id);
+		cf_dyn_buf_append_uint64_x(db, outlier_id);
+		cf_dyn_buf_append_char(db, ',');
+	}
+
+	if (num_outliers) {
+		cf_dyn_buf_chomp(db);
+	}
+
+	cf_vector_destroy(&outliers);
+
+	return num_outliers;
+}
+
+/**
+ * Print skew monitor info to a dynamic buffer.
+ */
+void
+as_skew_monitor_info(cf_dyn_buf* db)
+{
+	cf_dyn_buf_append_string(db, "cluster_clock_skew_outliers=");
+	uint32_t num_outliers = as_skew_monitor_outliers_append(db);
+	if (num_outliers == 0) {
+		cf_dyn_buf_append_string(db, "null");
+	}
+	cf_dyn_buf_append_char(db, ';');
+}
+
+/**
+ * Dump some debugging information to the logs.
+ */
+void
+as_skew_monitor_dump()
+{
+	uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)];
+	cf_vector node_vector;
+	cf_vector_init_smalloc(&node_vector, sizeof(cf_node), buffer,
+			sizeof(buffer), VECTOR_FLAG_INITZERO);
+	as_exchange_succession(&node_vector);
+
+	INFO("CSM: cluster-clock-skew:%ld", as_skew_monitor_skew());
+	if (cf_vector_size(&node_vector) <= 1) {
+		// Self node is an orphan or single node cluster. No cluster wide skew.
+		goto Cleanup;
+	}
+
+	as_skew_monitor_hlc_delta_udata udata = { 0 };
+	as_hb_plugin_data_iterate(&node_vector, AS_HB_PLUGIN_SKEW_MONITOR,
+			skew_monitor_delta_collect_iterate, &udata);
+
+	for (int i = 0; i < udata.num_nodes; i++) {
+		INFO("CSM:    node:%"PRIx64" hlc-delta:%ld", udata.skew_data[i].nodeid, udata.skew_data[i].delta);
+	}
+
+	// Log the outliers.
+	cf_vector_clear(&node_vector);
+	skew_monitor_outliers(&node_vector);
+	if (cf_vector_size(&node_vector)) {
+		as_clustering_log_cf_node_vector(AS_INFO, AS_SKEW,
+				"CSM: Estimated clock outliers", &node_vector);
+	}
+
+Cleanup:
+	cf_vector_destroy(&node_vector);
+}
diff --git a/as/src/geospatial/geojson.cc b/as/src/geospatial/geojson.cc
new file mode 100644
index 00000000..2c5cc384
--- /dev/null
+++ b/as/src/geospatial/geojson.cc
@@ -0,0 +1,344 @@
+/* 
+ * Copyright 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more
+ * contributor license agreements.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You
+ * may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <memory>
+#include <iostream>
+#include <iomanip>
+#include <stdexcept>
+
+#include <jansson.h>
+
+#include <s2.h>
+#include <s2cap.h>
+#include <s2cellid.h>
+#include <s2polygon.h>
+#include <s2regionunion.h>
+#include <s2latlng.h>
+
+#include "geospatial/scoped.h"
+#include "geospatial/throwstream.h"
+#include "geospatial/geojson.h"
+
+using namespace std;
+
+namespace {
+
+S2Point
+traverse_point(json_t * coord)
+{
+	if (! coord) {
+		throwstream(runtime_error, "missing coordinates");
+    }
+
+	if (! json_is_array(coord)) {
+		throwstream(runtime_error, "coordinates are not array");
+    }
+
+	if (json_array_size(coord) != 2) {
+		throwstream(runtime_error, "expected 2 coordinates, saw "
+					<< json_array_size(coord));
+    }
+
+	double lngval;
+	json_t * lng = json_array_get(coord, 0);
+	if (json_is_real(lng)) {
+		lngval = json_real_value(lng);
+	}
+	else if (json_is_integer(lng)) {
+		lngval = double(json_integer_value(lng));
+	}
+	else {
+		throwstream(runtime_error, "longitude not numeric value");
+    }
+	
+	double latval;
+	json_t * lat = json_array_get(coord, 1);
+	if (json_is_real(lat)) {
+		latval = json_real_value(lat);
+	}
+	else if (json_is_integer(lat)) {
+		latval = double(json_integer_value(lat));
+	}
+	else {
+		throwstream(runtime_error, "latitude not numeric value");
+    }
+	
+	// cout << setprecision(15) << latval << ", " << lngval << endl;
+
+	S2LatLng latlng = S2LatLng::FromDegrees(latval, lngval).Normalized();
+	if (! latlng.is_valid()) {
+		throwstream(runtime_error, "invalid latitude-longitude");
+	}
+	return latlng.ToPoint();
+}
+
+S2Loop *
+traverse_loop(json_t * vertices)
+{
+	if (! vertices) {
+		throwstream(runtime_error, "missing vertices");
+    }
+
+	if (! json_is_array(vertices)) {
+		throwstream(runtime_error, "vertices are not array");
+    }
+
+	vector<S2Point> points;
+
+	for (size_t ii = 0; ii < json_array_size(vertices); ++ii) {
+		points.push_back(traverse_point(json_array_get(vertices, ii)));
+    }
+
+	// Remove duplicate points.
+	for (size_t ii = 1; ii < points.size(); ++ii) {
+		if (points[ii - 1] == points[ii]) {
+			points.erase(points.begin() + ii);
+			--ii;
+		}
+	}
+
+	if (points.size() < 4) {
+		throwstream(runtime_error, "loop contains less than 4 points");
+	}
+	if (points[0] != points[points.size()-1]) {
+		throwstream(runtime_error, "loop not closed");
+	}
+	points.pop_back();
+
+	auto_ptr<S2Loop> loop(new S2Loop(points));
+	loop->Normalize();
+	return loop.release();
+}
+
+S2Polygon *
+traverse_polygon(json_t * loops)
+{
+	if (! loops) {
+		throwstream(runtime_error, "missing polygon body");
+    }
+
+	if (! json_is_array(loops)) {
+		throwstream(runtime_error, "polygon body is not array");
+    }
+
+	vector<S2Loop *> loopv;
+	try
+	{
+		for (size_t ii = 0; ii < json_array_size(loops); ++ii) {
+			loopv.push_back(traverse_loop(json_array_get(loops, ii)));
+        }
+		
+		return new S2Polygon(&loopv);
+	}
+	catch (...)
+	{
+		for (size_t ii = 0; ii < loopv.size(); ++ii) {
+			delete loopv[ii];
+        }
+		throw;
+	}
+}
+
+void process_point(GeoJSON::GeometryHandler & geohand, json_t * coord)
+{
+	geohand.handle_point(S2CellId::FromPoint(traverse_point(coord)));
+}
+
+void
+process_polygon(GeoJSON::GeometryHandler & geohand, json_t * coord)
+{
+	if (! coord) {
+		throwstream(runtime_error, "missing coordinates");
+    }
+
+	if (! json_is_array(coord)) {
+		throwstream(runtime_error, "coordinates are not array");
+    }
+
+	S2Polygon * poly = traverse_polygon(coord);
+	if (geohand.handle_region(poly)) {
+		delete poly;
+    }
+}
+
+void
+process_multipolygon(GeoJSON::GeometryHandler & geohand, json_t * coord)
+{
+	if (! coord) {
+		throwstream(runtime_error, "missing coordinates");
+    }
+
+	if (! json_is_array(coord)) {
+		throwstream(runtime_error, "coordinates are not array");
+    }
+
+	auto_ptr<S2RegionUnion> regionsp(new S2RegionUnion);
+
+	for (size_t ii = 0; ii < json_array_size(coord); ++ii) {
+		regionsp->Add(traverse_polygon(json_array_get(coord, ii)));
+    }
+
+	if (! geohand.handle_region(regionsp.get())) {
+		// Handler took ownership.
+		regionsp.release();
+    }
+}
+
+void
+process_circle(GeoJSON::GeometryHandler & geohand, json_t * coord)
+{
+	// {
+	//	   "type": "AeroCircle",
+	//	   "coordinates": [[-122.097837, 37.421363], 1000.0]
+	// }
+
+	if (! coord) {
+		throwstream(runtime_error, "missing coordinates");
+    }
+
+	if (! json_is_array(coord)) {
+		throwstream(runtime_error, "coordinates are not array");
+    }
+
+	if (json_array_size(coord) != 2) {
+		throwstream(runtime_error, "malformed circle coordinate array");
+    }
+
+	S2Point center = traverse_point(json_array_get(coord, 0));
+
+	double radius;
+	json_t * radiusobj = json_array_get(coord, 1);
+	if (json_is_real(radiusobj)) {
+		radius = json_real_value(radiusobj);
+	}
+	else if (json_is_integer(radiusobj)) {
+		radius = double(json_integer_value(radiusobj));
+	}
+	else {
+		throwstream(runtime_error, "radius not numeric value");
+    }
+
+	S1Angle angle = S1Angle::Radians(radius / geohand.earth_radius_meters());
+
+	auto_ptr<S2Cap> capp(S2Cap::FromAxisAngle(center, angle).Clone());
+
+	if (! geohand.handle_region(capp.get())) {
+		// Handler took ownership.
+		capp.release();
+    }
+}
+
+void traverse_geometry(GeoJSON::GeometryHandler & geohand, json_t * geom)
+{
+	if (! geom) {
+		throwstream(runtime_error, "missing geometry element");
+    }
+
+	if (! json_is_object(geom)) {
+		throwstream(runtime_error, "geometry is not object");
+    }
+
+	json_t * type = json_object_get(geom, "type");
+	if (! type) {
+		throwstream(runtime_error, "missing geometry type");
+    }
+	
+	if (! json_is_string(type)) {
+		throwstream(runtime_error, "geometry type is not string");
+    }
+
+	string typestr(json_string_value(type));
+	if (typestr == "Point") {
+		process_point(geohand, json_object_get(geom, "coordinates"));
+    }
+	else if (typestr == "Polygon") {
+		process_polygon(geohand, json_object_get(geom, "coordinates"));
+    }
+	else if (typestr == "MultiPolygon") {
+		process_multipolygon(geohand, json_object_get(geom, "coordinates"));
+    }
+	else if (typestr == "AeroCircle") {
+		process_circle(geohand, json_object_get(geom, "coordinates"));
+    }
+	else {
+		throwstream(runtime_error, "unknown geometry type: " << typestr);
+    }
+}
+
+} // end namespace
+
+namespace GeoJSON {
+
+void GeometryHandler::handle_point(S2CellId const & i_cellid)
+{
+	// nothing by default
+}
+
+bool GeometryHandler::handle_region(S2Region * i_regionp)
+{
+	// By default, caller should delete the region.
+	return true;
+}
+
+void parse(GeometryHandler & geohand, string const & geostr)
+{
+	json_error_t err;
+	Scoped<json_t *> geojson(json_loadb(geostr.data(), geostr.size(), 0, &err),
+							 NULL, json_decref);
+	if (! geojson) {
+		throwstream(runtime_error, "failed to parse geojson: "
+					<< err.line << ": " << err.text);
+    }
+
+	geohand.set_json(geojson);
+
+	if (! json_is_object(geojson)) {
+		throwstream(runtime_error, "top level geojson element not object");
+    }
+	
+	json_t * type = json_object_get(geojson, "type");
+	if (! type) {
+		throwstream(runtime_error, "missing top-level type in geojson element");
+    }
+	
+	if (! json_is_string(type)) {
+		throwstream(runtime_error, "top-level type is not string");
+    }
+
+	string typestr(json_string_value(type));
+	if (typestr == "Feature") {
+		traverse_geometry(geohand, json_object_get(geojson, "geometry"));
+    }
+	else if (typestr == "Point") {
+		process_point(geohand, json_object_get(geojson, "coordinates"));
+    }
+	else if (typestr == "Polygon") {
+		process_polygon(geohand, json_object_get(geojson, "coordinates"));
+    }
+	else if (typestr == "MultiPolygon") {
+		process_multipolygon(geohand, json_object_get(geojson, "coordinates"));
+    }
+	else if (typestr == "AeroCircle") {
+		process_circle(geohand, json_object_get(geojson, "coordinates"));
+    }
+	else {
+		throwstream(runtime_error, "unknown top-level type: " << typestr);
+    }
+}
+
+} // end namespace GeoJSON
diff --git a/as/src/geospatial/geospatial.cc b/as/src/geospatial/geospatial.cc
new file mode 100644
index 00000000..17825c23
--- /dev/null
+++ b/as/src/geospatial/geospatial.cc
@@ -0,0 +1,228 @@
+/*
+ * geospatial.cpp
+ *
+ * Copyright (C) 2015 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.	 If not, see http://www.gnu.org/licenses/
+ */
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+
+#include <stdexcept>
+
+#include <s2regioncoverer.h>
+
+extern "C" {
+#include "fault.h"
+#include "base/datamodel.h"
+} // end extern "C"
+
+#include "geospatial/geospatial.h"
+#include "geospatial/geojson.h"
+
+using namespace std;
+
+class PointRegionHandler: public GeoJSON::GeometryHandler
+{
+public:
+	PointRegionHandler(as_namespace * ns)
+		: m_cellid(0)
+		, m_regionp(NULL)
+	{
+		m_earth_radius_meters =
+			ns ? double(ns->geo2dsphere_within_earth_radius_meters) : 6371000;
+	}
+
+	virtual void handle_point(S2CellId const & cellid) {
+		m_cellid = cellid;
+	}
+
+	virtual bool handle_region(S2Region * regionp) {
+		m_regionp = regionp;
+		return false;	// Don't delete this region, please.
+	}
+
+	virtual double earth_radius_meters() {
+		return m_earth_radius_meters;
+	}
+
+	double m_earth_radius_meters;
+	S2CellId	m_cellid;
+	S2Region * m_regionp;
+};
+
+bool
+geo_parse(as_namespace * ns,
+		  const char * buf,
+		  size_t bufsz,
+		  uint64_t * cellidp,
+		  geo_region_t * regionp)
+{
+	try
+	{
+		PointRegionHandler prhandler(ns);
+		GeoJSON::parse(prhandler, string(buf, bufsz));
+		*cellidp = prhandler.m_cellid.id();
+		*regionp = (geo_region_t) prhandler.m_regionp;
+		return true;
+	}
+	catch (exception const & ex)
+	{
+		cf_warning(AS_GEO, (char *) "failed to parse point: %s", ex.what());
+		return false;
+	}
+}
+	
+bool
+geo_region_cover(as_namespace * ns,
+				 geo_region_t region,
+				 int maxnumcells,
+				 uint64_t * cellctrp,
+				 uint64_t * cellminp,
+				 uint64_t * cellmaxp,
+				 int * numcellsp)
+{
+	try
+	{
+		S2Region * regionp = (S2Region *) region;
+
+		S2RegionCoverer coverer;
+		if (ns) {
+			coverer.set_min_level(ns->geo2dsphere_within_min_level);
+			coverer.set_max_level(ns->geo2dsphere_within_max_level);
+			coverer.set_max_cells(ns->geo2dsphere_within_max_cells);
+			coverer.set_level_mod(ns->geo2dsphere_within_level_mod);
+		}
+		else {
+			// FIXME - we really don't want to hardcode these values, but
+			// some callers can't provide the namespace context ...
+			coverer.set_min_level(1);
+			coverer.set_max_level(30);
+			coverer.set_max_cells(12);
+			coverer.set_level_mod(1);
+		}
+		vector<S2CellId> covering;
+		coverer.GetCovering(*regionp, &covering);
+
+		// The coverer can always return 6 cells, even when max cells is
+		// less (regions which intersect all cube faces).  If we get more
+		// then we asked for and it's greater then 6 something is wrong.
+		if (covering.size() > max(size_t(6), size_t(coverer.max_cells()))) {
+			return false;
+		}
+	
+		for (size_t ii = 0; ii < covering.size(); ++ii)
+		{
+			if (ii == (size_t) maxnumcells)
+			{
+				cf_warning(AS_GEO, (char *) "region covered with %zu cells, "
+						   "only %d allowed", covering.size(), maxnumcells);
+				return false;
+			}
+
+			if (cellctrp) {
+				cellctrp[ii] = covering[ii].id();
+			}
+			if (cellminp) {
+				cellminp[ii] = covering[ii].range_min().id();
+			}
+			if (cellmaxp) {
+				cellmaxp[ii] = covering[ii].range_max().id();
+			}
+
+			if (cellctrp) {
+				cf_detail(AS_GEO, (char *) "cell[%zu]: 0x%lx",
+						  ii, cellctrp[ii]);
+			}
+
+			if (cellminp && cellmaxp) {
+				cf_detail(AS_GEO, (char *) "cell[%zu]: [0x%lx, 0x%lx]",
+						  ii, cellminp[ii], cellmaxp[ii]);
+			}
+		}
+
+		*numcellsp = covering.size();
+		return true;
+	}
+	catch (exception const & ex)
+	{
+		cf_warning(AS_GEO, (char *) "geo_region_cover failed: %s", ex.what());
+		return false;
+	}
+}
+
+bool
+geo_point_centers(as_namespace * ns,
+				  uint64_t cellidval,
+				  int maxnumcenters,
+				  uint64_t * center,
+				  int * numcentersp)
+{
+	try
+	{
+		S2CellId incellid(cellidval);
+
+		*numcentersp = 0;
+	
+		for (S2CellId cellid = incellid;
+			 cellid.level() > 0;
+			 cellid = cellid.parent())
+		{
+			// Make sure we don't overwrite the output array.
+			if (*numcentersp == maxnumcenters) {
+				break;
+			}
+			center[*numcentersp] = cellid.id();
+			*numcentersp += 1;
+		}
+		return true;
+	}
+	catch (exception const & ex)
+	{
+		cf_warning(AS_GEO, (char *) "geo_point_centers failed: %s", ex.what());
+		return false;
+	}
+}
+
+bool
+geo_point_within(uint64_t cellidval, geo_region_t region)
+{
+	try
+	{
+		S2Region * regionp = (S2Region *) region;
+		S2CellId cellid(cellidval);
+		bool iswithin = regionp->VirtualContainsPoint(cellid.ToPoint());
+		return iswithin;
+	}
+	catch (exception const & ex)
+	{
+		cf_warning(AS_GEO, (char *) "exception in geo_point_within: %s",
+				   ex.what());
+		return false;
+	}
+}
+
+void
+geo_region_destroy(geo_region_t region)
+{
+	S2Region * regionp = (S2Region *) region;
+	if (regionp) {
+		delete regionp;
+	}
+}
diff --git a/as/src/storage/drv_memory.c b/as/src/storage/drv_memory.c
new file mode 100644
index 00000000..913033ed
--- /dev/null
+++ b/as/src/storage/drv_memory.c
@@ -0,0 +1,78 @@
+/*
+ * drv_memory.c
+ *
+ * Copyright (C) 2009-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * in-memory storage engine driver
+ *
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_queue.h"
+
+#include "base/datamodel.h"
+#include "base/truncate.h"
+#include "storage/storage.h"
+
+
+/* SYNOPSIS
+ * In-memory storage driver
+ *
+ * This code almost entirely performs no-ops, because all the in-memory state
+ * is correct already.
+ * Note that this code is mostly for the NON-PERSISTENT main memory namespace.
+ * The File-backed (persistent) main memory namespace is NOT type 1 (MM) for
+ * some calls, but is instead treated as type 2 (SSD);  hence in some cases
+ * the SSD functions, like as_storage_bin_can_fit(), are applied with an SSD
+ * context rather than a transient main memory context.  (tjl)
+ */
+
+int
+as_storage_namespace_init_memory(as_namespace *ns, cf_queue *complete_q, void *udata)
+{
+	as_truncate_done_startup(ns);
+
+	void *_t = NULL;
+
+	cf_queue_push(complete_q, &_t);
+
+	return 0;
+}
+
+int
+as_storage_namespace_destroy_memory(as_namespace *ns)
+{
+	return(0);
+}
+
+int
+as_storage_stats_memory(as_namespace *ns, int *available_pct, uint64_t *used_disk_bytes)
+{
+	if (available_pct) {
+		*available_pct = 100;
+	}
+	if (used_disk_bytes) {
+		*used_disk_bytes = 0;
+	}
+	return(0);
+}
diff --git a/as/src/storage/drv_memory_ce.c b/as/src/storage/drv_memory_ce.c
new file mode 100644
index 00000000..26121924
--- /dev/null
+++ b/as/src/storage/drv_memory_ce.c
@@ -0,0 +1,44 @@
+/*
+ * drv_memory_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "base/datamodel.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+
+
+void
+as_storage_start_tomb_raider_memory(as_namespace* ns)
+{
+	// Tomb raider is for enterprise version only.
+}
+
+
+int
+as_storage_record_write_memory(as_storage_rd* rd)
+{
+	return 0;
+}
+
+void
+as_storage_info_get_memory(as_namespace *ns, as_partition *p)
+{
+}
diff --git a/as/src/storage/drv_ssd.c b/as/src/storage/drv_ssd.c
new file mode 100644
index 00000000..4d3c99c6
--- /dev/null
+++ b/as/src/storage/drv_ssd.c
@@ -0,0 +1,4312 @@
+/*
+ * drv_ssd.c
+ *
+ * Copyright (C) 2009-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/* SYNOPSIS
+ * "file" based storage driver, which applies to both SSD namespaces and, in
+ * some cases, to file-backed main-memory namespaces.
+ */
+
+#include "storage/drv_ssd.h"
+
+#include <fcntl.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <linux/fs.h> // for BLKGETSIZE64
+#include <sys/ioctl.h>
+#include <sys/param.h> // for MAX()
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+#include "citrusleaf/cf_random.h"
+
+#include "cf_mutex.h"
+#include "fault.h"
+#include "hist.h"
+#include "vmapx.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/rec_props.h"
+#include "base/secondary_index.h"
+#include "base/truncate.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+// Defined in thr_nsup.c, for historical reasons.
+extern bool as_cold_start_evict_if_needed(as_namespace* ns);
+
+
+//==========================================================
+// Constants.
+//
+
+#define DEFRAG_STARTUP_RESERVE	4
+#define DEFRAG_RUNTIME_RESERVE	4
+
+
+//==========================================================
+// Miscellaneous utility functions.
+//
+
+// Get an open file descriptor from the pool, or a fresh one if necessary.
+int
+ssd_fd_get(drv_ssd *ssd)
+{
+	int fd = -1;
+	int rv = cf_queue_pop(ssd->fd_q, (void*)&fd, CF_QUEUE_NOWAIT);
+
+	if (rv != CF_QUEUE_OK) {
+		fd = open(ssd->name, ssd->open_flag, S_IRUSR | S_IWUSR);
+
+		if (-1 == fd) {
+			cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)",
+					ssd->name, errno, cf_strerror(errno));
+		}
+	}
+
+	return fd;
+}
+
+
+int
+ssd_shadow_fd_get(drv_ssd *ssd)
+{
+	int fd = -1;
+	int rv = cf_queue_pop(ssd->shadow_fd_q, (void*)&fd, CF_QUEUE_NOWAIT);
+
+	if (rv != CF_QUEUE_OK) {
+		fd = open(ssd->shadow_name, ssd->open_flag, S_IRUSR | S_IWUSR);
+
+		if (-1 == fd) {
+			cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)",
+					ssd->shadow_name, errno, cf_strerror(errno));
+		}
+	}
+
+	return fd;
+}
+
+
+// Save an open file descriptor in the pool
+static inline void
+ssd_fd_put(drv_ssd *ssd, int fd)
+{
+	cf_queue_push(ssd->fd_q, (void*)&fd);
+}
+
+
+static inline void
+ssd_shadow_fd_put(drv_ssd *ssd, int fd)
+{
+	cf_queue_push(ssd->shadow_fd_q, (void*)&fd);
+}
+
+
+// Decide which device a record belongs on.
+static inline uint32_t
+ssd_get_file_id(drv_ssds *ssds, cf_digest *keyd)
+{
+	return *(uint32_t*)&keyd->digest[DIGEST_STORAGE_BASE_BYTE] % ssds->n_ssds;
+}
+
+
+// Put a wblock on the free queue for reuse.
+void
+push_wblock_to_free_q(drv_ssd *ssd, uint32_t wblock_id, e_free_to free_to)
+{
+	if (! ssd->free_wblock_q) { // null until devices are loaded at startup
+		return;
+	}
+
+	// temp debugging:
+	if (wblock_id >= ssd->alloc_table->n_wblocks) {
+		cf_warning(AS_DRV_SSD, "pushing invalid wblock_id %d to free_wblock_q",
+				(int32_t)wblock_id);
+		return;
+	}
+
+	if (free_to == FREE_TO_HEAD) {
+		cf_queue_push_head(ssd->free_wblock_q, &wblock_id);
+	}
+	else {
+		cf_queue_push(ssd->free_wblock_q, &wblock_id);
+	}
+}
+
+
+// Put a wblock on the defrag queue.
+static inline void
+push_wblock_to_defrag_q(drv_ssd *ssd, uint32_t wblock_id)
+{
+	if (ssd->defrag_wblock_q) { // null until devices are loaded at startup
+		ssd->alloc_table->wblock_state[wblock_id].state = WBLOCK_STATE_DEFRAG;
+		cf_queue_push(ssd->defrag_wblock_q, &wblock_id);
+		cf_atomic64_incr(&ssd->n_defrag_wblock_reads);
+	}
+}
+
+
+// Available contiguous size.
+static inline uint64_t
+available_size(drv_ssd *ssd)
+{
+	return ssd->free_wblock_q ? // null until devices are loaded at startup
+			(uint64_t)cf_queue_sz(ssd->free_wblock_q) * ssd->write_block_size :
+			ssd->file_size;
+
+	// Note - returns 100% available during cold start, to make it irrelevant in
+	// cold start eviction threshold check.
+}
+
+
+// Since UDF writes can't yet unwind on failure, we ensure that they'll succeed
+// by checking before writing on all threads that there's at least one wblock
+// per thread. TODO - deprecate this methodology when everything can unwind.
+static inline int
+min_free_wblocks(as_namespace *ns)
+{
+	// Data-in-memory namespaces process transactions in service threads.
+	int n_service_threads = ns->storage_data_in_memory ?
+			(int)g_config.n_service_threads : 0;
+
+	int n_transaction_threads = (int)
+			(g_config.n_transaction_queues * g_config.n_transaction_threads_per_queue);
+
+	return	n_service_threads +			// client writes
+			n_transaction_threads +		// client writes
+			g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW] + // prole writes
+			g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK] + // migration writes
+			1 +							// always 1 defrag thread
+			DEFRAG_RUNTIME_RESERVE +	// reserve for defrag at runtime
+			DEFRAG_STARTUP_RESERVE;		// reserve for defrag at startup
+}
+
+
+void
+ssd_release_vacated_wblock(drv_ssd *ssd, uint32_t wblock_id,
+		ssd_wblock_state* p_wblock_state)
+{
+	// Sanity checks.
+	cf_assert(! p_wblock_state->swb, AS_DRV_SSD,
+			"device %s: wblock-id %u swb not null while defragging",
+			ssd->name, wblock_id);
+	cf_assert(p_wblock_state->state == WBLOCK_STATE_DEFRAG, AS_DRV_SSD,
+			"device %s: wblock-id %u state not DEFRAG while defragging",
+			ssd->name, wblock_id);
+
+	int32_t n_vac_dests = cf_atomic32_decr(&p_wblock_state->n_vac_dests);
+
+	if (n_vac_dests > 0) {
+		return;
+	}
+	// else - all wblocks we defragged into have been flushed.
+
+	cf_assert(n_vac_dests == 0, AS_DRV_SSD,
+			"device %s: wblock-id %u vacation destinations underflow",
+			ssd->name, wblock_id);
+
+	cf_mutex_lock(&p_wblock_state->LOCK);
+
+	p_wblock_state->state = WBLOCK_STATE_NONE;
+
+	// Free the wblock if it's empty.
+	if (cf_atomic32_get(p_wblock_state->inuse_sz) == 0 &&
+			// TODO - given assertions above, this condition is superfluous:
+			! p_wblock_state->swb) {
+		push_wblock_to_free_q(ssd, wblock_id, FREE_TO_HEAD);
+	}
+
+	cf_mutex_unlock(&p_wblock_state->LOCK);
+}
+
+
+//------------------------------------------------
+// ssd_write_buf "swb" methods.
+//
+
+#define VACATED_CAPACITY_STEP 128 // allocate in 1K chunks
+
+static inline ssd_write_buf*
+swb_create(drv_ssd *ssd)
+{
+	ssd_write_buf *swb = (ssd_write_buf*)cf_malloc(sizeof(ssd_write_buf));
+
+	swb->buf = cf_valloc(ssd->write_block_size);
+
+	swb->n_vacated = 0;
+	swb->vacated_capacity = VACATED_CAPACITY_STEP;
+	swb->vacated_wblocks =
+			cf_malloc(sizeof(vacated_wblock) * swb->vacated_capacity);
+
+	return swb;
+}
+
+static inline void
+swb_destroy(ssd_write_buf *swb)
+{
+	cf_free(swb->vacated_wblocks);
+	cf_free(swb->buf);
+	cf_free(swb);
+}
+
+static inline void
+swb_reset(ssd_write_buf *swb)
+{
+	swb->skip_post_write_q = false;
+	swb->wblock_id = STORAGE_INVALID_WBLOCK;
+	swb->pos = 0;
+}
+
+#define swb_reserve(_swb) cf_atomic32_incr(&(_swb)->rc)
+
+static inline void
+swb_check_and_reserve(ssd_wblock_state *wblock_state, ssd_write_buf **p_swb)
+{
+	cf_mutex_lock(&wblock_state->LOCK);
+
+	if (wblock_state->swb) {
+		*p_swb = wblock_state->swb;
+		swb_reserve(*p_swb);
+	}
+
+	cf_mutex_unlock(&wblock_state->LOCK);
+}
+
+static inline void
+swb_release(ssd_write_buf *swb)
+{
+	if (0 == cf_atomic32_decr(&swb->rc)) {
+		swb_reset(swb);
+
+		// Put the swb back on the free queue for reuse.
+		cf_queue_push(swb->ssd->swb_free_q, &swb);
+	}
+}
+
+static inline void
+swb_dereference_and_release(drv_ssd *ssd, uint32_t wblock_id,
+		ssd_write_buf *swb)
+{
+	ssd_wblock_state *wblock_state = &ssd->alloc_table->wblock_state[wblock_id];
+
+	cf_mutex_lock(&wblock_state->LOCK);
+
+	if (swb != wblock_state->swb) {
+		cf_warning(AS_DRV_SSD, "releasing wrong swb! %p (%d) != %p (%d), thread %lu",
+			swb, (int32_t)swb->wblock_id, wblock_state->swb,
+			(int32_t)wblock_state->swb->wblock_id, pthread_self());
+	}
+
+	swb_release(wblock_state->swb);
+	wblock_state->swb = 0;
+
+	if (wblock_state->state != WBLOCK_STATE_DEFRAG) {
+		uint32_t inuse_sz = cf_atomic32_get(wblock_state->inuse_sz);
+
+		// Free wblock if all three gating conditions hold.
+		if (inuse_sz == 0) {
+			push_wblock_to_free_q(ssd, wblock_id, FREE_TO_HEAD);
+		}
+		// Queue wblock for defrag if applicable.
+		else if (inuse_sz < ssd->ns->defrag_lwm_size) {
+			push_wblock_to_defrag_q(ssd, wblock_id);
+		}
+	}
+	else {
+		cf_warning(AS_DRV_SSD, "device %s: wblock-id %u state is DEFRAG on swb release",
+				ssd->name, wblock_id);
+	}
+
+	cf_mutex_unlock(&wblock_state->LOCK);
+}
+
+ssd_write_buf *
+swb_get(drv_ssd *ssd)
+{
+	ssd_write_buf *swb;
+
+	if (CF_QUEUE_OK != cf_queue_pop(ssd->swb_free_q, &swb, CF_QUEUE_NOWAIT)) {
+		swb = swb_create(ssd);
+		swb->rc = 0;
+		swb->n_writers = 0;
+		swb->skip_post_write_q = false;
+		swb->ssd = ssd;
+		swb->wblock_id = STORAGE_INVALID_WBLOCK;
+		swb->pos = 0;
+	}
+
+	// Find a device block to write to.
+	if (CF_QUEUE_OK != cf_queue_pop(ssd->free_wblock_q, &swb->wblock_id,
+			CF_QUEUE_NOWAIT)) {
+		cf_queue_push(ssd->swb_free_q, &swb);
+		return NULL;
+	}
+
+	ssd_wblock_state* p_wblock_state =
+			&ssd->alloc_table->wblock_state[swb->wblock_id];
+
+	// Sanity checks.
+	if (cf_atomic32_get(p_wblock_state->inuse_sz) != 0) {
+		cf_warning(AS_DRV_SSD, "device %s: wblock-id %u inuse-size %u off free-q",
+				ssd->name, swb->wblock_id,
+				cf_atomic32_get(p_wblock_state->inuse_sz));
+	}
+	if (p_wblock_state->swb) {
+		cf_warning(AS_DRV_SSD, "device %s: wblock-id %u swb not null off free-q",
+				ssd->name, swb->wblock_id);
+	}
+	if (p_wblock_state->state != WBLOCK_STATE_NONE) {
+		cf_warning(AS_DRV_SSD, "device %s: wblock-id %u state not NONE off free-q",
+				ssd->name, swb->wblock_id);
+	}
+
+	cf_mutex_lock(&p_wblock_state->LOCK);
+
+	swb_reserve(swb);
+	p_wblock_state->swb = swb;
+
+	cf_mutex_unlock(&p_wblock_state->LOCK);
+
+	return swb;
+}
+
+bool
+swb_add_unique_vacated_wblock(ssd_write_buf* swb, uint32_t src_file_id,
+		uint32_t src_wblock_id)
+{
+	for (uint32_t i = 0; i < swb->n_vacated; i++) {
+		vacated_wblock *vw = &swb->vacated_wblocks[i];
+
+		if (vw->wblock_id == src_wblock_id && vw->file_id == src_file_id) {
+			return false; // already present
+		}
+	}
+
+	if (swb->n_vacated == swb->vacated_capacity) {
+		swb->vacated_capacity += VACATED_CAPACITY_STEP;
+		swb->vacated_wblocks = cf_realloc(swb->vacated_wblocks,
+				sizeof(vacated_wblock) * swb->vacated_capacity);
+	}
+
+	swb->vacated_wblocks[swb->n_vacated].file_id = src_file_id;
+	swb->vacated_wblocks[swb->n_vacated].wblock_id = src_wblock_id;
+	swb->n_vacated++;
+
+	return true; // added to list
+}
+
+void
+swb_release_all_vacated_wblocks(ssd_write_buf* swb)
+{
+	drv_ssds *ssds = (drv_ssds *)swb->ssd->ns->storage_private;
+
+	for (uint32_t i = 0; i < swb->n_vacated; i++) {
+		vacated_wblock *vw = &swb->vacated_wblocks[i];
+
+		drv_ssd *src_ssd = &ssds->ssds[vw->file_id];
+		ssd_alloc_table* at = src_ssd->alloc_table;
+		ssd_wblock_state* p_wblock_state = &at->wblock_state[vw->wblock_id];
+
+		ssd_release_vacated_wblock(src_ssd, vw->wblock_id, p_wblock_state);
+	}
+
+	swb->n_vacated = 0;
+}
+
+//
+// END - ssd_write_buf "swb" methods.
+//------------------------------------------------
+
+
+// Reduce wblock's used size, if result is 0 put it in the "free" pool, if it's
+// below the defrag threshold put it in the defrag queue.
+void
+ssd_block_free(drv_ssd *ssd, uint64_t rblock_id, uint64_t n_rblocks, char *msg)
+{
+	if (n_rblocks == 0) {
+		cf_warning(AS_DRV_SSD, "%s: %s: freeing 0 rblocks, rblock_id %lu",
+				ssd->name, msg, rblock_id);
+		return;
+	}
+
+	// Determine which wblock we're reducing used size in.
+	uint64_t start_byte = RBLOCKS_TO_BYTES(rblock_id);
+	uint64_t size = RBLOCKS_TO_BYTES(n_rblocks);
+	uint32_t wblock_id = BYTES_TO_WBLOCK_ID(ssd, start_byte);
+	uint32_t end_wblock_id = BYTES_TO_WBLOCK_ID(ssd, start_byte + size - 1);
+	ssd_alloc_table *at = ssd->alloc_table;
+
+	// Sanity-checks.
+	if (! (start_byte >= SSD_HEADER_SIZE && wblock_id < at->n_wblocks &&
+			wblock_id == end_wblock_id)) {
+		cf_warning(AS_DRV_SSD, "%s: %s: invalid range to free, rblock_id %lu, n_rblocks %lu",
+				ssd->name, msg, rblock_id, n_rblocks);
+		return;
+	}
+
+	cf_atomic64_sub(&ssd->inuse_size, size);
+
+	ssd_wblock_state *p_wblock_state = &at->wblock_state[wblock_id];
+
+	cf_mutex_lock(&p_wblock_state->LOCK);
+
+	int64_t resulting_inuse_sz = cf_atomic32_sub(&p_wblock_state->inuse_sz,
+			(int32_t)size);
+
+	if (resulting_inuse_sz < 0 ||
+			resulting_inuse_sz >= (int64_t)ssd->write_block_size) {
+		cf_warning(AS_DRV_SSD, "%s: %s: wblock %d %s, subtracted %d now %ld",
+				ssd->name, msg, wblock_id,
+				resulting_inuse_sz < 0 ? "over-freed" : "has crazy inuse_sz",
+				(int32_t)size, resulting_inuse_sz);
+
+		// TODO - really?
+		cf_atomic32_set(&p_wblock_state->inuse_sz, ssd->write_block_size);
+	}
+	else if (! p_wblock_state->swb &&
+			p_wblock_state->state != WBLOCK_STATE_DEFRAG) {
+		// Free wblock if all three gating conditions hold.
+		if (resulting_inuse_sz == 0) {
+			push_wblock_to_free_q(ssd, wblock_id, FREE_TO_HEAD);
+		}
+		// Queue wblock for defrag if appropriate.
+		else if (resulting_inuse_sz < ssd->ns->defrag_lwm_size) {
+			push_wblock_to_defrag_q(ssd, wblock_id);
+		}
+	}
+
+	cf_mutex_unlock(&p_wblock_state->LOCK);
+}
+
+
+static void
+log_bad_record(const char* ns_name, uint32_t n_bins, uint32_t block_bins,
+		const drv_ssd_bin* ssd_bin, const char* tag)
+{
+	cf_info(AS_DRV_SSD, "untrustworthy data from disk [%s]", tag);
+	cf_info(AS_DRV_SSD, "   ns->name = %s", ns_name);
+	cf_info(AS_DRV_SSD, "   bin %u [of %u]", (block_bins - n_bins) + 1, block_bins);
+
+	if (ssd_bin) {
+		cf_info(AS_DRV_SSD, "   ssd_bin->offset = %u", ssd_bin->offset);
+		cf_info(AS_DRV_SSD, "   ssd_bin->len = %u", ssd_bin->len);
+		cf_info(AS_DRV_SSD, "   ssd_bin->next = %u", ssd_bin->next);
+	}
+}
+
+
+// TODO - sanity-check rec-props?
+bool
+is_valid_record(const drv_ssd_block* block, const char* ns_name)
+{
+	uint8_t* block_head = (uint8_t*)block;
+	uint64_t size = (uint64_t)(block->length + LENGTH_BASE);
+	drv_ssd_bin* ssd_bin_end = (drv_ssd_bin*)(block_head + size - sizeof(drv_ssd_bin));
+	drv_ssd_bin* ssd_bin = (drv_ssd_bin*)(block->data + block->bins_offset);
+	uint32_t n_bins = block->n_bins;
+
+	if (! ssd_cold_start_is_valid_n_bins(n_bins)) {
+		log_bad_record(ns_name, n_bins, n_bins, NULL, "bins");
+		return false;
+	}
+
+	while (n_bins > 0) {
+		if (ssd_bin > ssd_bin_end) {
+			log_bad_record(ns_name, n_bins, block->n_bins, NULL, "bin ptr");
+			return false;
+		}
+
+		uint64_t data_offset = (uint64_t)((uint8_t*)(ssd_bin + 1) - block_head);
+
+		if ((uint64_t)ssd_bin->offset != data_offset) {
+			log_bad_record(ns_name, n_bins, block->n_bins, ssd_bin, "offset");
+			return false;
+		}
+
+		uint64_t bin_end_offset = data_offset + (uint64_t)ssd_bin->len;
+
+		if (bin_end_offset > size) {
+			log_bad_record(ns_name, n_bins, block->n_bins, ssd_bin, "length");
+			return false;
+		}
+
+		if (n_bins > 1) {
+			if ((uint64_t)ssd_bin->next != bin_end_offset) {
+				log_bad_record(ns_name, n_bins, block->n_bins, ssd_bin, "next ptr");
+				return false;
+			}
+
+			ssd_bin = (drv_ssd_bin*)(block_head + ssd_bin->next);
+		}
+
+		n_bins--;
+	}
+
+	return true;
+}
+
+
+void
+defrag_move_record(drv_ssd *src_ssd, uint32_t src_wblock_id,
+		drv_ssd_block *block, as_index *r)
+{
+	uint64_t old_rblock_id = r->rblock_id;
+	uint16_t old_n_rblocks = r->n_rblocks;
+
+	drv_ssds *ssds = (drv_ssds*)src_ssd->ns->storage_private;
+
+	// Figure out which device to write to. When replacing an old record, it's
+	// possible this is different from the old device (e.g. if we've added a
+	// fresh device), so derive it from the digest each time.
+	drv_ssd *ssd = &ssds->ssds[ssd_get_file_id(ssds, &block->keyd)];
+
+	if (! ssd) {
+		cf_warning(AS_DRV_SSD, "{%s} defrag_move_record: no drv_ssd for file_id %u",
+				ssds->ns->name, ssd->file_id);
+		return;
+	}
+
+	uint32_t write_size = block->length + LENGTH_BASE;
+
+	pthread_mutex_lock(&ssd->defrag_lock);
+
+	ssd_write_buf *swb = ssd->defrag_swb;
+
+	if (! swb) {
+		swb = swb_get(ssd);
+		ssd->defrag_swb = swb;
+
+		if (! swb) {
+			cf_warning(AS_DRV_SSD, "defrag_move_record: couldn't get swb");
+			pthread_mutex_unlock(&ssd->defrag_lock);
+			return;
+		}
+	}
+
+	// Check if there's enough space in defrag buffer - if not, free and zero
+	// any remaining unused space, enqueue it to be flushed to device, and grab
+	// a new buffer.
+	if (write_size > ssd->write_block_size - swb->pos) {
+		if (ssd->write_block_size != swb->pos) {
+			// Clean the end of the buffer before pushing to write queue.
+			memset(swb->buf + swb->pos, 0, ssd->write_block_size - swb->pos);
+		}
+
+		// Enqueue the buffer, to be flushed to device.
+		swb->skip_post_write_q = true;
+		cf_queue_push(ssd->swb_write_q, &swb);
+		cf_atomic64_incr(&ssd->n_defrag_wblock_writes);
+
+		// Get the new buffer.
+		swb = swb_get(ssd);
+		ssd->defrag_swb = swb;
+
+		if (! swb) {
+			cf_warning(AS_DRV_SSD, "defrag_move_record: couldn't get swb");
+			pthread_mutex_unlock(&ssd->defrag_lock);
+			return;
+		}
+	}
+
+	memcpy(swb->buf + swb->pos, (const uint8_t*)block, write_size);
+
+	uint64_t write_offset = WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id) + swb->pos;
+
+	ssd_encrypt(ssd, write_offset, (drv_ssd_block *)(swb->buf + swb->pos));
+
+	r->file_id = ssd->file_id;
+	r->rblock_id = BYTES_TO_RBLOCKS(write_offset);
+	r->n_rblocks = BYTES_TO_RBLOCKS(write_size);
+
+	swb->pos += write_size;
+
+	cf_atomic64_add(&ssd->inuse_size, (int64_t)write_size);
+	cf_atomic32_add(&ssd->alloc_table->wblock_state[swb->wblock_id].inuse_sz, (int32_t)write_size);
+
+	// If we just defragged into a new destination swb, count it.
+	if (swb_add_unique_vacated_wblock(swb, src_ssd->file_id, src_wblock_id)) {
+		ssd_wblock_state* p_wblock_state =
+				&src_ssd->alloc_table->wblock_state[src_wblock_id];
+
+		cf_atomic32_incr(&p_wblock_state->n_vac_dests);
+	}
+
+	pthread_mutex_unlock(&ssd->defrag_lock);
+
+	ssd_block_free(src_ssd, old_rblock_id, old_n_rblocks, "defrag-write");
+}
+
+
+int
+ssd_record_defrag(drv_ssd *ssd, uint32_t wblock_id, drv_ssd_block *block,
+		uint64_t rblock_id, uint32_t n_rblocks)
+{
+	as_namespace *ns = ssd->ns;
+	as_partition_reservation rsv;
+	uint32_t pid = as_partition_getid(&block->keyd);
+
+	as_partition_reserve(ns, pid, &rsv);
+
+	int rv;
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	bool found = 0 == as_record_get(rsv.tree, &block->keyd, &r_ref);
+
+	if (found) {
+		as_index *r = r_ref.r;
+
+		if (r->file_id == ssd->file_id && r->rblock_id == rblock_id) {
+			if (r->generation != block->generation) {
+				cf_warning_digest(AS_DRV_SSD, &r->keyd, "device %s defrag: rblock_id %lu generation mismatch (%u:%u) ",
+						ssd->name, rblock_id, r->generation, block->generation);
+			}
+
+			if (r->n_rblocks != n_rblocks) {
+				cf_warning_digest(AS_DRV_SSD, &r->keyd, "device %s defrag: rblock_id %lu n_blocks mismatch (%u:%u) ",
+						ssd->name, rblock_id, r->n_rblocks, n_rblocks);
+			}
+
+			defrag_move_record(ssd, wblock_id, block, r);
+
+			rv = 0; // record was in index tree and current - moved it
+		}
+		else {
+			rv = -1; // record was in index tree - presumably was overwritten
+		}
+
+		as_record_done(&r_ref, ns);
+	}
+	else {
+		rv = -2; // record was not in index tree - presumably was deleted
+	}
+
+	as_partition_release(&rsv);
+
+	return rv;
+}
+
+
+bool
+ssd_is_full(drv_ssd *ssd, uint32_t wblock_id)
+{
+	if (cf_queue_sz(ssd->free_wblock_q) > DEFRAG_STARTUP_RESERVE) {
+		return false;
+	}
+
+	ssd_wblock_state* p_wblock_state = &ssd->alloc_table->wblock_state[wblock_id];
+
+	cf_mutex_lock(&p_wblock_state->LOCK);
+
+	if (cf_atomic32_get(p_wblock_state->inuse_sz) == 0) {
+		// Lucky - wblock is empty, let ssd_defrag_wblock() free it.
+		cf_mutex_unlock(&p_wblock_state->LOCK);
+
+		return false;
+	}
+
+	cf_warning(AS_DRV_SSD, "{%s}: defrag: drive %s totally full, re-queuing wblock %u",
+			ssd->ns->name, ssd->name, wblock_id);
+
+	// Not using push_wblock_to_defrag_q() - state is already DEFRAG, we
+	// definitely have a queue, and it's better to push back to head.
+	cf_queue_push_head(ssd->defrag_wblock_q, &wblock_id);
+
+	cf_mutex_unlock(&p_wblock_state->LOCK);
+
+	// If we got here, we used all our runtime reserve wblocks, but the wblocks
+	// we defragged must still have non-zero inuse_sz. Must wait for those to
+	// become free. Sleep prevents retries from overwhelming the log.
+	sleep(1);
+
+	return true;
+}
+
+
+int
+ssd_defrag_wblock(drv_ssd *ssd, uint32_t wblock_id, uint8_t *read_buf)
+{
+	if (ssd_is_full(ssd, wblock_id)) {
+		return 0;
+	}
+
+	int record_count = 0;
+	int num_old_records = 0;
+	int num_deleted_records = 0;
+
+	ssd_wblock_state* p_wblock_state = &ssd->alloc_table->wblock_state[wblock_id];
+
+	cf_assert(p_wblock_state->n_vac_dests == 0, AS_DRV_SSD,
+			"n-vacations not 0 beginning defrag wblock");
+
+	// Make sure this can't decrement to 0 while defragging this wblock.
+	cf_atomic32_set(&p_wblock_state->n_vac_dests, 1);
+
+	if (cf_atomic32_get(p_wblock_state->inuse_sz) == 0) {
+		goto Finished;
+	}
+
+	int fd = ssd_fd_get(ssd);
+	uint64_t file_offset = WBLOCK_ID_TO_BYTES(ssd, wblock_id);
+
+	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+
+	if (lseek(fd, (off_t)file_offset, SEEK_SET) != (off_t)file_offset) {
+		cf_warning(AS_DRV_SSD, "%s: seek failed: offset %lu: errno %d (%s)",
+				ssd->name, file_offset, errno, cf_strerror(errno));
+		close(fd);
+		fd = -1;
+		goto Finished;
+	}
+
+	ssize_t rlen = read(fd, read_buf, ssd->write_block_size);
+
+	if (rlen != (ssize_t)ssd->write_block_size) {
+		cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)",
+				ssd->name, rlen, errno, cf_strerror(errno));
+		close(fd);
+		fd = -1;
+		goto Finished;
+	}
+
+	if (start_ns != 0) {
+		histogram_insert_data_point(ssd->hist_large_block_read, start_ns);
+	}
+
+	ssd_fd_put(ssd, fd);
+
+	size_t wblock_offset = 0; // current offset within the wblock, in bytes
+
+	while (wblock_offset < ssd->write_block_size &&
+			cf_atomic32_get(p_wblock_state->inuse_sz) != 0) {
+		drv_ssd_block *block = (drv_ssd_block*)&read_buf[wblock_offset];
+
+		ssd_decrypt(ssd, file_offset + wblock_offset, block);
+
+		if (block->magic != SSD_BLOCK_MAGIC) {
+			// First block must have magic.
+			if (wblock_offset == 0) {
+				cf_warning(AS_DRV_SSD, "BLOCK CORRUPTED: device %s has bad data on wblock %d",
+						ssd->name, wblock_id);
+				break;
+			}
+
+			// Later blocks may have no magic, just skip to next block.
+			wblock_offset += RBLOCK_SIZE;
+			continue;
+		}
+
+		// Note - if block->length is sane, we don't need to round up to a
+		// multiple of RBLOCK_SIZE, but let's do it anyway just to be safe.
+		size_t next_wblock_offset = wblock_offset +
+				BYTES_TO_RBLOCK_BYTES(block->length + LENGTH_BASE);
+
+		if (next_wblock_offset > ssd->write_block_size) {
+			cf_warning(AS_DRV_SSD, "error: block extends over read size: foff %lu boff %lu blen %lu",
+					file_offset, wblock_offset, (uint64_t)block->length);
+			break;
+		}
+
+		// Found a good record, move it if it's current.
+		int rv = ssd_record_defrag(ssd, wblock_id, block,
+				BYTES_TO_RBLOCKS(file_offset + wblock_offset),
+				(uint32_t)BYTES_TO_RBLOCKS(next_wblock_offset - wblock_offset));
+
+		if (rv == 0) {
+			record_count++;
+		}
+		else if (rv == -1) {
+			num_old_records++;
+		}
+		else if (rv == -2) {
+			num_deleted_records++;
+		}
+
+		wblock_offset = next_wblock_offset;
+	}
+
+Finished:
+
+	// Note - usually wblock's inuse_sz is 0 here, but may legitimately be non-0
+	// e.g. if a dropped partition's tree is not done purging. In this case, we
+	// may have found deleted records in the wblock whose used-size contribution
+	// has not yet been subtracted.
+
+	cf_detail(AS_DRV_SSD, "device %s: wblock-id %u defragged, final in-use-sz %d records (%d:%d:%d)",
+			ssd->name, wblock_id, cf_atomic32_get(p_wblock_state->inuse_sz),
+			record_count, num_old_records, num_deleted_records);
+
+	ssd_release_vacated_wblock(ssd, wblock_id, p_wblock_state);
+
+	return record_count;
+}
+
+
+// Thread "run" function to service a device's defrag queue.
+void*
+run_defrag(void *pv_data)
+{
+	drv_ssd *ssd = (drv_ssd*)pv_data;
+	uint32_t wblock_id;
+	uint8_t *read_buf = cf_valloc(ssd->write_block_size);
+
+	while (true) {
+		uint32_t q_min = ssd->ns->storage_defrag_queue_min;
+
+		if (q_min != 0) {
+			if (cf_queue_sz(ssd->defrag_wblock_q) > q_min) {
+				if (CF_QUEUE_OK !=
+						cf_queue_pop(ssd->defrag_wblock_q, &wblock_id,
+								CF_QUEUE_NOWAIT)) {
+					// Should never get here!
+					break;
+				}
+			}
+			else {
+				usleep(1000 * 50);
+				continue;
+			}
+		}
+		else {
+			if (CF_QUEUE_OK !=
+					cf_queue_pop(ssd->defrag_wblock_q, &wblock_id,
+							CF_QUEUE_FOREVER)) {
+				// Should never get here!
+				break;
+			}
+		}
+
+		ssd_defrag_wblock(ssd, wblock_id, read_buf);
+
+		uint32_t sleep_us = ssd->ns->storage_defrag_sleep;
+
+		if (sleep_us != 0) {
+			usleep(sleep_us);
+		}
+	}
+
+	// Although we ever expect to get here...
+	cf_free(read_buf);
+	cf_warning(AS_DRV_SSD, "device %s: quit defrag - queue error", ssd->name);
+
+	return NULL;
+}
+
+
+void
+ssd_start_defrag_threads(drv_ssds *ssds)
+{
+	cf_info(AS_DRV_SSD, "{%s} starting defrag threads", ssds->ns->name);
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		if (pthread_create(&ssd->defrag_thread, NULL, run_defrag,
+				(void*)ssd) != 0) {
+			cf_crash(AS_DRV_SSD, "%s defrag thread failed", ssd->name);
+		}
+	}
+}
+
+
+//------------------------------------------------
+// defrag_pen class.
+//
+
+#define DEFRAG_PEN_INIT_CAPACITY (8 * 1024)
+
+typedef struct defrag_pen_s {
+	uint32_t n_ids;
+	uint32_t capacity;
+	uint32_t *ids;
+	uint32_t stack_ids[DEFRAG_PEN_INIT_CAPACITY];
+} defrag_pen;
+
+static void
+defrag_pen_init(defrag_pen *pen)
+{
+	pen->n_ids = 0;
+	pen->capacity = DEFRAG_PEN_INIT_CAPACITY;
+	pen->ids = pen->stack_ids;
+}
+
+static void
+defrag_pen_destroy(defrag_pen *pen)
+{
+	if (pen->ids != pen->stack_ids) {
+		cf_free(pen->ids);
+	}
+}
+
+static void
+defrag_pen_add(defrag_pen *pen, uint32_t wblock_id)
+{
+	if (pen->n_ids == pen->capacity) {
+		if (pen->capacity == DEFRAG_PEN_INIT_CAPACITY) {
+			pen->capacity <<= 2;
+			pen->ids = cf_malloc(pen->capacity * sizeof(uint32_t));
+			memcpy(pen->ids, pen->stack_ids, sizeof(pen->stack_ids));
+		}
+		else {
+			pen->capacity <<= 1;
+			pen->ids = cf_realloc(pen->ids, pen->capacity * sizeof(uint32_t));
+		}
+	}
+
+	pen->ids[pen->n_ids++] = wblock_id;
+}
+
+static void
+defrag_pen_transfer(defrag_pen *pen, drv_ssd *ssd)
+{
+	// For speed, "customize" instead of using push_wblock_to_defrag_q()...
+	for (uint32_t i = 0; i < pen->n_ids; i++) {
+		uint32_t wblock_id = pen->ids[i];
+
+		ssd->alloc_table->wblock_state[wblock_id].state = WBLOCK_STATE_DEFRAG;
+		cf_queue_push(ssd->defrag_wblock_q, &wblock_id);
+	}
+}
+
+static void
+defrag_pens_dump(defrag_pen pens[], uint32_t n_pens, const char* ssd_name)
+{
+	char buf[2048];
+	uint32_t n = 0;
+	int pos = sprintf(buf, "%u", pens[n++].n_ids);
+
+	while (n < n_pens) {
+		pos += sprintf(buf + pos, ",%u", pens[n++].n_ids);
+	}
+
+	cf_info(AS_DRV_SSD, "%s init defrag profile: %s", ssd_name, buf);
+}
+
+//
+// END - defrag_pen class.
+//------------------------------------------------
+
+
+// Thread "run" function to create and load a device's (wblock) free & defrag
+// queues at startup. Sorts defrag-eligible wblocks so the most depleted ones
+// are at the head of the defrag queue.
+void*
+run_load_queues(void *pv_data)
+{
+	drv_ssd *ssd = (drv_ssd*)pv_data;
+
+	// TODO - would be nice to have a queue create of specified capacity.
+	ssd->free_wblock_q = cf_queue_create(sizeof(uint32_t), true);
+	ssd->defrag_wblock_q = cf_queue_create(sizeof(uint32_t), true);
+
+	as_namespace *ns = ssd->ns;
+	uint32_t lwm_pct = ns->storage_defrag_lwm_pct;
+	uint32_t lwm_size = ns->defrag_lwm_size;
+	defrag_pen pens[lwm_pct];
+
+	for (uint32_t n = 0; n < lwm_pct; n++) {
+		defrag_pen_init(&pens[n]);
+	}
+
+	ssd_alloc_table* at = ssd->alloc_table;
+	uint32_t first_id = BYTES_TO_WBLOCK_ID(ssd, SSD_HEADER_SIZE);
+	uint32_t last_id = at->n_wblocks;
+
+	for (uint32_t wblock_id = first_id; wblock_id < last_id; wblock_id++) {
+		uint32_t inuse_sz = at->wblock_state[wblock_id].inuse_sz;
+
+		if (inuse_sz == 0) {
+			// Faster than using push_wblock_to_free_q() here...
+			cf_queue_push(ssd->free_wblock_q, &wblock_id);
+		}
+		else if (inuse_sz < lwm_size) {
+			defrag_pen_add(&pens[(inuse_sz * lwm_pct) / lwm_size], wblock_id);
+		}
+	}
+
+	defrag_pens_dump(pens, lwm_pct, ssd->name);
+
+	for (uint32_t n = 0; n < lwm_pct; n++) {
+		defrag_pen_transfer(&pens[n], ssd);
+		defrag_pen_destroy(&pens[n]);
+	}
+
+	ssd->n_defrag_wblock_reads = (uint64_t)cf_queue_sz(ssd->defrag_wblock_q);
+
+	return NULL;
+}
+
+
+void
+ssd_load_wblock_queues(drv_ssds *ssds)
+{
+	cf_info(AS_DRV_SSD, "{%s} loading free & defrag queues", ssds->ns->name);
+
+	// Split this task across multiple threads.
+	pthread_t q_load_threads[ssds->n_ssds];
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		if (pthread_create(&q_load_threads[i], NULL, run_load_queues,
+				(void*)ssd) != 0) {
+			cf_crash(AS_DRV_SSD, "%s load queues thread failed", ssd->name);
+		}
+	}
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		pthread_join(q_load_threads[i], NULL);
+	}
+	// Now we're single-threaded again.
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		cf_info(AS_DRV_SSD, "%s init wblock free-q %d, defrag-q %d", ssd->name,
+				cf_queue_sz(ssd->free_wblock_q),
+				cf_queue_sz(ssd->defrag_wblock_q));
+	}
+}
+
+
+void
+ssd_wblock_init(drv_ssd *ssd)
+{
+	uint32_t n_wblocks = (uint32_t)(ssd->file_size / ssd->write_block_size);
+
+	cf_info(AS_DRV_SSD, "%s has %u wblocks of size %u", ssd->name, n_wblocks,
+			ssd->write_block_size);
+
+	ssd_alloc_table *at = cf_malloc(sizeof(ssd_alloc_table) + (n_wblocks * sizeof(ssd_wblock_state)));
+
+	at->n_wblocks = n_wblocks;
+
+	// Device header wblocks' inuse_sz will (also) be 0 but that doesn't matter.
+	for (uint32_t i = 0; i < n_wblocks; i++) {
+		ssd_wblock_state * p_wblock_state = &at->wblock_state[i];
+
+		cf_atomic32_set(&p_wblock_state->inuse_sz, 0);
+		cf_mutex_init(&p_wblock_state->LOCK);
+		p_wblock_state->swb = NULL;
+		p_wblock_state->state = WBLOCK_STATE_NONE;
+		p_wblock_state->n_vac_dests = 0;
+	}
+
+	ssd->alloc_table = at;
+}
+
+
+//==========================================================
+// Record reading utilities.
+//
+
+int
+ssd_read_record(as_storage_rd *rd)
+{
+	as_namespace *ns = rd->ns;
+	as_record *r = rd->r;
+
+	if (STORAGE_RBLOCK_IS_INVALID(r->rblock_id)) {
+		cf_warning_digest(AS_DRV_SSD, &r->keyd, "{%s} read_ssd: invalid rblock_id ",
+				ns->name);
+		return -1;
+	}
+
+	uint64_t record_offset = RBLOCKS_TO_BYTES(r->rblock_id);
+	uint64_t record_size = RBLOCKS_TO_BYTES(r->n_rblocks);
+
+	uint8_t *read_buf = NULL;
+	drv_ssd_block *block = NULL;
+
+	drv_ssd *ssd = rd->ssd;
+	ssd_write_buf *swb = 0;
+	uint32_t wblock = RBLOCK_ID_TO_WBLOCK_ID(ssd, r->rblock_id);
+
+	swb_check_and_reserve(&ssd->alloc_table->wblock_state[wblock], &swb);
+
+	if (swb) {
+		// Data is in write buffer, so read it from there.
+		cf_atomic32_incr(&ns->n_reads_from_cache);
+
+		read_buf = cf_malloc(record_size);
+		block = (drv_ssd_block*)read_buf;
+
+		int swb_offset = record_offset - WBLOCK_ID_TO_BYTES(ssd, wblock);
+		memcpy(read_buf, swb->buf + swb_offset, record_size);
+		swb_release(swb);
+
+		ssd_decrypt(ssd, record_offset, block);
+	}
+	else {
+		// Normal case - data is read from device.
+		cf_atomic32_incr(&ns->n_reads_from_device);
+
+		uint64_t record_end_offset = record_offset + record_size;
+		uint64_t read_offset = BYTES_DOWN_TO_IO_MIN(ssd, record_offset);
+		uint64_t read_end_offset = BYTES_UP_TO_IO_MIN(ssd, record_end_offset);
+		size_t read_size = read_end_offset - read_offset;
+		uint64_t record_buf_indent = record_offset - read_offset;
+
+		read_buf = cf_valloc(read_size);
+
+		int fd = ssd_fd_get(ssd);
+
+		uint64_t start_ns = ns->storage_benchmarks_enabled ? cf_getns() : 0;
+
+		if (lseek(fd, (off_t)read_offset, SEEK_SET) != (off_t)read_offset) {
+			cf_warning(AS_DRV_SSD, "%s: seek failed: offset %lu: errno %d (%s)",
+					ssd->name, read_offset, errno, cf_strerror(errno));
+			cf_free(read_buf);
+			close(fd);
+			return -1;
+		}
+
+		ssize_t rv = read(fd, read_buf, read_size);
+
+		if (rv != (ssize_t)read_size) {
+			cf_warning(AS_DRV_SSD, "%s: read failed (%ld): size %lu: errno %d (%s)",
+					ssd->name, rv, read_size, errno, cf_strerror(errno));
+			cf_free(read_buf);
+			close(fd);
+			return -1;
+		}
+
+		if (start_ns != 0) {
+			histogram_insert_data_point(ssd->hist_read, start_ns);
+		}
+
+		ssd_fd_put(ssd, fd);
+
+		block = (drv_ssd_block*)(read_buf + record_buf_indent);
+		ssd_decrypt(ssd, record_offset, block);
+
+		// Sanity checks.
+
+		if (block->magic != SSD_BLOCK_MAGIC) {
+			cf_warning(AS_DRV_SSD, "read: bad block magic offset %lu",
+					read_offset);
+			cf_free(read_buf);
+			return -1;
+		}
+
+		if (block->length + LENGTH_BASE > read_size) {
+			cf_warning(AS_DRV_SSD, "read: bad block length %u", block->length);
+			cf_free(read_buf);
+			return -1;
+		}
+
+		if (0 != cf_digest_compare(&block->keyd, &r->keyd)) {
+			cf_warning(AS_DRV_SSD, "read: read wrong key: expecting %lx got %lx",
+					*(uint64_t*)&r->keyd, *(uint64_t*)&block->keyd);
+			cf_free(read_buf);
+			return -1;
+		}
+
+		if (block->n_bins > BIN_NAMES_QUOTA) {
+			cf_warning(AS_DRV_SSD, "read: bad block n_bins %u", block->n_bins);
+			cf_free(read_buf);
+			return -1;
+		}
+
+		if (block->bins_offset + offsetof(drv_ssd_block, data) > read_size) {
+			cf_warning(AS_DRV_SSD, "read: bad block bins_offset %u", block->bins_offset);
+			cf_free(read_buf);
+			return -1;
+		}
+
+		if (ns->storage_benchmarks_enabled) {
+			histogram_insert_raw(ns->device_read_size_hist, read_size);
+		}
+	}
+
+	rd->block = block;
+	rd->must_free_block = read_buf;
+
+	return 0;
+}
+
+
+//==========================================================
+// Storage API implementation: reading records.
+//
+
+int
+as_storage_record_load_n_bins_ssd(as_storage_rd *rd)
+{
+	if (! as_record_is_live(rd->r)) {
+		rd->n_bins = 0;
+		return 0; // no need to read device
+	}
+
+	// If the record hasn't been read, read it.
+	if (! rd->block && ssd_read_record(rd) != 0) {
+		cf_warning(AS_DRV_SSD, "load_n_bins: failed ssd_read_record()");
+		return -1;
+	}
+
+	rd->n_bins = rd->block->n_bins;
+	return 0;
+}
+
+
+int
+as_storage_record_load_bins_ssd(as_storage_rd *rd)
+{
+	if (! as_record_is_live(rd->r)) {
+		return 0; // no need to read device
+	}
+
+	// If the record hasn't been read, read it.
+	if (! rd->block && ssd_read_record(rd) != 0) {
+		cf_warning(AS_DRV_SSD, "load_bins: failed ssd_read_record()");
+		return -1;
+	}
+
+	drv_ssd_block *block = rd->block;
+	uint8_t *block_head = (uint8_t*)rd->block;
+
+	drv_ssd_bin *ssd_bin = (drv_ssd_bin*)(block->data + block->bins_offset);
+
+	for (uint16_t i = 0; i < block->n_bins; i++) {
+		as_bin_set_id_from_name(rd->ns, &rd->bins[i], ssd_bin->name);
+
+		int rv = as_bin_particle_cast_from_flat(&rd->bins[i],
+				block_head + ssd_bin->offset, ssd_bin->len);
+
+		if (0 != rv) {
+			return rv;
+		}
+
+		ssd_bin = (drv_ssd_bin*)(block_head + ssd_bin->next);
+	}
+
+	return 0;
+}
+
+
+bool
+as_storage_record_get_key_ssd(as_storage_rd *rd)
+{
+	// If the record hasn't been read, read it.
+	if (! rd->block && ssd_read_record(rd) != 0) {
+		cf_warning(AS_DRV_SSD, "get_key: failed ssd_read_record()");
+		return false;
+	}
+
+	drv_ssd_block *block = rd->block;
+	as_rec_props props;
+
+	props.size = block->bins_offset;
+
+	if (props.size == 0) {
+		return false;
+	}
+
+	props.p_data = block->data;
+
+	return as_rec_props_get_value(&props, CL_REC_PROPS_FIELD_KEY,
+			&rd->key_size, &rd->key) == 0;
+}
+
+
+//==========================================================
+// Record writing utilities.
+//
+
+void
+ssd_flush_swb(drv_ssd *ssd, ssd_write_buf *swb)
+{
+	// Wait for all writers to finish.
+	while (cf_atomic32_get(swb->n_writers) != 0) {
+		;
+	}
+
+	int fd = ssd_fd_get(ssd);
+	off_t write_offset = (off_t)WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id);
+
+	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+
+	if (lseek(fd, write_offset, SEEK_SET) != write_offset) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: offset %ld: errno %d (%s)",
+				ssd->name, write_offset, errno, cf_strerror(errno));
+	}
+
+	ssize_t rv_s = write(fd, swb->buf, ssd->write_block_size);
+
+	if (rv_s != (ssize_t)ssd->write_block_size) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+				ssd->name, errno, cf_strerror(errno));
+	}
+
+	if (start_ns != 0) {
+		histogram_insert_data_point(ssd->hist_write, start_ns);
+	}
+
+	ssd_fd_put(ssd, fd);
+}
+
+
+void
+ssd_shadow_flush_swb(drv_ssd *ssd, ssd_write_buf *swb)
+{
+	int fd = ssd_shadow_fd_get(ssd);
+	off_t write_offset = (off_t)WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id);
+
+	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+
+	if (lseek(fd, write_offset, SEEK_SET) != write_offset) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: offset %ld: errno %d (%s)",
+				ssd->shadow_name, write_offset, errno, cf_strerror(errno));
+	}
+
+	ssize_t rv_s = write(fd, swb->buf, ssd->write_block_size);
+
+	if (rv_s != (ssize_t)ssd->write_block_size) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+				ssd->shadow_name, errno, cf_strerror(errno));
+	}
+
+	if (start_ns != 0) {
+		histogram_insert_data_point(ssd->hist_shadow_write, start_ns);
+	}
+
+	ssd_shadow_fd_put(ssd, fd);
+}
+
+
+void
+ssd_write_sanity_checks(drv_ssd *ssd, ssd_write_buf *swb)
+{
+	ssd_wblock_state* p_wblock_state =
+			&ssd->alloc_table->wblock_state[swb->wblock_id];
+
+	if (p_wblock_state->swb != swb) {
+		cf_warning(AS_DRV_SSD, "device %s: wblock-id %u swb not consistent while writing",
+				ssd->name, swb->wblock_id);
+	}
+
+	if (p_wblock_state->state != WBLOCK_STATE_NONE) {
+		cf_warning(AS_DRV_SSD, "device %s: wblock-id %u state not NONE while writing",
+				ssd->name, swb->wblock_id);
+	}
+}
+
+
+void
+ssd_post_write(drv_ssd *ssd, ssd_write_buf *swb)
+{
+	if (cf_atomic32_get(ssd->ns->storage_post_write_queue) == 0 ||
+			swb->skip_post_write_q) {
+		swb_dereference_and_release(ssd, swb->wblock_id, swb);
+	}
+	else {
+		// Transfer swb to post-write queue.
+		cf_queue_push(ssd->post_write_q, &swb);
+	}
+
+	if (ssd->post_write_q) {
+		// Release post-write queue swbs if we're over the limit.
+		while ((uint32_t)cf_queue_sz(ssd->post_write_q) >
+				cf_atomic32_get(ssd->ns->storage_post_write_queue)) {
+			ssd_write_buf* cached_swb;
+
+			if (CF_QUEUE_OK != cf_queue_pop(ssd->post_write_q, &cached_swb,
+					CF_QUEUE_NOWAIT)) {
+				// Should never happen.
+				cf_warning(AS_DRV_SSD, "device %s: post-write queue pop failed",
+						ssd->name);
+				break;
+			}
+
+			swb_dereference_and_release(ssd, cached_swb->wblock_id,
+					cached_swb);
+		}
+	}
+}
+
+
+// Thread "run" function that flushes write buffers to device.
+void *
+ssd_write_worker(void *arg)
+{
+	drv_ssd *ssd = (drv_ssd*)arg;
+
+	while (ssd->running) {
+		ssd_write_buf *swb;
+
+		if (CF_QUEUE_OK != cf_queue_pop(ssd->swb_write_q, &swb, 100)) {
+			continue;
+		}
+
+		// Sanity checks (optional).
+		ssd_write_sanity_checks(ssd, swb);
+
+		// Flush to the device.
+		ssd_flush_swb(ssd, swb);
+
+		if (ssd->shadow_name) {
+			// Queue for shadow device write.
+			cf_queue_push(ssd->swb_shadow_q, &swb);
+		}
+		else {
+			// If this swb was a defrag destination, release the sources.
+			swb_release_all_vacated_wblocks(swb);
+
+			// Transfer to post-write queue, or release swb, as appropriate.
+			ssd_post_write(ssd, swb);
+		}
+	} // infinite event loop waiting for block to write
+
+	return NULL;
+}
+
+
+// Thread "run" function that flushes write buffers to shadow device.
+void *
+ssd_shadow_worker(void *arg)
+{
+	drv_ssd *ssd = (drv_ssd*)arg;
+
+	while (ssd->running) {
+		ssd_write_buf *swb;
+
+		if (CF_QUEUE_OK != cf_queue_pop(ssd->swb_shadow_q, &swb, 100)) {
+			continue;
+		}
+
+		// Sanity checks (optional).
+		ssd_write_sanity_checks(ssd, swb);
+
+		// Flush to the shadow device.
+		ssd_shadow_flush_swb(ssd, swb);
+
+		// If this swb was a defrag destination, release the sources.
+		swb_release_all_vacated_wblocks(swb);
+
+		// Transfer to post-write queue, or release swb, as appropriate.
+		ssd_post_write(ssd, swb);
+	}
+
+	return NULL;
+}
+
+
+void
+ssd_start_write_worker_threads(drv_ssds *ssds)
+{
+	if (ssds->ns->storage_write_threads > MAX_SSD_THREADS) {
+		cf_warning(AS_DRV_SSD, "configured number of write threads %u greater than max, using %d instead",
+				ssds->ns->storage_write_threads, MAX_SSD_THREADS);
+		ssds->ns->storage_write_threads = MAX_SSD_THREADS;
+	}
+
+	cf_info(AS_DRV_SSD, "{%s} starting write worker threads", ssds->ns->name);
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		for (uint32_t j = 0; j < ssds->ns->storage_write_threads; j++) {
+			pthread_create(&ssd->write_worker_thread[j], 0, ssd_write_worker,
+					(void*)ssd);
+		}
+
+		if (ssd->shadow_name) {
+			pthread_create(&ssd->shadow_worker_thread, 0, ssd_shadow_worker,
+					(void*)ssd);
+		}
+	}
+}
+
+
+static inline uint32_t
+ssd_record_overhead_size(as_storage_rd *rd)
+{
+	// Start with size of record header struct.
+	size_t size = sizeof(drv_ssd_block);
+
+	// Add size of any record properties.
+	if (rd->rec_props.p_data) {
+		size += rd->rec_props.size;
+	}
+
+	return (uint32_t)size;
+}
+
+
+uint32_t
+ssd_record_size(as_storage_rd *rd)
+{
+	// Start with the record storage overhead, including vinfo and rec-props.
+	uint32_t write_size = ssd_record_overhead_size(rd);
+
+	// Add the bins' sizes, including bin overhead.
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		as_bin *bin = &rd->bins[i];
+
+		if (! as_bin_inuse(bin)) {
+			break;
+		}
+
+		// TODO: could factor out sizeof(drv_ssd_bin) and multiply by i, but
+		// for now let's favor the low bin-count case and leave it this way.
+		write_size += sizeof(drv_ssd_bin) + as_bin_particle_flat_size(bin);
+	}
+
+	return write_size;
+}
+
+
+int
+ssd_buffer_bins(as_storage_rd *rd)
+{
+	as_namespace *ns = rd->ns;
+	as_record *r = rd->r;
+	drv_ssd *ssd = rd->ssd;
+
+	// Note - this is the only place where rounding size (up to a  multiple of
+	// RBLOCK_SIZE) is really necessary.
+	uint32_t write_size = BYTES_TO_RBLOCK_BYTES(ssd_record_size(rd));
+
+	if (write_size > ssd->write_block_size) {
+		cf_detail_digest(AS_DRV_SSD, &r->keyd, "write: size %u - rejecting ",
+				write_size);
+		return -AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG;
+	}
+
+	// Reserve the portion of the current swb where this record will be written.
+	pthread_mutex_lock(&ssd->write_lock);
+
+	ssd_write_buf *swb = ssd->current_swb;
+
+	if (! swb) {
+		swb = swb_get(ssd);
+		ssd->current_swb = swb;
+
+		if (! swb) {
+			cf_warning(AS_DRV_SSD, "write bins: couldn't get swb");
+			pthread_mutex_unlock(&ssd->write_lock);
+			return -AS_PROTO_RESULT_FAIL_OUT_OF_SPACE;
+		}
+	}
+
+	// Check if there's enough space in current buffer - if not, free and zero
+	// any remaining unused space, enqueue it to be flushed to device, and grab
+	// a new buffer.
+	if (write_size > ssd->write_block_size - swb->pos) {
+		if (ssd->write_block_size != swb->pos) {
+			// Clean the end of the buffer before pushing to write queue.
+			memset(&swb->buf[swb->pos], 0, ssd->write_block_size - swb->pos);
+		}
+
+		// Enqueue the buffer, to be flushed to device.
+		cf_queue_push(ssd->swb_write_q, &swb);
+		cf_atomic64_incr(&ssd->n_wblock_writes);
+
+		// Get the new buffer.
+		swb = swb_get(ssd);
+		ssd->current_swb = swb;
+
+		if (! swb) {
+			cf_warning(AS_DRV_SSD, "write bins: couldn't get swb");
+			pthread_mutex_unlock(&ssd->write_lock);
+			return -AS_PROTO_RESULT_FAIL_OUT_OF_SPACE;
+		}
+	}
+
+	// There's enough space - save the position where this record will be
+	// written, and advance swb->pos for the next writer.
+	uint32_t swb_pos = swb->pos;
+
+	swb->pos += write_size;
+	cf_atomic32_incr(&swb->n_writers);
+
+	pthread_mutex_unlock(&ssd->write_lock);
+	// May now write this record concurrently with others in this swb.
+
+	// Flatten data into the block.
+
+	uint8_t *buf = &swb->buf[swb_pos];
+	uint8_t *buf_start = buf;
+
+	drv_ssd_block *block = (drv_ssd_block*)buf;
+
+	buf += sizeof(drv_ssd_block);
+
+	// Properties list goes just before bins.
+	if (rd->rec_props.p_data) {
+		memcpy(buf, rd->rec_props.p_data, rd->rec_props.size);
+		buf += rd->rec_props.size;
+	}
+
+	uint16_t n_bins_written;
+
+	for (n_bins_written = 0; n_bins_written < rd->n_bins; n_bins_written++) {
+		as_bin *bin = &rd->bins[n_bins_written];
+
+		if (! as_bin_inuse(bin)) {
+			break;
+		}
+
+		drv_ssd_bin *ssd_bin = (drv_ssd_bin*)buf;
+
+		buf += sizeof(drv_ssd_bin);
+
+		ssd_bin->version = 0;
+
+		if (ns->single_bin) {
+			ssd_bin->name[0] = 0;
+		}
+		else {
+			strcpy(ssd_bin->name, as_bin_get_name_from_id(ns, bin->id));
+		}
+
+		ssd_bin->offset = buf - buf_start;
+
+		uint32_t particle_flat_size = as_bin_particle_to_flat(bin, buf);
+
+		buf += particle_flat_size;
+		ssd_bin->len = particle_flat_size;
+		ssd_bin->next = buf - buf_start;
+	}
+
+	block->sig = 0; // deprecated
+	block->length = write_size - LENGTH_BASE;
+	block->magic = SSD_BLOCK_MAGIC;
+	block->keyd = r->keyd;
+	block->generation = r->generation;
+	block->void_time = r->void_time;
+	block->bins_offset = rd->rec_props.p_data ? rd->rec_props.size : 0;
+	block->n_bins = n_bins_written;
+	block->last_update_time = r->last_update_time;
+
+	uint64_t write_offset = WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id) + swb_pos;
+
+	ssd_encrypt(ssd, write_offset, block);
+
+	r->file_id = ssd->file_id;
+	r->rblock_id = BYTES_TO_RBLOCKS(write_offset);
+	r->n_rblocks = BYTES_TO_RBLOCKS(write_size);
+
+	cf_atomic64_add(&ssd->inuse_size, (int64_t)write_size);
+	cf_atomic32_add(&ssd->alloc_table->wblock_state[swb->wblock_id].inuse_sz, (int32_t)write_size);
+
+	// We are finished writing to the buffer.
+	cf_atomic32_decr(&swb->n_writers);
+
+	if (ns->storage_benchmarks_enabled) {
+		histogram_insert_raw(ns->device_write_size_hist, write_size);
+	}
+
+	return 0;
+}
+
+
+int
+ssd_write(as_storage_rd *rd)
+{
+	as_record *r = rd->r;
+
+	drv_ssd *old_ssd = NULL;
+	uint64_t old_rblock_id = 0;
+	uint16_t old_n_rblocks = 0;
+
+	if (STORAGE_RBLOCK_IS_VALID(r->rblock_id)) {
+		// Replacing an old record.
+		old_ssd = rd->ssd;
+		old_rblock_id = r->rblock_id;
+		old_n_rblocks = r->n_rblocks;
+	}
+
+	drv_ssds *ssds = (drv_ssds*)rd->ns->storage_private;
+
+	// Figure out which device to write to. When replacing an old record, it's
+	// possible this is different from the old device (e.g. if we've added a
+	// fresh device), so derive it from the digest each time.
+	rd->ssd = &ssds->ssds[ssd_get_file_id(ssds, &r->keyd)];
+
+	drv_ssd *ssd = rd->ssd;
+
+	if (! ssd) {
+		cf_warning(AS_DRV_SSD, "{%s} ssd_write: no drv_ssd for file_id %u",
+				rd->ns->name, ssd_get_file_id(ssds, &r->keyd));
+		return -AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	int rv = ssd_write_bins(rd);
+
+	if (rv == 0 && old_ssd) {
+		ssd_block_free(old_ssd, old_rblock_id, old_n_rblocks, "ssd-write");
+	}
+
+	return rv;
+}
+
+
+//==========================================================
+// Storage statistics utilities.
+//
+
+void
+as_storage_show_wblock_stats(as_namespace *ns)
+{
+	if (AS_STORAGE_ENGINE_SSD != ns->storage_type) {
+		cf_info(AS_DRV_SSD, "Storage engine type must be SSD (%d), not %d.",
+				AS_STORAGE_ENGINE_SSD, ns->storage_type);
+		return;
+	}
+
+	if (ns->storage_private) {
+		drv_ssds *ssds = ns->storage_private;
+
+		for (int d = 0; d < ssds->n_ssds; d++) {
+			int num_free_blocks = 0;
+			int num_full_blocks = 0;
+			int num_full_swb = 0;
+			int num_above_wm = 0;
+			int num_defraggable = 0;
+
+			drv_ssd *ssd = &ssds->ssds[d];
+			ssd_alloc_table *at = ssd->alloc_table;
+			uint32_t lwm_size = ns->defrag_lwm_size;
+
+			for (uint32_t i = 0; i < at->n_wblocks; i++) {
+				ssd_wblock_state *wblock_state = &at->wblock_state[i];
+				uint32_t inuse_sz = cf_atomic32_get(wblock_state->inuse_sz);
+
+				if (inuse_sz == 0) {
+					num_free_blocks++;
+				}
+				else if (inuse_sz == ssd->write_block_size) {
+					if (wblock_state->swb) {
+						num_full_swb++;
+					}
+					else {
+						num_full_blocks++;
+					}
+				}
+				else {
+					if (inuse_sz > ssd->write_block_size || inuse_sz < lwm_size) {
+						cf_info(AS_DRV_SSD, "dev %d, wblock %u, inuse_sz %u, %s swb",
+								d, i, inuse_sz, wblock_state->swb ? "has" : "no");
+
+						num_defraggable++;
+					}
+					else {
+						num_above_wm++;
+					}
+				}
+			}
+
+			cf_info(AS_DRV_SSD, "device %s free %d full %d fullswb %d pfull %d defrag %d freeq %d",
+				ssd->name, num_free_blocks, num_full_blocks, num_full_swb,
+				num_above_wm, num_defraggable, cf_queue_sz(ssd->free_wblock_q));
+		}
+	}
+	else {
+		cf_info(AS_DRV_SSD, "no devices");
+	}
+}
+
+
+void
+as_storage_summarize_wblock_stats(as_namespace *ns)
+{
+	if (AS_STORAGE_ENGINE_SSD != ns->storage_type) {
+		cf_info(AS_DRV_SSD, "Storage engine type must be SSD (%d), not %d.",
+				AS_STORAGE_ENGINE_SSD, ns->storage_type);
+		return;
+	}
+
+	if (! ns->storage_private) {
+		cf_info(AS_DRV_SSD, "no devices");
+		return;
+	}
+
+	drv_ssds *ssds = ns->storage_private;
+	uint32_t total_num_defraggable = 0;
+	uint32_t total_num_above_wm = 0;
+	uint64_t defraggable_sz = 0;
+	uint64_t non_defraggable_sz = 0;
+
+	// Note: This is a sparse array that could be more efficiently stored.
+	// (In addition, ranges of block sizes could be binned together to
+	// compress the histogram, rather than using one bin per block size.)
+	uint32_t wb_hist[MAX_WRITE_BLOCK_SIZE] = { 0 };
+
+	for (uint32_t d = 0; d < ssds->n_ssds; d++) {
+		drv_ssd *ssd = &ssds->ssds[d];
+		ssd_alloc_table *at = ssd->alloc_table;
+		uint32_t num_free_blocks = 0;
+		uint32_t num_full_swb = 0;
+		uint32_t num_full_blocks = 0;
+		uint32_t lwm_size = ns->defrag_lwm_size;
+		uint32_t num_defraggable = 0;
+		uint32_t num_above_wm = 0;
+
+		for (uint32_t i = 0; i < at->n_wblocks; i++) {
+			ssd_wblock_state *wblock_state = &at->wblock_state[i];
+			uint32_t inuse_sz = cf_atomic32_get(wblock_state->inuse_sz);
+
+			if (inuse_sz > ssd->write_block_size) {
+				cf_warning(AS_DRV_SSD, "wblock size (%d > %d) too large ~~ not counting in histogram",
+						inuse_sz, ssd->write_block_size);
+			}
+			else {
+				wb_hist[inuse_sz]++;
+			}
+
+			if (inuse_sz == 0) {
+				num_free_blocks++;
+			}
+			else if (inuse_sz == ssd->write_block_size) {
+				if (wblock_state->swb) {
+					num_full_swb++;
+				}
+				else {
+					num_full_blocks++;
+				}
+			}
+			else if (inuse_sz < lwm_size) {
+				defraggable_sz += inuse_sz;
+				num_defraggable++;
+			}
+			else {
+				non_defraggable_sz += inuse_sz;
+				num_above_wm++;
+			}
+		}
+
+		total_num_defraggable += num_defraggable;
+		total_num_above_wm += num_above_wm;
+
+		cf_info(AS_DRV_SSD, "device %s free %u full %u fullswb %u pfull %u defrag %u freeq %u",
+				ssd->name, num_free_blocks, num_full_blocks, num_full_swb,
+				num_above_wm, num_defraggable, cf_queue_sz(ssd->free_wblock_q));
+	}
+
+	cf_info(AS_DRV_SSD, "WBH: Storage histogram for namespace \"%s\":",
+			ns->name);
+	cf_info(AS_DRV_SSD, "WBH: Average wblock size of: defraggable blocks: %lu bytes; nondefraggable blocks: %lu bytes; all blocks: %lu bytes",
+			defraggable_sz / MAX(1, total_num_defraggable),
+			non_defraggable_sz / MAX(1, total_num_above_wm),
+			(defraggable_sz + non_defraggable_sz) /
+					MAX(1, (total_num_defraggable + total_num_above_wm)));
+
+	for (uint32_t i = 0; i < MAX_WRITE_BLOCK_SIZE; i++) {
+		if (wb_hist[i] > 0) {
+			cf_info(AS_DRV_SSD, "WBH: %u block%s of size %u bytes",
+					wb_hist[i], (wb_hist[i] != 1 ? "s" : ""), i);
+		}
+	}
+}
+
+
+// TODO - do something more useful with this info command.
+int
+as_storage_analyze_wblock(as_namespace* ns, int device_index,
+		uint32_t wblock_id)
+{
+	if (AS_STORAGE_ENGINE_SSD != ns->storage_type) {
+		cf_info(AS_DRV_SSD, "Storage engine type must be SSD (%d), not %d.",
+				AS_STORAGE_ENGINE_SSD, ns->storage_type);
+		return -1;
+	}
+
+	cf_info(AS_DRV_SSD, "analyze wblock: {%s}, device-index %d, wblock-id %u",
+			ns->name, device_index, wblock_id);
+
+	drv_ssds* ssds = (drv_ssds*)ns->storage_private;
+
+	if (! ssds) {
+		cf_warning(AS_DRV_SSD, "analyze wblock ERROR: no devices");
+		return -1;
+	}
+
+	if (device_index < 0 || device_index >= ssds->n_ssds) {
+		cf_warning(AS_DRV_SSD, "analyze wblock ERROR: bad device-index");
+		return -1;
+	}
+
+	drv_ssd* ssd = &ssds->ssds[device_index];
+	uint8_t* read_buf = cf_valloc(ssd->write_block_size);
+
+	int fd = ssd_fd_get(ssd);
+	uint64_t file_offset = WBLOCK_ID_TO_BYTES(ssd, wblock_id);
+
+	if (lseek(fd, (off_t)file_offset, SEEK_SET) != (off_t)file_offset) {
+		cf_warning(AS_DRV_SSD, "%s: seek failed: offset %lu: errno %d (%s)",
+				ssd->name, file_offset, errno, cf_strerror(errno));
+		cf_free(read_buf);
+		close(fd);
+		return -1;
+	}
+
+	ssize_t rlen = read(fd, read_buf, ssd->write_block_size);
+
+	if (rlen != (ssize_t)ssd->write_block_size) {
+		cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)",
+				ssd->name, rlen, errno, cf_strerror(errno));
+		cf_free(read_buf);
+		close(fd);
+		return -1;
+	}
+
+	ssd_fd_put(ssd, fd);
+
+	uint32_t living_populations[AS_PARTITIONS];
+	uint32_t zombie_populations[AS_PARTITIONS];
+
+	memset(living_populations, 0, sizeof(living_populations));
+	memset(zombie_populations, 0, sizeof(zombie_populations));
+
+	uint32_t inuse_sz_start =
+			cf_atomic32_get(ssd->alloc_table->wblock_state[wblock_id].inuse_sz);
+	uint32_t offset = 0;
+
+	while (offset < ssd->write_block_size) {
+		drv_ssd_block* p_block = (drv_ssd_block*)&read_buf[offset];
+
+		ssd_decrypt(ssd, file_offset + offset, p_block);
+
+		if (p_block->magic != SSD_BLOCK_MAGIC) {
+			if (offset == 0) {
+				// First block must have magic.
+				cf_warning(AS_DRV_SSD, "analyze wblock ERROR: 1st block has no magic");
+				cf_free(read_buf);
+				return -1;
+			}
+
+			// Later blocks may have no magic, just skip to next block.
+			offset += RBLOCK_SIZE;
+			continue;
+		}
+
+		// Note - if block->length is sane, we don't need to round up to a
+		// multiple of RBLOCK_SIZE, but let's do it anyway just to be safe.
+		uint32_t next_offset = offset +
+				BYTES_TO_RBLOCK_BYTES(p_block->length + LENGTH_BASE);
+
+		if (next_offset > ssd->write_block_size) {
+			cf_warning(AS_DRV_SSD, "analyze wblock ERROR: record overflows wblock");
+			cf_free(read_buf);
+			return -1;
+		}
+
+		uint64_t rblock_id = BYTES_TO_RBLOCKS(file_offset + offset);
+		uint32_t n_rblocks = (uint32_t)BYTES_TO_RBLOCKS(next_offset - offset);
+
+		bool living = false;
+		uint32_t pid = as_partition_getid(&p_block->keyd);
+		as_partition_reservation rsv;
+
+		as_partition_reserve(ns, pid, &rsv);
+
+		as_index_ref r_ref;
+		r_ref.skip_lock = false;
+
+		if (0 == as_record_get(rsv.tree, &p_block->keyd, &r_ref)) {
+			as_index* r = r_ref.r;
+
+			if (r->rblock_id == rblock_id && r->n_rblocks == n_rblocks) {
+				living = true;
+			}
+
+			as_record_done(&r_ref, ns);
+		}
+		// else it was deleted (?) so call it a zombie...
+
+		as_partition_release(&rsv);
+
+		if (living) {
+			living_populations[pid]++;
+		}
+		else {
+			zombie_populations[pid]++;
+		}
+
+		offset = next_offset;
+	}
+
+	cf_free(read_buf);
+
+	uint32_t inuse_sz_end =
+			cf_atomic32_get(ssd->alloc_table->wblock_state[wblock_id].inuse_sz);
+
+	cf_info(AS_DRV_SSD, "analyze wblock: inuse_sz %u (before) -> %u (after)",
+			inuse_sz_start, inuse_sz_end);
+
+	for (int i = 0; i < AS_PARTITIONS; i++) {
+		if (living_populations[i] > 0 || zombie_populations[i] > 0) {
+			cf_info(AS_DRV_SSD, "analyze wblock: pid %4d - live %u, dead %u",
+					i, living_populations[i], zombie_populations[i]);
+		}
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// Per-device background jobs.
+//
+
+#define LOG_STATS_INTERVAL_sec 20
+
+void
+ssd_log_stats(drv_ssd *ssd, uint64_t *p_prev_n_total_writes,
+		uint64_t *p_prev_n_defrag_reads, uint64_t *p_prev_n_defrag_writes,
+		uint64_t *p_prev_n_tomb_raider_reads)
+{
+	uint64_t n_defrag_reads = cf_atomic64_get(ssd->n_defrag_wblock_reads);
+	uint64_t n_defrag_writes = cf_atomic64_get(ssd->n_defrag_wblock_writes);
+	uint64_t n_total_writes = cf_atomic64_get(ssd->n_wblock_writes) +
+			n_defrag_writes;
+
+	float total_write_rate = (float)(n_total_writes - *p_prev_n_total_writes) /
+			(float)LOG_STATS_INTERVAL_sec;
+	float defrag_read_rate = (float)(n_defrag_reads - *p_prev_n_defrag_reads) /
+			(float)LOG_STATS_INTERVAL_sec;
+	float defrag_write_rate = (float)(n_defrag_writes - *p_prev_n_defrag_writes) /
+			(float)LOG_STATS_INTERVAL_sec;
+
+	uint64_t n_tomb_raider_reads = ssd->n_tomb_raider_reads;
+	char tomb_raider_str[64];
+
+	*tomb_raider_str = 0;
+
+	if (n_tomb_raider_reads != 0) {
+		if (*p_prev_n_tomb_raider_reads > n_tomb_raider_reads) {
+			*p_prev_n_tomb_raider_reads = 0;
+		}
+
+		float tomb_raider_read_rate =
+				(float)(n_tomb_raider_reads - *p_prev_n_tomb_raider_reads) /
+				(float)LOG_STATS_INTERVAL_sec;
+
+		sprintf(tomb_raider_str, " tomb-raider-read (%lu,%.1f)",
+				n_tomb_raider_reads, tomb_raider_read_rate);
+	}
+
+	char shadow_str[64];
+
+	*shadow_str = 0;
+
+	if (ssd->shadow_name) {
+		sprintf(shadow_str, " shadow-write-q %d",
+				cf_queue_sz(ssd->swb_shadow_q));
+	}
+
+	cf_info(AS_DRV_SSD, "{%s} %s: used-bytes %lu free-wblocks %d write-q %d write (%lu,%.1f) defrag-q %d defrag-read (%lu,%.1f) defrag-write (%lu,%.1f)%s%s",
+			ssd->ns->name, ssd->name,
+			ssd->inuse_size, cf_queue_sz(ssd->free_wblock_q),
+			cf_queue_sz(ssd->swb_write_q),
+			n_total_writes, total_write_rate,
+			cf_queue_sz(ssd->defrag_wblock_q), n_defrag_reads, defrag_read_rate,
+			n_defrag_writes, defrag_write_rate,
+			shadow_str, tomb_raider_str);
+
+	*p_prev_n_total_writes = n_total_writes;
+	*p_prev_n_defrag_reads = n_defrag_reads;
+	*p_prev_n_defrag_writes = n_defrag_writes;
+	*p_prev_n_tomb_raider_reads = n_tomb_raider_reads;
+
+	if (cf_queue_sz(ssd->free_wblock_q) == 0) {
+		cf_warning(AS_DRV_SSD, "device %s: out of storage space", ssd->name);
+	}
+}
+
+
+void
+ssd_free_swbs(drv_ssd *ssd)
+{
+	// Try to recover swbs, 16 at a time, down to 16.
+	for (int i = 0; i < 16 && cf_queue_sz(ssd->swb_free_q) > 16; i++) {
+		ssd_write_buf* swb;
+
+		if (CF_QUEUE_OK !=
+				cf_queue_pop(ssd->swb_free_q, &swb, CF_QUEUE_NOWAIT)) {
+			break;
+		}
+
+		swb_destroy(swb);
+	}
+}
+
+
+void
+ssd_flush_current_swb(drv_ssd *ssd, uint64_t *p_prev_n_writes,
+		uint32_t *p_prev_size)
+{
+	uint64_t n_writes = cf_atomic64_get(ssd->n_wblock_writes);
+
+	// If there's an active write load, we don't need to flush.
+	if (n_writes != *p_prev_n_writes) {
+		*p_prev_n_writes = n_writes;
+		*p_prev_size = 0;
+		return;
+	}
+
+	pthread_mutex_lock(&ssd->write_lock);
+
+	n_writes = cf_atomic64_get(ssd->n_wblock_writes);
+
+	// Must check under the lock, could be racing a current swb just queued.
+	if (n_writes != *p_prev_n_writes) {
+
+		pthread_mutex_unlock(&ssd->write_lock);
+
+		*p_prev_n_writes = n_writes;
+		*p_prev_size = 0;
+		return;
+	}
+
+	// Flush the current swb if it isn't empty, and has been written to since
+	// last flushed.
+
+	ssd_write_buf *swb = ssd->current_swb;
+
+	if (swb && swb->pos != *p_prev_size) {
+		*p_prev_size = swb->pos;
+
+		// Clean the end of the buffer before flushing.
+		if (ssd->write_block_size != swb->pos) {
+			memset(&swb->buf[swb->pos], 0, ssd->write_block_size - swb->pos);
+		}
+
+		// Flush it.
+		ssd_flush_swb(ssd, swb);
+
+		if (ssd->shadow_name) {
+			ssd_shadow_flush_swb(ssd, swb);
+		}
+	}
+
+	pthread_mutex_unlock(&ssd->write_lock);
+}
+
+
+void
+ssd_fsync(drv_ssd *ssd)
+{
+	int fd = ssd_fd_get(ssd);
+
+	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+
+	fsync(fd);
+
+	if (start_ns != 0) {
+		histogram_insert_data_point(ssd->hist_fsync, start_ns);
+	}
+
+	ssd_fd_put(ssd, fd);
+}
+
+
+// Check all wblocks to load a device's defrag queue at runtime. Triggered only
+// when defrag-lwm-pct is increased by manual intervention.
+void
+ssd_defrag_sweep(drv_ssd *ssd)
+{
+	ssd_alloc_table* at = ssd->alloc_table;
+	uint32_t first_id = BYTES_TO_WBLOCK_ID(ssd, SSD_HEADER_SIZE);
+	uint32_t last_id = at->n_wblocks;
+	uint32_t n_queued = 0;
+
+	for (uint32_t wblock_id = first_id; wblock_id < last_id; wblock_id++) {
+		ssd_wblock_state *p_wblock_state = &at->wblock_state[wblock_id];
+
+		cf_mutex_lock(&p_wblock_state->LOCK);
+
+		uint32_t inuse_sz = cf_atomic32_get(p_wblock_state->inuse_sz);
+
+		if (! p_wblock_state->swb &&
+				p_wblock_state->state != WBLOCK_STATE_DEFRAG &&
+					inuse_sz != 0 &&
+						inuse_sz < ssd->ns->defrag_lwm_size) {
+			push_wblock_to_defrag_q(ssd, wblock_id);
+			n_queued++;
+		}
+
+		cf_mutex_unlock(&p_wblock_state->LOCK);
+	}
+
+	cf_info(AS_DRV_SSD, "... %s sweep queued %u wblocks for defrag", ssd->name,
+			n_queued);
+}
+
+
+static inline uint64_t
+next_time(uint64_t now, uint64_t job_interval, uint64_t next)
+{
+	uint64_t next_job = now + job_interval;
+
+	return next_job < next ? next_job : next;
+}
+
+
+// All in microseconds since we're using usleep().
+#define MAX_INTERVAL		(1000 * 1000)
+#define LOG_STATS_INTERVAL	(1000 * 1000 * LOG_STATS_INTERVAL_sec)
+#define FREE_SWBS_INTERVAL	(1000 * 1000 * 20)
+
+// Thread "run" function to perform various background jobs per device.
+void *
+run_ssd_maintenance(void *udata)
+{
+	drv_ssd *ssd = (drv_ssd*)udata;
+	as_namespace *ns = ssd->ns;
+
+	uint64_t prev_n_total_writes = 0;
+	uint64_t prev_n_defrag_reads = 0;
+	uint64_t prev_n_defrag_writes = 0;
+	uint64_t prev_n_tomb_raider_reads = 0;
+
+	uint64_t prev_n_writes_flush = 0;
+	uint32_t prev_size_flush = 0;
+
+	uint64_t now = cf_getus();
+	uint64_t next = now + MAX_INTERVAL;
+
+	uint64_t prev_log_stats = now;
+	uint64_t prev_free_swbs = now;
+	uint64_t prev_flush = now;
+	uint64_t prev_fsync = now;
+
+	// If any job's (initial) interval is less than MAX_INTERVAL and we want it
+	// done on its interval the first time through, add a next_time() call for
+	// that job here to adjust 'next'. (No such jobs for now.)
+
+	uint64_t sleep_us = next - now;
+
+	while (true) {
+		usleep((uint32_t)sleep_us);
+
+		now = cf_getus();
+		next = now + MAX_INTERVAL;
+
+		if (now >= prev_log_stats + LOG_STATS_INTERVAL) {
+			ssd_log_stats(ssd, &prev_n_total_writes, &prev_n_defrag_reads,
+					&prev_n_defrag_writes, &prev_n_tomb_raider_reads);
+			prev_log_stats = now;
+			next = next_time(now, LOG_STATS_INTERVAL, next);
+		}
+
+		if (now >= prev_free_swbs + FREE_SWBS_INTERVAL) {
+			ssd_free_swbs(ssd);
+			prev_free_swbs = now;
+			next = next_time(now, FREE_SWBS_INTERVAL, next);
+		}
+
+		uint64_t flush_max_us = ssd_flush_max_us(ns);
+
+		if (flush_max_us != 0 && now >= prev_flush + flush_max_us) {
+			ssd_flush_current_swb(ssd, &prev_n_writes_flush, &prev_size_flush);
+			prev_flush = now;
+			next = next_time(now, flush_max_us, next);
+		}
+
+		uint64_t fsync_max_us = ns->storage_fsync_max_us;
+
+		if (fsync_max_us != 0 && now >= prev_fsync + fsync_max_us) {
+			ssd_fsync(ssd);
+			prev_fsync = now;
+			next = next_time(now, fsync_max_us, next);
+		}
+
+		if (cf_atomic32_get(ssd->defrag_sweep) != 0) {
+			// May take long enough to mess up other jobs' schedules, but it's a
+			// very rare manually-triggered intervention.
+			ssd_defrag_sweep(ssd);
+			cf_atomic32_decr(&ssd->defrag_sweep);
+		}
+
+		now = cf_getus(); // refresh in case jobs took significant time
+		sleep_us = next > now ? next - now : 1;
+	}
+
+	return NULL;
+}
+
+
+void
+ssd_start_maintenance_threads(drv_ssds *ssds)
+{
+	cf_info(AS_DRV_SSD, "{%s} starting device maintenance threads",
+			ssds->ns->name);
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd* ssd = &ssds->ssds[i];
+
+		pthread_create(&ssd->maintenance_thread, 0, run_ssd_maintenance, ssd);
+	}
+}
+
+
+//==========================================================
+// Device header utilities.
+//
+
+// -1 means unrecoverable error
+// -2 means not formatted, please overwrite me
+int
+ssd_read_header(drv_ssd *ssd, as_namespace *ns, ssd_device_header **header_r)
+{
+	*header_r = 0;
+
+	int rv = -1;
+
+	bool use_shadow = ns->cold_start && ssd->shadow_name;
+	const char *ssd_name = use_shadow ? ssd->shadow_name : ssd->name;
+	int fd = use_shadow ? ssd_shadow_fd_get(ssd) : ssd_fd_get(ssd);
+
+	size_t peek_size = BYTES_UP_TO_IO_MIN(ssd, sizeof(ssd_device_header));
+	ssd_device_header *header = cf_valloc(peek_size);
+
+	if (lseek(fd, 0, SEEK_SET) != 0) {
+		cf_warning(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", ssd_name,
+				errno, cf_strerror(errno));
+		close(fd);
+		fd = -1;
+		goto Fail;
+	}
+
+	ssize_t sz = read(fd, (void*)header, peek_size);
+
+	if (sz != (ssize_t)peek_size) {
+		cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)",
+				ssd_name, sz, errno, cf_strerror(errno));
+		close(fd);
+		fd = -1;
+		goto Fail;
+	}
+
+	// Make sure all following checks that return -1 or -2 are also done in
+	// peek_devices() in the enterprise repo.
+
+	if (header->magic != SSD_HEADER_MAGIC) { // normal path for a fresh drive
+		cf_detail(AS_DRV_SSD, "read_header: device %s no magic, not a Citrusleaf drive",
+				ssd_name);
+		rv = -2;
+		goto Fail;
+	}
+
+	if (header->version != SSD_VERSION) {
+		if (can_convert_storage_version(header->version)) {
+			cf_info(AS_DRV_SSD, "read_header: device %s converting storage version %u to %u",
+					ssd_name, header->version, SSD_VERSION);
+		}
+		else {
+			cf_warning(AS_DRV_SSD, "read_header: device %s bad version %u, not a current Citrusleaf drive",
+					ssd_name, header->version);
+			goto Fail;
+		}
+	}
+
+	if (header->write_block_size != 0 &&
+			ns->storage_write_block_size % header->write_block_size != 0) {
+		cf_warning(AS_DRV_SSD, "read header: device %s can't change write-block-size from %u to %u",
+				ssd_name, header->write_block_size,
+				ns->storage_write_block_size);
+		goto Fail;
+	}
+
+	if (header->devices_n > AS_STORAGE_MAX_DEVICES) {
+		cf_warning(AS_DRV_SSD, "read header: device %s bad number of devices %u",
+				ssd_name, header->devices_n);
+		goto Fail;
+	}
+
+	if (header->header_length != SSD_HEADER_SIZE) {
+		cf_warning(AS_DRV_SSD, "read header: device %s incompatible header size %u",
+				ssd_name, header->header_length);
+		goto Fail;
+	}
+
+	if (strcmp(header->namespace, ns->name) != 0) {
+		cf_warning(AS_DRV_SSD, "read header: device %s previous namespace %s now %s, check config or erase device",
+				ssd_name, header->namespace, ns->name);
+		goto Fail;
+	}
+
+	size_t h_len = header->header_length;
+
+	cf_free(header);
+
+	header = cf_valloc(h_len);
+
+	if (lseek(fd, 0, SEEK_SET) != 0) {
+		cf_warning(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", ssd_name,
+				errno, cf_strerror(errno));
+		close(fd);
+		fd = -1;
+		goto Fail;
+	}
+
+	sz = read(fd, (void*)header, h_len);
+
+	if (sz != (ssize_t)header->header_length) {
+		cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)",
+				ssd_name, sz, errno, cf_strerror(errno));
+		close(fd);
+		fd = -1;
+		goto Fail;
+	}
+
+	cf_detail(AS_DRV_SSD, "device %s: header read success: version %d devices %d random %lu",
+			ssd_name, header->version, header->devices_n, header->random);
+
+	if (! ssd_header_is_valid_cfg(ns, header)) {
+		goto Fail;
+	}
+
+	// In case we're bumping the version - ensure the new version gets written.
+	header->version = SSD_VERSION;
+
+	// In case we're increasing write-block-size - ensure new value is recorded.
+	header->write_block_size = ns->storage_write_block_size;
+
+	*header_r = header;
+
+	use_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd);
+
+	return 0;
+
+Fail:
+
+	if (header) {
+		cf_free(header);
+	}
+
+	if (fd != -1) {
+		use_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd);
+	}
+
+	return rv;
+}
+
+
+ssd_device_header *
+ssd_init_header(as_namespace *ns)
+{
+	ssd_device_header *h = cf_valloc(SSD_HEADER_SIZE);
+
+	memset(h, 0, SSD_HEADER_SIZE);
+
+	h->magic = SSD_HEADER_MAGIC;
+	h->random = 0;
+	h->write_block_size = ns->storage_write_block_size;
+	h->last_evict_void_time = 0;
+	h->version = SSD_VERSION;
+	h->flags = 0;
+	h->devices_n = 0;
+	h->header_length = SSD_HEADER_SIZE;
+	memset(h->namespace, 0, sizeof(h->namespace));
+	strcpy(h->namespace, ns->name);
+	h->info_n = AS_PARTITIONS;
+	h->info_stride = SSD_HEADER_INFO_STRIDE;
+
+	ssd_header_init_cfg(ns, h);
+
+	return h;
+}
+
+
+bool
+ssd_empty_header(int fd, const char* device_name)
+{
+	void *h = cf_valloc(SSD_HEADER_SIZE);
+
+	memset(h, 0, SSD_HEADER_SIZE);
+
+	if (0 != lseek(fd, 0, SEEK_SET)) {
+		cf_warning(AS_DRV_SSD, "device %s: empty header: seek error: %s",
+				device_name, cf_strerror(errno));
+		cf_free(h);
+		return false;
+	}
+
+	if (SSD_HEADER_SIZE != write(fd, h, SSD_HEADER_SIZE)) {
+		cf_warning(AS_DRV_SSD, "device %s: empty header: write error: %s",
+				device_name, cf_strerror(errno));
+		cf_free(h);
+		return false;
+	}
+
+	cf_free(h);
+	fsync(fd);
+
+	return true;
+}
+
+
+void
+ssd_write_header(drv_ssd *ssd, ssd_device_header *header, off_t offset,
+		size_t size)
+{
+	int fd = ssd_fd_get(ssd);
+
+	if (lseek(fd, offset, SEEK_SET) != offset) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: errno %d (%s)",
+				ssd->name, errno, cf_strerror(errno));
+	}
+
+	uint8_t *from = (uint8_t*)header + offset;
+
+	ssize_t sz = write(fd, (void*)from, size);
+
+	if (sz != (ssize_t)size) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+				ssd->name, errno, cf_strerror(errno));
+	}
+
+	ssd_fd_put(ssd, fd);
+
+	if (! ssd->shadow_name) {
+		return;
+	}
+
+	fd = ssd_shadow_fd_get(ssd);
+
+	if (lseek(fd, offset, SEEK_SET) != offset) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: errno %d (%s)",
+				ssd->shadow_name, errno, cf_strerror(errno));
+	}
+
+	sz = write(fd, (void*)from, size);
+
+	if (sz != (ssize_t)size) {
+		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+				ssd->shadow_name, errno, cf_strerror(errno));
+	}
+
+	ssd_shadow_fd_put(ssd, fd);
+}
+
+
+//==========================================================
+// Cold start utilities.
+//
+
+bool
+prefer_existing_record(drv_ssd* ssd, drv_ssd_block* block, as_index* r)
+{
+	int result = as_record_resolve_conflict(ssd_cold_start_policy(ssd->ns),
+			r->generation, r->last_update_time,
+			block->generation, block->last_update_time);
+
+	if (result != 0) {
+		return result == -1; // -1 means block record < existing record
+	}
+
+	// Finally, compare void-times. Note that defragged records will generate
+	// identical copies on drive, so they'll get here and return true.
+	return r->void_time == 0 ||
+			(block->void_time != 0 && block->void_time <= r->void_time);
+}
+
+
+bool
+is_set_evictable(as_namespace* ns, const as_rec_props* p_props)
+{
+	if (p_props->size == 0) {
+		return true;
+	}
+
+	const char* set_name;
+
+	if (as_rec_props_get_value(p_props, CL_REC_PROPS_FIELD_SET_NAME, NULL,
+			(uint8_t**)&set_name) != 0) {
+		return true;
+	}
+
+	as_set *p_set;
+
+	if (cf_vmapx_get_by_name(ns->p_sets_vmap, set_name, (void**)&p_set) !=
+			CF_VMAPX_OK) {
+		return true;
+	}
+
+	return ! IS_SET_EVICTION_DISABLED(p_set);
+}
+
+
+bool
+is_record_expired(as_namespace* ns, const drv_ssd_block* block,
+		const as_rec_props* p_props)
+{
+	if (block->void_time == 0 ||
+			block->void_time > ns->cold_start_threshold_void_time) {
+		return false;
+	}
+
+	// If set is not evictable, may have expired but wasn't evicted.
+	return block->void_time < as_record_void_time_get() ||
+			is_set_evictable(ns, p_props);
+}
+
+
+void
+apply_rec_props(as_record* r, as_namespace* ns, const as_rec_props* p_props)
+{
+	// Set record's set-id. (If it already has one, assume they're the same.)
+	if (! as_index_has_set(r) && p_props->size != 0) {
+		const char* set_name;
+
+		if (as_rec_props_get_value(p_props, CL_REC_PROPS_FIELD_SET_NAME, NULL,
+				(uint8_t**)&set_name) == 0) {
+			as_index_set_set(r, ns, set_name, false);
+		}
+	}
+
+	uint32_t key_size;
+	uint8_t* key;
+	bool got_key = p_props->size != 0 &&
+			as_rec_props_get_value(p_props, CL_REC_PROPS_FIELD_KEY, &key_size,
+					&key) == 0;
+
+	// If a key wasn't stored, and we got one, accommodate it.
+	if (r->key_stored == 0) {
+		if (got_key) {
+			if (ns->storage_data_in_memory) {
+				as_record_allocate_key(r, key, key_size);
+			}
+
+			r->key_stored = 1;
+		}
+	}
+	// If a key was stored, but we didn't get one, remove the key.
+	else if (! got_key) {
+		if (ns->storage_data_in_memory) {
+			as_record_remove_key(r);
+		}
+
+		r->key_stored = 0;
+	}
+}
+
+
+// Add a record just read from drive to the index, if all is well.
+void
+ssd_cold_start_add_record(drv_ssds* ssds, drv_ssd* ssd, drv_ssd_block* block,
+		uint64_t rblock_id, uint32_t n_rblocks)
+{
+	uint32_t pid = as_partition_getid(&block->keyd);
+
+	// If this isn't a partition we're interested in, skip this record.
+	if (! ssds->get_state_from_storage[pid]) {
+		return;
+	}
+
+	as_namespace* ns = ssds->ns;
+
+	// If eviction is necessary, evict previously added records closest to
+	// expiration. (If evicting, this call will block for a long time.) This
+	// call may also update the cold start threshold void-time.
+	if (! as_cold_start_evict_if_needed(ns)) {
+		cf_crash(AS_DRV_SSD, "hit stop-writes limit before drive scan completed");
+	}
+
+	// Sanity-check the record.
+	if (! is_valid_record(block, ns->name)) {
+		cf_warning_digest(AS_DRV_SSD, &block->keyd, "invalid data on device - ignoring record ");
+		return; // caller will continue and try next record
+	}
+
+	// Don't bother with reservations - partition trees aren't going anywhere.
+	as_partition* p_partition = &ns->partitions[pid];
+
+	// Get or create the record.
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	// Prepare to read rec-props.
+	as_rec_props props = { .p_data = block->data, .size = block->bins_offset };
+
+	if (ssd_cold_start_is_record_truncated(ns, block, &props)) {
+		return;
+	}
+
+	// Get/create the record from/in the appropriate index tree.
+	int rv = as_record_get_create(p_partition->vp, &block->keyd, &r_ref, ns);
+
+	if (rv < 0) {
+		cf_warning_digest(AS_DRV_SSD, &block->keyd, "record-add as_record_get_create() failed ");
+		return;
+	}
+
+	bool is_create = rv == 1;
+
+	// Fix 0 generations coming off device.
+	if (block->generation == 0) {
+		block->generation = 1;
+		cf_warning_digest(AS_DRV_SSD, &block->keyd, "record-add found generation 0 - changed to 1 ");
+	}
+
+	as_index* r = r_ref.r;
+	uint32_t wblock_id = RBLOCK_ID_TO_WBLOCK_ID(ssd, rblock_id);
+	// TODO - pass in wblock_id when we do boundary check in sweep.
+
+	if (! is_create) {
+		// Record already existed. Ignore this one if existing record is newer.
+		if (prefer_existing_record(ssd, block, r)) {
+			ssd_cold_start_adjust_cenotaph(ns, block, r);
+			as_record_done(&r_ref, ns);
+			ssd->record_add_older_counter++;
+			return;
+		}
+	}
+	// The record we're now reading is the latest version (so far) ...
+
+	// Skip records that have expired.
+	if (is_record_expired(ns, block, &props)) {
+		as_index_delete(p_partition->vp, &block->keyd);
+		as_record_done(&r_ref, ns);
+		ssd->record_add_expired_counter++;
+		return;
+	}
+
+	// We'll keep the record we're now reading ...
+
+	ssd_cold_start_init_repl_state(ns, r);
+
+	// Set/reset the record's last-update-time and generation.
+	r->last_update_time = block->last_update_time;
+	r->generation = block->generation;
+
+	// Set/reset the record's void-time, truncating it if beyond max-ttl.
+	if (block->void_time > ns->cold_start_max_void_time) {
+		cf_detail(AS_DRV_SSD, "record-add truncating void-time %lu > max %u",
+				block->void_time, ns->cold_start_max_void_time);
+
+		r->void_time = ns->cold_start_max_void_time;
+		ssd->record_add_max_ttl_counter++;
+	}
+	else {
+		r->void_time = block->void_time;
+	}
+
+	// Update maximum void-time.
+	cf_atomic64_setmax(&p_partition->max_void_time, r->void_time);
+
+	// If data is in memory, load bins and particles, adjust secondary index.
+	if (ns->storage_data_in_memory) {
+		uint8_t* block_head = (uint8_t*)block;
+		drv_ssd_bin* ssd_bin = (drv_ssd_bin*)(block->data + block->bins_offset);
+		as_storage_rd rd;
+
+		if (is_create) {
+			as_storage_record_create(ns, r, &rd);
+		}
+		else {
+			as_storage_record_open(ns, r, &rd);
+		}
+
+		as_storage_rd_load_n_bins(&rd);
+		as_storage_rd_load_bins(&rd, NULL);
+
+		uint64_t bytes_memory = as_storage_record_get_n_bytes_memory(&rd);
+
+		// Do this early since set-id is needed for the secondary index update.
+		apply_rec_props(r, ns, &props);
+
+		uint16_t old_n_bins = rd.n_bins;
+
+		bool has_sindex = record_has_sindex(r, ns);
+		int sbins_populated = 0;
+
+		if (has_sindex) {
+			SINDEX_GRLOCK();
+		}
+
+		SINDEX_BINS_SETUP(sbins, 2 * ns->sindex_cnt);
+		as_sindex* si_arr[2 * ns->sindex_cnt];
+		int si_arr_index = 0;
+		const char* set_name = as_index_get_set_name(r, ns);
+
+		if (has_sindex) {
+			for (uint16_t i = 0; i < old_n_bins; i++) {
+				si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, rd.bins[i].id, &si_arr[si_arr_index]);
+			}
+		}
+
+		int32_t delta_bins = (int32_t)block->n_bins - (int32_t)old_n_bins;
+
+		if (rd.ns->single_bin) {
+			if (delta_bins < 0) {
+				as_record_destroy_bins(&rd);
+			}
+		}
+		else if (delta_bins != 0) {
+			if (has_sindex && delta_bins < 0) {
+				sbins_populated += as_sindex_sbins_from_rd(&rd, (uint16_t)block->n_bins, old_n_bins, sbins, AS_SINDEX_OP_DELETE);
+			}
+
+			as_bin_allocate_bin_space(&rd, delta_bins);
+		}
+
+		for (uint16_t i = 0; i < block->n_bins; i++) {
+			as_bin* b;
+
+			if (i < old_n_bins) {
+				b = &rd.bins[i];
+
+				if (has_sindex) {
+					sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_DELETE);
+				}
+
+				as_bin_set_id_from_name(ns, b, ssd_bin->name);
+			}
+			else {
+				// TODO - what if this fails?
+				b = as_bin_create(&rd, ssd_bin->name);
+			}
+
+			// TODO - what if this fails?
+			as_bin_particle_replace_from_flat(b, block_head + ssd_bin->offset,
+					ssd_bin->len);
+
+			ssd_bin = (drv_ssd_bin*)(block_head + ssd_bin->next);
+
+			if (has_sindex) {
+				si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]);
+				sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_INSERT);
+			}
+		}
+
+		if (has_sindex) {
+			SINDEX_GRUNLOCK();
+
+			if (sbins_populated > 0) {
+				as_sindex_update_by_sbin(ns, as_index_get_set_name(r, ns), sbins, sbins_populated, &r->keyd);
+				as_sindex_sbin_freeall(sbins, sbins_populated);
+			}
+
+			as_sindex_release_arr(si_arr, si_arr_index);
+		}
+
+		as_storage_record_adjust_mem_stats(&rd, bytes_memory);
+		as_storage_record_close(&rd);
+	}
+	else {
+		apply_rec_props(r, ns, &props);
+	}
+
+	if (is_create) {
+		ssd->record_add_unique_counter++;
+	}
+	else if (STORAGE_RBLOCK_IS_VALID(r->rblock_id)) {
+		// Replacing an existing record, undo its previous storage accounting.
+		ssd_block_free(&ssds->ssds[r->file_id], r->rblock_id, r->n_rblocks,
+				"record-add");
+		ssd->record_add_replace_counter++;
+	}
+	else {
+		cf_warning(AS_DRV_SSD, "replacing record with invalid rblock-id");
+	}
+
+	ssd_cold_start_transition_record(ns, block, r, is_create);
+
+	// Update storage accounting to include this record.
+	// TODO - pass in size instead of n_rblocks.
+	uint32_t size = (uint32_t)RBLOCKS_TO_BYTES(n_rblocks);
+
+	ssd->inuse_size += size;
+	ssd->alloc_table->wblock_state[wblock_id].inuse_sz += size;
+
+	// Set/reset the record's storage information.
+	r->file_id = ssd->file_id;
+	r->rblock_id = rblock_id;
+	r->n_rblocks = n_rblocks;
+
+	as_record_done(&r_ref, ns);
+}
+
+
+// Sweep through a storage device to rebuild the index.
+void
+ssd_cold_start_sweep(drv_ssds *ssds, drv_ssd *ssd)
+{
+	size_t wblock_size = ssd->write_block_size;
+
+	uint8_t *buf = cf_valloc(wblock_size);
+
+	bool read_shadow = ssd->shadow_name;
+	char *read_ssd_name = read_shadow ? ssd->shadow_name : ssd->name;
+	int fd = read_shadow ? ssd_shadow_fd_get(ssd) : ssd_fd_get(ssd);
+	int write_fd = read_shadow ? ssd_fd_get(ssd) : -1;
+
+	// Seek past the header.
+
+	if (lseek(fd, SSD_HEADER_SIZE, SEEK_SET) != SSD_HEADER_SIZE) {
+		cf_crash(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", read_ssd_name,
+				errno, cf_strerror(errno));
+	}
+
+	if (read_shadow &&
+			lseek(write_fd, SSD_HEADER_SIZE, SEEK_SET) != SSD_HEADER_SIZE) {
+		cf_crash(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", ssd->name,
+				errno, cf_strerror(errno));
+	}
+
+	// Loop over all wblocks, unless we encounter 10 contiguous unused wblocks.
+
+	ssd->sweep_wblock_id = SSD_HEADER_SIZE / (uint32_t)wblock_size;
+
+	uint64_t file_offset = SSD_HEADER_SIZE;
+	uint32_t n_unused_wblocks = 0;
+
+	while (file_offset < ssd->file_size && n_unused_wblocks < 10) {
+		if (read(fd, buf, wblock_size) != wblock_size) {
+			cf_crash(AS_DRV_SSD, "%s: read failed: errno %d (%s)",
+					read_ssd_name, errno, cf_strerror(errno));
+		}
+
+		if (read_shadow &&
+				write(write_fd, (void*)buf, wblock_size) != wblock_size) {
+			cf_crash(AS_DRV_SSD, "%s: write failed: errno %d (%s)", ssd->name,
+					errno, cf_strerror(errno));
+		}
+
+		size_t indent = 0; // current offset within wblock, in bytes
+
+		while (indent < wblock_size) {
+			drv_ssd_block *block = (drv_ssd_block*)&buf[indent];
+
+			ssd_decrypt(ssd, file_offset + indent, block);
+
+			// Look for record magic.
+			if (block->magic != SSD_BLOCK_MAGIC) {
+				// Should always find a record at beginning of used wblock. if
+				// not, we've likely encountered the unused part of the device.
+				if (indent == 0) {
+					n_unused_wblocks++;
+					break; // try next wblock
+				}
+				// else - nothing more in this wblock, but keep looking for
+				// magic - necessary if we want to be able to increase
+				// write-block-size across restarts.
+
+				indent += RBLOCK_SIZE;
+				continue; // try next rblock
+			}
+
+			if (n_unused_wblocks != 0) {
+				cf_warning(AS_DRV_SSD, "%s: found used wblock after skipping %u unused",
+						ssd->name, n_unused_wblocks);
+
+				n_unused_wblocks = 0; // restart contiguous count
+			}
+
+			// Note - if block->length is sane, we don't need to round up to a
+			// multiple of RBLOCK_SIZE, but let's do it anyway just to be safe.
+			size_t next_indent = indent +
+					BYTES_TO_RBLOCK_BYTES(block->length + LENGTH_BASE);
+
+			// Sanity-check for wblock overruns.
+			if (next_indent > wblock_size) {
+				cf_warning(AS_DRV_SSD, "%s: record crosses wblock boundary: block-length %u",
+						ssd->name, block->length);
+				break; // skip this record, try next wblock
+			}
+
+			// Found a record - try to add it to the index.
+			ssd_cold_start_add_record(ssds, ssd, block,
+					BYTES_TO_RBLOCKS(file_offset + indent),
+					(uint32_t)BYTES_TO_RBLOCKS(next_indent - indent));
+
+			indent = next_indent;
+		}
+
+		file_offset += wblock_size;
+		ssd->sweep_wblock_id++;
+	}
+
+	ssd->sweep_wblock_id = (uint32_t)(ssd->file_size / wblock_size);
+
+	if (fd != -1) {
+		read_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd);
+	}
+
+	if (write_fd != -1) {
+		ssd_fd_put(ssd, write_fd);
+	}
+
+	cf_free(buf);
+}
+
+
+// Thread "run" function to read a storage device and rebuild the index.
+void *
+run_ssd_cold_start(void *udata)
+{
+	ssd_load_records_info *lri = (ssd_load_records_info*)udata;
+	drv_ssd *ssd = lri->ssd;
+	drv_ssds *ssds = lri->ssds;
+	cf_queue *complete_q = lri->complete_q;
+	void *complete_udata = lri->complete_udata;
+	void *complete_rc = lri->complete_rc;
+
+	cf_free(lri);
+
+	as_namespace* ns = ssds->ns;
+
+	cf_info(AS_DRV_SSD, "device %s: reading device to load index", ssd->name);
+
+	CF_ALLOC_SET_NS_ARENA(ns);
+
+	ssd_cold_start_sweep(ssds, ssd);
+
+	cf_info(AS_DRV_SSD, "device %s: read complete: UNIQUE %lu (REPLACED %lu) (OLDER %lu) (EXPIRED %lu) (MAX-TTL %lu) records",
+			ssd->name, ssd->record_add_unique_counter,
+			ssd->record_add_replace_counter, ssd->record_add_older_counter,
+			ssd->record_add_expired_counter, ssd->record_add_max_ttl_counter);
+
+	if (cf_rc_release(complete_rc) == 0) {
+		// All drives are done reading.
+
+		ns->loading_records = false;
+		ssd_cold_start_drop_cenotaphs(ns);
+		ssd_load_wblock_queues(ssds);
+
+		pthread_mutex_destroy(&ns->cold_start_evict_lock);
+
+		cf_queue_push(complete_q, &complete_udata);
+		cf_rc_free(complete_rc);
+
+		as_truncate_list_cenotaphs(ns);
+		as_truncate_done_startup(ns); // set truncate last-update-times in sets' vmap
+
+		ssd_start_maintenance_threads(ssds);
+		ssd_start_write_worker_threads(ssds);
+		ssd_start_defrag_threads(ssds);
+	}
+
+	return NULL;
+}
+
+
+void
+start_loading_records(drv_ssds *ssds, cf_queue *complete_q, void *udata)
+{
+	as_namespace *ns = ssds->ns;
+
+	ns->loading_records = true;
+
+	void *p = cf_rc_alloc(1);
+
+	for (int i = 1; i < ssds->n_ssds; i++) {
+		cf_rc_reserve(p);
+	}
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+		ssd_load_records_info *lri = cf_malloc(sizeof(ssd_load_records_info));
+
+		lri->ssds = ssds;
+		lri->ssd = ssd;
+		lri->complete_q = complete_q;
+		lri->complete_udata = udata;
+		lri->complete_rc = p;
+
+		pthread_create(&thread, &attrs,
+				ns->cold_start ? run_ssd_cold_start : run_ssd_cool_start, lri);
+	}
+}
+
+
+//==========================================================
+// Generic startup utilities.
+//
+
+static int
+first_used_device(ssd_device_header *headers[], int n_ssds)
+{
+	for (int i = 0; i < n_ssds; i++) {
+		if (headers[i]->random != 0) {
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+
+static bool
+stored_version_has_data(drv_ssds *ssds, uint32_t pid)
+{
+	info_buf *b = (info_buf*)
+			(ssds->header->info_data + (SSD_HEADER_INFO_STRIDE * pid));
+
+	return as_partition_version_has_data(&b->version);
+}
+
+
+bool
+ssd_load_records(drv_ssds *ssds, cf_queue *complete_q, void *udata)
+{
+	uint64_t random = cf_get_rand64();
+
+	int n_ssds = ssds->n_ssds;
+	as_namespace *ns = ssds->ns;
+
+	ssd_device_header *headers[n_ssds];
+
+	// Check all the headers. Pick one as the representative.
+	for (int i = 0; i < n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+		int rvh = ssd_read_header(ssd, ns, &headers[i]);
+
+		if (rvh == -1) {
+			cf_crash(AS_DRV_SSD, "unable to read disk header %s", ssd->name);
+		}
+
+		if (rvh == -2) {
+			headers[i] = ssd_init_header(ns);
+		}
+	}
+
+	int first_used = first_used_device(headers, n_ssds);
+
+	if (first_used == -1) {
+		// Shouldn't find all fresh headers here during warm or cool restart.
+		if (! ns->cold_start) {
+			// There's no going back to cold start now - do so the harsh way.
+			cf_crash(AS_DRV_SSD, "{%s}: found all %d devices fresh during %s restart",
+					ns->name, n_ssds, as_namespace_start_mode_str(ns));
+		}
+
+		cf_info(AS_DRV_SSD, "namespace %s: found all %d devices fresh, initializing to random %lu",
+				ns->name, n_ssds, random);
+
+		ssds->header = headers[0];
+
+		for (int i = 1; i < n_ssds; i++) {
+			cf_free(headers[i]);
+		}
+
+		ssd_init_trusted(ns);
+
+		ssds->header->random = random;
+		ssds->header->devices_n = n_ssds;
+
+		ssd_adjust_versions(ns, ssds->header);
+
+		as_storage_info_flush_ssd(ns);
+
+		as_truncate_list_cenotaphs(ns); // all will show as cenotaph
+		as_truncate_done_startup(ns);
+
+		return true;
+	}
+
+	// At least one device is not fresh. Check that all non-fresh devices match.
+
+	bool fresh_drive = false;
+	bool untrusted_drive = false;
+
+	for (int i = 0; i < n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		// Skip fresh devices.
+		if (headers[i]->random == 0) {
+			ssd->started_fresh = true; // warm or cool restart needs to know
+			fresh_drive = true;
+			continue;
+		}
+
+		if (headers[first_used]->random != headers[i]->random) {
+			cf_crash(AS_DRV_SSD, "namespace %s: drive set with unmatched headers - devices %s & %s have different signatures",
+					ns->name, ssds->ssds[first_used].name, ssd->name);
+		}
+
+		if (headers[first_used]->devices_n != headers[i]->devices_n) {
+			cf_crash(AS_DRV_SSD, "namespace %s: drive set with unmatched headers - devices %s & %s have different device counts",
+					ns->name, ssds->ssds[first_used].name, ssd->name);
+		}
+
+		if (headers[first_used]->last_evict_void_time !=
+				headers[i]->last_evict_void_time) {
+			cf_warning(AS_DRV_SSD, "namespace %s: devices have inconsistent evict-void-times - ignoring",
+					ns->name);
+			headers[first_used]->last_evict_void_time = 0;
+		}
+
+		untrusted_drive = ssd_is_untrusted(ns, headers[i]->flags);
+	}
+
+	// Drive set OK - fix up header set.
+	ssds->header = headers[first_used];
+	headers[first_used] = 0;
+
+	for (int i = 0; i < n_ssds; i++) {
+		if (headers[i]) {
+			cf_free(headers[i]);
+			headers[i] = 0;
+		}
+	}
+
+	ssd_init_trusted(ns);
+
+	ssds->header->random = random;
+	ssds->header->devices_n = n_ssds; // may have added fresh drives
+
+	if (fresh_drive || untrusted_drive) {
+		ssd_adjust_versions(ns, ssds->header);
+	}
+
+	as_storage_info_flush_ssd(ns);
+
+	// Cache booleans indicating whether partitions are owned or not.
+	for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+		ssds->get_state_from_storage[pid] =
+				stored_version_has_data(ssds, pid);
+	}
+
+	// Warm or cool restart.
+	if (! ns->cold_start) {
+		as_truncate_done_startup(ns); // set truncate last-update-times in sets' vmap
+		ssd_resume_devices(ssds);
+
+		if (as_namespace_cool_restarts(ns)) {
+			// Cool restart - fire off threads to load record data - will signal
+			// completion when threads are all done.
+			start_loading_records(ssds, complete_q, udata);
+
+			// Make sure caller doesn't signal completion.
+			return false;
+		}
+
+		return true; // warm restart (done)
+	}
+
+	// Initialize the cold start eviction machinery.
+
+	if (0 != pthread_mutex_init(&ns->cold_start_evict_lock, NULL)) {
+		cf_crash(AS_DRV_SSD, "failed cold start eviction mutex init");
+	}
+
+	uint32_t now = as_record_void_time_get();
+
+	if (ns->cold_start_evict_ttl == 0xFFFFffff) {
+		// Config file did NOT specify cold-start-evict-ttl.
+		ns->cold_start_threshold_void_time = ssds->header->last_evict_void_time;
+
+		// Check that it's not already in the past. (Note - includes 0.)
+		if (ns->cold_start_threshold_void_time < now) {
+			ns->cold_start_threshold_void_time = now;
+		}
+		else {
+			cf_info(AS_DRV_SSD, "namespace %s: using saved cold start evict-ttl %u",
+					ns->name, ns->cold_start_threshold_void_time - now);
+		}
+	}
+	else {
+		// Config file specified cold-start-evict-ttl. (0 is a valid value.)
+		ns->cold_start_threshold_void_time = now + ns->cold_start_evict_ttl;
+
+		cf_info(AS_DRV_SSD, "namespace %s: using config-specified cold start evict-ttl %u",
+				ns->name, ns->cold_start_evict_ttl);
+	}
+
+	ns->cold_start_max_void_time = now + (uint32_t)ns->max_ttl;
+
+	// Fire off threads to load record data - will signal completion when
+	// threads are all done.
+	start_loading_records(ssds, complete_q, udata);
+
+	// Make sure caller doesn't signal completion.
+	return false;
+}
+
+
+// Set a device's system block scheduler mode.
+static int
+ssd_set_scheduler_mode(const char* device_name, const char* mode)
+{
+	if (strncmp(device_name, "/dev/", 5)) {
+		cf_warning(AS_DRV_SSD, "storage: invalid device name %s, did not set scheduler mode",
+				device_name);
+		return -1;
+	}
+
+	char device_tag[(strlen(device_name) - 5) + 1];
+
+	strcpy(device_tag, device_name + 5);
+
+	// Replace any slashes in the device tag with '!' - this is the naming
+	// convention in /sys/block.
+	char* p_char = device_tag;
+
+	while (*p_char) {
+		if (*p_char == '/') {
+			*p_char = '!';
+		}
+
+		p_char++;
+	}
+
+	char scheduler_file_name[17 + strlen(device_tag) + 3 + 16 + 1];
+
+	strcpy(scheduler_file_name, "/sys/class/block/");
+	strcat(scheduler_file_name, device_tag);
+
+	// Determine if this device is a partition.
+	char partition_file_name[strlen(scheduler_file_name) + 10 + 1];
+
+	strcpy(partition_file_name, scheduler_file_name);
+	strcat(partition_file_name, "/partition");
+
+	FILE* partition_file = fopen(partition_file_name, "r");
+
+	if (partition_file) {
+		fclose(partition_file);
+
+		// This device is a partition, get parent device.
+		strcat(scheduler_file_name, "/..");
+	}
+
+	strcat(scheduler_file_name, "/queue/scheduler");
+
+	FILE* scheduler_file = fopen(scheduler_file_name, "w");
+
+	if (! scheduler_file) {
+		cf_warning(AS_DRV_SSD, "storage: couldn't open %s, did not set scheduler mode: %s",
+				scheduler_file_name, cf_strerror(errno));
+		return -1;
+	}
+
+	if (fwrite(mode, strlen(mode), 1, scheduler_file) != 1) {
+		fclose(scheduler_file);
+
+		cf_warning(AS_DRV_SSD, "storage: couldn't write %s to %s, did not set scheduler mode",
+				mode, scheduler_file_name);
+		return -1;
+	}
+
+	fclose(scheduler_file);
+
+	cf_info(AS_DRV_SSD, "storage: set device %s scheduler mode to %s",
+			device_name, mode);
+
+	return 0;
+}
+
+
+static uint64_t
+check_file_size(as_namespace *ns, uint64_t file_size, const char *tag)
+{
+	cf_assert(sizeof(off_t) > 4, AS_DRV_SSD, "this OS supports only 32-bit (4g) files - compile with 64 bit offsets");
+
+	if (file_size > SSD_HEADER_SIZE) {
+		off_t unusable_size =
+				(file_size - SSD_HEADER_SIZE) % ns->storage_write_block_size;
+
+		if (unusable_size != 0) {
+			cf_info(AS_DRV_SSD, "%s size must be header size %u + multiple of %u, rounding down",
+					tag, SSD_HEADER_SIZE, ns->storage_write_block_size);
+			file_size -= unusable_size;
+		}
+
+		if (file_size > AS_STORAGE_MAX_DEVICE_SIZE) {
+			cf_warning(AS_DRV_SSD, "%s size must be <= %ld, trimming original size %ld",
+					tag, AS_STORAGE_MAX_DEVICE_SIZE, file_size);
+			file_size = AS_STORAGE_MAX_DEVICE_SIZE;
+		}
+	}
+
+	if (file_size <= SSD_HEADER_SIZE) {
+		cf_crash(AS_DRV_SSD, "%s size %ld must be greater than header size %d",
+				tag, file_size, SSD_HEADER_SIZE);
+	}
+
+	return file_size;
+}
+
+
+static uint64_t
+find_io_min_size(int fd, const char *ssd_name)
+{
+	off_t off = lseek(fd, 0, SEEK_SET);
+
+	if (off != 0) {
+		cf_crash(AS_DRV_SSD, "%s: seek error %s", ssd_name, cf_strerror(errno));
+	}
+
+	uint8_t *buf = cf_valloc(HI_IO_MIN_SIZE);
+	size_t read_sz = LO_IO_MIN_SIZE;
+
+	while (read_sz <= HI_IO_MIN_SIZE) {
+		if (read(fd, (void*)buf, read_sz) == (ssize_t)read_sz) {
+			cf_free(buf);
+			return read_sz;
+		}
+
+		read_sz <<= 1; // LO_IO_MIN_SIZE and HI_IO_MIN_SIZE are powers of 2
+	}
+
+	cf_crash(AS_DRV_SSD, "%s: read failed at all sizes from %u to %u bytes",
+			ssd_name, LO_IO_MIN_SIZE, HI_IO_MIN_SIZE);
+
+	return 0;
+}
+
+
+int
+ssd_init_devices(as_namespace *ns, drv_ssds **ssds_p)
+{
+	int n_ssds;
+
+	for (n_ssds = 0; n_ssds < AS_STORAGE_MAX_DEVICES; n_ssds++) {
+		if (! ns->storage_devices[n_ssds]) {
+			break;
+		}
+	}
+
+	size_t ssds_size = sizeof(drv_ssds) + (n_ssds * sizeof(drv_ssd));
+	drv_ssds *ssds = cf_malloc(ssds_size);
+
+	memset(ssds, 0, ssds_size);
+	ssds->n_ssds = n_ssds;
+	ssds->ns = ns;
+
+	// Raw device-specific initialization of drv_ssd structures.
+	for (int i = 0; i < n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		ssd->name = ns->storage_devices[i];
+
+		ssd->open_flag = O_RDWR |
+				(ns->storage_disable_odirect ? 0 : O_DIRECT) |
+				(ns->storage_enable_osync ? O_SYNC : 0);
+
+		int fd = open(ssd->name, ssd->open_flag, S_IRUSR | S_IWUSR);
+
+		if (-1 == fd) {
+			cf_warning(AS_DRV_SSD, "unable to open device %s: %s", ssd->name,
+					cf_strerror(errno));
+			return -1;
+		}
+
+		uint64_t size = 0;
+
+		ioctl(fd, BLKGETSIZE64, &size); // gets the number of bytes
+
+		ssd->file_size = check_file_size(ns, size, "usable device");
+		ssd->io_min_size = find_io_min_size(fd, ssd->name);
+
+		if (ns->cold_start && ns->storage_cold_start_empty) {
+			if (! ssd_empty_header(fd, ssd->name)) {
+				close(fd);
+				return -1;
+			}
+
+			cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s",
+					ssd->name);
+		}
+
+		close(fd);
+
+		ns->ssd_size += ssd->file_size; // increment total storage size
+
+		cf_info(AS_DRV_SSD, "opened device %s: usable size %lu, io-min-size %lu",
+				ssd->name, ssd->file_size, ssd->io_min_size);
+
+		if (ns->storage_scheduler_mode) {
+			// Set scheduler mode specified in config file.
+			ssd_set_scheduler_mode(ssd->name, ns->storage_scheduler_mode);
+		}
+	}
+
+	*ssds_p = ssds;
+
+	return 0;
+}
+
+
+int
+ssd_init_shadows(as_namespace *ns, drv_ssds *ssds)
+{
+	int n_shadows = 0;
+
+	for (int n = 0; n < ssds->n_ssds; n++) {
+		if (ns->storage_shadows[n]) {
+			n_shadows++;
+		}
+	}
+
+	if (n_shadows == 0) {
+		// No shadows - a normal deployment.
+		return 0;
+	}
+
+	if (n_shadows != ssds->n_ssds) {
+		cf_warning(AS_DRV_SSD, "configured %d devices but only %d shadows",
+				ssds->n_ssds, n_shadows);
+		return -1;
+	}
+
+	// Check shadow devices.
+	for (int i = 0; i < n_shadows; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		ssd->shadow_name = ns->storage_shadows[i];
+
+		int fd = open(ssd->shadow_name, ssd->open_flag, S_IRUSR | S_IWUSR);
+
+		if (-1 == fd) {
+			cf_warning(AS_DRV_SSD, "unable to open shadow device %s: %s",
+					ssd->shadow_name, cf_strerror(errno));
+			return -1;
+		}
+
+		uint64_t size = 0;
+
+		ioctl(fd, BLKGETSIZE64, &size); // gets the number of bytes
+
+		if (size < ssd->file_size) {
+			cf_warning(AS_DRV_SSD, "shadow device %s is smaller than main device - %lu < %lu",
+					ssd->shadow_name, size, ssd->file_size);
+			close(fd);
+			return -1;
+		}
+
+		if (ns->cold_start && ns->storage_cold_start_empty) {
+			if (! ssd_empty_header(fd, ssd->shadow_name)) {
+				close(fd);
+				return -1;
+			}
+
+			cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s",
+					ssd->shadow_name);
+		}
+
+		close(fd);
+
+		cf_info(AS_DRV_SSD, "shadow device %s is compatible with main device",
+				ssd->shadow_name);
+
+		if (ns->storage_scheduler_mode) {
+			// Set scheduler mode specified in config file.
+			ssd_set_scheduler_mode(ssd->shadow_name,
+					ns->storage_scheduler_mode);
+		}
+	}
+
+	return 0;
+}
+
+
+int
+ssd_init_files(as_namespace *ns, drv_ssds **ssds_p)
+{
+	int n_ssds;
+
+	for (n_ssds = 0; n_ssds < AS_STORAGE_MAX_FILES; n_ssds++) {
+		if (! ns->storage_files[n_ssds]) {
+			break;
+		}
+	}
+
+	size_t ssds_size = sizeof(drv_ssds) + (n_ssds * sizeof(drv_ssd));
+	drv_ssds *ssds = cf_malloc(ssds_size);
+
+	memset(ssds, 0, ssds_size);
+	ssds->n_ssds = n_ssds;
+	ssds->ns = ns;
+
+	// File-specific initialization of drv_ssd structures.
+	for (int i = 0; i < n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		ssd->name = ns->storage_files[i];
+
+		if (ns->cold_start && ns->storage_cold_start_empty) {
+			if (0 == remove(ssd->name)) {
+				cf_info(AS_DRV_SSD, "cold-start-empty - removed %s", ssd->name);
+			}
+			else if (errno == ENOENT) {
+				cf_info(AS_DRV_SSD, "cold-start-empty - no file %s", ssd->name);
+			}
+			else {
+				cf_warning(AS_DRV_SSD, "failed remove: errno %d", errno);
+				return -1;
+			}
+		}
+
+		ssd->open_flag = O_RDWR;
+
+		// Validate that file can be opened, create it if it doesn't exist.
+		int fd = open(ssd->name, ssd->open_flag | O_CREAT, S_IRUSR | S_IWUSR);
+
+		if (-1 == fd) {
+			cf_warning(AS_DRV_SSD, "unable to open file %s: %s", ssd->name,
+					cf_strerror(errno));
+			return -1;
+		}
+
+		ssd->file_size = check_file_size(ns, ns->storage_filesize, "file");
+		ssd->io_min_size = LO_IO_MIN_SIZE;
+
+		// Truncate will grow or shrink the file to the correct size.
+		if (0 != ftruncate(fd, (off_t)ssd->file_size)) {
+			cf_warning(AS_DRV_SSD, "unable to truncate file: errno %d", errno);
+			close(fd);
+			return -1;
+		}
+
+		close(fd);
+
+		ns->ssd_size += ssd->file_size; // increment total storage size
+
+		cf_info(AS_DRV_SSD, "opened file %s: usable size %lu", ssd->name,
+				ssd->file_size);
+	}
+
+	*ssds_p = ssds;
+
+	return 0;
+}
+
+
+//==========================================================
+// Storage API implementation: startup, shutdown, etc.
+//
+
+int
+as_storage_namespace_init_ssd(as_namespace *ns, cf_queue *complete_q,
+		void *udata)
+{
+	drv_ssds *ssds;
+
+	if (ns->storage_devices[0]) {
+		if (0 != ssd_init_devices(ns, &ssds)) {
+			cf_warning(AS_DRV_SSD, "{%s} can't initialize devices", ns->name);
+			return -1;
+		}
+
+		if (0 != ssd_init_shadows(ns, ssds)) {
+			cf_warning(AS_DRV_SSD, "{%s} can't initialize shadows", ns->name);
+			return -1;
+		}
+	}
+	else if (ns->storage_files[0]) {
+		if (0 != ssd_init_files(ns, &ssds)) {
+			cf_warning(AS_DRV_SSD, "{%s} can't initialize files", ns->name);
+			return -1;
+		}
+	}
+	else {
+		cf_warning(AS_DRV_SSD, "{%s} has no devices or files", ns->name);
+		return -1;
+	}
+
+	// Allow defrag to go full speed during startup - restore the configured
+	// settings when startup is done.
+	ns->saved_defrag_sleep = ns->storage_defrag_sleep;
+	ns->storage_defrag_sleep = 0;
+
+	// The queue limit is more efficient to work with.
+	ns->storage_max_write_q = (int)
+			(ns->storage_max_write_cache / ns->storage_write_block_size);
+
+	// Minimize how often we recalculate this.
+	ns->defrag_lwm_size =
+			(ns->storage_write_block_size * ns->storage_defrag_lwm_pct) / 100;
+
+	ns->storage_private = (void*)ssds;
+
+	char histname[HISTOGRAM_NAME_SIZE];
+
+	snprintf(histname, sizeof(histname), "{%s}-device-read-size", ns->name);
+	ns->device_read_size_hist = histogram_create(histname, HIST_SIZE);
+
+	snprintf(histname, sizeof(histname), "{%s}-device-write-size", ns->name);
+	ns->device_write_size_hist = histogram_create(histname, HIST_SIZE);
+
+	// Finish initializing drv_ssd structures (non-zero-value members).
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		ssd->ns = ns;
+		ssd->file_id = i;
+
+		pthread_mutex_init(&ssd->write_lock, 0);
+		pthread_mutex_init(&ssd->defrag_lock, 0);
+
+		ssd->running = true;
+
+		ssd->data_in_memory = ns->storage_data_in_memory;
+		ssd->write_block_size = ns->storage_write_block_size;
+
+		ssd_wblock_init(ssd);
+
+		// Note: free_wblock_q, defrag_wblock_q created after loading devices.
+
+		ssd->fd_q = cf_queue_create(sizeof(int), true);
+
+		if (ssd->shadow_name) {
+			ssd->shadow_fd_q = cf_queue_create(sizeof(int), true);
+		}
+
+		ssd->swb_write_q = cf_queue_create(sizeof(void*), true);
+
+		if (ssd->shadow_name) {
+			ssd->swb_shadow_q = cf_queue_create(sizeof(void*), true);
+		}
+
+		ssd->swb_free_q = cf_queue_create(sizeof(void*), true);
+
+		if (! ns->storage_data_in_memory) {
+			ssd->post_write_q = cf_queue_create(sizeof(void*), false);
+		}
+
+		snprintf(histname, sizeof(histname), "{%s}-%s-read", ns->name, ssd->name);
+		ssd->hist_read = histogram_create(histname, HIST_MILLISECONDS);
+
+		snprintf(histname, sizeof(histname), "{%s}-%s-large-block-read", ns->name, ssd->name);
+		ssd->hist_large_block_read = histogram_create(histname, HIST_MILLISECONDS);
+
+		snprintf(histname, sizeof(histname), "{%s}-%s-write", ns->name, ssd->name);
+		ssd->hist_write = histogram_create(histname, HIST_MILLISECONDS);
+
+		if (ssd->shadow_name) {
+			snprintf(histname, sizeof(histname), "{%s}-%s-shadow-write", ns->name, ssd->name);
+			ssd->hist_shadow_write = histogram_create(histname, HIST_MILLISECONDS);
+		}
+
+		snprintf(histname, sizeof(histname), "{%s}-%s-fsync", ns->name, ssd->name);
+		ssd->hist_fsync = histogram_create(histname, HIST_MILLISECONDS);
+
+		ssd_init_commit(ssd);
+	}
+
+	// Attempt to load the data.
+	//
+	// Return value 'false' means it's going to take a long time and will later
+	// asynchronously signal completion via the complete_q, 'true' means it's
+	// finished, signal here.
+
+	if (ssd_load_records(ssds, complete_q, udata)) {
+		ssd_load_wblock_queues(ssds);
+
+		cf_queue_push(complete_q, &udata);
+
+		ssd_start_maintenance_threads(ssds);
+		ssd_start_write_worker_threads(ssds);
+		ssd_start_defrag_threads(ssds);
+	}
+
+	return 0;
+}
+
+
+void
+as_storage_loading_records_ticker_ssd()
+{
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+
+		if (ns->loading_records) {
+			char buf[2048];
+			int pos = 0;
+			drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+
+			for (int j = 0; j < ssds->n_ssds; j++) {
+				drv_ssd *ssd = &ssds->ssds[j];
+				uint32_t pct = (uint32_t)((ssd->sweep_wblock_id * 100UL) /
+						(ssd->file_size / ssd->write_block_size));
+
+				pos += sprintf(buf + pos, ", %s %u%%", ssd->name, pct);
+			}
+
+			// TODO - conform with new log standard?
+			if (ns->n_tombstones == 0) {
+				cf_info(AS_DRV_SSD, "{%s} loaded %lu objects%s", ns->name,
+						ns->n_objects, buf);
+			}
+			else {
+				cf_info(AS_DRV_SSD, "{%s} loaded %lu objects, %lu tombstones%s",
+						ns->name, ns->n_objects, ns->n_tombstones, buf);
+			}
+		}
+	}
+}
+
+
+int
+as_storage_namespace_destroy_ssd(as_namespace *ns)
+{
+	// This is not called - for now we don't bother unwinding.
+	return 0;
+}
+
+
+// Note that this is *NOT* the counterpart to as_storage_record_create_ssd()!
+// That would be as_storage_record_close_ssd(). This is what gets called when a
+// record is destroyed, to dereference storage.
+int
+as_storage_record_destroy_ssd(as_namespace *ns, as_record *r)
+{
+	if (STORAGE_RBLOCK_IS_VALID(r->rblock_id) && r->n_rblocks != 0) {
+		drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+		drv_ssd *ssd = &ssds->ssds[r->file_id];
+
+		ssd_block_free(ssd, r->rblock_id, r->n_rblocks, "destroy");
+
+		r->rblock_id = 0;
+		r->n_rblocks = 0;
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// Storage API implementation: as_storage_rd cycle.
+//
+
+int
+as_storage_record_create_ssd(as_storage_rd *rd)
+{
+	rd->block = NULL;
+	rd->must_free_block = NULL;
+	rd->ssd = NULL;
+
+	cf_assert(rd->r->rblock_id == 0, AS_DRV_SSD, "unexpected - uninitialized rblock-id");
+
+	return 0;
+}
+
+
+int
+as_storage_record_open_ssd(as_storage_rd *rd)
+{
+	drv_ssds *ssds = (drv_ssds*)rd->ns->storage_private;
+
+	rd->block = NULL;
+	rd->must_free_block = NULL;
+	rd->ssd = &ssds->ssds[rd->r->file_id];
+
+	return 0;
+}
+
+
+int
+as_storage_record_close_ssd(as_storage_rd *rd)
+{
+	if (rd->must_free_block) {
+		cf_free(rd->must_free_block);
+		rd->must_free_block = NULL;
+		rd->block = NULL;
+	}
+
+	return 0;
+}
+
+
+// These are near the top of this file:
+//		as_storage_record_get_n_bins_ssd()
+//		as_storage_record_read_ssd()
+//		as_storage_particle_read_all_ssd()
+//		as_storage_particle_read_and_size_all_ssd()
+
+
+bool
+as_storage_record_size_and_check_ssd(as_storage_rd *rd)
+{
+	return rd->ns->storage_write_block_size >= ssd_record_size(rd);
+}
+
+
+//==========================================================
+// Storage API implementation: storage capacity monitoring.
+//
+
+void
+as_storage_wait_for_defrag_ssd(as_namespace *ns)
+{
+	if (ns->storage_defrag_startup_minimum > 0) {
+		while (true) {
+			int avail_pct;
+
+			if (0 != as_storage_stats_ssd(ns, &avail_pct, 0)) {
+				cf_crash(AS_DRV_SSD, "namespace %s storage stats failed",
+						ns->name);
+			}
+
+			if (avail_pct >= ns->storage_defrag_startup_minimum) {
+				break;
+			}
+
+			cf_info(AS_DRV_SSD, "namespace %s waiting for defrag: %d pct available, waiting for %d ...",
+					ns->name, avail_pct, ns->storage_defrag_startup_minimum);
+
+			sleep(2);
+		}
+	}
+
+	// Restore configured defrag throttling values.
+	ns->storage_defrag_sleep = ns->saved_defrag_sleep;
+}
+
+
+bool
+as_storage_overloaded_ssd(as_namespace *ns)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+	int max_write_q = ns->storage_max_write_q;
+
+	// TODO - would be nice to not do this loop every single write transaction!
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+		int qsz = cf_queue_sz(ssd->swb_write_q);
+
+		if (qsz > max_write_q) {
+			cf_ticker_warning(AS_DRV_SSD, "{%s} write fail: queue too deep: exceeds max %d",
+					ns->name, max_write_q);
+			return true;
+		}
+
+		if (ssd->shadow_name) {
+			qsz = cf_queue_sz(ssd->swb_shadow_q);
+
+			if (qsz > max_write_q) {
+				cf_ticker_warning(AS_DRV_SSD, "{%s} write fail: shadow queue too deep: exceeds max %d",
+						ns->name, max_write_q);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+
+bool
+as_storage_has_space_ssd(as_namespace *ns)
+{
+	// Shortcut - assume we can't go from 5% to 0% in 1 ticker interval.
+	if (ns->storage_last_avail_pct > 5) {
+		return true;
+	}
+	// else - running low on available percent, check rigorously...
+
+	drv_ssds* ssds = (drv_ssds*)ns->storage_private;
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		if (cf_queue_sz(ssds->ssds[i].free_wblock_q) < min_free_wblocks(ns)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
+void
+as_storage_defrag_sweep_ssd(as_namespace *ns)
+{
+	cf_info(AS_DRV_SSD, "{%s} sweeping all devices for wblocks to defrag ...", ns->name);
+
+	drv_ssds* ssds = (drv_ssds*)ns->storage_private;
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		cf_atomic32_incr(&ssds->ssds[i].defrag_sweep);
+	}
+}
+
+
+//==========================================================
+// Storage API implementation: data in device headers.
+//
+
+void
+as_storage_info_set_ssd(as_namespace *ns, const as_partition *p, bool flush)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+	info_buf *b = (info_buf*)
+			(ssds->header->info_data + (SSD_HEADER_INFO_STRIDE * p->id));
+
+	// TODO - until future storage format change, we'll use partition 0 to save
+	// and restore ns->eventual_regime.
+	b->regime = p->id == 0 ? ns->eventual_regime : 0;
+
+	b->version = p->version;
+
+	if (flush) {
+		// TODO - in future storage format change, arrange for each stride to
+		// never cross an io-min-size boundary, so we can do less math here.
+
+		uint64_t offset = (uint8_t*)b - (uint8_t*)ssds->header;
+		uint64_t end_offset = offset + SSD_HEADER_INFO_STRIDE;
+
+		for (int i = 0; i < ssds->n_ssds; i++) {
+			drv_ssd *ssd = &ssds->ssds[i];
+
+			uint64_t flush_offset = BYTES_DOWN_TO_IO_MIN(ssd, offset);
+			uint64_t flush_end_offset = BYTES_UP_TO_IO_MIN(ssd, end_offset);
+
+			ssd_write_header(ssd, ssds->header,
+					flush_offset, flush_end_offset - flush_offset);
+		}
+	}
+}
+
+
+void
+as_storage_info_get_ssd(as_namespace *ns, as_partition *p)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+	info_buf *b = (info_buf*)
+			(ssds->header->info_data + (SSD_HEADER_INFO_STRIDE * p->id));
+
+	if (p->id == 0) {
+		ns->eventual_regime = b->regime;
+		ns->rebalance_regime = b->regime;
+	}
+
+	p->version = b->version;
+}
+
+
+int
+as_storage_info_flush_ssd(as_namespace *ns)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		ssd_write_header(ssd, ssds->header, 0, SSD_HEADER_SIZE);
+	}
+
+	return 0;
+}
+
+
+void
+as_storage_save_evict_void_time_ssd(as_namespace *ns, uint32_t evict_void_time)
+{
+	drv_ssds* ssds = (drv_ssds*)ns->storage_private;
+
+	ssds->header->last_evict_void_time = evict_void_time;
+
+	// Customized write instead of using as_storage_info_flush_ssd() so we can
+	// write 512-4096b instead of 1Mb (and not interfere with potentially
+	// concurrent writes for partition info).
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd* ssd = &ssds->ssds[i];
+		size_t peek_size = BYTES_UP_TO_IO_MIN(ssd, sizeof(ssd_device_header));
+
+		ssd_write_header(ssd, ssds->header, 0, peek_size);
+	}
+}
+
+
+//==========================================================
+// Storage API implementation: statistics.
+//
+
+int
+as_storage_stats_ssd(as_namespace *ns, int *available_pct,
+		uint64_t *used_disk_bytes)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+
+	if (available_pct) {
+		*available_pct = 100;
+
+		// Find the device with the lowest available percent.
+		for (int i = 0; i < ssds->n_ssds; i++) {
+			drv_ssd *ssd = &ssds->ssds[i];
+			uint64_t pct = (available_size(ssd) * 100) / ssd->file_size;
+
+			if (pct < (uint64_t)*available_pct) {
+				*available_pct = pct;
+			}
+		}
+
+		// Used for shortcut in as_storage_has_space_ssd(), which is done on a
+		// per-transaction basis:
+		ns->storage_last_avail_pct = *available_pct;
+	}
+
+	if (used_disk_bytes) {
+		uint64_t sz = 0;
+
+		for (int i = 0; i < ssds->n_ssds; i++) {
+			sz += ssds->ssds[i].inuse_size;
+		}
+
+		*used_disk_bytes = sz;
+	}
+
+	return 0;
+}
+
+
+int
+as_storage_ticker_stats_ssd(as_namespace *ns)
+{
+	histogram_dump(ns->device_read_size_hist);
+	histogram_dump(ns->device_write_size_hist);
+
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		histogram_dump(ssd->hist_read);
+		histogram_dump(ssd->hist_large_block_read);
+		histogram_dump(ssd->hist_write);
+
+		if (ssd->hist_shadow_write) {
+			histogram_dump(ssd->hist_shadow_write);
+		}
+
+		histogram_dump(ssd->hist_fsync);
+	}
+
+	return 0;
+}
+
+
+int
+as_storage_histogram_clear_ssd(as_namespace *ns)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		histogram_clear(ssd->hist_read);
+		histogram_clear(ssd->hist_large_block_read);
+		histogram_clear(ssd->hist_write);
+
+		if (ssd->hist_shadow_write) {
+			histogram_clear(ssd->hist_shadow_write);
+		}
+
+		histogram_clear(ssd->hist_fsync);
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// Shutdown.
+//
+
+void
+as_storage_shutdown_ssd(as_namespace *ns)
+{
+	drv_ssds *ssds = (drv_ssds*)ns->storage_private;
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		// Stop the maintenance thread from (also) flushing the swbs.
+		pthread_mutex_lock(&ssd->write_lock);
+		pthread_mutex_lock(&ssd->defrag_lock);
+
+		// Flush current swb by pushing it to write-q.
+		if (ssd->current_swb) {
+			// Clean the end of the buffer before pushing to write-q.
+			if (ssd->write_block_size > ssd->current_swb->pos) {
+				memset(&ssd->current_swb->buf[ssd->current_swb->pos], 0,
+						ssd->write_block_size - ssd->current_swb->pos);
+			}
+
+			cf_queue_push(ssd->swb_write_q, &ssd->current_swb);
+			ssd->current_swb = NULL;
+		}
+
+		// Flush defrag swb by pushing it to write-q.
+		if (ssd->defrag_swb) {
+			// Clean the end of the buffer before pushing to write-q.
+			if (ssd->write_block_size > ssd->defrag_swb->pos) {
+				memset(&ssd->defrag_swb->buf[ssd->defrag_swb->pos], 0,
+						ssd->write_block_size - ssd->defrag_swb->pos);
+			}
+
+			cf_queue_push(ssd->swb_write_q, &ssd->defrag_swb);
+			ssd->defrag_swb = NULL;
+		}
+	}
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+
+		while (cf_queue_sz(ssd->swb_write_q)) {
+			usleep(1000);
+		}
+
+		if (ssd->shadow_name) {
+			while (cf_queue_sz(ssd->swb_shadow_q)) {
+				usleep(1000);
+			}
+		}
+
+		ssd->running = false;
+	}
+
+	for (int i = 0; i < ssds->n_ssds; i++) {
+		drv_ssd *ssd = &ssds->ssds[i];
+		void *p_void;
+
+		for (uint32_t j = 0; j < ssds->ns->storage_write_threads; j++) {
+			pthread_join(ssd->write_worker_thread[j], &p_void);
+		}
+
+		if (ssd->shadow_name) {
+			pthread_join(ssd->shadow_worker_thread, &p_void);
+		}
+	}
+
+	ssd_set_trusted(ns);
+}
diff --git a/as/src/storage/drv_ssd_ce.c b/as/src/storage/drv_ssd_ce.c
new file mode 100644
index 00000000..9b52934c
--- /dev/null
+++ b/as/src/storage/drv_ssd_ce.c
@@ -0,0 +1,181 @@
+/*
+ * drv_ssd_cold.c
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "storage/drv_ssd.h"
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "fault.h"
+#include "base/datamodel.h"
+#include "base/rec_props.h"
+#include "storage/storage.h"
+
+
+void
+ssd_resume_devices(drv_ssds* ssds)
+{
+	// Should not get here - for enterprise version only.
+	cf_crash(AS_DRV_SSD, "cold start called ssd_resume_devices()");
+}
+
+void*
+run_ssd_cool_start(void* udata)
+{
+	// Should not get here - for enterprise version only.
+	cf_crash(AS_DRV_SSD, "community edition called run_ssd_cool_start()");
+
+	return NULL;
+}
+
+void
+ssd_header_init_cfg(const as_namespace* ns, ssd_device_header* header)
+{
+}
+
+bool
+ssd_header_is_valid_cfg(const as_namespace* ns, const ssd_device_header* header)
+{
+	return true;
+}
+
+bool
+ssd_cold_start_is_valid_n_bins(uint32_t n_bins)
+{
+	// FIXME - what should we do here?
+	cf_assert(n_bins != 0, AS_DRV_SSD,
+			"community edition found tombstone - erase drive and restart");
+
+	return n_bins <= BIN_NAMES_QUOTA;
+}
+
+bool
+ssd_cold_start_is_record_truncated(as_namespace* ns, const drv_ssd_block* block,
+		const as_rec_props* p_props)
+{
+	return false;
+}
+
+void
+ssd_cold_start_adjust_cenotaph(as_namespace* ns, const drv_ssd_block* block,
+		as_record* r)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+void
+ssd_cold_start_transition_record(as_namespace* ns, const drv_ssd_block* block,
+		as_record* r, bool is_create)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+void
+ssd_cold_start_drop_cenotaphs(as_namespace* ns)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+void
+ssd_adjust_versions(as_namespace* ns, ssd_device_header* header)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+conflict_resolution_pol
+ssd_cold_start_policy(as_namespace *ns)
+{
+	return AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME;
+}
+
+void
+ssd_cold_start_init_repl_state(as_namespace* ns, as_record* r)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+void
+ssd_init_commit(drv_ssd *ssd)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+uint64_t
+ssd_flush_max_us(const as_namespace *ns)
+{
+	return ns->storage_flush_max_us;
+}
+
+int
+ssd_write_bins(as_storage_rd *rd)
+{
+	return ssd_buffer_bins(rd);
+}
+
+void
+ssd_init_trusted(as_namespace* ns)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+bool
+ssd_is_untrusted(as_namespace *ns, uint8_t header_flags)
+{
+	return false;
+}
+
+void
+ssd_set_trusted(as_namespace* ns)
+{
+	// Nothing to do - relevant for enterprise version only.
+}
+
+void
+as_storage_start_tomb_raider_ssd(as_namespace* ns)
+{
+	// Tomb raider is for enterprise version only.
+}
+
+int
+as_storage_record_write_ssd(as_storage_rd* rd)
+{
+	// All record writes except defrag come through here!
+	return as_bin_inuse_has(rd) ? ssd_write(rd) : 0;
+}
+
+void
+ssd_init_encryption_key(as_namespace* ns)
+{
+}
+
+void
+ssd_do_encrypt(const uint8_t* key, uint64_t off, drv_ssd_block* block)
+{
+	// Should not get here - for enterprise version only.
+	cf_crash(AS_DRV_SSD, "community edition called ssd_do_encrypt()");
+}
+
+void
+ssd_do_decrypt(const uint8_t* key, uint64_t off, drv_ssd_block* block)
+{
+	// Should not get here - for enterprise version only.
+	cf_crash(AS_DRV_SSD, "community edition called ssd_do_decrypt()");
+}
diff --git a/as/src/storage/storage.c b/as/src/storage/storage.c
new file mode 100644
index 00000000..e4e1fb35
--- /dev/null
+++ b/as/src/storage/storage.c
@@ -0,0 +1,688 @@
+/*
+ * storage.c
+ *
+ * Copyright (C) 2009-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "storage/storage.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_digest.h"
+#include "citrusleaf/cf_queue.h"
+
+#include "cf_mutex.h"
+#include "fault.h"
+#include "olock.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/rec_props.h"
+#include "base/thr_info.h"
+#include "fabric/partition.h"
+
+
+//==========================================================
+// Generic "base class" functions that call through
+// storage-engine "v-tables".
+//
+
+//--------------------------------------
+// as_storage_init
+//
+
+typedef int (*as_storage_namespace_init_fn)(as_namespace *ns, cf_queue *complete_q, void *udata);
+static const as_storage_namespace_init_fn as_storage_namespace_init_table[AS_NUM_STORAGE_ENGINES] = {
+	as_storage_namespace_init_memory,
+	as_storage_namespace_init_ssd
+};
+
+void
+as_storage_init()
+{
+	cf_queue *complete_q = cf_queue_create(sizeof(void*), true);
+
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+
+		if (as_storage_namespace_init_table[ns->storage_type]) {
+			if (0 != as_storage_namespace_init_table[ns->storage_type](ns, complete_q, NULL)) {
+				cf_crash(AS_STORAGE, "could not initialize storage for namespace %s", ns->name);
+			}
+		}
+		else {
+			cf_crash(AS_STORAGE, "invalid storage type for namespace %s", ns->name);
+		}
+	}
+
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		void *_t;
+
+		while (CF_QUEUE_OK != cf_queue_pop(complete_q, &_t, 2000)) {
+			as_storage_loading_records_ticker_ssd();
+		}
+	}
+
+	cf_queue_destroy(complete_q);
+}
+
+//--------------------------------------
+// as_storage_start_tomb_raider
+//
+
+typedef void (*as_storage_start_tomb_raider_fn)(as_namespace *ns);
+static const as_storage_start_tomb_raider_fn as_storage_start_tomb_raider_table[AS_NUM_STORAGE_ENGINES] = {
+	as_storage_start_tomb_raider_memory,
+	as_storage_start_tomb_raider_ssd
+};
+
+void
+as_storage_start_tomb_raider()
+{
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+
+		if (as_storage_start_tomb_raider_table[ns->storage_type]) {
+			as_storage_start_tomb_raider_table[ns->storage_type](ns);
+		}
+	}
+}
+
+//--------------------------------------
+// as_storage_namespace_destroy
+//
+
+typedef int (*as_storage_namespace_destroy_fn)(as_namespace *ns);
+static const as_storage_namespace_destroy_fn as_storage_namespace_destroy_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no destroy
+	as_storage_namespace_destroy_ssd
+};
+
+int
+as_storage_namespace_destroy(as_namespace *ns)
+{
+	if (as_storage_namespace_destroy_table[ns->storage_type]) {
+		return as_storage_namespace_destroy_table[ns->storage_type](ns);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_destroy
+//
+
+typedef int (*as_storage_record_destroy_fn)(as_namespace *ns, as_record *r);
+static const as_storage_record_destroy_fn as_storage_record_destroy_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no record destroy
+	as_storage_record_destroy_ssd
+};
+
+int
+as_storage_record_destroy(as_namespace *ns, as_record *r)
+{
+	if (as_storage_record_destroy_table[ns->storage_type]) {
+		return as_storage_record_destroy_table[ns->storage_type](ns, r);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_create
+//
+
+typedef int (*as_storage_record_create_fn)(as_storage_rd *rd);
+static const as_storage_record_create_fn as_storage_record_create_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no record create
+	as_storage_record_create_ssd
+};
+
+int
+as_storage_record_create(as_namespace *ns, as_record *r, as_storage_rd *rd)
+{
+	rd->r = r;
+	rd->ns = ns;
+	as_rec_props_clear(&rd->rec_props);
+	rd->bins = 0;
+	rd->n_bins = 0;
+	rd->record_on_device = false;
+	rd->ignore_record_on_device = false;
+	rd->key_size = 0;
+	rd->key = NULL;
+	rd->is_durable_delete = false;
+
+	if (as_storage_record_create_table[ns->storage_type]) {
+		return as_storage_record_create_table[ns->storage_type](rd);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_open
+//
+
+typedef int (*as_storage_record_open_fn)(as_storage_rd *rd);
+static const as_storage_record_open_fn as_storage_record_open_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no record open
+	as_storage_record_open_ssd
+};
+
+int
+as_storage_record_open(as_namespace *ns, as_record *r, as_storage_rd *rd)
+{
+	rd->r = r;
+	rd->ns = ns;
+	as_rec_props_clear(&rd->rec_props);
+	rd->bins = 0;
+	rd->n_bins = 0;
+	rd->record_on_device = true;
+	rd->ignore_record_on_device = false;
+	rd->key_size = 0;
+	rd->key = NULL;
+	rd->is_durable_delete = false;
+
+	if (as_storage_record_open_table[ns->storage_type]) {
+		return as_storage_record_open_table[ns->storage_type](rd);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_close
+//
+
+typedef int (*as_storage_record_close_fn)(as_storage_rd *rd);
+static const as_storage_record_close_fn as_storage_record_close_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no record close
+	as_storage_record_close_ssd
+};
+
+int
+as_storage_record_close(as_storage_rd *rd)
+{
+	if (as_storage_record_close_table[rd->ns->storage_type]) {
+		return as_storage_record_close_table[rd->ns->storage_type](rd);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_load_n_bins
+//
+
+typedef int (*as_storage_record_load_n_bins_fn)(as_storage_rd *rd);
+static const as_storage_record_load_n_bins_fn as_storage_record_load_n_bins_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no record load n bins
+	as_storage_record_load_n_bins_ssd
+};
+
+int
+as_storage_record_load_n_bins(as_storage_rd *rd)
+{
+	if (as_storage_record_load_n_bins_table[rd->ns->storage_type]) {
+		return as_storage_record_load_n_bins_table[rd->ns->storage_type](rd);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_load_bins
+//
+
+typedef int (*as_storage_record_load_bins_fn)(as_storage_rd *rd);
+static const as_storage_record_load_bins_fn as_storage_record_load_bins_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no record load bins
+	as_storage_record_load_bins_ssd
+};
+
+int
+as_storage_record_load_bins(as_storage_rd *rd)
+{
+	if (as_storage_record_load_bins_table[rd->ns->storage_type]) {
+		return as_storage_record_load_bins_table[rd->ns->storage_type](rd);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_record_size_and_check
+//
+
+typedef bool (*as_storage_record_size_and_check_fn)(as_storage_rd *rd);
+static const as_storage_record_size_and_check_fn as_storage_record_size_and_check_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // no limit if no persistent storage - flat size is irrelevant
+	as_storage_record_size_and_check_ssd
+};
+
+bool
+as_storage_record_size_and_check(as_storage_rd *rd)
+{
+	if (as_storage_record_size_and_check_table[rd->ns->storage_type]) {
+		return as_storage_record_size_and_check_table[rd->ns->storage_type](rd);
+	}
+
+	return true;
+}
+
+//--------------------------------------
+// as_storage_record_write
+//
+
+typedef int (*as_storage_record_write_fn)(as_storage_rd *rd);
+static const as_storage_record_write_fn as_storage_record_write_table[AS_NUM_STORAGE_ENGINES] = {
+	as_storage_record_write_memory,
+	as_storage_record_write_ssd
+};
+
+int
+as_storage_record_write(as_storage_rd *rd)
+{
+	if (as_storage_record_write_table[rd->ns->storage_type]) {
+		return as_storage_record_write_table[rd->ns->storage_type](rd);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_wait_for_defrag
+//
+
+typedef void (*as_storage_wait_for_defrag_fn)(as_namespace *ns);
+static const as_storage_wait_for_defrag_fn as_storage_wait_for_defrag_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't do defrag
+	as_storage_wait_for_defrag_ssd
+};
+
+void
+as_storage_wait_for_defrag()
+{
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+
+		if (as_storage_wait_for_defrag_table[ns->storage_type]) {
+			as_storage_wait_for_defrag_table[ns->storage_type](ns);
+		}
+	}
+}
+
+//--------------------------------------
+// as_storage_overloaded
+//
+
+typedef bool (*as_storage_overloaded_fn)(as_namespace *ns);
+static const as_storage_overloaded_fn as_storage_overloaded_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no overload check
+	as_storage_overloaded_ssd
+};
+
+bool
+as_storage_overloaded(as_namespace *ns)
+{
+	if (as_storage_overloaded_table[ns->storage_type]) {
+		return as_storage_overloaded_table[ns->storage_type](ns);
+	}
+
+	return false;
+}
+
+//--------------------------------------
+// as_storage_has_space
+//
+
+typedef bool (*as_storage_has_space_fn)(as_namespace *ns);
+static const as_storage_has_space_fn as_storage_has_space_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory has no space check
+	as_storage_has_space_ssd
+};
+
+bool
+as_storage_has_space(as_namespace *ns)
+{
+	if (as_storage_has_space_table[ns->storage_type]) {
+		return as_storage_has_space_table[ns->storage_type](ns);
+	}
+
+	return true;
+}
+
+//--------------------------------------
+// as_storage_defrag_sweep
+//
+
+typedef void (*as_storage_defrag_sweep_fn)(as_namespace *ns);
+static const as_storage_defrag_sweep_fn as_storage_defrag_sweep_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't do defrag
+	as_storage_defrag_sweep_ssd
+};
+
+void
+as_storage_defrag_sweep(as_namespace *ns)
+{
+	if (as_storage_defrag_sweep_table[ns->storage_type]) {
+		as_storage_defrag_sweep_table[ns->storage_type](ns);
+	}
+}
+
+//--------------------------------------
+// as_storage_info_set
+//
+
+typedef void (*as_storage_info_set_fn)(as_namespace *ns, const as_partition *p, bool flush);
+static const as_storage_info_set_fn as_storage_info_set_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't support info
+	as_storage_info_set_ssd
+};
+
+void
+as_storage_info_set(as_namespace *ns, const as_partition *p, bool flush)
+{
+	if (as_storage_info_set_table[ns->storage_type]) {
+		as_storage_info_set_table[ns->storage_type](ns, p, flush);
+	}
+}
+
+//--------------------------------------
+// as_storage_info_get
+//
+
+typedef void (*as_storage_info_get_fn)(as_namespace *ns, as_partition *p);
+static const as_storage_info_get_fn as_storage_info_get_table[AS_NUM_STORAGE_ENGINES] = {
+	as_storage_info_get_memory,
+	as_storage_info_get_ssd
+};
+
+void
+as_storage_info_get(as_namespace *ns, as_partition *p)
+{
+	if (as_storage_info_get_table[ns->storage_type]) {
+		as_storage_info_get_table[ns->storage_type](ns, p);
+	}
+}
+
+//--------------------------------------
+// as_storage_info_flush
+//
+
+typedef int (*as_storage_info_flush_fn)(as_namespace *ns);
+static const as_storage_info_flush_fn as_storage_info_flush_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't support info
+	as_storage_info_flush_ssd
+};
+
+int
+as_storage_info_flush(as_namespace *ns)
+{
+	if (as_storage_info_flush_table[ns->storage_type]) {
+		return as_storage_info_flush_table[ns->storage_type](ns);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_save_evict_void_time
+//
+
+typedef void (*as_storage_save_evict_void_time_fn)(as_namespace *ns, uint32_t evict_void_time);
+static const as_storage_save_evict_void_time_fn as_storage_save_evict_void_time_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't store info
+	as_storage_save_evict_void_time_ssd
+};
+
+void
+as_storage_save_evict_void_time(as_namespace *ns, uint32_t evict_void_time)
+{
+	if (as_storage_save_evict_void_time_table[ns->storage_type]) {
+		as_storage_save_evict_void_time_table[ns->storage_type](ns, evict_void_time);
+	}
+}
+
+//--------------------------------------
+// as_storage_stats
+//
+
+typedef int (*as_storage_stats_fn)(as_namespace *ns, int *available_pct, uint64_t *used_disk_bytes);
+static const as_storage_stats_fn as_storage_stats_table[AS_NUM_STORAGE_ENGINES] = {
+	as_storage_stats_memory,
+	as_storage_stats_ssd
+};
+
+int
+as_storage_stats(as_namespace *ns, int *available_pct, uint64_t *used_disk_bytes)
+{
+	if (as_storage_stats_table[ns->storage_type]) {
+		return as_storage_stats_table[ns->storage_type](ns, available_pct, used_disk_bytes);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_ticker_stats
+//
+
+typedef int (*as_storage_ticker_stats_fn)(as_namespace *ns);
+static const as_storage_ticker_stats_fn as_storage_ticker_stats_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't support per-disk histograms... for now.
+	as_storage_ticker_stats_ssd
+};
+
+int
+as_storage_ticker_stats(as_namespace *ns)
+{
+	if (as_storage_ticker_stats_table[ns->storage_type]) {
+		return as_storage_ticker_stats_table[ns->storage_type](ns);
+	}
+
+	return 0;
+}
+
+//--------------------------------------
+// as_storage_histogram_clear_all
+//
+
+typedef int (*as_storage_histogram_clear_fn)(as_namespace *ns);
+static const as_storage_histogram_clear_fn as_storage_histogram_clear_table[AS_NUM_STORAGE_ENGINES] = {
+	NULL, // memory doesn't support per-disk histograms... for now.
+	as_storage_histogram_clear_ssd
+};
+
+int
+as_storage_histogram_clear_all(as_namespace *ns)
+{
+	if (as_storage_histogram_clear_table[ns->storage_type]) {
+		return as_storage_histogram_clear_table[ns->storage_type](ns);
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// Generic functions that don't use "v-tables".
+//
+
+// Get size of record's in-memory data - everything except index bytes.
+uint64_t
+as_storage_record_get_n_bytes_memory(as_storage_rd *rd)
+{
+	if (! rd->ns->storage_data_in_memory) {
+		return 0;
+	}
+
+	uint64_t n_bytes_memory = 0;
+
+	for (uint16_t i = 0; i < rd->n_bins; i++) {
+		n_bytes_memory += as_bin_particle_size(&rd->bins[i]);
+	}
+
+	if (! rd->ns->single_bin) {
+		if (rd->r->key_stored == 1) {
+			n_bytes_memory += sizeof(as_rec_space) +
+					((as_rec_space*)rd->r->dim)->key_size;
+		}
+
+		if (as_index_get_bin_space(rd->r)) {
+			n_bytes_memory += sizeof(as_bin_space) +
+					(sizeof(as_bin) * rd->n_bins);
+		}
+	}
+
+	return n_bytes_memory;
+}
+
+void
+as_storage_record_adjust_mem_stats(as_storage_rd *rd, uint64_t start_bytes)
+{
+	if (! rd->ns->storage_data_in_memory) {
+		return;
+	}
+
+	uint64_t end_bytes = as_storage_record_get_n_bytes_memory(rd);
+	int64_t delta_bytes = (int64_t)end_bytes - (int64_t)start_bytes;
+
+	if (delta_bytes != 0) {
+		cf_atomic_int_add(&rd->ns->n_bytes_memory, delta_bytes);
+		as_namespace_adjust_set_memory(rd->ns, as_index_get_set_id(rd->r),
+				delta_bytes);
+	}
+}
+
+void
+as_storage_record_drop_from_mem_stats(as_storage_rd *rd)
+{
+	if (! rd->ns->storage_data_in_memory) {
+		return;
+	}
+
+	uint64_t drop_bytes = as_storage_record_get_n_bytes_memory(rd);
+
+	cf_atomic_int_sub(&rd->ns->n_bytes_memory, drop_bytes);
+	as_namespace_adjust_set_memory(rd->ns, as_index_get_set_id(rd->r),
+			-(int64_t)drop_bytes);
+}
+
+bool
+as_storage_record_get_key(as_storage_rd *rd)
+{
+	if (rd->r->key_stored == 0) {
+		return false;
+	}
+
+	if (rd->ns->storage_data_in_memory) {
+		rd->key_size = ((as_rec_space*)rd->r->dim)->key_size;
+		rd->key = ((as_rec_space*)rd->r->dim)->key;
+		return true;
+	}
+
+	if (rd->record_on_device && ! rd->ignore_record_on_device) {
+		return as_storage_record_get_key_ssd(rd);
+	}
+
+	return false;
+}
+
+size_t
+as_storage_record_rec_props_size(as_storage_rd *rd)
+{
+	size_t rec_props_data_size = 0;
+
+	const char *set_name = as_index_get_set_name(rd->r, rd->ns);
+
+	if (set_name) {
+		rec_props_data_size += as_rec_props_sizeof_field(strlen(set_name) + 1);
+	}
+
+	if (rd->key) {
+		rec_props_data_size += as_rec_props_sizeof_field(rd->key_size);
+	}
+
+	return rec_props_data_size;
+}
+
+// Populates rec_props struct in rd, using index info where possible. Assumes
+// relevant information is ready:
+// - set name
+// - record key
+// Relies on caller's properly allocated rec_props_data.
+void
+as_storage_record_set_rec_props(as_storage_rd *rd, uint8_t* rec_props_data)
+{
+	as_rec_props_init(&(rd->rec_props), rec_props_data);
+
+	if (as_index_has_set(rd->r)) {
+		const char *set_name = as_index_get_set_name(rd->r, rd->ns);
+		as_rec_props_add_field(&(rd->rec_props), CL_REC_PROPS_FIELD_SET_NAME,
+				strlen(set_name) + 1, (uint8_t *)set_name);
+	}
+
+	if (rd->key) {
+		as_rec_props_add_field(&(rd->rec_props), CL_REC_PROPS_FIELD_KEY,
+				rd->key_size, rd->key);
+	}
+}
+
+void
+as_storage_shutdown(void)
+{
+	cf_info(AS_STORAGE, "initiating storage shutdown ...");
+
+	// Pull all record locks - stops everything writing to current swbs such
+	// that each write's record lock scope is either completed or never entered.
+
+	for (uint32_t n = 0; n < g_record_locks->n_locks; n++) {
+		cf_mutex_lock(&g_record_locks->locks[n]);
+	}
+
+	// Now flush everything outstanding to storage devices.
+
+ 	cf_info(AS_STORAGE, "flushing data to storage ...");
+
+	for (uint32_t i = 0; i < g_config.n_namespaces; i++) {
+		as_namespace *ns = g_config.namespaces[i];
+
+		if (ns->storage_type == AS_STORAGE_ENGINE_SSD) {
+
+			// For now this is only needed for warm-restartable namespaces.
+			for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) {
+				as_partition_shutdown(ns, pid);
+			}
+
+			as_storage_shutdown_ssd(ns);
+			as_namespace_xmem_trusted(ns);
+		}
+	}
+
+  	cf_info(AS_STORAGE, "completed flushing to storage");
+}
diff --git a/as/src/transaction/delete.c b/as/src/transaction/delete.c
new file mode 100644
index 00000000..9ab8387d
--- /dev/null
+++ b/as/src/transaction/delete.c
@@ -0,0 +1,486 @@
+/*
+ * delete.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/delete.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "base/xdr_serverside.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+#include "transaction/duplicate_resolve.h"
+#include "transaction/proxy.h"
+#include "transaction/replica_write.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void start_delete_dup_res(rw_request* rw, as_transaction* tr);
+void start_delete_repl_write(rw_request* rw, as_transaction* tr);
+void start_delete_repl_write_forget(rw_request* rw, as_transaction* tr);
+bool delete_dup_res_cb(rw_request* rw);
+void delete_repl_write_after_dup_res(rw_request* rw, as_transaction* tr);
+void delete_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr);
+void delete_repl_write_cb(rw_request* rw);
+
+void send_delete_response(as_transaction* tr);
+void delete_timeout_cb(rw_request* rw);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline void
+client_delete_update_stats(as_namespace* ns, uint8_t result_code,
+		bool is_xdr_op)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_client_delete_success);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_delete_success);
+		}
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_client_delete_timeout);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_delete_timeout);
+		}
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_client_delete_error);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_delete_error);
+		}
+		break;
+	case AS_PROTO_RESULT_FAIL_NOT_FOUND:
+		cf_atomic64_incr(&ns->n_client_delete_not_found);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_delete_not_found);
+		}
+		break;
+	}
+}
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status
+as_delete_start(as_transaction* tr)
+{
+	// Apply XDR filter.
+	if (! xdr_allows_write(tr)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN;
+		send_delete_response(tr);
+		return TRANS_DONE_ERROR;
+	}
+
+	if (! validate_delete_durability(tr)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_FORBIDDEN;
+		send_delete_response(tr);
+		return TRANS_DONE_ERROR;
+	}
+
+	if (delete_storage_overloaded(tr)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD;
+		send_delete_response(tr);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Create rw_request and add to hash.
+	rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd };
+	rw_request* rw = rw_request_create(&tr->keyd);
+	transaction_status status = rw_request_hash_insert(&hkey, rw, tr);
+
+	// If rw_request wasn't inserted in hash, transaction is finished.
+	if (status != TRANS_IN_PROGRESS) {
+		rw_request_release(rw);
+
+		if (status != TRANS_WAITING) {
+			send_delete_response(tr);
+		}
+
+		return status;
+	}
+	// else - rw_request is now in hash, continue...
+
+	if (tr->rsv.ns->write_dup_res_disabled ||
+			as_transaction_is_nsup_delete(tr)) {
+		// Note - preventing duplicate resolution this way allows
+		// rw_request_destroy() to handle dup_msg[] cleanup correctly.
+		tr->rsv.n_dupl = 0;
+	}
+
+	// If there are duplicates to resolve, start doing so.
+	// TODO - should we bother if there's no generation check?
+	if (tr->rsv.n_dupl != 0) {
+		start_delete_dup_res(rw, tr);
+
+		// Started duplicate resolution.
+		return TRANS_IN_PROGRESS;
+	}
+	// else - no duplicate resolution phase, apply operation to master.
+
+	// Set up the nodes to which we'll write replicas.
+	rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p,
+			rw->dest_nodes);
+
+	if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) {
+		rw_request_hash_delete(&hkey, rw);
+		tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+		send_delete_response(tr);
+		return TRANS_DONE_ERROR;
+	}
+
+	// If error, transaction is finished.
+	if ((status = delete_master(tr, rw)) != TRANS_IN_PROGRESS) {
+		rw_request_hash_delete(&hkey, rw);
+
+		if (status != TRANS_WAITING) {
+			send_delete_response(tr);
+		}
+
+		return status;
+	}
+
+	// If we don't need replica writes, transaction is finished.
+	if (rw->n_dest_nodes == 0) {
+		finished_replicated(tr);
+		rw_request_hash_delete(&hkey, rw);
+		send_delete_response(tr);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	// If we don't need to wait for replica write acks, fire and forget.
+	if (as_transaction_is_nsup_delete(tr) || respond_on_master_complete(tr)) {
+		start_delete_repl_write_forget(rw, tr);
+		rw_request_hash_delete(&hkey, rw);
+		send_delete_response(tr);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	start_delete_repl_write(rw, tr);
+
+	// Started replica write.
+	return TRANS_IN_PROGRESS;
+}
+
+
+//==========================================================
+// Local helpers - transaction flow.
+//
+
+void
+start_delete_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send dup-res message.
+
+	dup_res_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	dup_res_setup_rw(rw, tr, delete_dup_res_cb, delete_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_delete_repl_write(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send repl-delete message.
+
+	repl_write_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	repl_write_setup_rw(rw, tr, delete_repl_write_cb, delete_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_delete_repl_write_forget(rw_request* rw, as_transaction* tr)
+{
+	// Construct and send repl-write message. No need to finish rw setup.
+
+	repl_write_make_message(rw, tr);
+	send_rw_messages_forget(rw);
+}
+
+
+bool
+delete_dup_res_cb(rw_request* rw)
+{
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	if (tr.result_code != AS_PROTO_RESULT_OK) {
+		send_delete_response(&tr);
+		return true;
+	}
+
+	// Set up the nodes to which we'll write replicas.
+	rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p,
+			rw->dest_nodes);
+
+	if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) {
+		tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+		send_delete_response(&tr);
+		return true;
+	}
+
+	transaction_status status = delete_master(&tr, rw);
+
+	if (status == TRANS_WAITING) {
+		// Note - new tr now owns msgp, make sure rw destructor doesn't free it.
+		// Also, rw will release rsv - new tr will get a new one.
+		rw->msgp = NULL;
+		return true;
+	}
+
+	if (status == TRANS_DONE_ERROR) {
+		send_delete_response(&tr);
+		return true;
+	}
+
+	// If we don't need replica writes, transaction is finished.
+	if (rw->n_dest_nodes == 0) {
+		finished_replicated(&tr);
+		send_delete_response(&tr);
+		return true;
+	}
+
+	// If we don't need to wait for replica write acks, fire and forget.
+	// (Remember that nsup deletes can't get here, so no need to check.)
+	if (respond_on_master_complete(&tr)) {
+		delete_repl_write_forget_after_dup_res(rw, &tr);
+		send_delete_response(&tr);
+		return true;
+	}
+
+	delete_repl_write_after_dup_res(rw, &tr);
+
+	// Started replica write - don't delete rw_request from hash.
+	return false;
+}
+
+
+void
+delete_repl_write_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Recycle rw_request that was just used for duplicate resolution to now do
+	// replica writes. Note - we are under the rw_request lock here!
+
+	repl_write_make_message(rw, tr);
+	repl_write_reset_rw(rw, tr, delete_repl_write_cb);
+	send_rw_messages(rw);
+}
+
+
+void
+delete_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Send replica writes. Not waiting for acks, so need to reset rw_request.
+	// Note - we are under the rw_request lock here!
+
+	repl_write_make_message(rw, tr);
+	send_rw_messages_forget(rw);
+}
+
+
+void
+delete_repl_write_cb(rw_request* rw)
+{
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	finished_replicated(&tr);
+	send_delete_response(&tr);
+
+	// Finished transaction - rw_request cleans up reservation and msgp!
+}
+
+
+//==========================================================
+// Local helpers - transaction end.
+//
+
+void
+send_delete_response(as_transaction* tr)
+{
+	// Paranoia - shouldn't get here on losing race with timeout.
+	if (! tr->from.any && tr->origin != FROM_NSUP) {
+		cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin);
+		return;
+	}
+
+	// Note - if tr was setup from rw, rw->from.any has been set null and
+	// informs timeout it lost the race.
+
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		as_msg_send_reply(tr->from.proto_fd_h, tr->result_code, 0, 0, NULL,
+				NULL, 0, tr->rsv.ns, as_transaction_trid(tr));
+		client_delete_update_stats(tr->rsv.ns, tr->result_code,
+				as_transaction_is_xdr(tr));
+		break;
+	case FROM_PROXY:
+		as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid,
+				tr->result_code, 0, 0, NULL, NULL, 0, tr->rsv.ns,
+				as_transaction_trid(tr));
+		break;
+	case FROM_NSUP:
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+
+	tr->from.any = NULL; // pattern, not needed
+}
+
+
+void
+delete_timeout_cb(rw_request* rw)
+{
+	// Paranoia - remove eventually.
+	cf_assert(rw->origin != FROM_NSUP, AS_RW, "nsup delete got timeout cb");
+
+	if (! rw->from.any) {
+		return; // lost race against dup-res or repl-write callback
+	}
+
+	finished_not_replicated(rw);
+
+	switch (rw->origin) {
+	case FROM_CLIENT:
+		as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0,
+				0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw));
+		client_delete_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT,
+				as_msg_is_xdr(&rw->msgp->msg));
+		break;
+	case FROM_PROXY:
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin);
+		break;
+	}
+
+	rw->from.any = NULL; // inform other callback it lost the race
+}
+
+
+//==========================================================
+// Local helpers - delete master.
+//
+
+transaction_status
+drop_master(as_transaction* tr, as_index_ref* r_ref, rw_request* rw)
+{
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	as_index_tree* tree = tr->rsv.tree;
+	as_record* r = r_ref->r;
+
+	// Check generation requirement, if any.
+	if (! generation_check(r, m, ns)) {
+		as_record_done(r_ref, ns);
+		cf_atomic64_incr(&ns->n_fail_generation);
+		tr->result_code = AS_PROTO_RESULT_FAIL_GENERATION;
+		return TRANS_DONE_ERROR;
+	}
+
+	bool check_key = as_transaction_has_key(tr);
+
+	if (ns->storage_data_in_memory || check_key) {
+		as_storage_rd rd;
+		as_storage_record_open(ns, r, &rd);
+
+		// Check the key if required.
+		// Note - for data-not-in-memory a key check is expensive!
+		if (check_key && as_storage_record_get_key(&rd) &&
+				! check_msg_key(m, &rd)) {
+			as_storage_record_close(&rd);
+			as_record_done(r_ref, ns);
+			tr->result_code = AS_PROTO_RESULT_FAIL_KEY_MISMATCH;
+			return TRANS_DONE_ERROR;
+		}
+
+		if (ns->storage_data_in_memory) {
+			delete_adjust_sindex(&rd);
+		}
+
+		as_storage_record_close(&rd);
+	}
+
+	// Generate a binless pickle. but don't generate pickled rec-props - these
+	// are useless for a drop.
+	rw->pickled_sz = sizeof(uint16_t);
+	rw->pickled_buf = cf_malloc(rw->pickled_sz);
+	*(uint16_t*)rw->pickled_buf = 0;
+
+	// Save the set-ID for XDR.
+	uint16_t set_id = as_index_get_set_id(r);
+
+	as_index_delete(tree, &tr->keyd);
+	as_record_done(r_ref, ns);
+
+	if (xdr_must_ship_delete(ns, as_transaction_is_nsup_delete(tr),
+			as_msg_is_xdr(m))) {
+		xdr_write(ns, &tr->keyd, 0, 0, XDR_OP_TYPE_DROP, set_id, NULL);
+	}
+
+	return TRANS_IN_PROGRESS;
+}
diff --git a/as/src/transaction/delete_ce.c b/as/src/transaction/delete_ce.c
new file mode 100644
index 00000000..2872e806
--- /dev/null
+++ b/as/src/transaction/delete_ce.c
@@ -0,0 +1,69 @@
+/*
+ * delete_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/delete.h"
+
+#include <stdbool.h>
+
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/transaction.h"
+#include "transaction/rw_request.h"
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+bool
+delete_storage_overloaded(as_transaction* tr)
+{
+	return false;
+}
+
+
+transaction_status
+delete_master(as_transaction* tr, rw_request* rw)
+{
+	if (as_transaction_is_durable_delete(tr)) {
+		cf_warning(AS_RW, "durable delete is an enterprise feature");
+		tr->result_code = AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY;
+		return TRANS_DONE_ERROR;
+	}
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	if (0 != as_record_get(tr->rsv.tree, &tr->keyd, &r_ref)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		return TRANS_DONE_ERROR;
+	}
+
+	return drop_master(tr, &r_ref, rw);
+}
diff --git a/as/src/transaction/duplicate_resolve.c b/as/src/transaction/duplicate_resolve.c
new file mode 100644
index 00000000..c131cf33
--- /dev/null
+++ b/as/src/transaction/duplicate_resolve.c
@@ -0,0 +1,578 @@
+/*
+ * duplicate_resolve.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/duplicate_resolve.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void done_handle_request(as_partition_reservation* rsv, as_index_ref* r_ref, as_storage_rd* rd);
+void send_dup_res_ack(cf_node node, msg* m, uint32_t result);
+void send_ack_for_bad_request(cf_node node, msg* m);
+uint32_t parse_dup_meta(msg* m, uint32_t* p_generation, uint64_t* p_last_update_time);
+void apply_winner(rw_request* rw);
+
+
+//==========================================================
+// Public API.
+//
+
+void
+dup_res_make_message(rw_request* rw, as_transaction* tr)
+{
+	rw->dest_msg = as_fabric_msg_get(M_TYPE_RW);
+
+	as_namespace* ns = tr->rsv.ns;
+	msg* m = rw->dest_msg;
+
+	msg_set_uint32(m, RW_FIELD_OP, RW_OP_DUP);
+	msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name),
+			MSG_SET_COPY);
+	msg_set_uint32(m, RW_FIELD_NS_ID, ns->id);
+	msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest),
+			MSG_SET_COPY);
+	msg_set_uint32(m, RW_FIELD_TID, rw->tid);
+
+	// TODO - JUMP - send this only because versions up to 3.14.x require it.
+	msg_set_uint64(m, RW_FIELD_CLUSTER_KEY, as_exchange_cluster_key());
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) == 0) {
+		as_record* r = r_ref.r;
+
+		msg_set_uint32(m, RW_FIELD_GENERATION, r->generation);
+		msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, r->last_update_time);
+
+		as_record_done(&r_ref, ns);
+	}
+}
+
+
+void
+dup_res_setup_rw(rw_request* rw, as_transaction* tr, dup_res_done_cb dup_res_cb,
+		timeout_done_cb timeout_cb)
+{
+	rw->msgp = tr->msgp;
+	tr->msgp = NULL;
+
+	rw->msg_fields = tr->msg_fields;
+	rw->origin = tr->origin;
+	rw->from_flags = tr->from_flags;
+
+	rw->from.any = tr->from.any;
+	rw->from_data.any = tr->from_data.any;
+	tr->from.any = NULL;
+
+	rw->start_time = tr->start_time;
+	rw->benchmark_time = tr->benchmark_time;
+
+	as_partition_reservation_copy(&rw->rsv, &tr->rsv);
+	// Hereafter, rw must release the reservation - happens in destructor.
+
+	rw->end_time = tr->end_time;
+	// Note - don't need as_transaction's other 'container' members.
+
+	rw->dup_res_cb = dup_res_cb;
+	rw->timeout_cb = timeout_cb;
+
+	rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms;
+	rw->retry_interval_ms = g_config.transaction_retry_ms;
+
+	rw->n_dest_nodes = tr->rsv.n_dupl;
+
+	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
+		rw->dest_complete[i] = false;
+		rw->dest_nodes[i] = tr->rsv.dupl_nodes[i];
+	}
+
+	// Allow retransmit thread to destroy rw as soon as we unlock.
+	rw->is_set_up = true;
+}
+
+
+void
+dup_res_handle_request(cf_node node, msg* m)
+{
+	cf_digest* keyd;
+
+	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_RW, "dup-res handler: no digest");
+		send_ack_for_bad_request(node, m);
+		return;
+	}
+
+	uint8_t* ns_name;
+	size_t ns_name_len;
+
+	if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_RW, "dup-res handler: no namespace");
+		send_ack_for_bad_request(node, m);
+		return;
+	}
+
+	as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len);
+
+	if (! ns) {
+		cf_warning(AS_RW, "dup-res handler: invalid namespace");
+		send_ack_for_bad_request(node, m);
+		return;
+	}
+
+	uint32_t generation = 0;
+	uint64_t last_update_time = 0;
+
+	bool local_conflict_check =
+			msg_get_uint32(m, RW_FIELD_GENERATION, &generation) == 0 &&
+			msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME,
+					&last_update_time) == 0;
+
+	// Done reading message fields, may now set fields for ack.
+	msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID);
+
+	as_partition_reservation rsv;
+
+	as_partition_reserve(ns, as_partition_getid(keyd), &rsv);
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	if (as_record_get(rsv.tree, keyd, &r_ref) != 0) {
+		done_handle_request(&rsv, NULL, NULL);
+		send_dup_res_ack(node, m, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		return;
+	}
+
+	as_record* r = r_ref.r;
+
+	int result;
+
+	if ((result = as_partition_check_source(ns, rsv.p, node, NULL)) !=
+			AS_PROTO_RESULT_OK) {
+		done_handle_request(&rsv, &r_ref, NULL);
+		send_dup_res_ack(node, m, result);
+		return;
+	}
+
+	if (local_conflict_check &&
+			(result = as_record_resolve_conflict(ns->conflict_resolution_policy,
+					generation, last_update_time, r->generation,
+					r->last_update_time)) <= 0) {
+		uint32_t info = dup_res_pack_repl_state_info(r, ns);
+
+		if (info != 0) {
+			msg_set_uint32(m, RW_FIELD_INFO, info);
+		}
+
+		done_handle_request(&rsv, &r_ref, NULL);
+		send_dup_res_ack(node, m, result == 0 ?
+				AS_PROTO_RESULT_FAIL_RECORD_EXISTS :
+				AS_PROTO_RESULT_FAIL_GENERATION);
+		return;
+	}
+
+	as_storage_rd rd;
+
+	as_storage_record_open(ns, r, &rd);
+
+	if ((result = as_storage_rd_load_n_bins(&rd)) < 0) {
+		done_handle_request(&rsv, &r_ref, &rd);
+		send_dup_res_ack(node, m, (uint32_t)-result);
+		return;
+	}
+
+	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+	if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) {
+		done_handle_request(&rsv, &r_ref, &rd);
+		send_dup_res_ack(node, m, (uint32_t)-result);
+		return;
+	}
+
+	size_t buf_len;
+	uint8_t* buf = as_record_pickle(&rd, &buf_len);
+
+	msg_set_buf(m, RW_FIELD_RECORD, (void*)buf, buf_len,
+			MSG_SET_HANDOFF_MALLOC);
+
+	const char* set_name = as_index_get_set_name(r, ns);
+
+	if (set_name) {
+		msg_set_buf(m, RW_FIELD_SET_NAME, (const uint8_t *)set_name,
+				strlen(set_name), MSG_SET_COPY);
+	}
+
+	as_storage_record_get_key(&rd);
+
+	if (rd.key) {
+		msg_set_buf(m, RW_FIELD_KEY, rd.key, rd.key_size, MSG_SET_COPY);
+	}
+
+	msg_set_uint32(m, RW_FIELD_GENERATION, r->generation);
+	msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, r->last_update_time);
+
+	if (r->void_time != 0) {
+		msg_set_uint32(m, RW_FIELD_VOID_TIME, r->void_time);
+	}
+
+	uint32_t info = dup_res_pack_info(r, ns);
+
+	if (info != 0) {
+		msg_set_uint32(m, RW_FIELD_INFO, info);
+	}
+
+	done_handle_request(&rsv, &r_ref, &rd);
+	send_dup_res_ack(node, m, AS_PROTO_RESULT_OK);
+}
+
+
+void
+dup_res_handle_ack(cf_node node, msg* m)
+{
+	uint32_t ns_id;
+
+	if (msg_get_uint32(m, RW_FIELD_NS_ID, &ns_id) != 0) {
+		cf_warning(AS_RW, "dup-res ack: no ns-id");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	cf_digest* keyd;
+
+	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_RW, "dup-res ack: no digest");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t tid;
+
+	if (msg_get_uint32(m, RW_FIELD_TID, &tid) != 0) {
+		cf_warning(AS_RW, "dup-res ack: no tid");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	rw_request_hkey hkey = { ns_id, *keyd };
+	rw_request* rw = rw_request_hash_get(&hkey);
+
+	if (! rw) {
+		// Extra ack, after rw_request is already gone.
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	pthread_mutex_lock(&rw->lock);
+
+	if (rw->tid != tid || rw->dup_res_complete) {
+		// Extra ack - rw_request is newer transaction for same digest, or ack
+		// is arriving after rw_request was aborted or finished dup-res.
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	// Find remote node in duplicates list.
+	int i = index_of_node(rw->dest_nodes, rw->n_dest_nodes, node);
+
+	if (i == -1) {
+		cf_warning(AS_RW, "dup-res ack: from non-dest node %lx", node);
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	if (rw->dest_complete[i]) {
+		// Extra ack for this duplicate.
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	rw->dest_complete[i] = true;
+
+	uint32_t generation = 0;
+	uint64_t last_update_time = 0;
+	uint32_t result_code = parse_dup_meta(m, &generation, &last_update_time);
+
+	// If it makes sense, retry transaction from the beginning.
+	// TODO - is this retry too fast? Should there be a throttle? If so, how?
+	if (dup_res_should_retry_transaction(rw, result_code)) {
+		if (! rw->from.any) {
+			// Lost race against timeout in retransmit thread.
+			pthread_mutex_unlock(&rw->lock);
+			rw_request_release(rw);
+			as_fabric_msg_put(m);
+			return;
+		}
+
+		as_transaction tr;
+		as_transaction_init_head_from_rw(&tr, rw);
+
+		// Note that tr now owns msgp - make sure rw destructor doesn't free it.
+		// Note also that rw will release rsv - tr will get a new one.
+		rw->msgp = NULL;
+
+		tr.from_flags |= FROM_FLAG_RESTART;
+		as_tsvc_enqueue(&tr);
+
+		rw->dup_res_complete = true;
+
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_hash_delete(&hkey, rw);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	dup_res_handle_tie(rw, m, result_code);
+
+	// Compare this duplicate with previous best, if any.
+	bool keep_previous_best = rw->best_dup_msg &&
+			as_record_resolve_conflict(rw->rsv.ns->conflict_resolution_policy,
+					rw->best_dup_gen, rw->best_dup_lut,
+					(uint16_t)generation, last_update_time) <= 0;
+
+	if (keep_previous_best) {
+		// This duplicate is no better than previous best - keep previous best.
+		as_fabric_msg_put(m);
+	}
+	else {
+		// No previous best, or this duplicate is better - keep this one.
+		if (rw->best_dup_msg) {
+			as_fabric_msg_put(rw->best_dup_msg);
+		}
+
+		msg_preserve_all_fields(m);
+		rw->best_dup_msg = m;
+		rw->best_dup_result_code = (uint8_t)result_code;
+		rw->best_dup_gen = generation;
+		rw->best_dup_lut = last_update_time;
+	}
+
+	// Saved or discarded m - from here down don't call as_fabric_msg_put(m)!
+
+	for (uint32_t j = 0; j < rw->n_dest_nodes; j++) {
+		if (! rw->dest_complete[j]) {
+			// Still haven't heard from all duplicates.
+			pthread_mutex_unlock(&rw->lock);
+			rw_request_release(rw);
+			return;
+		}
+	}
+
+	if (rw->best_dup_result_code == AS_PROTO_RESULT_OK) {
+		apply_winner(rw); // sets rw->result_code to pass along to callback
+	}
+	else {
+		apply_if_tie(rw);
+	}
+
+	// Check for lost race against timeout in retransmit thread *after* applying
+	// winner - may save a future transaction from re-fetching the duplicates.
+	// Note - nsup deletes don't get here, so check using rw->from.any is ok.
+	if (! rw->from.any) {
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		return;
+	}
+
+	dup_res_translate_result_code(rw);
+
+	bool delete_from_hash = rw->dup_res_cb(rw);
+
+	rw->dup_res_complete = true;
+
+	pthread_mutex_unlock(&rw->lock);
+
+	if (delete_from_hash) {
+		rw_request_hash_delete(&hkey, rw);
+	}
+
+	rw_request_release(rw);
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+void
+done_handle_request(as_partition_reservation* rsv, as_index_ref* r_ref,
+		as_storage_rd* rd)
+{
+	if (rd) {
+		as_storage_record_close(rd);
+	}
+
+	if (r_ref) {
+		as_record_done(r_ref, rsv->ns);
+	}
+
+	if (rsv) {
+		as_partition_release(rsv);
+	}
+}
+
+
+void
+send_dup_res_ack(cf_node node, msg* m, uint32_t result)
+{
+	msg_set_uint32(m, RW_FIELD_OP, RW_OP_DUP_ACK);
+	msg_set_uint32(m, RW_FIELD_RESULT, result);
+
+	if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+void
+send_ack_for_bad_request(cf_node node, msg* m)
+{
+	msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID);
+
+	msg_set_uint32(m, RW_FIELD_OP, RW_OP_DUP_ACK);
+	msg_set_uint32(m, RW_FIELD_RESULT, AS_PROTO_RESULT_FAIL_UNKNOWN); // ???
+
+	if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+uint32_t
+parse_dup_meta(msg* m, uint32_t* p_generation, uint64_t* p_last_update_time)
+{
+	uint32_t result_code;
+
+	if (msg_get_uint32(m, RW_FIELD_RESULT, &result_code) != 0) {
+		cf_warning(AS_RW, "dup-res ack: no result_code");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (result_code != AS_PROTO_RESULT_OK) {
+		return result_code;
+	}
+
+	if (msg_get_uint32(m, RW_FIELD_GENERATION, p_generation) != 0 ||
+			*p_generation == 0) {
+		cf_warning(AS_RW, "dup-res ack: no or bad generation");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	if (msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, p_last_update_time) != 0) {
+		cf_warning(AS_RW, "dup-res ack: no last-update-time");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+void
+apply_winner(rw_request* rw)
+{
+	msg* m = rw->best_dup_msg;
+
+	as_remote_record rr = {
+			// Skipping .src for now.
+			.rsv = &rw->rsv,
+			.keyd = &rw->keyd,
+			.generation = rw->best_dup_gen,
+			.last_update_time = rw->best_dup_lut
+	};
+
+	if (msg_get_buf(m, RW_FIELD_RECORD, &rr.record_buf, &rr.record_buf_sz,
+			MSG_GET_DIRECT) != 0 || rr.record_buf_sz < 2) {
+		cf_warning_digest(AS_RW, &rw->keyd, "dup-res ack: no record ");
+		rw->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN;
+		return;
+	}
+
+	uint32_t info = 0;
+
+	msg_get_uint32(m, RW_FIELD_INFO, &info);
+
+	if (dup_res_ignore_pickle(rr.record_buf, info)) {
+		cf_warning_digest(AS_RW, &rw->keyd, "dup-res ack: binless pickle ");
+		rw->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN;
+		return;
+	}
+
+	msg_get_uint32(m, RW_FIELD_VOID_TIME, &rr.void_time);
+
+	msg_get_buf(m, RW_FIELD_SET_NAME, (uint8_t **)&rr.set_name,
+			&rr.set_name_len, MSG_GET_DIRECT);
+
+	msg_get_buf(m, RW_FIELD_KEY, (uint8_t **)&rr.key, &rr.key_size,
+			MSG_GET_DIRECT);
+
+	dup_res_init_repl_state(&rr, info);
+
+	rw->result_code = (uint8_t)as_record_replace_if_better(&rr, false, false,
+			false);
+
+	// Duplicate resolution just treats these errors as successful no-ops:
+	if (rw->result_code == AS_PROTO_RESULT_FAIL_RECORD_EXISTS ||
+			rw->result_code == AS_PROTO_RESULT_FAIL_GENERATION) {
+		rw->result_code = AS_PROTO_RESULT_OK;
+	}
+}
diff --git a/as/src/transaction/proxy.c b/as/src/transaction/proxy.c
new file mode 100644
index 00000000..d97ac7d9
--- /dev/null
+++ b/as/src/transaction/proxy.c
@@ -0,0 +1,698 @@
+/*
+ * proxy.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/proxy.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+#include "shash.h"
+#include "socket.h"
+
+#include "base/batch.h"
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "base/stats.h"
+#include "fabric/exchange.h"
+#include "fabric/fabric.h"
+#include "fabric/partition.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+#include "transaction/udf.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef enum {
+	// These values go on the wire, so mind backward compatibility if changing.
+	PROXY_FIELD_OP,
+	PROXY_FIELD_TID,
+	PROXY_FIELD_DIGEST,
+	PROXY_FIELD_REDIRECT,
+	PROXY_FIELD_AS_PROTO, // request as_proto - currently contains only as_msg's
+	PROXY_FIELD_UNUSED_5,
+	PROXY_FIELD_UNUSED_6,
+	PROXY_FIELD_UNUSED_7,
+
+	NUM_PROXY_FIELDS
+} proxy_msg_field;
+
+#define PROXY_OP_REQUEST 1
+#define PROXY_OP_RESPONSE 2
+#define PROXY_OP_RETURN_TO_SENDER 3
+
+const msg_template proxy_mt[] = {
+	{ PROXY_FIELD_OP, M_FT_UINT32 },
+	{ PROXY_FIELD_TID, M_FT_UINT32 },
+	{ PROXY_FIELD_DIGEST, M_FT_BUF },
+	{ PROXY_FIELD_REDIRECT, M_FT_UINT64 },
+	{ PROXY_FIELD_AS_PROTO, M_FT_BUF },
+	{ PROXY_FIELD_UNUSED_5, M_FT_UINT64 },
+	{ PROXY_FIELD_UNUSED_6, M_FT_UINT32 },
+	{ PROXY_FIELD_UNUSED_7, M_FT_UINT32 },
+};
+
+COMPILER_ASSERT(sizeof(proxy_mt) / sizeof(msg_template) == NUM_PROXY_FIELDS);
+
+#define PROXY_MSG_SCRATCH_SIZE 128
+
+typedef struct proxy_request_s {
+	uint32_t		msg_fields;
+
+	uint8_t			origin;
+	uint8_t			from_flags;
+
+	union {
+		void*				any;
+		as_file_handle*		proto_fd_h;
+		as_batch_shared*	batch_shared;
+		// No need yet for other members of this union.
+	} from;
+
+	// No need yet for a 'from_data" union.
+	uint32_t		batch_index;
+
+	uint64_t		start_time;
+	uint64_t		end_time;
+
+	// The original proxy message.
+	msg*			fab_msg;
+
+	as_namespace*	ns;
+} proxy_request;
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_shash* g_proxy_hash = NULL;
+static cf_atomic32 g_proxy_tid = 0;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void* run_proxy_timeout(void* arg);
+int proxy_timeout_reduce_fn(const void* key, void* data, void* udata);
+
+int proxy_msg_cb(cf_node src, msg* m, void* udata);
+
+void proxyer_handle_response(msg* m, uint32_t tid);
+int proxyer_handle_client_response(msg* m, proxy_request* pr);
+int proxyer_handle_batch_response(msg* m, proxy_request* pr);
+void proxyer_handle_return_to_sender(msg* m, uint32_t tid);
+
+void proxyee_handle_request(cf_node src, msg* m, uint32_t tid);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline void
+error_response(cf_node src, uint32_t tid, uint32_t error)
+{
+	as_proxy_send_response(src, tid, error, 0, 0, NULL, NULL, 0, NULL, 0);
+}
+
+static inline void
+client_proxy_update_stats(as_namespace* ns, uint8_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_client_proxy_complete);
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_client_proxy_timeout);
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_client_proxy_error);
+		break;
+	}
+}
+
+static inline void
+batch_sub_proxy_update_stats(as_namespace* ns, uint8_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_batch_sub_proxy_complete);
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_batch_sub_proxy_timeout);
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_batch_sub_proxy_error);
+		break;
+	}
+}
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_proxy_init()
+{
+	g_proxy_hash = cf_shash_create(cf_shash_fn_u32, sizeof(uint32_t),
+			sizeof(proxy_request), 4 * 1024, CF_SHASH_MANY_LOCK);
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, run_proxy_timeout, NULL) != 0) {
+		cf_crash(AS_PROXY, "failed to create proxy timeout thread");
+	}
+
+	as_fabric_register_msg_fn(M_TYPE_PROXY, proxy_mt, sizeof(proxy_mt),
+			PROXY_MSG_SCRATCH_SIZE, proxy_msg_cb, NULL);
+}
+
+
+uint32_t
+as_proxy_hash_count()
+{
+	return cf_shash_get_size(g_proxy_hash);
+}
+
+
+// Proxyer - divert a transaction request to another node.
+void
+as_proxy_divert(cf_node dst, as_transaction* tr, as_namespace* ns)
+{
+	// Special log detail.
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		cf_detail_digest(AS_PROXY_DIVERT, &tr->keyd,
+				"{%s} diverting from client %s to node %lx ",
+				ns->name, tr->from.proto_fd_h->client, dst);
+		break;
+	case FROM_BATCH:
+		cf_detail_digest(AS_PROXY_DIVERT, &tr->keyd,
+				"{%s} diverting batch-sub from client %s to node %lx ",
+				ns->name, as_batch_get_fd_h(tr->from.batch_shared)->client,
+				dst);
+		break;
+	default:
+		cf_crash(AS_PROXY, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+
+	// Get a fabric message and fill it out.
+
+	msg* m = as_fabric_msg_get(M_TYPE_PROXY);
+
+	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);
+
+	msg_set_type set_type = tr->origin == FROM_BATCH ?
+			MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC;
+
+	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
+	msg_set_uint32(m, PROXY_FIELD_TID, tid);
+	msg_set_buf(m, PROXY_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest),
+			MSG_SET_COPY);
+	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void*)tr->msgp,
+			as_proto_size_get(&tr->msgp->proto), set_type);
+
+	// Set up a proxy_request and insert it in the hash.
+
+	proxy_request pr;
+
+	pr.msg_fields = tr->msg_fields;
+
+	pr.origin = tr->origin;
+	pr.from_flags = tr->from_flags;
+	pr.from.any = tr->from.any;
+	pr.batch_index = tr->from_data.batch_index;
+
+	pr.start_time = tr->start_time;
+	pr.end_time = tr->end_time;
+
+	pr.fab_msg = m;
+
+	pr.ns = ns;
+
+	cf_shash_put(g_proxy_hash, &tid, &pr);
+
+	tr->msgp = NULL; // pattern, not needed
+	tr->from.any = NULL; // pattern, not needed
+
+	// Send fabric message to remote node.
+
+	msg_incr_ref(m);
+
+	if (as_fabric_send(dst, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+// Proxyee - transaction reservation failed here, tell proxyer to try again.
+void
+as_proxy_return_to_sender(const as_transaction* tr, as_namespace* ns)
+{
+	msg* m = as_fabric_msg_get(M_TYPE_PROXY);
+	uint32_t pid = as_partition_getid(&tr->keyd);
+	cf_node redirect_node = as_partition_proxyee_redirect(ns, pid);
+
+	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RETURN_TO_SENDER);
+	msg_set_uint32(m, PROXY_FIELD_TID, tr->from_data.proxy_tid);
+	msg_set_uint64(m, PROXY_FIELD_REDIRECT,
+			redirect_node == (cf_node)0 ? tr->from.proxy_node : redirect_node);
+
+	if (as_fabric_send(tr->from.proxy_node, m, AS_FABRIC_CHANNEL_RW) !=
+			AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+// Proxyee - transaction completed here, send response to proxyer.
+void
+as_proxy_send_response(cf_node dst, uint32_t proxy_tid, uint32_t result_code,
+		uint32_t generation, uint32_t void_time, as_msg_op** ops, as_bin** bins,
+		uint16_t bin_count, as_namespace* ns, uint64_t trid)
+{
+	msg* m = as_fabric_msg_get(M_TYPE_PROXY);
+
+	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE);
+	msg_set_uint32(m, PROXY_FIELD_TID, proxy_tid);
+
+	size_t msg_sz = 0;
+	uint8_t* msgp = (uint8_t*)as_msg_make_response_msg(result_code, generation,
+			void_time, ops, bins, bin_count, ns, 0, &msg_sz, trid);
+
+	msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_HANDOFF_MALLOC);
+
+	if (as_fabric_send(dst, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+// Proxyee - transaction completed here, send response to proxyer.
+void
+as_proxy_send_ops_response(cf_node dst, uint32_t proxy_tid, cf_dyn_buf* db)
+{
+	msg* m = as_fabric_msg_get(M_TYPE_PROXY);
+
+	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE);
+	msg_set_uint32(m, PROXY_FIELD_TID, proxy_tid);
+
+	uint8_t* msgp = db->buf;
+	size_t msg_sz = db->used_sz;
+
+	if (db->is_stack) {
+		msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_COPY);
+	}
+	else {
+		msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz,
+				MSG_SET_HANDOFF_MALLOC);
+		db->buf = NULL; // the fabric owns the buffer now
+	}
+
+	if (as_fabric_send(dst, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+//==========================================================
+// Local helpers - proxyer.
+//
+
+void
+proxyer_handle_response(msg* m, uint32_t tid)
+{
+	proxy_request pr;
+
+	if (cf_shash_get_and_delete(g_proxy_hash, &tid, &pr) != CF_SHASH_OK) {
+		// Some other response (or timeout) has already finished this pr.
+		return;
+	}
+
+	cf_assert(pr.from.any, AS_PROXY, "origin %u has null 'from'", pr.origin);
+
+	int result;
+
+	switch (pr.origin) {
+	case FROM_CLIENT:
+		result = proxyer_handle_client_response(m, &pr);
+		client_proxy_update_stats(pr.ns, result);
+		break;
+	case FROM_BATCH:
+		result = proxyer_handle_batch_response(m, &pr);
+		batch_sub_proxy_update_stats(pr.ns, result);
+		// Note - no worries about msgp, proxy divert copied it.
+		break;
+	default:
+		cf_crash(AS_PROXY, "unexpected transaction origin %u", pr.origin);
+		break;
+	}
+
+	pr.from.any = NULL; // pattern, not needed
+
+	as_fabric_msg_put(pr.fab_msg);
+
+	// Note that this includes both origins.
+	if (pr.ns->proxy_hist_enabled) {
+		histogram_insert_data_point(pr.ns->proxy_hist, pr.start_time);
+	}
+}
+
+
+int
+proxyer_handle_client_response(msg* m, proxy_request* pr)
+{
+	uint8_t* proto;
+	size_t proto_sz;
+
+	if (msg_get_buf(m, PROXY_FIELD_AS_PROTO, &proto, &proto_sz,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_PROXY, "msg get for proto failed");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	as_file_handle* fd_h = pr->from.proto_fd_h;
+
+	if (cf_socket_send_all(&fd_h->sock, proto, proto_sz, MSG_NOSIGNAL,
+			CF_SOCKET_TIMEOUT) < 0) {
+		// Common when a client aborts.
+		as_end_of_transaction_force_close(fd_h);
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	as_end_of_transaction_ok(fd_h);
+	return AS_PROTO_RESULT_OK;
+}
+
+
+int
+proxyer_handle_batch_response(msg* m, proxy_request* pr)
+{
+	cl_msg* msgp;
+	size_t msgp_sz;
+
+	if (msg_get_buf(m, PROXY_FIELD_AS_PROTO, (uint8_t**)&msgp, &msgp_sz,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_PROXY, "msg get for proto failed");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	cf_digest* keyd;
+
+	if (msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_crash(AS_PROXY, "original msg get for digest failed");
+	}
+
+	as_batch_add_proxy_result(pr->from.batch_shared, pr->batch_index, keyd,
+			msgp, msgp_sz);
+
+	return AS_PROTO_RESULT_OK;
+}
+
+
+void
+proxyer_handle_return_to_sender(msg* m, uint32_t tid)
+{
+	proxy_request* pr;
+	pthread_mutex_t* lock;
+
+	if (cf_shash_get_vlock(g_proxy_hash, &tid, (void**)&pr, &lock) !=
+			CF_SHASH_OK) {
+		// Some other response (or timeout) has already finished this pr.
+		return;
+	}
+
+	cf_node redirect_node;
+
+	if (msg_get_uint64(m, PROXY_FIELD_REDIRECT, &redirect_node) == 0
+			&& redirect_node != g_config.self_node
+			&& redirect_node != (cf_node)0) {
+		// If this node was a "random" node, i.e. neither acting nor eventual
+		// master, it diverts to the eventual master (the best it can do.) The
+		// eventual master must inform this node about the acting master.
+
+		msg_incr_ref(pr->fab_msg);
+
+		if (as_fabric_send(redirect_node, pr->fab_msg, AS_FABRIC_CHANNEL_RW) !=
+				AS_FABRIC_SUCCESS) {
+			as_fabric_msg_put(pr->fab_msg);
+		}
+
+		pthread_mutex_unlock(lock);
+		return;
+	}
+
+	cf_digest* keyd;
+
+	if (msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_crash(AS_PROXY, "original msg get for digest failed");
+	}
+
+	cl_msg* msgp;
+
+	// TODO - inefficient! Should be a way to 'take' a buffer from msg.
+	if (msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (uint8_t**)&msgp, NULL,
+			MSG_GET_COPY_MALLOC) != 0) {
+		cf_crash(AS_PROXY, "original msg get for proto failed");
+	}
+
+	// Put the as_msg on the normal queue for processing.
+	as_transaction tr;
+	as_transaction_init_head(&tr, keyd, msgp);
+	// msgp might not have digest - batch sub-transactions, old clients.
+	// For old clients, will compute it again from msgp key and set.
+
+	tr.msg_fields = pr->msg_fields;
+	tr.origin = pr->origin;
+	tr.from_flags = pr->from_flags;
+	tr.from.any = pr->from.any;
+	tr.from_data.batch_index = pr->batch_index;
+	tr.start_time = pr->start_time;
+
+	as_tsvc_enqueue(&tr);
+
+	as_fabric_msg_put(pr->fab_msg);
+
+	cf_shash_delete_lockfree(g_proxy_hash, &tid);
+	pthread_mutex_unlock(lock);
+}
+
+
+//==========================================================
+// Local helpers - proxyee.
+//
+
+void
+proxyee_handle_request(cf_node src, msg* m, uint32_t tid)
+{
+	cf_digest* keyd;
+
+	if (msg_get_buf(m, PROXY_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_PROXY, "msg get for digest failed");
+		error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	cl_msg* msgp;
+	size_t msgp_sz;
+
+	if (msg_get_buf(m, PROXY_FIELD_AS_PROTO, (uint8_t**)&msgp, &msgp_sz,
+			MSG_GET_COPY_MALLOC) != 0) {
+		cf_warning(AS_PROXY, "msg get for proto failed");
+		error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	// Sanity check as_proto fields.
+	as_proto* proto = &msgp->proto;
+
+	if (! as_proto_wrapped_is_valid(proto, msgp_sz)) {
+		cf_warning(AS_PROXY, "bad proto: version %u, type %u, sz %lu [%lu]",
+				proto->version, proto->type, (uint64_t)proto->sz, msgp_sz);
+		error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	// Put the as_msg on the normal queue for processing.
+	as_transaction tr;
+	as_transaction_init_head(&tr, keyd, msgp);
+	// msgp might not have digest - batch sub-transactions, old clients.
+	// For old clients, will compute it again from msgp key and set.
+
+	tr.start_time = cf_getns();
+
+	tr.origin = FROM_PROXY;
+	tr.from.proxy_node = src;
+	tr.from_data.proxy_tid = tid;
+
+	// Proxyer has already done byte swapping in as_msg.
+	if (! as_transaction_prepare(&tr, false)) {
+		cf_warning(AS_PROXY, "bad proxy msg");
+		error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	// For batch sub-transactions, make sure we flag them so they're not
+	// mistaken for multi-record transactions (which never proxy).
+	if (as_transaction_has_no_key_or_digest(&tr)) {
+		tr.from_flags |= FROM_FLAG_BATCH_SUB;
+	}
+
+	as_tsvc_enqueue(&tr);
+}
+
+
+//==========================================================
+// Local helpers - timeout.
+//
+
+void*
+run_proxy_timeout(void* arg)
+{
+	while (true) {
+		usleep(75 * 1000);
+
+		now_times now;
+
+		now.now_ns = cf_getns();
+		now.now_ms = now.now_ns / 1000000;
+
+		cf_shash_reduce(g_proxy_hash, proxy_timeout_reduce_fn, &now);
+	}
+
+	return NULL;
+}
+
+
+int
+proxy_timeout_reduce_fn(const void* key, void* data, void* udata)
+{
+	proxy_request* pr = data;
+	now_times* now = (now_times*)udata;
+
+	if (now->now_ns < pr->end_time) {
+		return CF_SHASH_OK;
+	}
+
+	// Handle timeouts.
+
+	cf_assert(pr->from.any, AS_PROXY, "origin %u has null 'from'", pr->origin);
+
+	switch (pr->origin) {
+	case FROM_CLIENT:
+		// TODO - when it becomes important enough, find a way to echo trid.
+		as_msg_send_reply(pr->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0,
+				0, NULL, NULL, 0, pr->ns, 0);
+		client_proxy_update_stats(pr->ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		break;
+	case FROM_BATCH:
+		as_batch_add_error(pr->from.batch_shared, pr->batch_index,
+				AS_PROTO_RESULT_FAIL_TIMEOUT);
+		// Note - no worries about msgp, proxy divert copied it.
+		batch_sub_proxy_update_stats(pr->ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		break;
+	default:
+		cf_crash(AS_PROXY, "unexpected transaction origin %u", pr->origin);
+		break;
+	}
+
+	pr->from.any = NULL; // pattern, not needed
+	as_fabric_msg_put(pr->fab_msg);
+
+	return CF_SHASH_REDUCE_DELETE;
+}
+
+
+//==========================================================
+// Local helpers - handle PROXY fabric messages.
+//
+
+int
+proxy_msg_cb(cf_node src, msg* m, void* udata)
+{
+	uint32_t op;
+
+	if (msg_get_uint32(m, PROXY_FIELD_OP, &op) != 0) {
+		cf_warning(AS_PROXY, "msg get for op failed");
+		as_fabric_msg_put(m);
+		return 0;
+	}
+
+	uint32_t tid;
+
+	if (msg_get_uint32(m, PROXY_FIELD_TID, &tid) != 0) {
+		cf_warning(AS_PROXY, "msg get for tid failed");
+		as_fabric_msg_put(m);
+		return 0;
+	}
+
+	switch (op) {
+	case PROXY_OP_REQUEST:
+		proxyee_handle_request(src, m, tid);
+		break;
+	case PROXY_OP_RESPONSE:
+		proxyer_handle_response(m, tid);
+		break;
+	case PROXY_OP_RETURN_TO_SENDER:
+		proxyer_handle_return_to_sender(m, tid);
+		break;
+	default:
+		cf_warning(AS_PROXY, "received unexpected message op %u", op);
+		break;
+	}
+
+	as_fabric_msg_put(m);
+	return 0;
+}
diff --git a/as/src/transaction/re_replicate_ce.c b/as/src/transaction/re_replicate_ce.c
new file mode 100644
index 00000000..ffb518e6
--- /dev/null
+++ b/as/src/transaction/re_replicate_ce.c
@@ -0,0 +1,43 @@
+/*
+ * re_replicate_ce.c
+ *
+ * Copyright (C) 2017-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/re_replicate.h"
+
+#include "fault.h"
+
+#include "base/transaction.h"
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status
+as_re_replicate_start(as_transaction* tr)
+{
+	cf_crash(AS_RW, "CE code called as_re_replicate_start()");
+	return TRANS_DONE_ERROR;
+}
diff --git a/as/src/transaction/read.c b/as/src/transaction/read.c
new file mode 100644
index 00000000..7d9d7949
--- /dev/null
+++ b/as/src/transaction/read.c
@@ -0,0 +1,625 @@
+/*
+ * read.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/read.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/batch.h"
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+#include "transaction/duplicate_resolve.h"
+#include "transaction/proxy.h"
+#include "transaction/replica_ping.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void start_read_dup_res(rw_request* rw, as_transaction* tr);
+void start_repl_ping(rw_request* rw, as_transaction* tr);
+bool read_dup_res_cb(rw_request* rw);
+void repl_ping_after_dup_res(rw_request* rw, as_transaction* tr);
+void repl_ping_cb(rw_request* rw);
+
+void send_read_response(as_transaction* tr, as_msg_op** ops,
+		as_bin** response_bins, uint16_t n_bins, cf_dyn_buf* db);
+void read_timeout_cb(rw_request* rw);
+
+transaction_status read_local(as_transaction* tr);
+void read_local_done(as_transaction* tr, as_index_ref* r_ref, as_storage_rd* rd,
+		int result_code);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline bool
+read_must_duplicate_resolve(const as_transaction* tr)
+{
+	return tr->rsv.n_dupl != 0 &&
+			TR_READ_CONSISTENCY_LEVEL(tr) == AS_READ_CONSISTENCY_LEVEL_ALL;
+}
+
+static inline bool
+read_must_ping(const as_transaction *tr)
+{
+	return (tr->flags & AS_TRANSACTION_FLAG_MUST_PING) != 0;
+}
+
+static inline void
+client_read_update_stats(as_namespace* ns, uint8_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_client_read_success);
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_client_read_timeout);
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_client_read_error);
+		break;
+	case AS_PROTO_RESULT_FAIL_NOT_FOUND:
+		cf_atomic64_incr(&ns->n_client_read_not_found);
+		break;
+	}
+}
+
+static inline void
+batch_sub_read_update_stats(as_namespace* ns, uint8_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_batch_sub_read_success);
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_batch_sub_read_timeout);
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_batch_sub_read_error);
+		break;
+	case AS_PROTO_RESULT_FAIL_NOT_FOUND:
+		cf_atomic64_incr(&ns->n_batch_sub_read_not_found);
+		break;
+	}
+}
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status
+as_read_start(as_transaction* tr)
+{
+	BENCHMARK_START(tr, read, FROM_CLIENT);
+	BENCHMARK_START(tr, batch_sub, FROM_BATCH);
+
+	if (! repl_ping_check(tr)) {
+		send_read_response(tr, NULL, NULL, 0, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	transaction_status status;
+	bool must_duplicate_resolve = read_must_duplicate_resolve(tr);
+	bool must_ping = read_must_ping(tr);
+
+	if (! must_duplicate_resolve && ! must_ping) {
+		// No network hops needed, try reading.
+		if ((status = read_local(tr)) != TRANS_IN_PROGRESS) {
+			return status;
+		}
+		// else - must try again under hash.
+	}
+	// else - there are duplicates, and we're configured to resolve them, or
+	// we're required to ping replicas.
+
+	// Create rw_request and add to hash.
+	rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd };
+	rw_request* rw = rw_request_create(&tr->keyd);
+
+	// If rw_request isn't inserted in hash, transaction is finished.
+	if ((status = rw_request_hash_insert(&hkey, rw, tr)) != TRANS_IN_PROGRESS) {
+		rw_request_release(rw);
+
+		if (status != TRANS_WAITING) {
+			send_read_response(tr, NULL, NULL, 0, NULL);
+		}
+
+		return status;
+	}
+	// else - rw_request is now in hash, continue...
+
+	if (must_duplicate_resolve) {
+		start_read_dup_res(rw, tr);
+
+		// Started duplicate resolution.
+		return TRANS_IN_PROGRESS;
+	}
+
+	if (must_ping) {
+		// Set up the nodes to which we'll ping.
+		rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p,
+				rw->dest_nodes);
+
+		if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) {
+			rw_request_hash_delete(&hkey, rw);
+			tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+			send_read_response(tr, NULL, NULL, 0, NULL);
+			return TRANS_DONE_ERROR;
+		}
+
+		start_repl_ping(rw, tr);
+
+		// Started replica ping.
+		return TRANS_IN_PROGRESS;
+	}
+
+	// Trying again under hash.
+	status = read_local(tr);
+	cf_assert(status != TRANS_IN_PROGRESS, AS_RW, "read in-progress");
+	rw_request_hash_delete(&hkey, rw);
+
+	return status;
+}
+
+
+//==========================================================
+// Local helpers - transaction flow.
+//
+
+void
+start_read_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw_request, construct and send dup-res message.
+
+	dup_res_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	dup_res_setup_rw(rw, tr, read_dup_res_cb, read_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_repl_ping(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send repl-ping message.
+
+	repl_ping_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	repl_ping_setup_rw(rw, tr, repl_ping_cb, read_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+bool
+read_dup_res_cb(rw_request* rw)
+{
+	BENCHMARK_NEXT_DATA_POINT(rw, read, dup_res);
+	BENCHMARK_NEXT_DATA_POINT(rw, batch_sub, dup_res);
+
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	if (tr.result_code != AS_PROTO_RESULT_OK) {
+		send_read_response(&tr, NULL, NULL, 0, NULL);
+		return true;
+	}
+
+	if (read_must_ping(&tr)) {
+		// Set up the nodes to which we'll ping.
+		rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p,
+				rw->dest_nodes);
+
+		if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) {
+			tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+			send_read_response(&tr, NULL, NULL, 0, NULL);
+			return true;
+		}
+
+		repl_ping_after_dup_res(rw, &tr);
+
+		return false;
+	}
+
+	// Read the local copy and respond to origin.
+	transaction_status status = read_local(&tr);
+
+	cf_assert(status != TRANS_IN_PROGRESS, AS_RW, "read in-progress");
+
+	if (status == TRANS_WAITING) {
+		// Note - new tr now owns msgp, make sure rw destructor doesn't free it.
+		// Also, rw will release rsv - new tr will get a new one.
+		rw->msgp = NULL;
+	}
+
+	// Finished transaction - rw_request cleans up reservation and msgp!
+	return true;
+}
+
+
+void
+repl_ping_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Recycle rw_request that was just used for duplicate resolution to now do
+	// replica pings. Note - we are under the rw_request lock here!
+
+	repl_ping_make_message(rw, tr);
+	repl_ping_reset_rw(rw, tr, repl_ping_cb);
+	send_rw_messages(rw);
+}
+
+
+void
+repl_ping_cb(rw_request* rw)
+{
+	BENCHMARK_NEXT_DATA_POINT(rw, read, repl_ping);
+	BENCHMARK_NEXT_DATA_POINT(rw, batch_sub, repl_ping);
+
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	// Read the local copy and respond to origin.
+	transaction_status status = read_local(&tr);
+
+	cf_assert(status != TRANS_IN_PROGRESS, AS_RW, "read in-progress");
+
+	if (status == TRANS_WAITING) {
+		// Note - new tr now owns msgp, make sure rw destructor doesn't free it.
+		// Also, rw will release rsv - new tr will get a new one.
+		rw->msgp = NULL;
+	}
+}
+
+
+//==========================================================
+// Local helpers - transaction end.
+//
+
+void
+send_read_response(as_transaction* tr, as_msg_op** ops, as_bin** response_bins,
+		uint16_t n_bins, cf_dyn_buf* db)
+{
+	// Paranoia - shouldn't get here on losing race with timeout.
+	if (! tr->from.any) {
+		cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin);
+		return;
+	}
+
+	// Note - if tr was setup from rw, rw->from.any has been set null and
+	// informs timeout it lost the race.
+
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		BENCHMARK_NEXT_DATA_POINT(tr, read, local);
+		if (db && db->used_sz != 0) {
+			as_msg_send_ops_reply(tr->from.proto_fd_h, db);
+		}
+		else {
+			as_msg_send_reply(tr->from.proto_fd_h, tr->result_code,
+					tr->generation, tr->void_time, ops, response_bins, n_bins,
+					tr->rsv.ns, as_transaction_trid(tr));
+		}
+		BENCHMARK_NEXT_DATA_POINT(tr, read, response);
+		HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(tr, read_hist);
+		client_read_update_stats(tr->rsv.ns, tr->result_code);
+		break;
+	case FROM_PROXY:
+		if (db && db->used_sz != 0) {
+			as_proxy_send_ops_response(tr->from.proxy_node,
+					tr->from_data.proxy_tid, db);
+		}
+		else {
+			as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid,
+					tr->result_code, tr->generation, tr->void_time, ops,
+					response_bins, n_bins, tr->rsv.ns, as_transaction_trid(tr));
+		}
+		break;
+	case FROM_BATCH:
+		BENCHMARK_NEXT_DATA_POINT(tr, batch_sub, read_local);
+		as_batch_add_result(tr, n_bins, response_bins, ops);
+		BENCHMARK_NEXT_DATA_POINT(tr, batch_sub, response);
+		batch_sub_read_update_stats(tr->rsv.ns, tr->result_code);
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+
+	tr->from.any = NULL; // pattern, not needed
+}
+
+
+void
+read_timeout_cb(rw_request* rw)
+{
+	if (! rw->from.any) {
+		return; // lost race against dup-res callback
+	}
+
+	switch (rw->origin) {
+	case FROM_CLIENT:
+		as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0,
+				0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw));
+		// Timeouts aren't included in histograms.
+		client_read_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		break;
+	case FROM_PROXY:
+		break;
+	case FROM_BATCH:
+		as_batch_add_error(rw->from.batch_shared, rw->from_data.batch_index,
+				AS_PROTO_RESULT_FAIL_TIMEOUT);
+		// Timeouts aren't included in histograms.
+		batch_sub_read_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin);
+		break;
+	}
+
+	rw->from.any = NULL; // inform other callback it lost the race
+}
+
+
+//==========================================================
+// Local helpers - read local.
+//
+
+transaction_status
+read_local(as_transaction* tr)
+{
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) {
+		read_local_done(tr, NULL, NULL, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		return TRANS_DONE_ERROR;
+	}
+
+	as_record* r = r_ref.r;
+
+	// Check if it's an expired or truncated record.
+	if (as_record_is_doomed(r, ns)) {
+		read_local_done(tr, &r_ref, NULL, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		return TRANS_DONE_ERROR;
+	}
+
+	int result = repl_state_check(r, tr);
+
+	if (result != 0) {
+		// No response sent to origin.
+		as_record_done(&r_ref, ns);
+		return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING;
+	}
+
+	// Check if it's a tombstone.
+	if (! as_record_is_live(r)) {
+		read_local_done(tr, &r_ref, NULL, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+		return TRANS_DONE_ERROR;
+	}
+
+	as_storage_rd rd;
+
+	as_storage_record_open(ns, r, &rd);
+
+	// Check the key if required.
+	// Note - for data-not-in-memory "exists" ops, key check is expensive!
+	if (as_transaction_has_key(tr) &&
+			as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) {
+		read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_KEY_MISMATCH);
+		return TRANS_DONE_ERROR;
+	}
+
+	if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) {
+		tr->generation = r->generation;
+		tr->void_time = r->void_time;
+		tr->last_update_time = r->last_update_time;
+
+		read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_OK);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	if ((result = as_storage_rd_load_n_bins(&rd)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name);
+		read_local_done(tr, &r_ref, &rd, -result);
+		return TRANS_DONE_ERROR;
+	}
+
+	as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+	if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name);
+		read_local_done(tr, &r_ref, &rd, -result);
+		return TRANS_DONE_ERROR;
+	}
+
+	if (! as_bin_inuse_has(&rd)) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name);
+		read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return TRANS_DONE_ERROR;
+	}
+
+	uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ?
+			rd.n_bins : m->n_ops;
+
+	as_msg_op* ops[bin_count];
+	as_msg_op** p_ops = ops;
+	as_bin* response_bins[bin_count];
+	uint16_t n_bins = 0;
+
+	as_bin result_bins[bin_count];
+	uint32_t n_result_bins = 0;
+
+	if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) {
+		p_ops = NULL;
+		n_bins = as_bin_inuse_count(&rd);
+		as_bin_get_all_p(&rd, response_bins);
+	}
+	else {
+		if (m->n_ops == 0) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name);
+			read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_PARAMETER);
+			return TRANS_DONE_ERROR;
+		}
+
+		bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0;
+
+		as_msg_op* op = 0;
+		int n = 0;
+
+		while ((op = as_msg_op_iterate(m, op, &n)) != NULL) {
+			if (op->op == AS_MSG_OP_READ) {
+				as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz);
+
+				if (b || respond_all_ops) {
+					ops[n_bins] = op;
+					response_bins[n_bins++] = b;
+				}
+			}
+			else if (op->op == AS_MSG_OP_CDT_READ) {
+				as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz);
+
+				if (b) {
+					as_bin* rb = &result_bins[n_result_bins];
+					as_bin_set_empty(rb);
+
+					if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) {
+						cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name);
+						destroy_stack_bins(result_bins, n_result_bins);
+						read_local_done(tr, &r_ref, &rd, -result);
+						return TRANS_DONE_ERROR;
+					}
+
+					if (as_bin_inuse(rb)) {
+						n_result_bins++;
+						ops[n_bins] = op;
+						response_bins[n_bins++] = rb;
+					}
+					else if (respond_all_ops) {
+						ops[n_bins] = op;
+						response_bins[n_bins++] = NULL;
+					}
+				}
+				else if (respond_all_ops) {
+					ops[n_bins] = op;
+					response_bins[n_bins++] = NULL;
+				}
+			}
+			else {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op);
+				destroy_stack_bins(result_bins, n_result_bins);
+				read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_PARAMETER);
+				return TRANS_DONE_ERROR;
+			}
+		}
+	}
+
+	cf_dyn_buf_define_size(db, 16 * 1024);
+
+	if (tr->origin != FROM_BATCH) {
+		db.used_sz = db.alloc_sz;
+		db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code,
+				r->generation, r->void_time, p_ops, response_bins, n_bins, ns,
+				(cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr));
+
+		db.is_stack = db.buf == dyn_bufdb;
+		// Note - not bothering to correct alloc_sz if buf was allocated.
+	}
+	else {
+		tr->generation = r->generation;
+		tr->void_time = r->void_time;
+		tr->last_update_time = r->last_update_time;
+
+		// Since as_batch_add_result() constructs response directly in shared
+		// buffer to avoid extra copies, can't use db.
+		send_read_response(tr, p_ops, response_bins, n_bins, NULL);
+	}
+
+	destroy_stack_bins(result_bins, n_result_bins);
+	as_storage_record_close(&rd);
+	as_record_done(&r_ref, ns);
+
+	// Now that we're not under the record lock, send the message we just built.
+	if (db.used_sz != 0) {
+		send_read_response(tr, NULL, NULL, 0, &db);
+
+		cf_dyn_buf_free(&db);
+		tr->from.proto_fd_h = NULL;
+	}
+
+	return TRANS_DONE_SUCCESS;
+}
+
+
+void
+read_local_done(as_transaction* tr, as_index_ref* r_ref, as_storage_rd* rd,
+		int result_code)
+{
+	if (r_ref) {
+		if (rd) {
+			as_storage_record_close(rd);
+		}
+
+		as_record_done(r_ref, tr->rsv.ns);
+	}
+
+	tr->result_code = (uint8_t)result_code;
+
+	send_read_response(tr, NULL, NULL, 0, NULL);
+}
diff --git a/as/src/transaction/replica_ping_ce.c b/as/src/transaction/replica_ping_ce.c
new file mode 100644
index 00000000..c4a09df0
--- /dev/null
+++ b/as/src/transaction/replica_ping_ce.c
@@ -0,0 +1,88 @@
+/*
+ * replica_ping_ce.c
+ *
+ * Copyright (C) 2017-2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/replica_ping.h"
+
+#include <stdbool.h>
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+
+#include "base/datamodel.h"
+#include "base/transaction.h"
+#include "fabric/fabric.h"
+#include "transaction/rw_request.h"
+
+
+//==========================================================
+// Public API.
+//
+
+bool
+repl_ping_check(as_transaction* tr)
+{
+	if (as_transaction_is_linearized_read(tr)) {
+		cf_warning(AS_RW, "linearized read is an enterprise feature");
+		tr->result_code = AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY;
+		return false;
+	}
+
+	return true;
+}
+
+void
+repl_ping_make_message(rw_request* rw, as_transaction* tr)
+{
+	cf_crash(AS_RW, "CE code called repl_ping_make_message()");
+}
+
+void
+repl_ping_setup_rw(rw_request* rw, as_transaction* tr,
+		repl_ping_done_cb repl_ping_cb, timeout_done_cb timeout_cb)
+{
+	cf_crash(AS_RW, "CE code called repl_ping_setup_rw()");
+}
+
+void
+repl_ping_reset_rw(rw_request* rw, as_transaction* tr, repl_ping_done_cb cb)
+{
+	cf_crash(AS_RW, "CE code called repl_ping_reset_rw()");
+}
+
+void
+repl_ping_handle_op(cf_node node, msg* m)
+{
+	cf_warning(AS_RW, "CE code called repl_ping_handle_op()");
+	as_fabric_msg_put(m);
+}
+
+void
+repl_ping_handle_ack(cf_node node, msg* m)
+{
+	cf_warning(AS_RW, "CE code called repl_ping_handle_ack()");
+	as_fabric_msg_put(m);
+}
diff --git a/as/src/transaction/replica_write.c b/as/src/transaction/replica_write.c
new file mode 100644
index 00000000..ae0b45a0
--- /dev/null
+++ b/as/src/transaction/replica_write.c
@@ -0,0 +1,520 @@
+/*
+ * replica_write.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/replica_write.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/rec_props.h"
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/xdr_serverside.h"
+#include "fabric/fabric.h"
+#include "fabric/partition.h"
+#include "transaction/delete.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+uint32_t pack_info_bits(as_transaction* tr);
+void send_repl_write_ack(cf_node node, msg* m, uint32_t result);
+uint32_t parse_result_code(msg* m);
+void drop_replica(as_partition_reservation* rsv, cf_digest* keyd,
+		bool is_nsup_delete, bool is_xdr_op, cf_node master);
+
+
+//==========================================================
+// Public API.
+//
+
+void
+repl_write_make_message(rw_request* rw, as_transaction* tr)
+{
+	if (rw->dest_msg) {
+		msg_reset(rw->dest_msg);
+	}
+	else {
+		rw->dest_msg = as_fabric_msg_get(M_TYPE_RW);
+	}
+
+	// TODO - remove this when we're comfortable:
+	cf_assert(rw->pickled_buf, AS_RW, "making repl-write msg with null pickle");
+
+	as_namespace* ns = tr->rsv.ns;
+	msg* m = rw->dest_msg;
+
+	msg_set_uint32(m, RW_FIELD_OP, RW_OP_WRITE);
+	msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name),
+			MSG_SET_COPY);
+	msg_set_uint32(m, RW_FIELD_NS_ID, ns->id);
+	msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest),
+			MSG_SET_COPY);
+	msg_set_uint32(m, RW_FIELD_TID, rw->tid);
+	msg_set_uint32(m, RW_FIELD_GENERATION, tr->generation);
+	msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, tr->last_update_time);
+
+	if (tr->void_time != 0) {
+		msg_set_uint32(m, RW_FIELD_VOID_TIME, tr->void_time);
+	}
+
+	uint32_t info = pack_info_bits(tr);
+
+	repl_write_flag_pickle(tr, rw->pickled_buf, &info);
+
+	msg_set_buf(m, RW_FIELD_RECORD, (void*)rw->pickled_buf, rw->pickled_sz,
+			MSG_SET_HANDOFF_MALLOC);
+
+	// Make sure destructor doesn't free this.
+	rw->pickled_buf = NULL;
+
+	// TODO - replace rw->pickled_rec_props with individual fields.
+	if (rw->pickled_rec_props.p_data) {
+		const char* set_name;
+		uint32_t set_name_size;
+
+		if (as_rec_props_get_value(&rw->pickled_rec_props,
+				CL_REC_PROPS_FIELD_SET_NAME, &set_name_size,
+				(uint8_t**)&set_name) == 0) {
+			msg_set_buf(m, RW_FIELD_SET_NAME, (const uint8_t *)set_name,
+					set_name_size - 1, MSG_SET_COPY);
+		}
+
+		uint32_t key_size;
+		uint8_t* key;
+
+		if (as_rec_props_get_value(&rw->pickled_rec_props,
+				CL_REC_PROPS_FIELD_KEY, &key_size, &key) == 0) {
+			msg_set_buf(m, RW_FIELD_KEY, key, key_size, MSG_SET_COPY);
+		}
+	}
+
+	if (info != 0) {
+		msg_set_uint32(m, RW_FIELD_INFO, info);
+	}
+}
+
+
+void
+repl_write_setup_rw(rw_request* rw, as_transaction* tr,
+		repl_write_done_cb repl_write_cb, timeout_done_cb timeout_cb)
+{
+	rw->msgp = tr->msgp;
+	tr->msgp = NULL;
+
+	rw->msg_fields = tr->msg_fields;
+	rw->origin = tr->origin;
+	rw->from_flags = tr->from_flags;
+
+	rw->from.any = tr->from.any;
+	rw->from_data.any = tr->from_data.any;
+	tr->from.any = NULL;
+
+	rw->start_time = tr->start_time;
+	rw->benchmark_time = tr->benchmark_time;
+
+	as_partition_reservation_copy(&rw->rsv, &tr->rsv);
+	// Hereafter, rw_request must release reservation - happens in destructor.
+
+	rw->end_time = tr->end_time;
+	rw->flags = tr->flags;
+	rw->generation = tr->generation;
+	rw->void_time = tr->void_time;
+	rw->last_update_time = tr->last_update_time;
+
+	rw->repl_write_cb = repl_write_cb;
+	rw->timeout_cb = timeout_cb;
+
+	rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms;
+	rw->retry_interval_ms = g_config.transaction_retry_ms;
+
+	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
+		rw->dest_complete[i] = false;
+	}
+
+	// Allow retransmit thread to destroy rw_request as soon as we unlock.
+	rw->is_set_up = true;
+}
+
+
+void
+repl_write_reset_rw(rw_request* rw, as_transaction* tr, repl_write_done_cb cb)
+{
+	// Reset rw->from.any which was set null in tr setup.
+	rw->from.any = tr->from.any;
+
+	// Needed for response to origin.
+	rw->flags = tr->flags;
+	rw->generation = tr->generation;
+	rw->void_time = tr->void_time;
+	rw->last_update_time = tr->last_update_time;
+
+	rw->repl_write_cb = cb;
+
+	// TODO - is this better than not resetting? Note - xmit_ms not volatile.
+	rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms;
+	rw->retry_interval_ms = g_config.transaction_retry_ms;
+
+	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
+		rw->dest_complete[i] = false;
+	}
+}
+
+
+void
+repl_write_handle_op(cf_node node, msg* m)
+{
+	uint8_t* ns_name;
+	size_t ns_name_len;
+
+	if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_RW, "repl_write_handle_op: no namespace");
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len);
+
+	if (! ns) {
+		cf_warning(AS_RW, "repl_write_handle_op: invalid namespace");
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	cf_digest* keyd;
+
+	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_RW, "repl_write_handle_op: no digest");
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	as_partition_reservation rsv;
+	uint32_t result = as_partition_reserve_replica(ns, as_partition_getid(keyd),
+			&rsv);
+
+	if (result != AS_PROTO_RESULT_OK) {
+		send_repl_write_ack(node, m, result);
+		return;
+	}
+
+	as_remote_record rr = { .src = node, .rsv = &rsv, .keyd = keyd };
+
+	if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&rr.record_buf,
+			&rr.record_buf_sz, MSG_GET_DIRECT) != 0 || rr.record_buf_sz < 2) {
+		cf_warning(AS_RW, "repl_write_handle_op: no or bad record");
+		as_partition_release(&rsv);
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	uint32_t info = 0;
+
+	msg_get_uint32(m, RW_FIELD_INFO, &info);
+
+	if (repl_write_pickle_is_drop(rr.record_buf, info)) {
+		drop_replica(&rsv, keyd,
+				(info & RW_INFO_NSUP_DELETE) != 0,
+				(info & RW_INFO_XDR) != 0,
+				node);
+
+		as_partition_release(&rsv);
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_OK);
+
+		return;
+	}
+
+	if (msg_get_uint32(m, RW_FIELD_GENERATION, &rr.generation) != 0 ||
+			rr.generation == 0) {
+		cf_warning(AS_RW, "repl_write_handle_op: no or bad generation");
+		as_partition_release(&rsv);
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	if (msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME,
+			&rr.last_update_time) != 0) {
+		cf_warning(AS_RW, "repl_write_handle_op: no last-update-time");
+		as_partition_release(&rsv);
+		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
+		return;
+	}
+
+	msg_get_uint32(m, RW_FIELD_VOID_TIME, &rr.void_time);
+
+	msg_get_buf(m, RW_FIELD_SET_NAME, (uint8_t **)&rr.set_name,
+			&rr.set_name_len, MSG_GET_DIRECT);
+
+	msg_get_buf(m, RW_FIELD_KEY, (uint8_t **)&rr.key, &rr.key_size,
+			MSG_GET_DIRECT);
+
+	// Do XDR write if the write is a non-XDR write or forwarding is enabled.
+	bool do_xdr_write = (info & RW_INFO_XDR) == 0 ||
+			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes;
+
+	// If source didn't touch sindex, may not need to touch it locally.
+	bool skip_sindex = (info & RW_INFO_SINDEX_TOUCHED) == 0;
+
+	result = (uint32_t)as_record_replace_if_better(&rr, true, skip_sindex,
+			do_xdr_write);
+
+	as_partition_release(&rsv);
+	send_repl_write_ack(node, m, result);
+}
+
+
+void
+repl_write_handle_ack(cf_node node, msg* m)
+{
+	uint32_t ns_id;
+
+	if (msg_get_uint32(m, RW_FIELD_NS_ID, &ns_id) != 0) {
+		cf_warning(AS_RW, "repl-write ack: no ns-id");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	cf_digest* keyd;
+
+	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL,
+			MSG_GET_DIRECT) != 0) {
+		cf_warning(AS_RW, "repl-write ack: no digest");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t tid;
+
+	if (msg_get_uint32(m, RW_FIELD_TID, &tid) != 0) {
+		cf_warning(AS_RW, "repl-write ack: no tid");
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	rw_request_hkey hkey = { ns_id, *keyd };
+	rw_request* rw = rw_request_hash_get(&hkey);
+
+	if (! rw) {
+		// Extra ack, after rw_request is already gone.
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	pthread_mutex_lock(&rw->lock);
+
+	if (rw->tid != tid || rw->repl_write_complete) {
+		// Extra ack - rw_request is newer transaction for same digest, or ack
+		// is arriving after rw_request was aborted.
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	// Paranoia - remove eventually.
+	cf_assert(rw->origin != FROM_NSUP, AS_RW, "nsup delete got repl-write ack");
+
+	if (! rw->from.any) {
+		// Lost race against timeout in retransmit thread.
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	// Find remote node in replicas list.
+	int i = index_of_node(rw->dest_nodes, rw->n_dest_nodes, node);
+
+	if (i == -1) {
+		cf_warning(AS_RW, "repl-write ack: from non-dest node %lx", node);
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	if (rw->dest_complete[i]) {
+		// Extra ack for this replica write.
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	uint32_t result_code = parse_result_code(m);
+
+	// If it makes sense, retransmit replicas. Note - rw->dest_complete[i] not
+	// yet set true, so that retransmit will go to this remote node.
+	if (repl_write_should_retransmit_replicas(rw, result_code)) {
+		pthread_mutex_unlock(&rw->lock);
+		rw_request_release(rw);
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	rw->dest_complete[i] = true;
+
+	for (uint32_t j = 0; j < rw->n_dest_nodes; j++) {
+		if (! rw->dest_complete[j]) {
+			// Still haven't heard from all replicas.
+			pthread_mutex_unlock(&rw->lock);
+			rw_request_release(rw);
+			as_fabric_msg_put(m);
+			return;
+		}
+	}
+
+	// Success for all replicas.
+	rw->repl_write_cb(rw);
+	repl_write_send_confirmation(rw);
+
+	rw->repl_write_complete = true;
+
+	pthread_mutex_unlock(&rw->lock);
+	rw_request_hash_delete(&hkey, rw);
+	rw_request_release(rw);
+	as_fabric_msg_put(m);
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+uint32_t
+pack_info_bits(as_transaction* tr)
+{
+	uint32_t info = 0;
+
+	if (as_transaction_is_xdr(tr)) {
+		info |= RW_INFO_XDR;
+	}
+
+	if ((tr->flags & AS_TRANSACTION_FLAG_SINDEX_TOUCHED) != 0) {
+		info |= RW_INFO_SINDEX_TOUCHED;
+	}
+
+	if (as_transaction_is_nsup_delete(tr)) {
+		info |= (RW_INFO_NSUP_DELETE | RW_INFO_NO_REPL_ACK);
+	}
+
+	if (respond_on_master_complete(tr)) {
+		info |= RW_INFO_NO_REPL_ACK;
+	}
+
+	return info;
+}
+
+
+void
+send_repl_write_ack(cf_node node, msg* m, uint32_t result)
+{
+	uint32_t info = 0;
+
+	msg_get_uint32(m, RW_FIELD_INFO, &info);
+
+	if ((info & RW_INFO_NO_REPL_ACK) != 0) {
+		as_fabric_msg_put(m);
+		return;
+	}
+
+	msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID);
+
+	msg_set_uint32(m, RW_FIELD_OP, RW_OP_WRITE_ACK);
+	msg_set_uint32(m, RW_FIELD_RESULT, result);
+
+	if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+		as_fabric_msg_put(m);
+	}
+}
+
+
+uint32_t
+parse_result_code(msg* m)
+{
+	uint32_t result_code;
+
+	if (msg_get_uint32(m, RW_FIELD_RESULT, &result_code) != 0) {
+		cf_warning(AS_RW, "repl-write ack: no result_code");
+		return AS_PROTO_RESULT_FAIL_UNKNOWN;
+	}
+
+	return result_code;
+}
+
+
+void
+drop_replica(as_partition_reservation* rsv, cf_digest* keyd,
+		bool is_nsup_delete, bool is_xdr_op, cf_node master)
+{
+	// Shortcut pointers & flags.
+	as_namespace* ns = rsv->ns;
+	as_index_tree* tree = rsv->tree;
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	if (as_record_get(tree, keyd, &r_ref) != 0) {
+		return; // not found is ok from master's perspective.
+	}
+
+	as_record* r = r_ref.r;
+
+	if (ns->storage_data_in_memory) {
+		record_delete_adjust_sindex(r, ns);
+	}
+
+	// Save the set-ID for XDR.
+	uint16_t set_id = as_index_get_set_id(r);
+
+	as_index_delete(tree, keyd);
+	as_record_done(&r_ref, ns);
+
+	if (xdr_must_ship_delete(ns, is_nsup_delete, is_xdr_op)) {
+		xdr_write(ns, keyd, 0, master, XDR_OP_TYPE_DROP, set_id, NULL);
+	}
+}
diff --git a/as/src/transaction/rw_request.c b/as/src/transaction/rw_request.c
new file mode 100644
index 00000000..4493db09
--- /dev/null
+++ b/as/src/transaction/rw_request.c
@@ -0,0 +1,223 @@
+/*
+ * rw_request.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/rw_request.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/rec_props.h"
+#include "base/thr_tsvc.h"
+#include "base/transaction.h"
+#include "fabric/fabric.h"
+#include "fabric/partition.h"
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_atomic32 g_rw_tid = 0;
+
+
+//==========================================================
+// Public API.
+//
+
+rw_request*
+rw_request_create(cf_digest* keyd)
+{
+	rw_request* rw = cf_rc_alloc(sizeof(rw_request));
+
+	// as_transaction look-alike:
+	rw->msgp				= NULL;
+	rw->msg_fields			= 0;
+	rw->origin				= 0;
+	rw->from_flags			= 0;
+	rw->from.any			= NULL;
+	rw->from_data.any		= 0;
+	rw->keyd				= *keyd;
+	rw->start_time			= 0;
+	rw->benchmark_time		= 0;
+
+	AS_PARTITION_RESERVATION_INIT(rw->rsv);
+
+	rw->end_time			= 0;
+	rw->result_code			= AS_PROTO_RESULT_OK;
+	rw->flags				= 0;
+	rw->generation			= 0;
+	rw->void_time			= 0;
+	rw->last_update_time	= 0;
+	// End of as_transaction look-alike.
+
+	pthread_mutex_init(&rw->lock, NULL);
+
+	rw->wait_queue_head = NULL;
+	rw->wait_queue_tail = NULL;
+	rw->wait_queue_depth = 0;
+
+	rw->is_set_up = false;
+
+	rw->pickled_buf = NULL;
+	rw->pickled_sz = 0;
+	as_rec_props_clear(&rw->pickled_rec_props);
+
+	rw->response_db.buf = NULL;
+	rw->response_db.is_stack = false;
+	rw->response_db.alloc_sz = 0;
+	rw->response_db.used_sz = 0;
+
+	rw->tid = cf_atomic32_incr(&g_rw_tid);
+	rw->dup_res_complete = false;
+	rw->repl_write_complete = false;
+	rw->repl_ping_complete = false;
+	rw->dup_res_cb = NULL;
+	rw->repl_write_cb = NULL;
+	rw->repl_ping_cb = NULL;
+	rw->timeout_cb = NULL;
+
+	rw->dest_msg = NULL;
+	rw->xmit_ms = 0;
+	rw->retry_interval_ms = 0;
+
+	rw->n_dest_nodes = 0;
+
+	rw->best_dup_msg = NULL;
+	rw->best_dup_result_code = AS_PROTO_RESULT_OK;
+	rw->best_dup_gen = 0;
+	rw->best_dup_lut = 0;
+
+	rw->tie_was_replicated = false;
+
+	return rw;
+}
+
+
+void
+rw_request_destroy(rw_request* rw)
+{
+	// Paranoia:
+	if (rw->from.any) {
+		cf_crash(AS_RW, "rw_request_destroy: origin %d has non-null 'from'",
+				rw->origin);
+	}
+
+	if (rw->msgp && rw->origin != FROM_BATCH) {
+		cf_free(rw->msgp);
+	}
+
+	if (rw->pickled_buf) {
+		cf_free(rw->pickled_buf);
+	}
+
+	if (rw->pickled_rec_props.p_data) {
+		cf_free(rw->pickled_rec_props.p_data);
+	}
+
+	cf_dyn_buf_free(&rw->response_db);
+
+	if (rw->dest_msg) {
+		as_fabric_msg_put(rw->dest_msg);
+	}
+
+	if (rw->is_set_up) {
+		if (rw->best_dup_msg) {
+			as_fabric_msg_put(rw->best_dup_msg);
+		}
+
+		as_partition_release(&rw->rsv);
+	}
+
+	pthread_mutex_destroy(&rw->lock);
+
+	rw_wait_ele* e = rw->wait_queue_head;
+
+	while (e) {
+		rw_wait_ele* next = e->next;
+
+		e->tr.from_flags |= FROM_FLAG_RESTART;
+		as_tsvc_enqueue(&e->tr);
+
+		cf_free(e);
+		e = next;
+	}
+}
+
+
+void
+rw_request_wait_q_push(rw_request* rw, as_transaction* tr)
+{
+	rw_wait_ele* e = cf_malloc(sizeof(rw_wait_ele));
+
+	as_transaction_copy_head(&e->tr, tr);
+	tr->from.any = NULL;
+	tr->msgp = NULL;
+
+	e->next = NULL;
+
+	if (rw->wait_queue_tail) {
+		rw->wait_queue_tail->next = e;
+		rw->wait_queue_tail = e;
+	}
+	else {
+		rw->wait_queue_head = e;
+		rw->wait_queue_tail = e;
+	}
+
+	rw->wait_queue_depth++;
+}
+
+
+void
+rw_request_wait_q_push_head(rw_request* rw, as_transaction* tr)
+{
+	rw_wait_ele* e = cf_malloc(sizeof(rw_wait_ele));
+	cf_assert(e, AS_RW, "alloc rw_wait_ele");
+
+	as_transaction_copy_head(&e->tr, tr);
+	tr->from.any = NULL;
+	tr->msgp = NULL;
+
+	e->next = rw->wait_queue_head;
+	rw->wait_queue_head = e;
+
+	if (! rw->wait_queue_tail) {
+		rw->wait_queue_tail = e;
+	}
+
+	rw->wait_queue_depth++;
+}
diff --git a/as/src/transaction/rw_request_hash.c b/as/src/transaction/rw_request_hash.c
new file mode 100644
index 00000000..4b97d85a
--- /dev/null
+++ b/as/src/transaction/rw_request_hash.c
@@ -0,0 +1,448 @@
+/*
+ * rw_request_hash.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/rw_request_hash.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_rchash.h"
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "fabric/fabric.h"
+#include "transaction/duplicate_resolve.h"
+#include "transaction/replica_ping.h"
+#include "transaction/replica_write.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+const msg_template rw_mt[] = {
+		{ RW_FIELD_OP, M_FT_UINT32 },
+		{ RW_FIELD_RESULT, M_FT_UINT32 },
+		{ RW_FIELD_NAMESPACE, M_FT_BUF },
+		{ RW_FIELD_NS_ID, M_FT_UINT32 },
+		{ RW_FIELD_GENERATION, M_FT_UINT32 },
+		{ RW_FIELD_DIGEST, M_FT_BUF },
+		{ RW_FIELD_UNUSED_6, M_FT_BUF },
+		{ RW_FIELD_UNUSED_7, M_FT_BUF },
+		{ RW_FIELD_CLUSTER_KEY, M_FT_UINT64 },
+		{ RW_FIELD_RECORD, M_FT_BUF },
+		{ RW_FIELD_TID, M_FT_UINT32 },
+		{ RW_FIELD_VOID_TIME, M_FT_UINT32 },
+		{ RW_FIELD_INFO, M_FT_UINT32 },
+		{ RW_FIELD_UNUSED_13, M_FT_BUF },
+		{ RW_FIELD_UNUSED_14, M_FT_BUF },
+		{ RW_FIELD_UNUSED_15, M_FT_UINT64 },
+		{ RW_FIELD_LAST_UPDATE_TIME, M_FT_UINT64 },
+		{ RW_FIELD_SET_NAME, M_FT_BUF },
+		{ RW_FIELD_KEY, M_FT_BUF },
+		{ RW_FIELD_REGIME, M_FT_UINT32 }
+};
+
+COMPILER_ASSERT(sizeof(rw_mt) / sizeof(msg_template) == NUM_RW_FIELDS);
+
+#define RW_MSG_SCRATCH_SIZE 192
+
+
+//==========================================================
+// Globals.
+//
+
+static cf_rchash* g_rw_request_hash = NULL;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+uint32_t rw_request_hash_fn(const void* value, uint32_t value_len);
+transaction_status handle_hot_key(rw_request* rw0, as_transaction* tr);
+
+void* run_retransmit(void* arg);
+int retransmit_reduce_fn(const void* key, uint32_t keylen, void* data, void* udata);
+void update_retransmit_stats(const rw_request* rw);
+
+int rw_msg_cb(cf_node id, msg* m, void* udata);
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_rw_init()
+{
+	cf_rchash_create(&g_rw_request_hash, rw_request_hash_fn,
+			rw_request_hdestroy, sizeof(rw_request_hkey), 32 * 1024,
+			CF_RCHASH_MANY_LOCK);
+
+	pthread_t thread;
+	pthread_attr_t attrs;
+
+	pthread_attr_init(&attrs);
+	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+
+	if (pthread_create(&thread, &attrs, run_retransmit, NULL) != 0) {
+		cf_crash(AS_RW, "failed to create retransmit thread");
+	}
+
+	as_fabric_register_msg_fn(M_TYPE_RW, rw_mt, sizeof(rw_mt),
+			RW_MSG_SCRATCH_SIZE, rw_msg_cb, NULL);
+}
+
+
+uint32_t
+rw_request_hash_count()
+{
+	return cf_rchash_get_size(g_rw_request_hash);
+}
+
+
+transaction_status
+rw_request_hash_insert(rw_request_hkey* hkey, rw_request* rw,
+		as_transaction* tr)
+{
+	int insert_rv;
+
+	while ((insert_rv = cf_rchash_put_unique(g_rw_request_hash, hkey,
+			sizeof(*hkey), rw)) != CF_RCHASH_OK) {
+		cf_assert(insert_rv == CF_RCHASH_ERR_FOUND, AS_RW, "put-unique error");
+		// rw_request with this digest already in hash - get it.
+
+		rw_request* rw0;
+		int get_rv = cf_rchash_get(g_rw_request_hash, hkey, sizeof(*hkey),
+				(void**)&rw0);
+
+		if (get_rv == CF_RCHASH_ERR_NOT_FOUND) {
+			// Try insertion again immediately.
+			continue;
+		}
+		// else - got it - handle "hot key" scenario.
+		cf_assert(get_rv == CF_RCHASH_OK, AS_RW, "cf_rchash_get error");
+
+		pthread_mutex_lock(&rw0->lock);
+
+		transaction_status status = handle_hot_key(rw0, tr);
+
+		pthread_mutex_unlock(&rw0->lock);
+		rw_request_release(rw0);
+
+		return status; // rw_request was not inserted in the hash
+	}
+
+	return TRANS_IN_PROGRESS; // rw_request was inserted in the hash
+}
+
+
+void
+rw_request_hash_delete(rw_request_hkey* hkey, rw_request* rw)
+{
+	cf_rchash_delete_object(g_rw_request_hash, hkey, sizeof(*hkey), rw);
+}
+
+
+rw_request*
+rw_request_hash_get(rw_request_hkey* hkey)
+{
+	rw_request* rw = NULL;
+
+	cf_rchash_get(g_rw_request_hash, hkey, sizeof(*hkey), (void**)&rw);
+
+	return rw;
+}
+
+
+// For debugging only.
+void
+rw_request_hash_dump()
+{
+	cf_info(AS_RW, "rw_request_hash dump not yet implemented");
+	// TODO - implement something, or deprecate.
+}
+
+
+//==========================================================
+// Local helpers - hash insertion.
+//
+
+uint32_t
+rw_request_hash_fn(const void* key, uint32_t key_size)
+{
+	rw_request_hkey* hkey = (rw_request_hkey*)key;
+
+	return *(uint32_t*)&hkey->keyd.digest[DIGEST_SCRAMBLE_BYTE1];
+}
+
+
+transaction_status
+handle_hot_key(rw_request* rw0, as_transaction* tr)
+{
+	if (rw0->is_set_up &&
+			rw0->origin == FROM_PROXY && tr->origin == FROM_PROXY &&
+			rw0->from.proxy_node == tr->from.proxy_node &&
+			rw0->from_data.proxy_tid == tr->from_data.proxy_tid) {
+		// If the new transaction is a retransmitted proxy request, don't
+		// queue it or reply to origin, just drop it and feign success. (Older
+		// servers will retransmit proxy requests - must handle them.)
+
+		return TRANS_DONE_SUCCESS;
+	}
+	else if (tr->origin == FROM_RE_REPL) {
+		// Always put this transaction at the head of the original rw_request's
+		// queue - it will be retried (first) when the original is complete.
+		rw_request_wait_q_push_head(rw0, tr);
+
+		return TRANS_WAITING;
+	}
+	else if (g_config.transaction_pending_limit != 0 &&
+			rw0->wait_queue_depth > g_config.transaction_pending_limit) {
+		// If we're over the hot key pending limit, fail this transaction.
+		cf_detail_digest(AS_RW, &tr->keyd, "{%s} key busy ", tr->rsv.ns->name);
+
+		cf_atomic64_incr(&tr->rsv.ns->n_fail_key_busy);
+		tr->result_code = AS_PROTO_RESULT_FAIL_KEY_BUSY;
+
+		return TRANS_DONE_ERROR;
+	}
+	else {
+		// Queue this transaction on the original rw_request - it will be
+		// retried when the original is complete.
+		rw_request_wait_q_push(rw0, tr);
+
+		return TRANS_WAITING;
+	}
+}
+
+
+//==========================================================
+// Local helpers - retransmit.
+//
+
+void*
+run_retransmit(void* arg)
+{
+	while (true) {
+		usleep(130 * 1000);
+
+		now_times now;
+
+		now.now_ns = cf_getns();
+		now.now_ms = now.now_ns / 1000000;
+
+		cf_rchash_reduce(g_rw_request_hash, retransmit_reduce_fn, &now);
+	}
+
+	return NULL;
+}
+
+
+int
+retransmit_reduce_fn(const void* key, uint32_t keylen, void* data, void* udata)
+{
+	rw_request* rw = data;
+	now_times* now = (now_times*)udata;
+
+	if (! rw->is_set_up) {
+		return 0;
+	}
+
+	if (now->now_ns > rw->end_time) {
+		pthread_mutex_lock(&rw->lock);
+
+		rw->timeout_cb(rw);
+
+		pthread_mutex_unlock(&rw->lock);
+
+		return CF_RCHASH_REDUCE_DELETE;
+	}
+
+	if (rw->xmit_ms < now->now_ms) {
+		pthread_mutex_lock(&rw->lock);
+
+		if (rw->from.any) {
+			rw->xmit_ms = now->now_ms + rw->retry_interval_ms;
+			rw->retry_interval_ms *= 2;
+
+			send_rw_messages(rw);
+			update_retransmit_stats(rw);
+		}
+		// else - lost race against dup-res or repl-write callback.
+
+		pthread_mutex_unlock(&rw->lock);
+	}
+
+	return 0;
+}
+
+
+void
+update_retransmit_stats(const rw_request* rw)
+{
+	as_namespace* ns = rw->rsv.ns;
+	as_msg* m = &rw->msgp->msg;
+	bool is_dup_res = rw->repl_write_cb == NULL;
+
+	// Note - only one retransmit thread, so no need for atomic increments.
+
+	switch (rw->origin) {
+	case FROM_CLIENT: {
+			bool is_write = (m->info2 & AS_MSG_INFO2_WRITE) != 0;
+			bool is_delete = (m->info2 & AS_MSG_INFO2_DELETE) != 0;
+			bool is_udf = (rw->msg_fields & AS_MSG_FIELD_BIT_UDF_FILENAME) != 0;
+
+			if (is_dup_res) {
+				if (is_write) {
+					if (is_delete) {
+						ns->n_retransmit_client_delete_dup_res++;
+					}
+					else if (is_udf) {
+						ns->n_retransmit_client_udf_dup_res++;
+					}
+					else {
+						ns->n_retransmit_client_write_dup_res++;
+					}
+				}
+				else {
+					ns->n_retransmit_client_read_dup_res++;
+				}
+			}
+			else {
+				cf_assert(is_write, AS_RW, "read doing replica write");
+
+				if (is_delete) {
+					ns->n_retransmit_client_delete_repl_write++;
+				}
+				else if (is_udf) {
+					ns->n_retransmit_client_udf_repl_write++;
+				}
+				else {
+					ns->n_retransmit_client_write_repl_write++;
+				}
+			}
+		}
+		break;
+	case FROM_PROXY:
+		// For now we don't report proxyee stats.
+		break;
+	case FROM_BATCH:
+		// For now batch sub transactions are read-only.
+		ns->n_retransmit_batch_sub_dup_res++;
+		break;
+	case FROM_IUDF:
+		if (is_dup_res) {
+			ns->n_retransmit_udf_sub_dup_res++;
+		}
+		else {
+			ns->n_retransmit_udf_sub_repl_write++;
+		}
+		break;
+	case FROM_RE_REPL:
+		// For now we don't report re-replication retransmit stats.
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin);
+		break;
+	}
+}
+
+
+//==========================================================
+// Local helpers - handle RW fabric messages.
+//
+
+int
+rw_msg_cb(cf_node id, msg* m, void* udata)
+{
+	uint32_t op;
+
+	if (msg_get_uint32(m, RW_FIELD_OP, &op) != 0) {
+		cf_warning(AS_RW, "got rw msg without op field");
+		as_fabric_msg_put(m);
+		return 0;
+	}
+
+	switch (op) {
+	//--------------------------------------------
+	// Duplicate resolution:
+	//
+	case RW_OP_DUP:
+		dup_res_handle_request(id, m);
+		break;
+	case RW_OP_DUP_ACK:
+		dup_res_handle_ack(id, m);
+		break;
+
+	//--------------------------------------------
+	// Replica writes:
+	//
+	case RW_OP_WRITE:
+		repl_write_handle_op(id, m);
+		break;
+	case RW_OP_WRITE_ACK:
+		repl_write_handle_ack(id, m);
+		break;
+	case RW_OP_REPL_CONFIRM:
+		repl_write_handle_confirmation(m);
+		break;
+
+	//--------------------------------------------
+	// Replica pings:
+	//
+	case RW_OP_REPL_PING:
+		repl_ping_handle_op(id, m);
+		break;
+	case RW_OP_REPL_PING_ACK:
+		repl_ping_handle_ack(id, m);
+		break;
+
+	default:
+		cf_warning(AS_RW, "got rw msg with unrecognized op %u", op);
+		as_fabric_msg_put(m);
+		break;
+	}
+
+	return 0;
+}
diff --git a/as/src/transaction/rw_utils.c b/as/src/transaction/rw_utils.c
new file mode 100644
index 00000000..72bb0c21
--- /dev/null
+++ b/as/src/transaction/rw_utils.c
@@ -0,0 +1,470 @@
+/*
+ * rw_utils.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/rw_utils.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_atomic.h" // xdr_allows_write
+#include "citrusleaf/cf_clock.h"
+#include "citrusleaf/cf_digest.h"
+
+#include "fault.h"
+#include "msg.h"
+
+#include "base/cfg.h" // xdr_allows_write
+#include "base/datamodel.h"
+#include "base/proto.h" // xdr_allows_write
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/xdr_serverside.h"
+#include "fabric/fabric.h"
+#include "storage/storage.h"
+#include "transaction/rw_request.h"
+
+
+//==========================================================
+// Public API.
+//
+
+// TODO - really? we can't hide this behind an XDR stub?
+bool
+xdr_allows_write(as_transaction* tr)
+{
+	if (as_transaction_is_xdr(tr)) {
+		if (tr->rsv.ns->ns_allow_xdr_writes) {
+			return true;
+		}
+	}
+	else {
+		if (tr->rsv.ns->ns_allow_nonxdr_writes || tr->origin == FROM_NSUP) {
+			return true;
+		}
+	}
+
+	cf_atomic_int_incr(&tr->rsv.ns->n_fail_xdr_forbidden);
+
+	return false;
+}
+
+
+void
+send_rw_messages(rw_request* rw)
+{
+	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
+		if (rw->dest_complete[i]) {
+			continue;
+		}
+
+		msg_incr_ref(rw->dest_msg);
+
+		if (as_fabric_send(rw->dest_nodes[i], rw->dest_msg,
+				AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+			as_fabric_msg_put(rw->dest_msg);
+			rw->xmit_ms = 0; // force a retransmit on next cycle
+		}
+	}
+}
+
+
+void
+send_rw_messages_forget(rw_request* rw)
+{
+	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
+		msg_incr_ref(rw->dest_msg);
+
+		if (as_fabric_send(rw->dest_nodes[i], rw->dest_msg,
+				AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
+			as_fabric_msg_put(rw->dest_msg);
+		}
+	}
+}
+
+
+int
+set_set_from_msg(as_record* r, as_namespace* ns, as_msg* m)
+{
+	as_msg_field* f = as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET);
+	size_t name_len = (size_t)as_msg_field_get_value_sz(f);
+
+	if (name_len == 0) {
+		return 0;
+	}
+
+	// Given the name, find/assign the set-ID and write it in the as_index.
+	return as_index_set_set_w_len(r, ns, (const char*)f->data, name_len, true);
+}
+
+
+// Caller must have checked that key is present in message.
+bool
+check_msg_key(as_msg* m, as_storage_rd* rd)
+{
+	as_msg_field* f = as_msg_field_get(m, AS_MSG_FIELD_TYPE_KEY);
+	uint32_t key_size = as_msg_field_get_value_sz(f);
+	uint8_t* key = f->data;
+
+	if (key_size != rd->key_size || memcmp(key, rd->key, key_size) != 0) {
+		cf_warning(AS_RW, "key mismatch - end of universe?");
+		return false;
+	}
+
+	return true;
+}
+
+
+bool
+get_msg_key(as_transaction* tr, as_storage_rd* rd)
+{
+	if (! as_transaction_has_key(tr)) {
+		return true;
+	}
+
+	as_msg_field* f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_KEY);
+
+	if (rd->ns->single_bin && rd->ns->storage_data_in_memory) {
+		cf_warning(AS_RW, "{%s} can't store key if data-in-memory & single-bin",
+				tr->rsv.ns->name);
+		return false;
+	}
+
+	rd->key_size = as_msg_field_get_value_sz(f);
+	rd->key = f->data;
+
+	return true;
+}
+
+
+int
+handle_msg_key(as_transaction* tr, as_storage_rd* rd)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+
+	if (rd->r->key_stored == 1) {
+		// Key stored for this record - be sure it gets rewritten.
+
+		// This will force a device read for non-data-in-memory, even if
+		// must_fetch_data is false! Since there's no advantage to using the
+		// loaded block after this if must_fetch_data is false, leave the
+		// subsequent code as-is.
+		if (! as_storage_record_get_key(rd)) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} can't get stored key ",
+					ns->name);
+			return AS_PROTO_RESULT_FAIL_UNKNOWN;
+		}
+
+		// Check the client-sent key, if any, against the stored key.
+		if (as_transaction_has_key(tr) && ! check_msg_key(m, rd)) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} key mismatch ", ns->name);
+			return AS_PROTO_RESULT_FAIL_KEY_MISMATCH;
+		}
+	}
+	// If we got a key without a digest, it's an old client, not a cue to store
+	// the key. (Remove this check when we're sure all old C clients are gone.)
+	else if (as_transaction_has_digest(tr)) {
+		// Key not stored for this record - store one if sent from client. For
+		// data-in-memory, don't allocate the key until we reach the point of no
+		// return. Also don't set AS_INDEX_FLAG_KEY_STORED flag until then.
+		if (! get_msg_key(tr, rd)) {
+			return AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE;
+		}
+	}
+
+	return 0;
+}
+
+
+void
+update_metadata_in_index(as_transaction* tr, bool increment_generation,
+		as_record* r)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+
+	uint64_t now = cf_clepoch_milliseconds();
+
+	switch (m->record_ttl) {
+	case TTL_NAMESPACE_DEFAULT:
+		if (ns->default_ttl != 0) {
+			// Set record void-time using default TTL value.
+			r->void_time = (now / 1000) + ns->default_ttl;
+		}
+		else {
+			// Default TTL is "never expire".
+			r->void_time = 0;
+		}
+		break;
+	case TTL_NEVER_EXPIRE:
+		// Set record to "never expire".
+		r->void_time = 0;
+		break;
+	case TTL_DONT_UPDATE:
+		// Do not change record's void time.
+		break;
+	default:
+		// Apply non-special m->record_ttl directly. Have already checked
+		// m->record_ttl <= 10 years, so no overflow etc.
+		r->void_time = (now / 1000) + m->record_ttl;
+		break;
+	}
+
+	as_record_set_lut(r, tr->rsv.regime, now, ns);
+
+	if (increment_generation) {
+		as_record_increment_generation(r, ns);
+	}
+}
+
+
+void
+pickle_all(as_storage_rd* rd, rw_request* rw)
+{
+	if (rw->n_dest_nodes == 0) {
+		return;
+	}
+
+	rw->pickled_buf = as_record_pickle(rd, &rw->pickled_sz);
+
+	// TODO - we could avoid this copy (and maybe even not do this here at all)
+	// if all callers malloc'd rd->rec_props.p_data upstream for hand-off...
+	if (rd->rec_props.p_data) {
+		rw->pickled_rec_props.size = rd->rec_props.size;
+		rw->pickled_rec_props.p_data = cf_malloc(rd->rec_props.size);
+		memcpy(rw->pickled_rec_props.p_data, rd->rec_props.p_data,
+				rd->rec_props.size);
+	}
+}
+
+
+bool
+write_sindex_update(as_namespace* ns, const char* set_name, cf_digest* keyd,
+		as_bin* old_bins, uint32_t n_old_bins, as_bin* new_bins,
+		uint32_t n_new_bins)
+{
+	int n_populated = 0;
+	bool not_just_created[n_new_bins];
+
+	memset(not_just_created, 0, sizeof(not_just_created));
+
+	// Maximum number of sindexes which can be changed in one transaction is
+	// 2 * ns->sindex_cnt.
+
+	SINDEX_GRLOCK();
+	SINDEX_BINS_SETUP(sbins, 2 * ns->sindex_cnt);
+	as_sindex* si_arr[2 * ns->sindex_cnt];
+	int si_arr_index = 0;
+
+	// Reserve matching SIs.
+
+	for (int i = 0; i < n_old_bins; i++) {
+		si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name,
+				old_bins[i].id, &si_arr[si_arr_index]);
+	}
+
+	for (int i = 0; i < n_new_bins; i++) {
+		si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name,
+				new_bins[i].id, &si_arr[si_arr_index]);
+	}
+
+	// For every old bin, find the corresponding new bin (if any) and adjust the
+	// secondary index if the bin was modified. If no corresponding new bin is
+	// found, it means the old bin was deleted - also adjust the secondary index
+	// accordingly.
+
+	for (int32_t i_old = 0; i_old < (int32_t)n_old_bins; i_old++) {
+		as_bin* b_old = &old_bins[i_old];
+		bool found = false;
+
+		// Loop over new bins. Start at old bin index (if possible) and go down,
+		// wrapping around to do the higher indexes last. This will find a match
+		// (if any) very quickly - instantly, unless there were bins deleted.
+
+		bool any_new = n_new_bins != 0;
+		int32_t n_new_minus_1 = (int32_t)n_new_bins - 1;
+		int32_t i_new = n_new_minus_1 < i_old ? n_new_minus_1 : i_old;
+
+		while (any_new) {
+			as_bin* b_new = &new_bins[i_new];
+
+			if (b_old->id == b_new->id) {
+				if (as_bin_get_particle_type(b_old) !=
+						as_bin_get_particle_type(b_new) ||
+								b_old->particle != b_new->particle) {
+					n_populated += as_sindex_sbins_populate(
+							&sbins[n_populated], ns, set_name, b_old, b_new);
+				}
+
+				found = true;
+				not_just_created[i_new] = true;
+				break;
+			}
+
+			if (--i_new < 0 && (i_new = n_new_minus_1) <= i_old) {
+				break;
+			}
+
+			if (i_new == i_old) {
+				break;
+			}
+		}
+
+		if (! found) {
+			n_populated += as_sindex_sbins_from_bin(ns, set_name, b_old,
+					&sbins[n_populated], AS_SINDEX_OP_DELETE);
+		}
+	}
+
+	// Now find the new bins that are just-created bins. We've marked the others
+	// in the loop above, so any left are just-created.
+
+	for (uint32_t i_new = 0; i_new < n_new_bins; i_new++) {
+		if (not_just_created[i_new]) {
+			continue;
+		}
+
+		n_populated += as_sindex_sbins_from_bin(ns, set_name, &new_bins[i_new],
+				&sbins[n_populated], AS_SINDEX_OP_INSERT);
+	}
+
+	SINDEX_GRUNLOCK();
+
+	if (n_populated != 0) {
+		as_sindex_update_by_sbin(ns, set_name, sbins, n_populated, keyd);
+		as_sindex_sbin_freeall(sbins, n_populated);
+	}
+
+	as_sindex_release_arr(si_arr, si_arr_index);
+
+	return n_populated != 0;
+}
+
+
+// If called for data-not-in-memory, this may read record from drive!
+// TODO - rename as as_record_... and move to record.c?
+void
+record_delete_adjust_sindex(as_record* r, as_namespace* ns)
+{
+	if (! record_has_sindex(r, ns)) {
+		return;
+	}
+
+	as_storage_rd rd;
+
+	as_storage_record_open(ns, r, &rd);
+	as_storage_rd_load_n_bins(&rd);
+
+	as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins];
+
+	as_storage_rd_load_bins(&rd, stack_bins);
+
+	remove_from_sindex(ns, as_index_get_set_name(r, ns), &r->keyd, rd.bins,
+			rd.n_bins);
+
+	as_storage_record_close(&rd);
+}
+
+
+// Remove record from secondary index. Called only for data-in-memory. If
+// data-not-in-memory, existing record is not read, and secondary index entry is
+// cleaned up by background sindex defrag thread.
+// TODO - rename as as_record_... and move to record.c?
+void
+delete_adjust_sindex(as_storage_rd* rd)
+{
+	as_namespace* ns = rd->ns;
+
+	if (! record_has_sindex(rd->r, ns)) {
+		return;
+	}
+
+	as_storage_rd_load_n_bins(rd);
+	as_storage_rd_load_bins(rd, NULL);
+
+	remove_from_sindex(ns, as_index_get_set_name(rd->r, ns), &rd->r->keyd,
+			rd->bins, rd->n_bins);
+}
+
+
+// TODO - rename as as_record_..., move to record.c, take r instead of set_name,
+// and lose keyd parameter?
+void
+remove_from_sindex(as_namespace* ns, const char* set_name, cf_digest* keyd,
+		as_bin* bins, uint32_t n_bins)
+{
+	SINDEX_GRLOCK();
+
+	SINDEX_BINS_SETUP(sbins, ns->sindex_cnt);
+
+	as_sindex* si_arr[ns->sindex_cnt];
+	int si_arr_index = 0;
+	int sbins_populated = 0;
+
+	// Reserve matching sindexes.
+	for (int i = 0; i < (int)n_bins; i++) {
+		si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name,
+				bins[i].id, &si_arr[si_arr_index]);
+	}
+
+	for (int i = 0; i < (int)n_bins; i++) {
+		sbins_populated += as_sindex_sbins_from_bin(ns, set_name, &bins[i],
+				&sbins[sbins_populated], AS_SINDEX_OP_DELETE);
+	}
+
+	SINDEX_GRUNLOCK();
+
+	if (sbins_populated) {
+		as_sindex_update_by_sbin(ns, set_name, sbins, sbins_populated, keyd);
+		as_sindex_sbin_freeall(sbins, sbins_populated);
+	}
+
+	as_sindex_release_arr(si_arr, si_arr_index);
+}
+
+
+bool
+xdr_must_ship_delete(as_namespace* ns, bool is_nsup_delete, bool is_xdr_op)
+{
+	if (! is_xdr_delete_shipping_enabled()) {
+		return false;
+	}
+
+	// If this delete is a result of expiration/eviction, don't ship it unless
+	// configured to do so.
+	if (is_nsup_delete && ! is_xdr_nsup_deletes_enabled()) {
+		return false;
+	}
+
+	return ! is_xdr_op ||
+			// If this delete is a result of XDR shipping, don't ship it unless
+			// configured to do so.
+			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes;
+}
diff --git a/as/src/transaction/rw_utils_ce.c b/as/src/transaction/rw_utils_ce.c
new file mode 100644
index 00000000..931c91b5
--- /dev/null
+++ b/as/src/transaction/rw_utils_ce.c
@@ -0,0 +1,259 @@
+/*
+ * rw_utils_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/rw_utils.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "fault.h"
+#include "msg.h"
+
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/transaction.h"
+#include "base/udf_record.h"
+#include "storage/storage.h"
+#include "transaction/rw_request.h"
+#include "transaction/udf.h"
+
+
+//==========================================================
+// Public API.
+//
+
+bool
+validate_delete_durability(as_transaction* tr)
+{
+	return true;
+}
+
+
+int
+repl_state_check(as_record* r, as_transaction* tr)
+{
+	return 0;
+}
+
+
+void
+will_replicate(as_record* r, as_namespace* ns)
+{
+}
+
+
+bool
+insufficient_replica_destinations(const as_namespace* ns, uint32_t n_dests)
+{
+	return false;
+}
+
+
+void
+finished_replicated(as_transaction* tr)
+{
+}
+
+
+void
+finished_not_replicated(rw_request* rw)
+{
+}
+
+
+bool
+generation_check(const as_record* r, const as_msg* m, const as_namespace* ns)
+{
+	if ((m->info2 & AS_MSG_INFO2_GENERATION) != 0) {
+		return m->generation == r->generation;
+	}
+
+	if ((m->info2 & AS_MSG_INFO2_GENERATION_GT) != 0) {
+		return m->generation > r->generation;
+	}
+
+	return true; // no generation requirement
+}
+
+
+int
+set_delete_durablility(const as_transaction* tr, as_storage_rd* rd)
+{
+	if (as_transaction_is_durable_delete(tr)) {
+		cf_warning(AS_RW, "durable delete is an enterprise feature");
+		return AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY;
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+bool
+create_only_check(const as_record* r, const as_msg* m)
+{
+	// Ok (return true) if no requirement.
+	return (m->info2 & AS_MSG_INFO2_CREATE_ONLY) == 0;
+}
+
+
+void
+write_delete_record(as_record* r, as_index_tree* tree)
+{
+	as_index_delete(tree, &r->keyd);
+}
+
+
+udf_optype
+udf_finish_delete(udf_record* urecord)
+{
+	return (urecord->flag & UDF_RECORD_FLAG_PREEXISTS) != 0 ?
+			UDF_OPTYPE_DELETE : UDF_OPTYPE_NONE;
+}
+
+
+uint32_t
+dup_res_pack_repl_state_info(const as_record* r, as_namespace* ns)
+{
+	return 0;
+}
+
+
+uint32_t
+dup_res_pack_info(const as_record* r, as_namespace* ns)
+{
+	return 0;
+}
+
+
+bool
+dup_res_should_retry_transaction(rw_request* rw, uint32_t result_code)
+{
+	// TODO - JUMP - can get this from 3.14.x nodes or older - retry if so.
+	return result_code == AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH;
+}
+
+
+void
+dup_res_handle_tie(rw_request* rw, const msg* m, uint32_t result_code)
+{
+}
+
+
+void
+apply_if_tie(rw_request* rw)
+{
+}
+
+
+void
+dup_res_translate_result_code(rw_request* rw)
+{
+	rw->result_code = AS_PROTO_RESULT_OK;
+}
+
+
+bool
+dup_res_ignore_pickle(const uint8_t* buf, uint32_t info)
+{
+	return as_record_pickle_is_binless(buf);
+}
+
+
+void
+dup_res_init_repl_state(as_remote_record* rr, uint32_t info)
+{
+}
+
+
+void
+repl_write_flag_pickle(const as_transaction* tr, const uint8_t* buf,
+		uint32_t* info)
+{
+	// Do nothing.
+}
+
+
+bool
+repl_write_pickle_is_drop(const uint8_t* buf, uint32_t info)
+{
+	return as_record_pickle_is_binless(buf);
+}
+
+
+void
+repl_write_init_repl_state(as_remote_record* rr, bool from_replica)
+{
+}
+
+
+conflict_resolution_pol
+repl_write_conflict_resolution_policy(const as_namespace* ns)
+{
+	return AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME;
+}
+
+
+bool
+repl_write_should_retransmit_replicas(rw_request* rw, uint32_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH:
+		rw->xmit_ms = 0; // force retransmit on next cycle
+		return true;
+	default:
+		return false;
+	}
+}
+
+
+void
+repl_write_send_confirmation(rw_request* rw)
+{
+}
+
+
+void
+repl_write_handle_confirmation(msg* m)
+{
+}
+
+
+int
+record_replace_check(as_record* r, as_namespace* ns)
+{
+	return 0;
+}
+
+
+void
+record_replaced(as_record* r, as_remote_record* rr)
+{
+}
diff --git a/as/src/transaction/udf.c b/as/src/transaction/udf.c
new file mode 100644
index 00000000..62cf9158
--- /dev/null
+++ b/as/src/transaction/udf.c
@@ -0,0 +1,1094 @@
+/*
+ * udf.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/udf.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "aerospike/as_aerospike.h"
+#include "aerospike/as_buffer.h"
+#include "aerospike/as_log.h"
+#include "aerospike/as_list.h"
+#include "aerospike/as_module.h"
+#include "aerospike/as_msgpack.h"
+#include "aerospike/as_serializer.h"
+#include "aerospike/as_types.h"
+#include "aerospike/as_udf_context.h"
+#include "aerospike/mod_lua.h"
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "base/udf_aerospike.h"
+#include "base/udf_arglist.h"
+#include "base/udf_cask.h"
+#include "base/udf_record.h"
+#include "base/udf_timer.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+#include "transaction/duplicate_resolve.h"
+#include "transaction/proxy.h"
+#include "transaction/replica_write.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+static const cf_fault_severity as_log_level_map[5] = {
+	[AS_LOG_LEVEL_ERROR] = CF_WARNING,
+	[AS_LOG_LEVEL_WARN]	= CF_WARNING,
+	[AS_LOG_LEVEL_INFO]	= CF_INFO,
+	[AS_LOG_LEVEL_DEBUG] = CF_DEBUG,
+	[AS_LOG_LEVEL_TRACE] = CF_DETAIL
+};
+
+typedef struct udf_call_s {
+	udf_def*		def;
+	as_transaction* tr;
+} udf_call;
+
+
+//==========================================================
+// Globals.
+//
+
+as_aerospike g_as_aerospike;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+bool log_callback(as_log_level level, const char* func, const char* file,
+		uint32_t line, const char* fmt, ...);
+
+void start_udf_dup_res(rw_request* rw, as_transaction* tr);
+void start_udf_repl_write(rw_request* rw, as_transaction* tr);
+void start_udf_repl_write_forget(rw_request* rw, as_transaction* tr);
+bool udf_dup_res_cb(rw_request* rw);
+void udf_repl_write_after_dup_res(rw_request* rw, as_transaction* tr);
+void udf_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr);
+void udf_repl_write_cb(rw_request* rw);
+
+void send_udf_response(as_transaction* tr, cf_dyn_buf* db);
+void udf_timeout_cb(rw_request* rw);
+
+transaction_status udf_master(rw_request* rw, as_transaction* tr);
+udf_optype udf_master_apply(udf_call* call, rw_request* rw);
+int udf_apply_record(udf_call* call, as_rec* rec, as_result* result);
+uint64_t udf_end_time(time_tracker* tt);
+void udf_finish(udf_record* urecord, rw_request* rw, udf_optype* record_op);
+udf_optype udf_finish_op(udf_record* urecord);
+void udf_post_processing(udf_record* urecord, rw_request* rw,
+		udf_optype urecord_op);
+
+void update_lua_complete_stats(uint8_t origin, as_namespace* ns, udf_optype op,
+		int ret, bool is_success);
+
+void process_failure_str(udf_call* call, const char* err_str, size_t len,
+		cf_dyn_buf* db);
+void process_result(const as_result* result, udf_call* call, cf_dyn_buf* db);
+void process_response(udf_call* call, bool success, const as_val* val,
+		cf_dyn_buf* db);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline void
+client_udf_update_stats(as_namespace* ns, uint8_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_client_udf_complete);
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_client_udf_timeout);
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_client_udf_error);
+		break;
+	}
+}
+
+static inline void
+udf_sub_udf_update_stats(as_namespace* ns, uint8_t result_code)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_udf_sub_udf_complete);
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_udf_sub_udf_timeout);
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_udf_sub_udf_error);
+		break;
+	}
+}
+
+static inline bool
+udf_zero_bins_left(udf_record* urecord)
+{
+	return (urecord->flag & UDF_RECORD_FLAG_OPEN) != 0 &&
+			! as_bin_inuse_has(urecord->rd);
+}
+
+static inline void
+process_failure(udf_call* call, const as_val* val, cf_dyn_buf* db)
+{
+	process_response(call, false, val, db);
+}
+
+static inline void
+process_success(udf_call* call, const as_val* val, cf_dyn_buf* db)
+{
+	process_response(call, true, val, db);
+}
+
+
+//==========================================================
+// Public API.
+//
+
+void
+as_udf_init()
+{
+	as_module_configure(&mod_lua, &g_config.mod_lua);
+	as_log_set_callback(log_callback);
+	udf_cask_init();
+	as_aerospike_init(&g_as_aerospike, NULL, &udf_aerospike_hooks);
+}
+
+
+// Public API for udf_def class, not big enough for it's own file.
+udf_def*
+udf_def_init_from_msg(udf_def* def, const as_transaction* tr)
+{
+	def->arglist = NULL;
+
+	as_msg* m = &tr->msgp->msg;
+	as_msg_field* filename =
+			as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_FILENAME);
+
+	if (! filename) {
+		return NULL;
+	}
+
+	as_msg_field* function =
+			as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_FUNCTION);
+
+	if (! function) {
+		return NULL;
+	}
+
+	as_msg_field* arglist = as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_ARGLIST);
+
+	if (! arglist) {
+		return NULL;
+	}
+
+	as_msg_field_get_strncpy(filename, def->filename, sizeof(def->filename));
+	as_msg_field_get_strncpy(function, def->function, sizeof(def->function));
+
+	as_unpacker unpacker;
+
+	unpacker.buffer = (const unsigned char*)arglist->data;
+	unpacker.length = as_msg_field_get_value_sz(arglist);
+	unpacker.offset = 0;
+
+	if (unpacker.length > 0) {
+		as_val* val = NULL;
+		int ret = as_unpack_val(&unpacker, &val);
+
+		if (ret == 0 && as_val_type(val) == AS_LIST) {
+			def->arglist = (as_list*)val;
+		}
+	}
+
+	as_msg_field* op = as_transaction_has_udf_op(tr) ?
+			as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_OP) : NULL;
+
+	def->type = op ? *op->data : AS_UDF_OP_KVS;
+
+	return def;
+}
+
+
+transaction_status
+as_udf_start(as_transaction* tr)
+{
+	BENCHMARK_START(tr, udf, FROM_CLIENT);
+	BENCHMARK_START(tr, udf_sub, FROM_IUDF);
+
+	// Apply XDR filter.
+	if (! xdr_allows_write(tr)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN;
+		send_udf_response(tr, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Don't know if UDF is read or delete - check that we aren't backed up.
+	if (as_storage_overloaded(tr->rsv.ns)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD;
+		send_udf_response(tr, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Create rw_request and add to hash.
+	rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd };
+	rw_request* rw = rw_request_create(&tr->keyd);
+	transaction_status status = rw_request_hash_insert(&hkey, rw, tr);
+
+	// If rw_request wasn't inserted in hash, transaction is finished.
+	if (status != TRANS_IN_PROGRESS) {
+		rw_request_release(rw);
+
+		if (status != TRANS_WAITING) {
+			send_udf_response(tr, NULL);
+		}
+
+		return status;
+	}
+	// else - rw_request is now in hash, continue...
+
+	if (tr->rsv.ns->write_dup_res_disabled) {
+		// Note - preventing duplicate resolution this way allows
+		// rw_request_destroy() to handle dup_msg[] cleanup correctly.
+		tr->rsv.n_dupl = 0;
+	}
+
+	// If there are duplicates to resolve, start doing so.
+	if (tr->rsv.n_dupl != 0) {
+		start_udf_dup_res(rw, tr);
+
+		// Started duplicate resolution.
+		return TRANS_IN_PROGRESS;
+	}
+	// else - no duplicate resolution phase, apply operation to master.
+
+	// Set up the nodes to which we'll write replicas.
+	rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p,
+			rw->dest_nodes);
+
+	if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) {
+		rw_request_hash_delete(&hkey, rw);
+		tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+		send_udf_response(tr, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	status = udf_master(rw, tr);
+
+	BENCHMARK_NEXT_DATA_POINT(tr, udf, master);
+	BENCHMARK_NEXT_DATA_POINT(tr, udf_sub, master);
+
+	// If error or UDF was a read, transaction is finished.
+	if (status != TRANS_IN_PROGRESS) {
+		if (status != TRANS_WAITING) {
+			send_udf_response(tr, &rw->response_db);
+		}
+
+		rw_request_hash_delete(&hkey, rw);
+		return status;
+	}
+
+	// If we don't need replica writes, transaction is finished.
+	if (rw->n_dest_nodes == 0) {
+		finished_replicated(tr);
+		send_udf_response(tr, &rw->response_db);
+		rw_request_hash_delete(&hkey, rw);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	// If we don't need to wait for replica write acks, fire and forget.
+	if (respond_on_master_complete(tr)) {
+		start_udf_repl_write_forget(rw, tr);
+		send_udf_response(tr, &rw->response_db);
+		rw_request_hash_delete(&hkey, rw);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	start_udf_repl_write(rw, tr);
+
+	// Started replica write.
+	return TRANS_IN_PROGRESS;
+}
+
+
+//==========================================================
+// Local helpers - initialization.
+//
+
+bool
+log_callback(as_log_level level, const char* func, const char* file,
+		uint32_t line, const char* fmt, ...)
+{
+	cf_fault_severity severity = as_log_level_map[level];
+
+	if (severity > cf_fault_filter[AS_UDF]) {
+		return true;
+	}
+
+	va_list ap;
+
+	va_start(ap, fmt);
+	char message[1024] = { '\0' };
+	vsnprintf(message, 1024, fmt, ap);
+	va_end(ap);
+
+	cf_fault_event(AS_UDF, severity, file, line, "%s", message);
+
+	return true;
+}
+
+
+//==========================================================
+// Local helpers - transaction flow.
+//
+
+void
+start_udf_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send dup-res message.
+
+	dup_res_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	dup_res_setup_rw(rw, tr, udf_dup_res_cb, udf_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_udf_repl_write(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send repl-write message.
+
+	repl_write_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	repl_write_setup_rw(rw, tr, udf_repl_write_cb, udf_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_udf_repl_write_forget(rw_request* rw, as_transaction* tr)
+{
+	// Construct and send repl-write message. No need to finish rw setup.
+
+	repl_write_make_message(rw, tr);
+	send_rw_messages_forget(rw);
+}
+
+
+bool
+udf_dup_res_cb(rw_request* rw)
+{
+	BENCHMARK_NEXT_DATA_POINT(rw, udf, dup_res);
+	BENCHMARK_NEXT_DATA_POINT(rw, udf_sub, dup_res);
+
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	if (tr.result_code != AS_PROTO_RESULT_OK) {
+		send_udf_response(&tr, NULL);
+		return true;
+	}
+
+	// Set up the nodes to which we'll write replicas.
+	rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p,
+			rw->dest_nodes);
+
+	if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) {
+		tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+		send_udf_response(&tr, NULL);
+		return true;
+	}
+
+	transaction_status status = udf_master(rw, &tr);
+
+	BENCHMARK_NEXT_DATA_POINT((&tr), udf, master);
+	BENCHMARK_NEXT_DATA_POINT((&tr), udf_sub, master);
+
+	if (status == TRANS_WAITING) {
+		// Note - new tr now owns msgp, make sure rw destructor doesn't free it.
+		// Also, rw will release rsv - new tr will get a new one.
+		rw->msgp = NULL;
+		return true;
+	}
+
+	if (status != TRANS_IN_PROGRESS) {
+		send_udf_response(&tr, &rw->response_db);
+		return true;
+	}
+
+	// If we don't need replica writes, transaction is finished.
+	if (rw->n_dest_nodes == 0) {
+		finished_replicated(&tr);
+		send_udf_response(&tr, &rw->response_db);
+		return true;
+	}
+
+	// If we don't need to wait for replica write acks, fire and forget.
+	if (respond_on_master_complete(&tr)) {
+		udf_repl_write_forget_after_dup_res(rw, &tr);
+		send_udf_response(&tr, &rw->response_db);
+		return true;
+	}
+
+	udf_repl_write_after_dup_res(rw, &tr);
+
+	// Started replica write - don't delete rw_request from hash.
+	return false;
+}
+
+
+void
+udf_repl_write_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Recycle rw_request that was just used for duplicate resolution to now do
+	// replica writes. Note - we are under the rw_request lock here!
+
+	repl_write_make_message(rw, tr);
+	repl_write_reset_rw(rw, tr, udf_repl_write_cb);
+	send_rw_messages(rw);
+}
+
+
+void
+udf_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Send replica writes. Not waiting for acks, so need to reset rw_request.
+	// Note - we are under the rw_request lock here!
+
+	repl_write_make_message(rw, tr);
+	send_rw_messages_forget(rw);
+}
+
+
+void
+udf_repl_write_cb(rw_request* rw)
+{
+	BENCHMARK_NEXT_DATA_POINT(rw, udf, repl_write);
+	BENCHMARK_NEXT_DATA_POINT(rw, udf_sub, repl_write);
+
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	finished_replicated(&tr);
+	send_udf_response(&tr, &rw->response_db);
+
+	// Finished transaction - rw_request cleans up reservation and msgp!
+}
+
+
+//==========================================================
+// Local helpers - transaction end.
+//
+
+void
+send_udf_response(as_transaction* tr, cf_dyn_buf* db)
+{
+	// Paranoia - shouldn't get here on losing race with timeout.
+	if (! tr->from.any) {
+		cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin);
+		return;
+	}
+
+	// Note - if tr was setup from rw, rw->from.any has been set null and
+	// informs timeout it lost the race.
+
+	clear_delete_response_metadata(tr);
+
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		if (db && db->used_sz != 0) {
+			as_msg_send_ops_reply(tr->from.proto_fd_h, db);
+		}
+		else {
+			as_msg_send_reply(tr->from.proto_fd_h, tr->result_code,
+					tr->generation, tr->void_time, NULL, NULL, 0, tr->rsv.ns,
+					as_transaction_trid(tr));
+		}
+		BENCHMARK_NEXT_DATA_POINT(tr, udf, response);
+		HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(tr, udf_hist);
+		client_udf_update_stats(tr->rsv.ns, tr->result_code);
+		break;
+	case FROM_PROXY:
+		if (db && db->used_sz != 0) {
+			as_proxy_send_ops_response(tr->from.proxy_node,
+					tr->from_data.proxy_tid, db);
+		}
+		else {
+			as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid,
+					tr->result_code, tr->generation, tr->void_time, NULL, NULL,
+					0, tr->rsv.ns, as_transaction_trid(tr));
+		}
+		break;
+	case FROM_IUDF:
+		if (db && db->used_sz != 0) {
+			cf_crash(AS_RW, "unexpected - internal udf has response");
+		}
+		tr->from.iudf_orig->cb(tr->from.iudf_orig->udata, tr->result_code);
+		BENCHMARK_NEXT_DATA_POINT(tr, udf_sub, response);
+		udf_sub_udf_update_stats(tr->rsv.ns, tr->result_code);
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+
+	tr->from.any = NULL; // pattern, not needed
+}
+
+
+void
+udf_timeout_cb(rw_request* rw)
+{
+	if (! rw->from.any) {
+		return; // lost race against dup-res or repl-write callback
+	}
+
+	finished_not_replicated(rw);
+
+	switch (rw->origin) {
+	case FROM_CLIENT:
+		as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0,
+				0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw));
+		// Timeouts aren't included in histograms.
+		client_udf_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		break;
+	case FROM_PROXY:
+		break;
+	case FROM_IUDF:
+		rw->from.iudf_orig->cb(rw->from.iudf_orig->udata,
+				AS_PROTO_RESULT_FAIL_TIMEOUT);
+		// Timeouts aren't included in histograms.
+		udf_sub_udf_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT);
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin);
+		break;
+	}
+
+	rw->from.any = NULL; // inform other callback it lost the race
+}
+
+
+//==========================================================
+// Local helpers - UDF.
+//
+
+transaction_status
+udf_master(rw_request* rw, as_transaction* tr)
+{
+	CF_ALLOC_SET_NS_ARENA(tr->rsv.ns);
+
+	udf_def def;
+	udf_call call = { &def, tr };
+
+	if (tr->origin == FROM_IUDF) {
+		call.def = &tr->from.iudf_orig->def;
+	}
+	else if (! udf_def_init_from_msg(call.def, tr)) {
+		cf_warning(AS_UDF, "failed udf_def_init_from_msg");
+		tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER;
+		return TRANS_DONE_ERROR;
+	}
+
+	udf_optype optype = udf_master_apply(&call, rw);
+
+	if (tr->origin != FROM_IUDF && call.def->arglist) {
+		as_list_destroy(call.def->arglist);
+	}
+
+	if (optype == UDF_OPTYPE_READ || optype == UDF_OPTYPE_NONE) {
+		// UDF is done, no replica writes needed.
+		return TRANS_DONE_SUCCESS;
+	}
+
+	return optype == UDF_OPTYPE_WAITING ? TRANS_WAITING : TRANS_IN_PROGRESS;
+}
+
+
+udf_optype
+udf_master_apply(udf_call* call, rw_request* rw)
+{
+	as_transaction* tr = call->tr;
+	as_namespace* ns = tr->rsv.ns;
+
+	// Find record in index.
+
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+
+	int get_rv = as_record_get(tr->rsv.tree, &tr->keyd, &r_ref);
+
+	if (get_rv == 0 && as_record_is_doomed(r_ref.r, ns)) {
+		// If record is expired or truncated, pretend it was not found.
+		as_record_done(&r_ref, ns);
+		get_rv = -1;
+	}
+
+	if (get_rv == 0 && repl_state_check(r_ref.r, tr) < 0) {
+		as_record_done(&r_ref, ns);
+		return UDF_OPTYPE_WAITING;
+	}
+
+	if (tr->origin == FROM_IUDF &&
+			(get_rv == -1 || ! as_record_is_live(r_ref.r))) {
+		// Internal UDFs must not create records.
+		tr->result_code = AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		process_failure(call, NULL, &rw->response_db);
+		return UDF_OPTYPE_NONE;
+	}
+
+	// Open storage record.
+
+	as_storage_rd rd;
+
+	udf_record urecord;
+	udf_record_init(&urecord, true);
+
+	xdr_dirty_bins dirty_bins;
+	xdr_clear_dirty_bins(&dirty_bins);
+
+	urecord.r_ref	= &r_ref;
+	urecord.tr		= tr;
+	urecord.rd		= &rd;
+	urecord.dirty	= &dirty_bins;
+	urecord.keyd	= tr->keyd;
+
+	if (get_rv == 0) {
+		urecord.flag |= (UDF_RECORD_FLAG_OPEN | UDF_RECORD_FLAG_PREEXISTS);
+
+		if (udf_storage_record_open(&urecord) != 0) {
+			udf_record_close(&urecord);
+			tr->result_code = AS_PROTO_RESULT_FAIL_BIN_NAME; // overloaded... add bin_count error?
+			process_failure(call, NULL, &rw->response_db);
+			return UDF_OPTYPE_NONE;
+		}
+
+		if (tr->origin == FROM_IUDF && tr->from.iudf_orig->predexp) {
+			predexp_args_t predargs = {
+					.ns = ns, .md = r_ref.r, .vl = NULL, .rd = &rd
+			};
+
+			if (! predexp_matches_record(tr->from.iudf_orig->predexp,
+					&predargs)) {
+				udf_record_close(&urecord);
+				tr->result_code = AS_PROTO_RESULT_FAIL_NOT_FOUND; // not ideal
+				process_failure(call, NULL, &rw->response_db);
+				return UDF_OPTYPE_NONE;
+			}
+		}
+
+		as_msg* m = &tr->msgp->msg;
+
+		// If both the record and the message have keys, check them.
+		if (rd.key) {
+			if (as_transaction_has_key(tr) && ! check_msg_key(m, &rd)) {
+				udf_record_close(&urecord);
+				tr->result_code = AS_PROTO_RESULT_FAIL_KEY_MISMATCH;
+				process_failure(call, NULL, &rw->response_db);
+				return UDF_OPTYPE_NONE;
+			}
+		}
+		else {
+			// If the message has a key, apply it to the record.
+			if (! get_msg_key(tr, &rd)) {
+				udf_record_close(&urecord);
+				tr->result_code = AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE;
+				process_failure(call, NULL, &rw->response_db);
+				return UDF_OPTYPE_NONE;
+			}
+
+			urecord.flag |= UDF_RECORD_FLAG_METADATA_UPDATED;
+		}
+	}
+	else {
+		urecord.flag &= ~(UDF_RECORD_FLAG_OPEN |
+				UDF_RECORD_FLAG_STORAGE_OPEN |
+				UDF_RECORD_FLAG_PREEXISTS);
+	}
+
+	// Run UDF.
+
+	// This as_rec needs to be in the heap - once passed into the lua scope it
+	// gets garbage collected later. Also, the destroy hook is set to NULL so
+	// garbage collection has nothing to do.
+	as_rec* urec = as_rec_new(&urecord, &udf_record_hooks);
+
+	as_val_reserve(urec); // for lua
+
+	as_result result;
+	as_result_init(&result);
+
+	int apply_rv = udf_apply_record(call, urec, &result);
+
+	udf_optype optype = UDF_OPTYPE_NONE;
+
+	if (apply_rv == 0) {
+		udf_finish(&urecord, rw, &optype);
+		process_result(&result, call, &rw->response_db);
+	}
+	else {
+		udf_record_close(&urecord);
+
+		char* rs = as_module_err_string(apply_rv);
+
+		tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION;
+		process_failure_str(call, rs, strlen(rs), &rw->response_db);
+		cf_free(rs);
+	}
+
+	update_lua_complete_stats(tr->origin, ns, optype, apply_rv,
+			result.is_success);
+
+	as_result_destroy(&result);
+	udf_record_destroy(urec);
+
+	return optype;
+}
+
+
+int
+udf_apply_record(udf_call* call, as_rec* rec, as_result* result)
+{
+	time_tracker udf_timer_tracker = {
+		.udata		= as_rec_source(rec),
+		.end_time	= udf_end_time
+	};
+
+	udf_timer_setup(&udf_timer_tracker);
+
+	as_timer timer;
+	as_timer_init(&timer, &udf_timer_tracker, &udf_timer_hooks);
+
+	as_udf_context ctx = {
+		.as			= &g_as_aerospike,
+		.timer		= &timer,
+		.memtracker	= NULL
+	};
+
+	int apply_rv = as_module_apply_record(&mod_lua, &ctx, call->def->filename,
+			call->def->function, rec, call->def->arglist, result);
+
+	udf_timer_cleanup();
+
+	return apply_rv;
+}
+
+
+uint64_t
+udf_end_time(time_tracker* tt)
+{
+	udf_record* urecord = (udf_record*)tt->udata;
+
+	if (! urecord) {
+		return -1; // TODO - should be impossible.
+	}
+
+	return urecord->tr->end_time;
+}
+
+
+void
+udf_finish(udf_record* urecord, rw_request* rw, udf_optype* record_op)
+{
+	*record_op = UDF_OPTYPE_READ;
+
+	udf_optype final_op = udf_finish_op(urecord);
+
+	if (final_op == UDF_OPTYPE_DELETE) {
+		*record_op = UDF_OPTYPE_DELETE;
+		urecord->tr->flags |= AS_TRANSACTION_FLAG_IS_DELETE;
+	}
+	else if (final_op == UDF_OPTYPE_WRITE) {
+		*record_op = UDF_OPTYPE_WRITE;
+	}
+
+	udf_post_processing(urecord, rw, final_op);
+}
+
+
+udf_optype
+udf_finish_op(udf_record* urecord)
+{
+	if (udf_zero_bins_left(urecord)) {
+		// Amazingly, with respect to stored key, memory statistics work out
+		// correctly regardless of what this returns.
+		return udf_finish_delete(urecord);
+	}
+
+	if ((urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) != 0) {
+		if ((urecord->flag & UDF_RECORD_FLAG_OPEN) == 0) {
+			cf_crash(AS_UDF, "updated record not open");
+		}
+
+		return UDF_OPTYPE_WRITE;
+	}
+
+	return UDF_OPTYPE_READ;
+}
+
+
+void
+udf_post_processing(udf_record* urecord, rw_request* rw, udf_optype urecord_op)
+{
+	as_storage_rd* rd = urecord->rd;
+	as_transaction* tr = urecord->tr;
+	as_record* r = rd->r;
+
+	uint16_t generation = 0;
+	uint16_t set_id = 0;
+	xdr_dirty_bins dirty_bins;
+
+	if (urecord_op == UDF_OPTYPE_WRITE || urecord_op == UDF_OPTYPE_DELETE) {
+		size_t rec_props_data_size = as_storage_record_rec_props_size(rd);
+		uint8_t rec_props_data[rec_props_data_size];
+
+		if (rec_props_data_size > 0) {
+			as_storage_record_set_rec_props(rd, rec_props_data);
+		}
+
+		as_msg* m = &tr->msgp->msg;
+
+		// Convert message TTL special value if appropriate.
+		if (m->record_ttl == TTL_DONT_UPDATE &&
+				(urecord->flag & UDF_RECORD_FLAG_PREEXISTS) == 0) {
+			m->record_ttl = TTL_NAMESPACE_DEFAULT;
+		}
+
+		update_metadata_in_index(tr, true, r);
+
+		pickle_all(rd, rw);
+
+		tr->generation = r->generation;
+		tr->void_time = r->void_time;
+		tr->last_update_time = r->last_update_time;
+
+		// Now ok to accommodate a new stored key...
+		if (r->key_stored == 0 && rd->key) {
+			if (rd->ns->storage_data_in_memory) {
+				as_record_allocate_key(r, rd->key, rd->key_size);
+			}
+
+			r->key_stored = 1;
+		}
+		// ... or drop a stored key.
+		else if (r->key_stored == 1 && ! rd->key) {
+			if (rd->ns->storage_data_in_memory) {
+				as_record_remove_key(r);
+			}
+
+			r->key_stored = 0;
+		}
+
+		as_storage_record_adjust_mem_stats(rd, urecord->starting_memory_bytes);
+
+		will_replicate(r, rd->ns);
+
+		// Collect information for XDR before closing the record.
+		generation = plain_generation(r->generation, rd->ns);
+		set_id = as_index_get_set_id(r);
+
+		if (urecord->dirty && urecord_op == UDF_OPTYPE_WRITE) {
+			xdr_clear_dirty_bins(&dirty_bins);
+			xdr_copy_dirty_bins(urecord->dirty, &dirty_bins);
+		}
+	}
+
+	// Close the record for all the cases.
+	udf_record_close(urecord);
+
+	// Write to XDR pipe.
+	if (urecord_op == UDF_OPTYPE_WRITE) {
+		xdr_write(tr->rsv.ns, &tr->keyd, generation, 0, XDR_OP_TYPE_WRITE,
+				set_id, &dirty_bins);
+	}
+	else if (urecord_op == UDF_OPTYPE_DELETE) {
+		xdr_write(tr->rsv.ns, &tr->keyd, 0, 0,
+				as_transaction_is_durable_delete(tr) ?
+						XDR_OP_TYPE_DURABLE_DELETE : XDR_OP_TYPE_DROP,
+				set_id, NULL);
+	}
+}
+
+
+//==========================================================
+// Local helpers - statistics.
+//
+
+void
+update_lua_complete_stats(uint8_t origin, as_namespace* ns, udf_optype op,
+		int ret, bool is_success)
+{
+	switch (origin) {
+	case FROM_CLIENT:
+		if (ret == 0 && is_success) {
+			if (op == UDF_OPTYPE_READ) {
+				cf_atomic_int_incr(&ns->n_client_lang_read_success);
+			}
+			else if (op == UDF_OPTYPE_DELETE) {
+				cf_atomic_int_incr(&ns->n_client_lang_delete_success);
+			}
+			else if (op == UDF_OPTYPE_WRITE) {
+				cf_atomic_int_incr(&ns->n_client_lang_write_success);
+			}
+		}
+		else {
+			cf_info(AS_UDF, "lua error, ret:%d", ret);
+			cf_atomic_int_incr(&ns->n_client_lang_error);
+		}
+		break;
+	case FROM_PROXY:
+		// TODO?
+		break;
+	case FROM_IUDF:
+		if (ret == 0 && is_success) {
+			if (op == UDF_OPTYPE_READ) {
+				// Note - this would be weird, since there's nowhere for a
+				// response to go in our current UDF scans & queries.
+				cf_atomic_int_incr(&ns->n_udf_sub_lang_read_success);
+			}
+			else if (op == UDF_OPTYPE_DELETE) {
+				cf_atomic_int_incr(&ns->n_udf_sub_lang_delete_success);
+			}
+			else if (op == UDF_OPTYPE_WRITE) {
+				cf_atomic_int_incr(&ns->n_udf_sub_lang_write_success);
+			}
+		}
+		else {
+			cf_info(AS_UDF, "lua error, ret:%d", ret);
+			cf_atomic_int_incr(&ns->n_udf_sub_lang_error);
+		}
+		break;
+	default:
+		cf_crash(AS_UDF, "unexpected transaction origin %u", origin);
+		break;
+	}
+}
+
+
+//==========================================================
+// Local helpers - construct response to be sent to origin.
+//
+
+void
+process_failure_str(udf_call* call, const char* err_str, size_t len,
+		cf_dyn_buf* db)
+{
+	if (! err_str) {
+		// Better than sending an as_string with null value.
+		process_failure(call, NULL, db);
+		return;
+	}
+
+	as_string stack_s;
+	as_string_init_wlen(&stack_s, (char*)err_str, len, false);
+
+	process_failure(call, as_string_toval(&stack_s), db);
+}
+
+
+void
+process_result(const as_result* result, udf_call* call, cf_dyn_buf* db)
+{
+	as_val* val = result->value;
+
+	if (result->is_success) {
+		process_success(call, val, db);
+		return;
+	}
+
+	// Failures...
+
+	if (as_val_type(val) == AS_STRING) {
+		call->tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION;
+		process_failure(call, val, db);
+		return;
+	}
+
+	char lua_err_str[1024];
+	size_t len = (size_t)sprintf(lua_err_str,
+			"%s:0: in function %s() - error() argument type not handled",
+			call->def->filename, call->def->function);
+
+	call->tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION;
+	process_failure_str(call, lua_err_str, len, db);
+}
+
+
+void
+process_response(udf_call* call, bool success, const as_val* val,
+		cf_dyn_buf* db)
+{
+	// No response for background (internal) UDF.
+	if (call->def->type == AS_UDF_OP_BACKGROUND) {
+		return;
+	}
+
+	as_transaction* tr = call->tr;
+
+	// Note - this function quietly handles a null val. The response call will
+	// be given a bin with a name but not 'in use', and it does the right thing.
+
+	size_t msg_sz = 0;
+
+	db->buf = (uint8_t *)as_msg_make_val_response(success, val, tr->result_code,
+			tr->generation, tr->void_time, as_transaction_trid(tr), &msg_sz);
+
+	db->is_stack = false;
+	db->alloc_sz = msg_sz;
+	db->used_sz = msg_sz;
+}
diff --git a/as/src/transaction/write.c b/as/src/transaction/write.c
new file mode 100644
index 00000000..c17bfcff
--- /dev/null
+++ b/as/src/transaction/write.c
@@ -0,0 +1,1958 @@
+/*
+ * write.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "transaction/write.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+#include "base/cfg.h"
+#include "base/datamodel.h"
+#include "base/index.h"
+#include "base/proto.h"
+#include "base/secondary_index.h"
+#include "base/transaction.h"
+#include "base/transaction_policy.h"
+#include "base/truncate.h"
+#include "base/xdr_serverside.h"
+#include "fabric/partition.h"
+#include "storage/storage.h"
+#include "transaction/duplicate_resolve.h"
+#include "transaction/proxy.h"
+#include "transaction/replica_write.h"
+#include "transaction/rw_request.h"
+#include "transaction/rw_request_hash.h"
+#include "transaction/rw_utils.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define STACK_PARTICLES_SIZE (1024 * 1024)
+
+
+//==========================================================
+// Forward declarations.
+//
+
+void start_write_dup_res(rw_request* rw, as_transaction* tr);
+void start_write_repl_write(rw_request* rw, as_transaction* tr);
+void start_write_repl_write_forget(rw_request* rw, as_transaction* tr);
+bool write_dup_res_cb(rw_request* rw);
+void write_repl_write_after_dup_res(rw_request* rw, as_transaction* tr);
+void write_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr);
+void write_repl_write_cb(rw_request* rw);
+
+void send_write_response(as_transaction* tr, cf_dyn_buf* db);
+void write_timeout_cb(rw_request* rw);
+
+transaction_status write_master(rw_request* rw, as_transaction* tr);
+void write_master_failed(as_transaction* tr, as_index_ref* r_ref,
+		bool record_created, as_index_tree* tree, as_storage_rd* rd,
+		int result_code);
+int write_master_preprocessing(as_transaction* tr);
+int write_master_policies(as_transaction* tr, bool* p_must_not_create,
+		bool* p_record_level_replace, bool* p_must_fetch_data,
+		bool* p_increment_generation);
+bool check_msg_set_name(as_transaction* tr, const char* set_name);
+
+int write_master_dim_single_bin(as_transaction* tr, as_storage_rd* rd,
+		bool increment_generation, rw_request* rw, bool* is_delete,
+		xdr_dirty_bins* dirty_bins);
+int write_master_dim(as_transaction* tr, const char* set_name,
+		as_storage_rd* rd, bool record_level_replace, bool increment_generation,
+		rw_request* rw, bool* is_delete, xdr_dirty_bins* dirty_bins);
+int write_master_ssd_single_bin(as_transaction* tr, as_storage_rd* rd,
+		bool must_fetch_data, bool increment_generation, rw_request* rw,
+		bool* is_delete, xdr_dirty_bins* dirty_bins);
+int write_master_ssd(as_transaction* tr, const char* set_name,
+		as_storage_rd* rd, bool must_fetch_data, bool record_level_replace,
+		bool increment_generation, rw_request* rw, bool* is_delete,
+		xdr_dirty_bins* dirty_bins);
+
+void write_master_update_index_metadata(as_transaction* tr,
+		bool increment_generation, index_metadata* old, as_record* r);
+int write_master_bin_ops(as_transaction* tr, as_storage_rd* rd,
+		cf_ll_buf* particles_llb, as_bin* cleanup_bins,
+		uint32_t* p_n_cleanup_bins, cf_dyn_buf* db, uint32_t* p_n_final_bins,
+		xdr_dirty_bins* dirty_bins);
+int write_master_bin_ops_loop(as_transaction* tr, as_storage_rd* rd,
+		as_msg_op** ops, as_bin* response_bins, uint32_t* p_n_response_bins,
+		as_bin* result_bins, uint32_t* p_n_result_bins,
+		cf_ll_buf* particles_llb, as_bin* cleanup_bins,
+		uint32_t* p_n_cleanup_bins, xdr_dirty_bins* dirty_bins);
+
+void write_master_index_metadata_unwind(index_metadata* old, as_record* r);
+void write_master_dim_single_bin_unwind(as_bin* old_bin, as_bin* new_bin,
+		as_bin* cleanup_bins, uint32_t n_cleanup_bins);
+void write_master_dim_unwind(as_bin* old_bins, uint32_t n_old_bins,
+		as_bin* new_bins, uint32_t n_new_bins, as_bin* cleanup_bins,
+		uint32_t n_cleanup_bins);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+static inline void
+client_write_update_stats(as_namespace* ns, uint8_t result_code, bool is_xdr_op)
+{
+	switch (result_code) {
+	case AS_PROTO_RESULT_OK:
+		cf_atomic64_incr(&ns->n_client_write_success);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_write_success);
+		}
+		break;
+	case AS_PROTO_RESULT_FAIL_TIMEOUT:
+		cf_atomic64_incr(&ns->n_client_write_timeout);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_write_timeout);
+		}
+		break;
+	default:
+		cf_atomic64_incr(&ns->n_client_write_error);
+		if (is_xdr_op) {
+			cf_atomic64_incr(&ns->n_xdr_write_error);
+		}
+		break;
+	}
+}
+
+static inline void
+append_bin_to_destroy(as_bin* b, as_bin* bins, uint32_t* p_n_bins)
+{
+	if (as_bin_is_external_particle(b)) {
+		bins[(*p_n_bins)++] = *b;
+	}
+}
+
+
+//==========================================================
+// Public API.
+//
+
+transaction_status
+as_write_start(as_transaction* tr)
+{
+	BENCHMARK_START(tr, write, FROM_CLIENT);
+
+	// Apply XDR filter.
+	if (! xdr_allows_write(tr)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN;
+		send_write_response(tr, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Check that we aren't backed up.
+	if (as_storage_overloaded(tr->rsv.ns)) {
+		tr->result_code = AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD;
+		send_write_response(tr, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Create rw_request and add to hash.
+	rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd };
+	rw_request* rw = rw_request_create(&tr->keyd);
+	transaction_status status = rw_request_hash_insert(&hkey, rw, tr);
+
+	// If rw_request wasn't inserted in hash, transaction is finished.
+	if (status != TRANS_IN_PROGRESS) {
+		rw_request_release(rw);
+
+		if (status != TRANS_WAITING) {
+			send_write_response(tr, NULL);
+		}
+
+		return status;
+	}
+	// else - rw_request is now in hash, continue...
+
+	if (tr->rsv.ns->write_dup_res_disabled) {
+		// Note - preventing duplicate resolution this way allows
+		// rw_request_destroy() to handle dup_msg[] cleanup correctly.
+		tr->rsv.n_dupl = 0;
+	}
+
+	// If there are duplicates to resolve, start doing so.
+	if (tr->rsv.n_dupl != 0) {
+		start_write_dup_res(rw, tr);
+
+		// Started duplicate resolution.
+		return TRANS_IN_PROGRESS;
+	}
+	// else - no duplicate resolution phase, apply operation to master.
+
+	// Set up the nodes to which we'll write replicas.
+	rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p,
+			rw->dest_nodes);
+
+	if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) {
+		rw_request_hash_delete(&hkey, rw);
+		tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+		send_write_response(tr, NULL);
+		return TRANS_DONE_ERROR;
+	}
+
+	status = write_master(rw, tr);
+
+	BENCHMARK_NEXT_DATA_POINT(tr, write, master);
+
+	// If error, transaction is finished.
+	if (status != TRANS_IN_PROGRESS) {
+		rw_request_hash_delete(&hkey, rw);
+
+		if (status != TRANS_WAITING) {
+			send_write_response(tr, NULL);
+		}
+
+		return status;
+	}
+
+	// If we don't need replica writes, transaction is finished.
+	if (rw->n_dest_nodes == 0) {
+		finished_replicated(tr);
+		send_write_response(tr, &rw->response_db);
+		rw_request_hash_delete(&hkey, rw);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	// If we don't need to wait for replica write acks, fire and forget.
+	if (respond_on_master_complete(tr)) {
+		start_write_repl_write_forget(rw, tr);
+		send_write_response(tr, &rw->response_db);
+		rw_request_hash_delete(&hkey, rw);
+		return TRANS_DONE_SUCCESS;
+	}
+
+	start_write_repl_write(rw, tr);
+
+	// Started replica write.
+	return TRANS_IN_PROGRESS;
+}
+
+
+//==========================================================
+// Local helpers - transaction flow.
+//
+
+void
+start_write_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send dup-res message.
+
+	dup_res_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	dup_res_setup_rw(rw, tr, write_dup_res_cb, write_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_write_repl_write(rw_request* rw, as_transaction* tr)
+{
+	// Finish initializing rw, construct and send repl-write message.
+
+	repl_write_make_message(rw, tr);
+
+	pthread_mutex_lock(&rw->lock);
+
+	repl_write_setup_rw(rw, tr, write_repl_write_cb, write_timeout_cb);
+	send_rw_messages(rw);
+
+	pthread_mutex_unlock(&rw->lock);
+}
+
+
+void
+start_write_repl_write_forget(rw_request* rw, as_transaction* tr)
+{
+	// Construct and send repl-write message. No need to finish rw setup.
+
+	repl_write_make_message(rw, tr);
+	send_rw_messages_forget(rw);
+}
+
+
+bool
+write_dup_res_cb(rw_request* rw)
+{
+	BENCHMARK_NEXT_DATA_POINT(rw, write, dup_res);
+
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	if (tr.result_code != AS_PROTO_RESULT_OK) {
+		send_write_response(&tr, NULL);
+		return true;
+	}
+
+	// Set up the nodes to which we'll write replicas.
+	rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p,
+			rw->dest_nodes);
+
+	if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) {
+		tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE;
+		send_write_response(&tr, NULL);
+		return true;
+	}
+
+	transaction_status status = write_master(rw, &tr);
+
+	BENCHMARK_NEXT_DATA_POINT((&tr), write, master);
+
+	if (status == TRANS_WAITING) {
+		// Note - new tr now owns msgp, make sure rw destructor doesn't free it.
+		// Also, rw will release rsv - new tr will get a new one.
+		rw->msgp = NULL;
+		return true;
+	}
+
+	if (status == TRANS_DONE_ERROR) {
+		send_write_response(&tr, NULL);
+		return true;
+	}
+
+	// If we don't need replica writes, transaction is finished.
+	if (rw->n_dest_nodes == 0) {
+		finished_replicated(&tr);
+		send_write_response(&tr, &rw->response_db);
+		return true;
+	}
+
+	// If we don't need to wait for replica write acks, fire and forget.
+	if (respond_on_master_complete(&tr)) {
+		write_repl_write_forget_after_dup_res(rw, &tr);
+		send_write_response(&tr, &rw->response_db);
+		return true;
+	}
+
+	write_repl_write_after_dup_res(rw, &tr);
+
+	// Started replica write - don't delete rw_request from hash.
+	return false;
+}
+
+
+void
+write_repl_write_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Recycle rw_request that was just used for duplicate resolution to now do
+	// replica writes. Note - we are under the rw_request lock here!
+
+	repl_write_make_message(rw, tr);
+	repl_write_reset_rw(rw, tr, write_repl_write_cb);
+	send_rw_messages(rw);
+}
+
+
+void
+write_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr)
+{
+	// Send replica writes. Not waiting for acks, so need to reset rw_request.
+	// Note - we are under the rw_request lock here!
+
+	repl_write_make_message(rw, tr);
+	send_rw_messages_forget(rw);
+}
+
+
+void
+write_repl_write_cb(rw_request* rw)
+{
+	BENCHMARK_NEXT_DATA_POINT(rw, write, repl_write);
+
+	as_transaction tr;
+	as_transaction_init_from_rw(&tr, rw);
+
+	finished_replicated(&tr);
+	send_write_response(&tr, &rw->response_db);
+
+	// Finished transaction - rw_request cleans up reservation and msgp!
+}
+
+
+//==========================================================
+// Local helpers - transaction end.
+//
+
+void
+send_write_response(as_transaction* tr, cf_dyn_buf* db)
+{
+	// Paranoia - shouldn't get here on losing race with timeout.
+	if (! tr->from.any) {
+		cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin);
+		return;
+	}
+
+	// Note - if tr was setup from rw, rw->from.any has been set null and
+	// informs timeout it lost the race.
+
+	clear_delete_response_metadata(tr);
+
+	switch (tr->origin) {
+	case FROM_CLIENT:
+		if (db && db->used_sz != 0) {
+			as_msg_send_ops_reply(tr->from.proto_fd_h, db);
+		}
+		else {
+			as_msg_send_reply(tr->from.proto_fd_h, tr->result_code,
+					tr->generation, tr->void_time, NULL, NULL, 0, tr->rsv.ns,
+					as_transaction_trid(tr));
+		}
+		BENCHMARK_NEXT_DATA_POINT(tr, write, response);
+		HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(tr, write_hist);
+		client_write_update_stats(tr->rsv.ns, tr->result_code,
+				as_transaction_is_xdr(tr));
+		break;
+	case FROM_PROXY:
+		if (db && db->used_sz != 0) {
+			as_proxy_send_ops_response(tr->from.proxy_node,
+					tr->from_data.proxy_tid, db);
+		}
+		else {
+			as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid,
+					tr->result_code, tr->generation, tr->void_time, NULL, NULL,
+					0, tr->rsv.ns, as_transaction_trid(tr));
+		}
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin);
+		break;
+	}
+
+	tr->from.any = NULL; // pattern, not needed
+}
+
+
+void
+write_timeout_cb(rw_request* rw)
+{
+	if (! rw->from.any) {
+		return; // lost race against dup-res or repl-write callback
+	}
+
+	finished_not_replicated(rw);
+
+	switch (rw->origin) {
+	case FROM_CLIENT:
+		as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0,
+				0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw));
+		// Timeouts aren't included in histograms.
+		client_write_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT,
+				as_msg_is_xdr(&rw->msgp->msg));
+		break;
+	case FROM_PROXY:
+		break;
+	default:
+		cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin);
+		break;
+	}
+
+	rw->from.any = NULL; // inform other callback it lost the race
+}
+
+
+//==========================================================
+// Local helpers - write master.
+//
+
+transaction_status
+write_master(rw_request* rw, as_transaction* tr)
+{
+	CF_ALLOC_SET_NS_ARENA(tr->rsv.ns);
+
+	//------------------------------------------------------
+	// Perform checks that don't need to loop over ops, or
+	// create or find (and lock) the as_index.
+	//
+
+	if (! write_master_preprocessing(tr)) {
+		// Failure cases all call write_master_failed().
+		return TRANS_DONE_ERROR;
+	}
+
+	//------------------------------------------------------
+	// Loop over ops to set some essential policy flags.
+	//
+
+	bool must_not_create;
+	bool record_level_replace;
+	bool must_fetch_data;
+	bool increment_generation;
+
+	int result = write_master_policies(tr, &must_not_create,
+			&record_level_replace, &must_fetch_data, &increment_generation);
+
+	if (result != 0) {
+		write_master_failed(tr, 0, false, 0, 0, result);
+		return TRANS_DONE_ERROR;
+	}
+
+	//------------------------------------------------------
+	// Find or create the as_index and get a reference -
+	// this locks the record. Perform all checks that don't
+	// need the as_storage_rd.
+	//
+
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	as_index_tree* tree = tr->rsv.tree;
+
+	// Find or create as_index, populate as_index_ref, lock record.
+	as_index_ref r_ref;
+	r_ref.skip_lock = false;
+	as_record* r = NULL;
+	bool record_created = false;
+
+	if (must_not_create) {
+		if (as_record_get(tree, &tr->keyd, &r_ref) != 0) {
+			write_master_failed(tr, 0, record_created, tree, 0, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+			return TRANS_DONE_ERROR;
+		}
+
+		r = r_ref.r;
+
+		if (as_record_is_doomed(r, ns)) {
+			write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+			return TRANS_DONE_ERROR;
+		}
+
+		if (repl_state_check(r, tr) < 0) {
+			as_record_done(&r_ref, ns);
+			return TRANS_WAITING;
+		}
+
+		if (! as_record_is_live(r)) {
+			write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_NOT_FOUND);
+			return TRANS_DONE_ERROR;
+		}
+	}
+	else {
+		int rv = as_record_get_create(tree, &tr->keyd, &r_ref, ns);
+
+		if (rv < 0) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: fail as_record_get_create() ", ns->name);
+			write_master_failed(tr, 0, record_created, tree, 0, AS_PROTO_RESULT_FAIL_UNKNOWN);
+			return TRANS_DONE_ERROR;
+		}
+
+		r = r_ref.r;
+		record_created = rv == 1;
+
+		bool is_doomed = as_record_is_doomed(r, ns);
+
+		if (! record_created && ! is_doomed && repl_state_check(r, tr) < 0) {
+			as_record_done(&r_ref, ns);
+			return TRANS_WAITING;
+		}
+
+		// If it's an expired or truncated record, pretend it's a fresh create.
+		if (! record_created && is_doomed) {
+			as_record_rescue(&r_ref, ns);
+			record_created = true;
+		}
+	}
+
+	// Enforce record-level create-only existence policy.
+	if (! record_created && ! create_only_check(r, m)) {
+		write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_RECORD_EXISTS);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Check generation requirement, if any.
+	if (! generation_check(r, m, ns)) {
+		write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_GENERATION);
+		return TRANS_DONE_ERROR;
+	}
+
+	// If creating record, write set-ID into index.
+	if (record_created) {
+		int rv_set = as_transaction_has_set(tr) ?
+				set_set_from_msg(r, ns, m) : 0;
+
+		if (rv_set == -1) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: set can't be added ", ns->name);
+			write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_PARAMETER);
+			return TRANS_DONE_ERROR;
+		}
+		else if (rv_set == -2) {
+			write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_FORBIDDEN);
+			return TRANS_DONE_ERROR;
+		}
+
+		// Don't write record if it would be truncated.
+		if (as_truncate_now_is_truncated(ns, as_index_get_set_id(r))) {
+			write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_FORBIDDEN);
+			return TRANS_DONE_ERROR;
+		}
+	}
+
+	// Shortcut set name.
+	const char* set_name = as_index_get_set_name(r, ns);
+
+	// If record existed, check that as_msg set name matches.
+	if (! record_created && ! check_msg_set_name(tr, set_name)) {
+		write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_PARAMETER);
+		return TRANS_DONE_ERROR;
+	}
+
+	//------------------------------------------------------
+	// Open or create the as_storage_rd, and handle record
+	// metadata.
+	//
+
+	as_storage_rd rd;
+
+	if (record_created) {
+		as_storage_record_create(ns, r, &rd);
+	}
+	else {
+		as_storage_record_open(ns, r, &rd);
+	}
+
+	// Deal with delete durability (enterprise only).
+	if ((result = set_delete_durablility(tr, &rd)) != 0) {
+		write_master_failed(tr, &r_ref, record_created, tree, &rd, result);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Deal with key storage as needed.
+	if ((result = handle_msg_key(tr, &rd)) != 0) {
+		write_master_failed(tr, &r_ref, record_created, tree, &rd, result);
+		return TRANS_DONE_ERROR;
+	}
+
+	// Assemble record properties from index information.
+	size_t rec_props_data_size = as_storage_record_rec_props_size(&rd);
+	uint8_t rec_props_data[rec_props_data_size];
+
+	if (rec_props_data_size > 0) {
+		as_storage_record_set_rec_props(&rd, rec_props_data);
+	}
+
+	// Convert message TTL special value if appropriate.
+	if (record_created && m->record_ttl == TTL_DONT_UPDATE) {
+		m->record_ttl = TTL_NAMESPACE_DEFAULT;
+	}
+
+	//------------------------------------------------------
+	// Split write_master() according to configuration to
+	// handle record bins.
+	//
+
+	xdr_dirty_bins dirty_bins;
+	xdr_clear_dirty_bins(&dirty_bins);
+
+	bool is_delete = false;
+
+	if (ns->storage_data_in_memory) {
+		if (ns->single_bin) {
+			result = write_master_dim_single_bin(tr, &rd,
+					increment_generation,
+					rw, &is_delete, &dirty_bins);
+		}
+		else {
+			result = write_master_dim(tr, set_name, &rd,
+					record_level_replace, increment_generation,
+					rw, &is_delete, &dirty_bins);
+		}
+	}
+	else {
+		if (ns->single_bin) {
+			result = write_master_ssd_single_bin(tr, &rd,
+					must_fetch_data, increment_generation,
+					rw, &is_delete, &dirty_bins);
+		}
+		else {
+			result = write_master_ssd(tr, set_name, &rd,
+					must_fetch_data, record_level_replace, increment_generation,
+					rw, &is_delete, &dirty_bins);
+		}
+	}
+
+	if (result != 0) {
+		write_master_failed(tr, &r_ref, record_created, tree, &rd, result);
+		return TRANS_DONE_ERROR;
+	}
+
+	//------------------------------------------------------
+	// Done - complete function's output, release the record
+	// lock, and do XDR write if appropriate.
+	//
+
+	tr->generation = r->generation;
+	tr->void_time = r->void_time;
+	tr->last_update_time = r->last_update_time;
+
+	// Get set-id before releasing.
+	uint16_t set_id = as_index_get_set_id(r_ref.r);
+
+	// Collect more info for XDR.
+	uint16_t generation = plain_generation(r->generation, ns);
+	xdr_op_type op_type = XDR_OP_TYPE_WRITE;
+
+	// Handle deletion if appropriate.
+	if (is_delete) {
+		write_delete_record(r_ref.r, tree);
+		cf_atomic64_incr(&ns->n_deleted_last_bin);
+		tr->flags |= AS_TRANSACTION_FLAG_IS_DELETE;
+
+		generation = 0;
+		op_type = as_transaction_is_durable_delete(tr) ?
+				XDR_OP_TYPE_DURABLE_DELETE : XDR_OP_TYPE_DROP;
+	}
+	// Or (normally) adjust max void-time.
+	else if (r->void_time != 0) {
+		cf_atomic64_setmax(&tr->rsv.p->max_void_time, r->void_time);
+	}
+
+	will_replicate(r, ns);
+
+	as_storage_record_close(&rd);
+	as_record_done(&r_ref, ns);
+
+	// Don't send an XDR delete if it's disallowed.
+	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
+		return TRANS_IN_PROGRESS;
+	}
+
+	// Do an XDR write if the write is a non-XDR write or is an XDR write with
+	// forwarding enabled.
+	if (! as_msg_is_xdr(m) || is_xdr_forwarding_enabled() ||
+			ns->ns_forward_xdr_writes) {
+		xdr_write(ns, &tr->keyd, generation, 0, op_type, set_id, &dirty_bins);
+	}
+
+	return TRANS_IN_PROGRESS;
+}
+
+
+void
+write_master_failed(as_transaction* tr, as_index_ref* r_ref,
+		bool record_created, as_index_tree* tree, as_storage_rd* rd,
+		int result_code)
+{
+	as_namespace* ns = tr->rsv.ns;
+
+	if (r_ref) {
+		if (record_created) {
+			as_index_delete(tree, &tr->keyd);
+		}
+
+		if (rd) {
+			as_storage_record_close(rd);
+		}
+
+		as_record_done(r_ref, ns);
+	}
+
+	switch (result_code) {
+	case AS_PROTO_RESULT_FAIL_GENERATION:
+		cf_atomic64_incr(&ns->n_fail_generation);
+		break;
+	case AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG:
+		cf_detail_digest(AS_RW, &tr->keyd, "{%s} write_master: record too big ", ns->name);
+		cf_atomic64_incr(&ns->n_fail_record_too_big);
+		break;
+	default:
+		// These either log warnings or aren't interesting enough to count.
+		break;
+	}
+
+	tr->result_code = (uint8_t)result_code;
+}
+
+
+int
+write_master_preprocessing(as_transaction* tr)
+{
+	as_namespace* ns = tr->rsv.ns;
+	as_msg* m = &tr->msgp->msg;
+
+	if (ns->clock_skew_stop_writes) {
+		// TODO - new error code?
+		write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_FORBIDDEN);
+		return false;
+	}
+
+	// ns->stop_writes is set by thr_nsup if configured threshold is breached.
+	if (cf_atomic32_get(ns->stop_writes) == 1) {
+		write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_OUT_OF_SPACE);
+		return false;
+	}
+
+	if (! as_storage_has_space(ns)) {
+		cf_warning(AS_RW, "{%s}: write_master: drives full", ns->name);
+		write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_OUT_OF_SPACE);
+		return false;
+	}
+
+	if (! is_valid_ttl(ns, m->record_ttl)) {
+		cf_warning(AS_RW, "write_master: invalid ttl %u", m->record_ttl);
+		write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_PARAMETER);
+		return false;
+	}
+
+	// Fail if disallow_null_setname is true and set name is absent or empty.
+	if (ns->disallow_null_setname) {
+		as_msg_field* f = as_transaction_has_set(tr) ?
+				as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET) : NULL;
+
+		if (! f || as_msg_field_get_value_sz(f) == 0) {
+			cf_warning(AS_RW, "write_master: null/empty set name not allowed for namespace %s", ns->name);
+			write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_PARAMETER);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
+int
+write_master_policies(as_transaction* tr, bool* p_must_not_create,
+		bool* p_record_level_replace, bool* p_must_fetch_data,
+		bool* p_increment_generation)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+
+	if (m->n_ops == 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: bin op(s) expected, none present ", ns->name);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	bool info1_get_all = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0;
+	bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0;
+
+	bool must_not_create =
+			(m->info3 & AS_MSG_INFO3_UPDATE_ONLY) != 0 ||
+			(m->info3 & AS_MSG_INFO3_REPLACE_ONLY) != 0;
+
+	bool record_level_replace =
+			(m->info3 & AS_MSG_INFO3_CREATE_OR_REPLACE) != 0 ||
+			(m->info3 & AS_MSG_INFO3_REPLACE_ONLY) != 0;
+
+	bool must_fetch_data = false;
+
+	bool increment_generation = false;
+
+	bool has_read_all_op = false;
+	bool generates_response_bin = false;
+
+	// Loop over ops to check and modify flags.
+	as_msg_op* op = NULL;
+	int i = 0;
+
+	while ((op = as_msg_op_iterate(m, op, &i)) != NULL) {
+		if (op->op != AS_MSG_OP_MC_TOUCH) {
+			increment_generation = true;
+		}
+
+		if (OP_IS_TOUCH(op->op)) {
+			if (record_level_replace) {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: touch op can't have record-level replace flag ", ns->name);
+				return AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			must_not_create = true;
+			must_fetch_data = true;
+			continue;
+		}
+
+		if (ns->data_in_index &&
+				! is_embedded_particle_type(op->particle_type) &&
+				// Allow AS_PARTICLE_TYPE_NULL, although bin-delete operations
+				// are not likely in single-bin configuration.
+				op->particle_type != AS_PARTICLE_TYPE_NULL) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: can't write data type %u in data-in-index configuration ", ns->name, op->particle_type);
+			return AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
+		}
+
+		if (op->name_sz >= AS_ID_BIN_SZ) {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: bin name too long (%d) ", ns->name, op->name_sz);
+			return AS_PROTO_RESULT_FAIL_BIN_NAME;
+		}
+
+		if (op->op == AS_MSG_OP_WRITE) {
+			if (op->particle_type == AS_PARTICLE_TYPE_NULL &&
+					record_level_replace) {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: bin delete can't have record-level replace flag ", ns->name);
+				return AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+		}
+		else if (OP_IS_MODIFY(op->op)) {
+			if (record_level_replace) {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: modify op can't have record-level replace flag ", ns->name);
+				return AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			must_fetch_data = true;
+		}
+		else if (op_is_read_all(op, m)) {
+			if (respond_all_ops) {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: read-all op can't have respond-all-ops flag ", ns->name);
+				return AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			if (has_read_all_op) {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: can't have more than one read-all op ", ns->name);
+				return AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			has_read_all_op = true;
+			must_fetch_data = true;
+		}
+		else if (op->op == AS_MSG_OP_READ) {
+			generates_response_bin = true;
+			must_fetch_data = true;
+		}
+		else if (op->op == AS_MSG_OP_CDT_MODIFY) {
+			if (record_level_replace) {
+				cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: cdt modify op can't have record-level replace flag ", ns->name);
+				return AS_PROTO_RESULT_FAIL_PARAMETER;
+			}
+
+			generates_response_bin = true; // CDT modify may generate a response bin
+			must_fetch_data = true;
+		}
+		else if (op->op == AS_MSG_OP_CDT_READ) {
+			generates_response_bin = true;
+			must_fetch_data = true;
+		}
+	}
+
+	if (has_read_all_op && generates_response_bin) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: read-all op can't mix with ops that generate response bins ", ns->name);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	if (info1_get_all && ! has_read_all_op) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: get-all flag set with no read-all op ", ns->name);
+		return AS_PROTO_RESULT_FAIL_PARAMETER;
+	}
+
+	*p_must_not_create = must_not_create;
+	*p_record_level_replace = record_level_replace;
+	*p_must_fetch_data = must_fetch_data;
+	*p_increment_generation = increment_generation;
+
+	return 0;
+}
+
+
+bool
+check_msg_set_name(as_transaction* tr, const char* set_name)
+{
+	as_msg_field* f = as_transaction_has_set(tr) ?
+			as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_SET) : NULL;
+
+	if (! f || as_msg_field_get_value_sz(f) == 0) {
+		if (set_name) {
+			cf_warning_digest(AS_RW, &tr->keyd, "overwriting record in set '%s' but msg has no set name ",
+					set_name);
+		}
+
+		return true;
+	}
+
+	size_t msg_set_name_len = as_msg_field_get_value_sz(f);
+
+	if (! set_name ||
+			strncmp(set_name, (const char*)f->data, msg_set_name_len) != 0 ||
+			set_name[msg_set_name_len] != 0) {
+		CF_ZSTR_DEFINE(msg_set_name, AS_SET_NAME_MAX_SIZE + 4, f->data,
+				msg_set_name_len);
+
+		cf_warning_digest(AS_RW, &tr->keyd, "overwriting record in set '%s' but msg has different set name '%s' ",
+				set_name ? set_name : "(null)", msg_set_name);
+		return false;
+	}
+
+	return true;
+}
+
+
+//==========================================================
+// write_master() splits based on configuration -
+// data-in-memory & single-bin.
+//
+// These handle the bin operations part of write_master()
+// which are very different per configuration.
+//
+
+int
+write_master_dim_single_bin(as_transaction* tr, as_storage_rd* rd,
+		bool increment_generation, rw_request* rw, bool* is_delete,
+		xdr_dirty_bins* dirty_bins)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	as_record* r = rd->r;
+
+	rd->n_bins = 1;
+
+	// Set rd->bins!
+	// For data-in-memory:
+	// - if just created record - sets rd->bins to empty bin embedded in index
+	// - otherwise - sets rd->bins to existing embedded bin
+	as_storage_rd_load_bins(rd, NULL);
+
+	// For memory accounting, note current usage.
+	uint64_t memory_bytes = 0;
+
+	if (as_bin_inuse(rd->bins)) {
+		memory_bytes = as_storage_record_get_n_bytes_memory(rd);
+	}
+
+	//------------------------------------------------------
+	// Copy existing bin into old_bin to enable unwinding.
+	//
+
+	uint32_t n_old_bins = as_bin_inuse(rd->bins) ? 1 : 0;
+	as_bin old_bin;
+
+	as_single_bin_copy(&old_bin, rd->bins);
+
+	// Collect bins (old or intermediate versions) to destroy on cleanup.
+	as_bin cleanup_bins[m->n_ops];
+	uint32_t n_cleanup_bins = 0;
+
+	//------------------------------------------------------
+	// Apply changes to metadata in as_index needed for
+	// response, pickling, and writing.
+	//
+
+	index_metadata old_metadata;
+
+	write_master_update_index_metadata(tr, increment_generation, &old_metadata, r);
+
+	//------------------------------------------------------
+	// Loop over bin ops to affect new bin space, creating
+	// the new record bin to write.
+	//
+
+	uint32_t n_new_bins = 0;
+	int result = write_master_bin_ops(tr, rd, NULL, cleanup_bins,
+			&n_cleanup_bins, &rw->response_db, &n_new_bins, dirty_bins);
+
+	if (result != 0) {
+		write_master_index_metadata_unwind(&old_metadata, r);
+		write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins);
+		return result;
+	}
+
+	//------------------------------------------------------
+	// Created the new bin to write.
+	//
+
+	if (n_new_bins == 0) {
+		if (n_old_bins == 0) {
+			write_master_index_metadata_unwind(&old_metadata, r);
+			write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins);
+			return AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		}
+
+		if (! validate_delete_durability(tr)) {
+			write_master_index_metadata_unwind(&old_metadata, r);
+			write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins);
+			return AS_PROTO_RESULT_FAIL_FORBIDDEN;
+		}
+
+		*is_delete = true;
+	}
+
+	// Pickle before writing - can't fail after. (Historic - now can't fail.)
+	pickle_all(rd, rw);
+
+	//------------------------------------------------------
+	// Write the record to storage.
+	//
+
+	if ((result = as_storage_record_write(rd)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name);
+		write_master_index_metadata_unwind(&old_metadata, r);
+		write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins);
+		return -result;
+	}
+
+	//------------------------------------------------------
+	// Cleanup - destroy relevant bins, can't unwind after.
+	//
+
+	destroy_stack_bins(cleanup_bins, n_cleanup_bins);
+
+	as_storage_record_adjust_mem_stats(rd, memory_bytes);
+
+	return 0;
+}
+
+
+int
+write_master_dim(as_transaction* tr, const char* set_name, as_storage_rd* rd,
+		bool record_level_replace, bool increment_generation, rw_request* rw,
+		bool* is_delete, xdr_dirty_bins* dirty_bins)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	as_record* r = rd->r;
+
+	// Set rd->n_bins!
+	// For data-in-memory - number of bins in existing record.
+	as_storage_rd_load_n_bins(rd);
+
+	// Set rd->bins!
+	// For data-in-memory:
+	// - if just created record - sets rd->bins to NULL
+	// - otherwise - sets rd->bins to existing (already populated) bins array
+	as_storage_rd_load_bins(rd, NULL);
+
+	// For memory accounting, note current usage.
+	uint64_t memory_bytes = as_storage_record_get_n_bytes_memory(rd);
+
+	//------------------------------------------------------
+	// Copy existing bins to new space, and keep old bins
+	// intact for sindex adjustment and so it's possible to
+	// unwind on failure.
+	//
+
+	uint32_t n_old_bins = (uint32_t)rd->n_bins;
+	uint32_t n_new_bins = n_old_bins + m->n_ops; // can't be more than this
+
+	size_t old_bins_size = n_old_bins * sizeof(as_bin);
+	size_t new_bins_size = n_new_bins * sizeof(as_bin);
+
+	as_bin* old_bins = rd->bins;
+	as_bin new_bins[n_new_bins];
+
+	if (old_bins_size == 0 || record_level_replace) {
+		memset(new_bins, 0, new_bins_size);
+	}
+	else {
+		memcpy(new_bins, old_bins, old_bins_size);
+		memset(new_bins + n_old_bins, 0, new_bins_size - old_bins_size);
+	}
+
+	rd->n_bins = (uint16_t)n_new_bins;
+	rd->bins = new_bins;
+
+	// Collect bins (old or intermediate versions) to destroy on cleanup.
+	as_bin cleanup_bins[m->n_ops];
+	uint32_t n_cleanup_bins = 0;
+
+	//------------------------------------------------------
+	// Apply changes to metadata in as_index needed for
+	// response, pickling, and writing.
+	//
+
+	index_metadata old_metadata;
+
+	write_master_update_index_metadata(tr, increment_generation, &old_metadata, r);
+
+	//------------------------------------------------------
+	// Loop over bin ops to affect new bin space, creating
+	// the new record bins to write.
+	//
+
+	int result = write_master_bin_ops(tr, rd, NULL, cleanup_bins,
+			&n_cleanup_bins, &rw->response_db, &n_new_bins, dirty_bins);
+
+	if (result != 0) {
+		write_master_index_metadata_unwind(&old_metadata, r);
+		write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins);
+		return result;
+	}
+
+	//------------------------------------------------------
+	// Created the new bins to write.
+	//
+
+	as_bin_space* new_bin_space = NULL;
+
+	// Adjust - the actual number of new bins.
+	rd->n_bins = n_new_bins;
+
+	if (n_new_bins != 0) {
+		new_bins_size = n_new_bins * sizeof(as_bin);
+		new_bin_space = (as_bin_space*)
+				cf_malloc_ns(sizeof(as_bin_space) + new_bins_size);
+	}
+	else {
+		if (n_old_bins == 0) {
+			write_master_index_metadata_unwind(&old_metadata, r);
+			write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins);
+			return AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		}
+
+		if (! validate_delete_durability(tr)) {
+			write_master_index_metadata_unwind(&old_metadata, r);
+			write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins);
+			return AS_PROTO_RESULT_FAIL_FORBIDDEN;
+		}
+
+		*is_delete = true;
+	}
+
+	// Pickle before writing - can't fail after. (Historic - now can't fail.)
+	pickle_all(rd, rw);
+
+	//------------------------------------------------------
+	// Write the record to storage.
+	//
+
+	if ((result = as_storage_record_write(rd)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name);
+
+		if (new_bin_space) {
+			cf_free(new_bin_space);
+		}
+
+		write_master_index_metadata_unwind(&old_metadata, r);
+		write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins);
+		return -result;
+	}
+
+	//------------------------------------------------------
+	// Success - adjust sindex, looking at old and new bins.
+	//
+
+	if (record_has_sindex(r, ns) &&
+			write_sindex_update(ns, set_name, &tr->keyd, old_bins, n_old_bins,
+					new_bins, n_new_bins)) {
+		tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED;
+	}
+
+	//------------------------------------------------------
+	// Cleanup - destroy relevant bins, can't unwind after.
+	//
+
+	if (record_level_replace) {
+		destroy_stack_bins(old_bins, n_old_bins);
+	}
+
+	destroy_stack_bins(cleanup_bins, n_cleanup_bins);
+
+	//------------------------------------------------------
+	// Final changes to record data in as_index.
+	//
+
+	// Fill out new_bin_space.
+	if (n_new_bins != 0) {
+		new_bin_space->n_bins = rd->n_bins;
+		memcpy((void*)new_bin_space->bins, new_bins, new_bins_size);
+	}
+
+	// Swizzle the index element's as_bin_space pointer.
+	as_bin_space* old_bin_space = as_index_get_bin_space(r);
+
+	if (old_bin_space) {
+		cf_free(old_bin_space);
+	}
+
+	as_index_set_bin_space(r, new_bin_space);
+
+	// Accommodate a new stored key - wasn't needed for pickling and writing.
+	if (r->key_stored == 0 && rd->key) {
+		as_record_allocate_key(r, rd->key, rd->key_size);
+		r->key_stored = 1;
+	}
+
+	as_storage_record_adjust_mem_stats(rd, memory_bytes);
+
+	return 0;
+}
+
+
+int
+write_master_ssd_single_bin(as_transaction* tr, as_storage_rd* rd,
+		bool must_fetch_data, bool increment_generation, rw_request* rw,
+		bool* is_delete, xdr_dirty_bins* dirty_bins)
+{
+	// Shortcut pointers.
+	as_namespace* ns = tr->rsv.ns;
+	as_record* r = rd->r;
+
+	rd->ignore_record_on_device = ! must_fetch_data;
+	rd->n_bins = 1;
+
+	as_bin stack_bin;
+
+	// Set rd->bins!
+	// For non-data-in-memory:
+	// - if just created record, or must_fetch_data is false - sets rd->bins to
+	//		empty stack_bin
+	// - otherwise - sets rd->bins to stack_bin, reads existing record off
+	//		device and populates bin (including particle pointer into block
+	//		buffer)
+	int result = as_storage_rd_load_bins(rd, &stack_bin);
+
+	if (result < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_rd_load_bins()", ns->name);
+		return -result;
+	}
+
+	uint32_t n_old_bins = as_bin_inuse(rd->bins) ? 1 : 0;
+
+	//------------------------------------------------------
+	// Apply changes to metadata in as_index needed for
+	// response, pickling, and writing.
+	//
+
+	index_metadata old_metadata;
+
+	write_master_update_index_metadata(tr, increment_generation, &old_metadata, r);
+
+	//------------------------------------------------------
+	// Loop over bin ops to affect new bin space, creating
+	// the new record bin to write.
+	//
+
+	cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE);
+
+	uint32_t n_new_bins = 0;
+
+	if ((result = write_master_bin_ops(tr, rd, &particles_llb, NULL, NULL,
+			&rw->response_db, &n_new_bins, dirty_bins)) != 0) {
+		cf_ll_buf_free(&particles_llb);
+		write_master_index_metadata_unwind(&old_metadata, r);
+		return result;
+	}
+
+	//------------------------------------------------------
+	// Created the new bin to write.
+	//
+
+	if (n_new_bins == 0) {
+		if (n_old_bins == 0) {
+			cf_ll_buf_free(&particles_llb);
+			write_master_index_metadata_unwind(&old_metadata, r);
+			return AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		}
+
+		if (! validate_delete_durability(tr)) {
+			cf_ll_buf_free(&particles_llb);
+			write_master_index_metadata_unwind(&old_metadata, r);
+			return AS_PROTO_RESULT_FAIL_FORBIDDEN;
+		}
+
+		*is_delete = true;
+	}
+
+	// Pickle before writing - bins may disappear on as_storage_record_close().
+	pickle_all(rd, rw);
+
+	//------------------------------------------------------
+	// Write the record to storage.
+	//
+
+	if ((result = as_storage_record_write(rd)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name);
+		cf_ll_buf_free(&particles_llb);
+		write_master_index_metadata_unwind(&old_metadata, r);
+		return -result;
+	}
+
+	//------------------------------------------------------
+	// Final changes to record data in as_index.
+	//
+
+	// Accommodate a new stored key - wasn't needed for pickling and writing.
+	if (r->key_stored == 0 && rd->key) {
+		r->key_stored = 1;
+	}
+
+	cf_ll_buf_free(&particles_llb);
+
+	return 0;
+}
+
+
+int
+write_master_ssd(as_transaction* tr, const char* set_name, as_storage_rd* rd,
+		bool must_fetch_data, bool record_level_replace,
+		bool increment_generation, rw_request* rw, bool* is_delete,
+		xdr_dirty_bins* dirty_bins)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	as_record* r = rd->r;
+	bool has_sindex = record_has_sindex(r, ns);
+
+	// If it's not touch or modify, determine if we must read existing record.
+	if (! must_fetch_data) {
+		must_fetch_data = has_sindex || ! record_level_replace;
+	}
+
+	rd->ignore_record_on_device = ! must_fetch_data;
+
+	// Set rd->n_bins!
+	// For non-data-in-memory:
+	// - if just created record, or must_fetch_data is false - 0
+	// - otherwise - number of bins in existing record
+	int result = as_storage_rd_load_n_bins(rd);
+
+	if (result < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_rd_load_n_bins()", ns->name);
+		return -result;
+	}
+
+	uint32_t n_old_bins = (uint32_t)rd->n_bins;
+	uint32_t n_new_bins = n_old_bins + m->n_ops; // can't be more than this
+
+	// Needed for as_storage_rd_load_bins() to clear all unused bins.
+	rd->n_bins = (uint16_t)n_new_bins;
+
+	// Stack space for resulting record's bins.
+	as_bin old_bins[n_old_bins];
+	as_bin new_bins[n_new_bins];
+
+	// Set rd->bins!
+	// For non-data-in-memory:
+	// - if just created record, or must_fetch_data is false - sets rd->bins to
+	//		empty new_bins
+	// - otherwise - sets rd->bins to new_bins, reads existing record off device
+	//		and populates bins (including particle pointers into block buffer)
+	if ((result = as_storage_rd_load_bins(rd, new_bins)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_rd_load_bins()", ns->name);
+		return -result;
+	}
+
+	//------------------------------------------------------
+	// Copy old bins (if any) - which are currently in new
+	// bins array - to old bins array, for sindex purposes.
+	//
+
+	if (has_sindex && n_old_bins != 0) {
+		memcpy(old_bins, new_bins, n_old_bins * sizeof(as_bin));
+
+		// If it's a replace, clear the new bins array.
+		if (record_level_replace) {
+			as_bin_set_all_empty(rd);
+		}
+	}
+
+	//------------------------------------------------------
+	// Apply changes to metadata in as_index needed for
+	// response, pickling, and writing.
+	//
+
+	index_metadata old_metadata;
+
+	write_master_update_index_metadata(tr, increment_generation, &old_metadata, r);
+
+	//------------------------------------------------------
+	// Loop over bin ops to affect new bin space, creating
+	// the new record bins to write.
+	//
+
+	cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE);
+
+	if ((result = write_master_bin_ops(tr, rd, &particles_llb, NULL, NULL,
+			&rw->response_db, &n_new_bins, dirty_bins)) != 0) {
+		cf_ll_buf_free(&particles_llb);
+		write_master_index_metadata_unwind(&old_metadata, r);
+		return result;
+	}
+
+	//------------------------------------------------------
+	// Created the new bins to write.
+	//
+
+	// Adjust - the actual number of new bins.
+	rd->n_bins = n_new_bins;
+
+	if (n_new_bins == 0) {
+		if (n_old_bins == 0) {
+			cf_ll_buf_free(&particles_llb);
+			write_master_index_metadata_unwind(&old_metadata, r);
+			return AS_PROTO_RESULT_FAIL_NOT_FOUND;
+		}
+
+		if (! validate_delete_durability(tr)) {
+			cf_ll_buf_free(&particles_llb);
+			write_master_index_metadata_unwind(&old_metadata, r);
+			return AS_PROTO_RESULT_FAIL_FORBIDDEN;
+		}
+
+		*is_delete = true;
+	}
+
+	// Pickle before writing - bins may disappear on as_storage_record_close().
+	pickle_all(rd, rw);
+
+	//------------------------------------------------------
+	// Write the record to storage.
+	//
+
+	if ((result = as_storage_record_write(rd)) < 0) {
+		cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name);
+		cf_ll_buf_free(&particles_llb);
+		write_master_index_metadata_unwind(&old_metadata, r);
+		return -result;
+	}
+
+	//------------------------------------------------------
+	// Success - adjust sindex, looking at old and new bins.
+	//
+
+	if (has_sindex &&
+			write_sindex_update(ns, set_name, &tr->keyd, old_bins, n_old_bins,
+					new_bins, n_new_bins)) {
+		tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED;
+	}
+
+	//------------------------------------------------------
+	// Final changes to record data in as_index.
+	//
+
+	// Accommodate a new stored key - wasn't needed for pickling and writing.
+	if (r->key_stored == 0 && rd->key) {
+		r->key_stored = 1;
+	}
+
+	cf_ll_buf_free(&particles_llb);
+
+	return 0;
+}
+
+
+//==========================================================
+// write_master() - apply record updates.
+//
+
+void
+write_master_update_index_metadata(as_transaction* tr,
+		bool increment_generation, index_metadata* old, as_record* r)
+{
+	old->void_time = r->void_time;
+	old->last_update_time = r->last_update_time;
+	old->generation = r->generation;
+
+	update_metadata_in_index(tr, increment_generation, r);
+}
+
+
+int
+write_master_bin_ops(as_transaction* tr, as_storage_rd* rd,
+		cf_ll_buf* particles_llb, as_bin* cleanup_bins,
+		uint32_t* p_n_cleanup_bins, cf_dyn_buf* db, uint32_t* p_n_final_bins,
+		xdr_dirty_bins* dirty_bins)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	as_record* r = rd->r;
+	bool has_read_all_op = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0;
+
+	as_msg_op* ops[m->n_ops];
+	as_bin response_bins[has_read_all_op ? rd->n_bins : m->n_ops];
+	as_bin result_bins[m->n_ops];
+
+	uint32_t n_response_bins = 0;
+	uint32_t n_result_bins = 0;
+
+	int result = write_master_bin_ops_loop(tr, rd, ops, response_bins,
+			&n_response_bins, result_bins, &n_result_bins, particles_llb,
+			cleanup_bins, p_n_cleanup_bins, dirty_bins);
+
+	if (result != 0) {
+		destroy_stack_bins(result_bins, n_result_bins);
+		return result;
+	}
+
+	*p_n_final_bins = as_bin_inuse_count(rd);
+
+	if (n_response_bins == 0) {
+		// If 'ordered-ops' flag was not set, and there were no read ops or CDT
+		// ops with results, there's no response to build and send later.
+		return 0;
+	}
+
+	as_bin* bins[n_response_bins];
+
+	for (uint32_t i = 0; i < n_response_bins; i++) {
+		as_bin* b = &response_bins[i];
+
+		bins[i] = as_bin_inuse(b) ? b : NULL;
+	}
+
+	uint32_t generation = r->generation;
+	uint32_t void_time = r->void_time;
+
+	// Deletes don't return metadata.
+	if (*p_n_final_bins == 0) {
+		generation = 0;
+		void_time = 0;
+	}
+
+	size_t msg_sz = 0;
+	uint8_t* msgp = (uint8_t*)as_msg_make_response_msg(AS_PROTO_RESULT_OK,
+			generation, void_time, has_read_all_op ? NULL : ops, bins,
+			(uint16_t)n_response_bins, ns, NULL, &msg_sz,
+			as_transaction_trid(tr));
+
+	destroy_stack_bins(result_bins, n_result_bins);
+
+	// Stash the message, to be sent later.
+	db->buf = msgp;
+	db->is_stack = false;
+	db->alloc_sz = msg_sz;
+	db->used_sz = msg_sz;
+
+	return 0;
+}
+
+
+int
+write_master_bin_ops_loop(as_transaction* tr, as_storage_rd* rd,
+		as_msg_op** ops, as_bin* response_bins, uint32_t* p_n_response_bins,
+		as_bin* result_bins, uint32_t* p_n_result_bins,
+		cf_ll_buf* particles_llb, as_bin* cleanup_bins,
+		uint32_t* p_n_cleanup_bins, xdr_dirty_bins* dirty_bins)
+{
+	// Shortcut pointers.
+	as_msg* m = &tr->msgp->msg;
+	as_namespace* ns = tr->rsv.ns;
+	bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0;
+
+	int result;
+
+	as_msg_op* op = NULL;
+	int i = 0;
+
+	while ((op = as_msg_op_iterate(m, op, &i)) != NULL) {
+		if (OP_IS_TOUCH(op->op)) {
+			continue;
+		}
+
+		if (op->op == AS_MSG_OP_WRITE) {
+			// AS_PARTICLE_TYPE_NULL means delete the bin.
+			// TODO - should this even be allowed for single-bin?
+			if (op->particle_type == AS_PARTICLE_TYPE_NULL) {
+				int32_t j = as_bin_get_index_from_buf(rd, op->name, op->name_sz);
+
+				if (j != -1) {
+					if (ns->storage_data_in_memory) {
+						// Double copy necessary for single-bin, but doing it
+						// generally for code simplicity.
+						as_bin cleanup_bin;
+						as_bin_copy(ns, &cleanup_bin, &rd->bins[j]);
+
+						append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins);
+					}
+
+					as_bin_set_empty_shift(rd, j);
+					xdr_fill_dirty_bins(dirty_bins);
+				}
+			}
+			// It's a regular bin write.
+			else {
+				as_bin* b = as_bin_get_or_create_from_buf(rd, op->name, op->name_sz, &result);
+
+				if (! b) {
+					return result;
+				}
+
+				if (ns->storage_data_in_memory) {
+					as_bin cleanup_bin;
+					as_bin_copy(ns, &cleanup_bin, b);
+
+					if ((result = as_bin_particle_alloc_from_client(b, op)) < 0) {
+						cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_alloc_from_client() ", ns->name);
+						return -result;
+					}
+
+					append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins);
+				}
+				else {
+					if ((result = as_bin_particle_stack_from_client(b, particles_llb, op)) < 0) {
+						cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_stack_from_client() ", ns->name);
+						return -result;
+					}
+				}
+
+				xdr_add_dirty_bin(ns, dirty_bins, (const char*)op->name, op->name_sz);
+			}
+
+			if (respond_all_ops) {
+				ops[*p_n_response_bins] = op;
+				as_bin_set_empty(&response_bins[(*p_n_response_bins)++]);
+			}
+		}
+		// Modify an existing bin value.
+		else if (OP_IS_MODIFY(op->op)) {
+			as_bin* b = as_bin_get_or_create_from_buf(rd, op->name, op->name_sz, &result);
+
+			if (! b) {
+				return result;
+			}
+
+			if (ns->storage_data_in_memory) {
+				as_bin cleanup_bin;
+				as_bin_copy(ns, &cleanup_bin, b);
+
+				if ((result = as_bin_particle_alloc_modify_from_client(b, op)) < 0) {
+					cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_alloc_modify_from_client() ", ns->name);
+					return -result;
+				}
+
+				append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins);
+			}
+			else {
+				if ((result = as_bin_particle_stack_modify_from_client(b, particles_llb, op)) < 0) {
+					cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_stack_modify_from_client() ", ns->name);
+					return -result;
+				}
+			}
+
+			xdr_add_dirty_bin(ns, dirty_bins, (const char*)op->name, op->name_sz);
+
+			if (respond_all_ops) {
+				ops[*p_n_response_bins] = op;
+				as_bin_set_empty(&response_bins[(*p_n_response_bins)++]);
+			}
+		}
+		else if (op_is_read_all(op, m)) {
+			for (uint16_t i = 0; i < rd->n_bins; i++) {
+				as_bin* b = &rd->bins[i];
+
+				if (! as_bin_inuse(b)) {
+					break;
+				}
+
+				// ops array will not be not used in this case.
+				as_bin_copy(ns, &response_bins[(*p_n_response_bins)++], b);
+			}
+		}
+		else if (op->op == AS_MSG_OP_READ) {
+			as_bin* b = as_bin_get_from_buf(rd, op->name, op->name_sz);
+
+			if (b) {
+				ops[*p_n_response_bins] = op;
+				as_bin_copy(ns, &response_bins[(*p_n_response_bins)++], b);
+			}
+			else if (respond_all_ops) {
+				ops[*p_n_response_bins] = op;
+				as_bin_set_empty(&response_bins[(*p_n_response_bins)++]);
+			}
+		}
+		else if (op->op == AS_MSG_OP_CDT_MODIFY) {
+			as_bin* b = as_bin_get_or_create_from_buf(rd, op->name, op->name_sz, &result);
+
+			if (! b) {
+				return result;
+			}
+
+			as_bin result_bin;
+			as_bin_set_empty(&result_bin);
+
+			if (ns->storage_data_in_memory) {
+				as_bin cleanup_bin;
+				as_bin_copy(ns, &cleanup_bin, b);
+
+				if ((result = as_bin_cdt_alloc_modify_from_client(b, op, &result_bin)) < 0) {
+					cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_cdt_alloc_modify_from_client() ", ns->name);
+					return -result;
+				}
+
+				// Account for noop CDT operations. Modifying non-mutable
+				// particle contents in-place is still disallowed.
+				if (cleanup_bin.particle != b->particle) {
+					append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins);
+				}
+			}
+			else {
+				if ((result = as_bin_cdt_stack_modify_from_client(b, particles_llb, op, &result_bin)) < 0) {
+					cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_cdt_stack_modify_from_client() ", ns->name);
+					return -result;
+				}
+			}
+
+			if (respond_all_ops || as_bin_inuse(&result_bin)) {
+				ops[*p_n_response_bins] = op;
+				response_bins[(*p_n_response_bins)++] = result_bin;
+				append_bin_to_destroy(&result_bin, result_bins, p_n_result_bins);
+			}
+
+			if (! as_bin_inuse(b)) {
+				// TODO - could do better than finding index from name.
+				int32_t index = as_bin_get_index_from_buf(rd, op->name, op->name_sz);
+
+				if (index >= 0) {
+					as_bin_set_empty_shift(rd, (uint32_t)index);
+					xdr_fill_dirty_bins(dirty_bins);
+				}
+			}
+			else {
+				xdr_add_dirty_bin(ns, dirty_bins, (const char*)op->name, op->name_sz);
+			}
+		}
+		else if (op->op == AS_MSG_OP_CDT_READ) {
+			as_bin* b = as_bin_get_from_buf(rd, op->name, op->name_sz);
+
+			if (b) {
+				as_bin result_bin;
+				as_bin_set_empty(&result_bin);
+
+				if ((result = as_bin_cdt_read_from_client(b, op, &result_bin)) < 0) {
+					cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_cdt_read_from_client() ", ns->name);
+					return -result;
+				}
+
+				ops[*p_n_response_bins] = op;
+				response_bins[(*p_n_response_bins)++] = result_bin;
+				append_bin_to_destroy(&result_bin, result_bins, p_n_result_bins);
+			}
+			else if (respond_all_ops) {
+				ops[*p_n_response_bins] = op;
+				as_bin_set_empty(&response_bins[(*p_n_response_bins)++]);
+			}
+		}
+		else {
+			cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: unknown bin op %u ", ns->name, op->op);
+			return AS_PROTO_RESULT_FAIL_PARAMETER;
+		}
+	}
+
+	return 0;
+}
+
+
+//==========================================================
+// write_master() - unwind on failure or cleanup.
+//
+
+void
+write_master_index_metadata_unwind(index_metadata* old, as_record* r)
+{
+	r->void_time = old->void_time;
+	r->last_update_time = old->last_update_time;
+	r->generation = old->generation;
+}
+
+
+void
+write_master_dim_single_bin_unwind(as_bin* old_bin, as_bin* new_bin,
+		as_bin* cleanup_bins, uint32_t n_cleanup_bins)
+{
+	as_particle* p_old = as_bin_get_particle(old_bin);
+
+	if (as_bin_is_external_particle(new_bin) && new_bin->particle != p_old) {
+		as_bin_particle_destroy(new_bin, true);
+	}
+
+	for (uint32_t i_cleanup = 0; i_cleanup < n_cleanup_bins; i_cleanup++) {
+		as_bin* b_cleanup = &cleanup_bins[i_cleanup];
+
+		if (b_cleanup->particle != p_old) {
+			as_bin_particle_destroy(b_cleanup, true);
+		}
+	}
+
+	as_single_bin_copy(new_bin, old_bin);
+}
+
+
+void
+write_master_dim_unwind(as_bin* old_bins, uint32_t n_old_bins, as_bin* new_bins,
+		uint32_t n_new_bins, as_bin* cleanup_bins, uint32_t n_cleanup_bins)
+{
+	for (uint32_t i_new = 0; i_new < n_new_bins; i_new++) {
+		as_bin* b_new = &new_bins[i_new];
+
+		if (! as_bin_inuse(b_new)) {
+			break;
+		}
+
+		// Embedded particles have no-op destructors - skip loop over old bins.
+		if (as_bin_is_embedded_particle(b_new)) {
+			continue;
+		}
+
+		as_particle* p_new = b_new->particle;
+		uint32_t i_old;
+
+		for (i_old = 0; i_old < n_old_bins; i_old++) {
+			as_bin* b_old = &old_bins[i_old];
+
+			if (b_new->id == b_old->id) {
+				if (p_new != as_bin_get_particle(b_old)) {
+					as_bin_particle_destroy(b_new, true);
+				}
+
+				break;
+			}
+		}
+
+		if (i_old == n_old_bins) {
+			as_bin_particle_destroy(b_new, true);
+		}
+	}
+
+	for (uint32_t i_cleanup = 0; i_cleanup < n_cleanup_bins; i_cleanup++) {
+		as_bin* b_cleanup = &cleanup_bins[i_cleanup];
+		as_particle* p_cleanup = b_cleanup->particle;
+		uint32_t i_old;
+
+		for (i_old = 0; i_old < n_old_bins; i_old++) {
+			as_bin* b_old = &old_bins[i_old];
+
+			if (b_cleanup->id == b_old->id) {
+				if (p_cleanup != as_bin_get_particle(b_old)) {
+					as_bin_particle_destroy(b_cleanup, true);
+				}
+
+				break;
+			}
+		}
+
+		if (i_old == n_old_bins) {
+			as_bin_particle_destroy(b_cleanup, true);
+		}
+	}
+
+	// The index element's as_bin_space pointer still points at old bins.
+}
diff --git a/build/VersionCheck.py b/build/VersionCheck.py
new file mode 100755
index 00000000..5fffe13e
--- /dev/null
+++ b/build/VersionCheck.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+
+#
+# VersionCheck.py:
+#   Execute the given command, which must output a version of the form:
+#
+#   <Major>{.<Minor>.<Patch>}, where all three fields are non-negative integers and missing components default to 0
+#
+#   and check against the supplied minimum version components.
+#
+#   Returns 1 if the version is at least the minimum, 0 if not, or else -1 if an error occurs.
+#
+
+import os, sys
+
+def VersionCheck(command, minVersion):
+    try:
+        minVers = minVersion.split('.')
+        while (len(minVers) < 3):
+            minVers.append(0)
+        minMajor, minMinor, minPatch  = [int(c) for c in minVers]
+        vers = os.popen(command).read().strip().split('.')
+        while (len(vers) < 3):
+            vers.append(0)
+        major, minor, patch = [int(c) for c in vers]
+        return 1 if (major > minMajor or
+                     (major == minMajor and (minor > minMinor or
+                                             (minor == minMinor and patch >= minPatch)))) else 0
+    except:
+        return -1
+
+sys.stdout.write(str(VersionCheck(*sys.argv[1:3])))
diff --git a/build/gen_version b/build/gen_version
new file mode 100755
index 00000000..fdef4958
--- /dev/null
+++ b/build/gen_version
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+EDITION=${1:-community}
+BUILD_OS=${2:-unknown}
+FEATURES=""
+if [ $EDITION == enterprise ]; then
+   FEATURES=";xdr"
+fi
+
+echo "////
+//// AUTOMATICALLY GENERATED BY BUILD SYSTEM
+////
+const char aerospike_build_id[] = \"`git describe`\";
+const char aerospike_build_time[] = \"`date`\";
+const char aerospike_build_type[] = \"Aerospike ${EDITION^} Edition\";
+const char aerospike_build_os[] = \"${BUILD_OS}\";
+const char aerospike_build_features[] = \"${FEATURES}\";
+"
diff --git a/build/os_version b/build/os_version
new file mode 100755
index 00000000..e3d1afb2
--- /dev/null
+++ b/build/os_version
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------
+# Copyright 2012-2015 Aerospike, Inc.
+#
+# Portions may be licensed to Aerospike, Inc. under one or more contributor
+# license agreements.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ------------------------------------------------------------------------------
+
+OPT_LONG=0
+
+if [ "$1" = "-long" ]
+then
+  OPT_LONG=1
+fi
+
+error() {
+	echo 'error:' $* >&2
+}
+
+main() {
+
+	local kernel=''
+	local distro_id=''
+	local distro_version=''
+	local distro_long=''
+	local distro_short=''
+
+	# Make sure this script is running on Linux
+	# The script is not designed to work on non-Linux 
+	# operating systems.
+	kernel=$(uname -s | tr '[:upper:]' '[:lower:]')
+	if [ "$kernel" != 'linux' ]
+	then
+		error "$kernel is not supported."
+		exit 1
+	fi
+
+	if [ -f /etc/os-release ]
+	then
+		. /etc/os-release
+		distro_id=${ID,,}
+		distro_version=${VERSION_ID}
+	elif [ -f /etc/issue ]
+	then
+		issue=$(cat /etc/issue | tr '[:upper:]' '[:lower:]')
+		case "$issue" in
+		*'centos'* )
+			distro_id='centos'
+			;;
+		*'redhat'* )
+			distro_id='redhat'
+			;;
+		*'debian'* )
+			distro_id='debian'
+			;;
+		* )
+			error "/etc/issue contained an unsupported linux distibution: $issue"
+			exit 1
+			;;
+		esac
+
+		case "$distro_id" in
+		'centos' | 'redhat' )
+			local release=''
+			if [ -f /etc/centos-release ]; then
+				release=$(cat /etc/centos-release | tr '[:upper:]' '[:lower:]')
+			elif [ -f /etc/redhat-release ]; then
+				release=$(cat /etc/redhat-release | tr '[:upper:]' '[:lower:]')
+			fi
+			release_version=${release##*release}
+			distro_version=${release_version%.*}
+			;;
+		'debian' )
+			debian_version=$(cat /etc/debian_version | tr '[:upper:]' '[:lower:]')
+			distro_version=${debian_version%%.*}
+			;;
+		* )
+			error "/etc/issue contained an unsupported linux distibution: $issue"
+			exit 1
+			;;
+		esac
+	fi
+
+	distro_id=${distro_id//[[:space:]]/}
+	distro_version=${distro_version//[[:space:]]/}
+
+	case "$distro_id" in
+	'centos' | 'redhat' )
+		distro_long="centos${distro_version}"
+		distro_short="el${distro_version}"
+		;;
+	'fedora' )
+		if [ "$distro_version" -gt "15" ]
+		then
+			distro_version=7
+		elif [ "$distro_version" -gt "10" ]
+		then
+			distro_version=6
+		else
+			error "Unsupported linux distibution: $distro_id $distro_version"
+		exit 1
+		fi
+		distro_long="centos${distro_version}"
+		distro_short="el${distro_version}"
+		;;
+	'amzn' )
+		distro_long="ami"
+		distro_short="ami"
+		;;
+	* )
+		distro_long="${distro_id}${distro_version}"
+		distro_short="${distro_id}${distro_version}"
+		;;
+	esac
+
+	if [ "$OPT_LONG" = "1" ]
+	then
+		echo "${distro_long}"
+	else
+		echo "${distro_short}"
+	fi
+	exit 0
+}
+
+main
diff --git a/build/prep-ce b/build/prep-ce
new file mode 100755
index 00000000..5f0b8f83
--- /dev/null
+++ b/build/prep-ce
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#   File:         build/prep-ce
+#   Description:  Prepare for building from Community Edition source distribution.
+#   Usage:        prompt$ build/prep-ce
+#
+#   Executing this script replaces the version-related build scripts with versions
+#   using frozen version information instead of regenerating it every time.
+#
+
+mv build/gen_version{,.ORIG}
+cat > build/gen_version <<EOF
+cat version.c
+EOF
+chmod ugo+x build/gen_version
+
+mv build/version{,.ORIG}
+grep build_id version.c | sed 's/.*\"\(.*\)\";/echo \1/' > build/version
+chmod ugo+x build/version
diff --git a/build/version b/build/version
new file mode 100755
index 00000000..2545f25d
--- /dev/null
+++ b/build/version
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+rev=`git describe`
+subbuild=`echo $rev | awk -F'-' '{print $2}'`
+
+if [ "$subbuild" != "" ]
+then
+  rev=`echo $rev | awk -F'-' '{printf("%s-%s\n",$1,$2)}'`
+fi
+echo $rev
diff --git a/cf/.gitignore b/cf/.gitignore
new file mode 100644
index 00000000..4c16076a
--- /dev/null
+++ b/cf/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+target
\ No newline at end of file
diff --git a/cf/Makefile b/cf/Makefile
new file mode 100644
index 00000000..3acd1a10
--- /dev/null
+++ b/cf/Makefile
@@ -0,0 +1,9 @@
+# Citrusleaf Foundation
+# Makefile
+
+.PHONY: default
+default: all
+	@echo "done."
+
+%:
+	$(MAKE) -C src $@
diff --git a/cf/README.md b/cf/README.md
new file mode 100644
index 00000000..05cbd02c
--- /dev/null
+++ b/cf/README.md
@@ -0,0 +1,13 @@
+# Aerospike CF
+
+Library of objects shared between ASD and XDR.
+
+## Build
+
+To build
+
+	$ make
+
+To clean:
+
+	$ make clean
diff --git a/cf/include/arenax.h b/cf/include/arenax.h
new file mode 100644
index 00000000..53e39d12
--- /dev/null
+++ b/cf/include/arenax.h
@@ -0,0 +1,131 @@
+/*
+ * arenax.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define CF_ARENAX_BIGLOCK	(1 << 0)
+#define CF_ARENAX_CALLOC	(1 << 1)
+
+#ifndef CF_ARENAX_MAX_STAGES
+#define CF_ARENAX_MAX_STAGES 256
+#endif
+
+typedef uint64_t cf_arenax_handle;
+
+// Must be in-sync with internal array ARENAX_ERR_STRINGS[]:
+typedef enum {
+	CF_ARENAX_OK = 0,
+	CF_ARENAX_ERR_BAD_PARAM,
+	CF_ARENAX_ERR_STAGE_CREATE,
+	CF_ARENAX_ERR_STAGE_ATTACH,
+	CF_ARENAX_ERR_STAGE_DETACH,
+	CF_ARENAX_ERR_UNKNOWN
+} cf_arenax_err;
+
+//------------------------------------------------
+// For enterprise separation only.
+//
+
+// Element is indexed by 24 bits.
+#define ELEMENT_ID_NUM_BITS 24
+#define ELEMENT_ID_MASK ((1UL << ELEMENT_ID_NUM_BITS) - 1) // 0xFFffff
+
+#define MAX_STAGE_CAPACITY (1 << ELEMENT_ID_NUM_BITS) // 16 M
+
+// DO NOT access this member data directly - use the API!
+typedef struct cf_arenax_s {
+	// Configuration (passed in constructors).
+	key_t				key_base;
+	uint32_t			element_size;
+	uint32_t			stage_capacity;
+	uint32_t			max_stages;
+	uint32_t			flags;
+
+	// Configuration (derived).
+	size_t				stage_size;
+
+	// Free-element list.
+	cf_arenax_handle	free_h;
+
+	// Where to end-allocate.
+	uint32_t			at_stage_id;
+	uint32_t			at_element_id;
+
+	// Thread safety.
+	pthread_mutex_t		lock;
+
+	// Current stages.
+	uint32_t			stage_count;
+	uint8_t*			stages[CF_ARENAX_MAX_STAGES];
+} cf_arenax;
+
+typedef struct free_element_s {
+	uint32_t			magic;
+	cf_arenax_handle	next_h;
+} free_element;
+
+#define FREE_MAGIC 0xff1234ff
+
+
+//==========================================================
+// Public API.
+//
+
+size_t cf_arenax_sizeof();
+const char* cf_arenax_errstr(cf_arenax_err err);
+
+void cf_arenax_init(cf_arenax* arena, key_t key_base, uint32_t element_size,
+		uint32_t stage_capacity, uint32_t max_stages, uint32_t flags);
+
+cf_arenax_handle cf_arenax_alloc(cf_arenax* arena);
+void cf_arenax_free(cf_arenax* arena, cf_arenax_handle h);
+
+void* cf_arenax_resolve(cf_arenax* arena, cf_arenax_handle h);
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+static inline void
+cf_arenax_set_handle(cf_arenax_handle* h, uint32_t stage_id,
+		uint32_t element_id)
+{
+	*h = ((uint64_t)stage_id << ELEMENT_ID_NUM_BITS) | element_id;
+}
+
+cf_arenax_err cf_arenax_add_stage(cf_arenax* arena);
diff --git a/cf/include/bits.h b/cf/include/bits.h
new file mode 100644
index 00000000..458cf179
--- /dev/null
+++ b/cf/include/bits.h
@@ -0,0 +1,80 @@
+/*
+ * bits.h
+ *
+ * Copyright (C) 2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+
+//==========================================================
+// Public API.
+//
+
+// Position of most significant bit, 0 ... 63 from low to high. -1 for value 0.
+static inline int
+cf_msb(uint64_t value)
+{
+	int n = -1;
+
+	while (value != 0) {
+		value >>= 1;
+		n++;
+	}
+
+	return n;
+}
+
+// Returns number of trailing zeros in a uint64_t, 64 for x == 0.
+static inline uint32_t
+cf_lsb64(uint64_t x)
+{
+	if (x == 0) {
+		return 64;
+	}
+
+	return (uint32_t)__builtin_ctzll(x);
+}
+
+// Returns number of leading zeros in a uint64_t, 64 for x == 0.
+static inline uint32_t
+cf_msb64(uint64_t x)
+{
+	if (x == 0) {
+		return 64;
+	}
+
+	return (uint32_t)__builtin_clzll(x);
+}
+
+static inline uint32_t
+cf_bit_count64(uint64_t x)
+{
+	x -= (x >> 1) & 0x5555555555555555;
+	x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+	x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+
+	return (uint32_t)((x * 0x0101010101010101) >> 56);
+}
diff --git a/cf/include/cf_mutex.h b/cf/include/cf_mutex.h
new file mode 100644
index 00000000..eaced35e
--- /dev/null
+++ b/cf/include/cf_mutex.h
@@ -0,0 +1,63 @@
+/*
+ * cf_mutex.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+
+//==========================================================
+// Includes.
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct cf_mutex_s {
+	uint32_t u32;
+} cf_mutex __attribute__ ((aligned(4)));
+
+typedef struct cf_condition_s {
+	uint32_t seq;
+} cf_condition __attribute__ ((aligned(4)));
+
+#define CF_MUTEX_INIT { 0 }
+#define cf_mutex_init(__m) (__m)->u32 = 0
+#define cf_mutex_destroy(__m) // no-op
+
+
+//==========================================================
+// Public API.
+//
+
+void cf_mutex_lock(cf_mutex *m);
+void cf_mutex_unlock(cf_mutex *m);
+bool cf_mutex_trylock(cf_mutex *m);
+
+void cf_mutex_lock_spin(cf_mutex *m);
+void cf_mutex_unlock_spin(cf_mutex *m);
+
+void cf_condition_wait(cf_condition *c, cf_mutex *m);
+void cf_condition_signal(cf_condition *c);
diff --git a/cf/include/cf_str.h b/cf/include/cf_str.h
new file mode 100644
index 00000000..7feb6a36
--- /dev/null
+++ b/cf/include/cf_str.h
@@ -0,0 +1,73 @@
+/*
+ * cf_str.h
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+// These functions convert integers into a string, writing into the provided
+// buffer, and return the number of bytes written.
+unsigned int cf_str_itoa(int value, char *s, int radix);
+unsigned int cf_str_itoa_u64(uint64_t value, char *s, int radix);
+unsigned int cf_str_itoa_u32(uint32_t value, char *s, int radix);
+
+// These functions convert a string to a number of different integer types, and
+// returns 0 on success.
+int cf_str_atoi(char *s, int *value);
+int cf_str_atoi_u32(char *s, uint32_t *value);
+int cf_str_atoi_64(char *s, int64_t *value);
+int cf_str_atoi_u64(char *s, uint64_t *value);
+int cf_str_atoi_x64(const char *s, uint64_t *value);
+int cf_str_atoi_seconds(char *s, uint64_t *value);
+
+// And this does the same, with radix.
+int cf_str_atoi_u64_x(char *s, uint64_t *value, int radix);
+
+// Split the string 'str' based on input breaks in 'fmt'.
+// - The splitting is destructive.
+// - The pointers will be added to the end of vector '*v'.
+// - The vector better be created with object size 'void *'.
+struct cf_vector_s;
+extern void cf_str_split(char *fmt, char *str, struct cf_vector_s *v);
+
+static inline int
+cf_str_strnchr(uint8_t *s, int sz, int c)
+{
+	for (int i = 0; i < sz; i++) {
+		if (s[i] == c) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+static inline const char *
+cf_str_safe_as_empty(const char *s)
+{
+	return s ? s : "";
+}
+
+static inline const char *
+cf_str_safe_as_null(const char *s)
+{
+	return s ? s : "null";
+}
diff --git a/cf/include/compare.h b/cf/include/compare.h
new file mode 100644
index 00000000..f2066806
--- /dev/null
+++ b/cf/include/compare.h
@@ -0,0 +1,52 @@
+/*
+ * compare.h
+ *
+ * Copyright (C) 2018 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <stdint.h>
+
+
+//==========================================================
+// Public API - qsort() comparators.
+//
+
+static inline int
+cf_compare_uint64_desc(const void* pa, const void* pb)
+{
+	uint64_t a = *(const uint64_t*)pa;
+	uint64_t b = *(const uint64_t*)pb;
+
+	return a > b ? -1 : (a == b ? 0 : 1);
+}
+
+static inline int
+cf_compare_uint32_desc(const void* pa, const void* pb)
+{
+	uint32_t a = *(const uint32_t*)pa;
+	uint32_t b = *(const uint32_t*)pb;
+
+	return a > b ? -1 : (a == b ? 0 : 1);
+}
diff --git a/cf/include/daemon.h b/cf/include/daemon.h
new file mode 100644
index 00000000..cb325bc8
--- /dev/null
+++ b/cf/include/daemon.h
@@ -0,0 +1,30 @@
+/*
+ * daemon.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <sys/types.h>
+
+void cf_process_daemonize(int *fd_ignore_list, int list_size);
+void cf_process_privsep(uid_t uid, gid_t gid);
+void cf_process_holdcap(void);
+void cf_process_clearcap(void);
diff --git a/cf/include/dynbuf.h b/cf/include/dynbuf.h
new file mode 100644
index 00000000..5c6fd93b
--- /dev/null
+++ b/cf/include/dynbuf.h
@@ -0,0 +1,126 @@
+/*
+ * dynbuf.h
+ *
+ * Copyright (C) 2009 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * A simple dynamic buffer implementation
+ * Allows the first, simpler part of the buffer to be on the stack
+ * which is usually all that's needed
+ *
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct cf_dyn_buf_s {
+	uint8_t		*buf;
+	bool		is_stack;
+	size_t		alloc_sz;
+	size_t		used_sz;
+} cf_dyn_buf;
+
+#define cf_dyn_buf_define(__x)  uint8_t dyn_buf##__x[1024]; cf_dyn_buf __x = { dyn_buf##__x, true, 1024, 0 }
+#define cf_dyn_buf_define_size(__x, __sz)  uint8_t dyn_buf##__x[__sz]; cf_dyn_buf __x = { dyn_buf##__x, true, __sz, 0 }
+
+extern void cf_dyn_buf_init_heap(cf_dyn_buf *db, size_t sz);
+extern void cf_dyn_buf_reserve(cf_dyn_buf *db, size_t sz, uint8_t **from);
+extern void cf_dyn_buf_append_string(cf_dyn_buf *db, const char *s);
+extern void cf_dyn_buf_append_char(cf_dyn_buf *db, char c);
+extern void cf_dyn_buf_append_bool(cf_dyn_buf *db, bool b);
+extern void cf_dyn_buf_append_buf(cf_dyn_buf *db, uint8_t *buf, size_t sz);
+extern void cf_dyn_buf_append_int(cf_dyn_buf *db, int i);
+extern void cf_dyn_buf_append_uint64_x(cf_dyn_buf *db, uint64_t i); // HEX FORMAT!
+extern void cf_dyn_buf_append_uint64(cf_dyn_buf *db, uint64_t i);
+extern void cf_dyn_buf_append_uint32(cf_dyn_buf *db, uint32_t i);
+extern void cf_dyn_buf_chomp(cf_dyn_buf *db);
+extern char *cf_dyn_buf_strdup(cf_dyn_buf *db);
+extern void cf_dyn_buf_free(cf_dyn_buf *db);
+
+// Helpers to append name value pairs to a cf_dyn_buf in pattern: name=value;
+void info_append_bool(cf_dyn_buf *db, const char *name, bool value);
+void info_append_int(cf_dyn_buf *db, const char *name, int value);
+void info_append_string(cf_dyn_buf *db, const char *name, const char *value);
+void info_append_string_safe(cf_dyn_buf *db, const char *name, const char *value);
+void info_append_uint32(cf_dyn_buf *db, const char *name, uint32_t value);
+void info_append_uint64(cf_dyn_buf *db, const char *name, uint64_t value);
+void info_append_uint64_x(cf_dyn_buf *db, const char *name, uint64_t value);
+
+typedef struct cf_buf_builder_s {
+	size_t	alloc_sz;
+	size_t	used_sz;
+	uint8_t buf[];
+} cf_buf_builder;
+
+extern cf_buf_builder *cf_buf_builder_create();
+extern cf_buf_builder *cf_buf_builder_create_size(size_t sz);
+extern void cf_buf_builder_free(cf_buf_builder *bb);
+extern void cf_buf_builder_reset(cf_buf_builder *bb);
+extern void cf_buf_builder_chomp(cf_buf_builder *bb_r);
+// If you use any binary components, this strdup thing is a bad idea:
+extern char *cf_buf_builder_strdup(cf_buf_builder *bb_r);
+
+extern void cf_buf_builder_append_string(cf_buf_builder **bb_r, const char *s);
+extern void cf_buf_builder_append_char(cf_buf_builder **bb_r, char c);
+extern void cf_buf_builder_append_buf(cf_buf_builder **bb_r, uint8_t *buf, size_t sz);
+// These append ASCII versions:
+extern void cf_buf_builder_append_ascii_uint64_x(cf_buf_builder **bb_r, uint64_t i); // HEX FORMAT!
+extern void cf_buf_builder_append_ascii_uint64(cf_buf_builder **bb_r, uint64_t i);
+extern void cf_buf_builder_append_ascii_uint32(cf_buf_builder **bb_r, uint32_t i);
+extern void cf_buf_builder_append_ascii_int(cf_buf_builder **bb_r, int i);
+// These append network-order bytes:
+extern void cf_buf_builder_append_uint64(cf_buf_builder **bb_r, uint64_t i);
+extern void cf_buf_builder_append_uint32(cf_buf_builder **bb_r, uint32_t i);
+extern void cf_buf_builder_append_uint16(cf_buf_builder **bb_r, uint16_t i);
+extern void cf_buf_builder_append_uint8(cf_buf_builder **bb_r, uint8_t i);
+// Reserve the bytes and give me the handle to the spot reserved:
+extern void cf_buf_builder_reserve(cf_buf_builder **bb_r, int sz, uint8_t **buf);
+extern int cf_buf_builder_size(cf_buf_builder *bb);
+extern size_t get_new_size(int alloc, int used, int requested);
+
+// TODO - We've only implemented a few cf_ll_buf methods for now. We'll add more
+// functionality if and when it's needed.
+
+typedef struct cf_ll_buf_stage_s {
+	struct cf_ll_buf_stage_s	*next;
+	size_t						buf_sz;
+	size_t						used_sz;
+	uint8_t						buf[];
+} cf_ll_buf_stage;
+
+typedef struct cf_ll_buf_s {
+	bool			head_is_stack;
+	cf_ll_buf_stage	*head;
+	cf_ll_buf_stage	*tail;
+} cf_ll_buf;
+
+#define cf_ll_buf_define(__x, __sz) \
+		uint8_t llb_stage##__x[sizeof(cf_ll_buf_stage) + __sz]; \
+		cf_ll_buf_stage* ll_buf_stage##__x = (cf_ll_buf_stage*)llb_stage##__x; \
+		ll_buf_stage##__x->next = NULL; \
+		ll_buf_stage##__x->buf_sz = __sz; \
+		ll_buf_stage##__x->used_sz = 0; \
+		cf_ll_buf __x = { true, ll_buf_stage##__x, ll_buf_stage##__x }
+
+extern void cf_ll_buf_reserve(cf_ll_buf *llb, size_t sz, uint8_t **from);
+extern void cf_ll_buf_free(cf_ll_buf *llb);
diff --git a/cf/include/enhanced_alloc.h b/cf/include/enhanced_alloc.h
new file mode 100644
index 00000000..cb6b49fb
--- /dev/null
+++ b/cf/include/enhanced_alloc.h
@@ -0,0 +1,126 @@
+/*
+ * enhanced_alloc.h
+ *
+ * Copyright (C) 2013-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <malloc.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/cf_atomic.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct cf_rc_header_s {
+	cf_atomic32 rc;
+	uint32_t	sz;
+} cf_rc_header;
+
+typedef enum {
+	CF_ALLOC_DEBUG_NONE,
+	CF_ALLOC_DEBUG_TRANSIENT,
+	CF_ALLOC_DEBUG_PERSISTENT,
+	CF_ALLOC_DEBUG_ALL
+} cf_alloc_debug;
+
+
+//==========================================================
+// Public API - arena management and stats.
+//
+
+extern __thread int32_t g_ns_arena;
+
+void cf_alloc_init(void);
+void cf_alloc_set_debug(cf_alloc_debug debug);
+int32_t cf_alloc_create_arena(void);
+
+#define CF_ALLOC_SET_NS_ARENA(_ns) \
+	(g_ns_arena = _ns->storage_data_in_memory ? _ns->jem_arena : -1)
+
+static inline int32_t
+cf_alloc_clear_ns_arena(void)
+{
+	int32_t old_arena = g_ns_arena;
+	g_ns_arena = -1;
+	return old_arena;
+}
+
+static inline void
+cf_alloc_restore_ns_arena(int32_t old_arena)
+{
+	g_ns_arena = old_arena;
+}
+
+void cf_alloc_heap_stats(size_t *allocated_kbytes, size_t *active_kbytes, size_t *mapped_kbytes, double *efficiency_pct, uint32_t *site_count);
+void cf_alloc_log_stats(const char *file, const char *opts);
+void cf_alloc_log_site_infos(const char *file);
+
+
+//==========================================================
+// Public API - ordinary allocation.
+//
+
+// Don't call these directly - use wrappers below.
+void *cf_alloc_try_malloc(size_t sz);
+void *cf_alloc_malloc_arena(size_t sz, int32_t arena);
+void *cf_alloc_calloc_arena(size_t n, size_t sz, int32_t arena);
+void *cf_alloc_realloc_arena(void *p, size_t sz, int32_t arena);
+
+#define cf_try_malloc(_sz)       cf_alloc_try_malloc(_sz)
+
+#define cf_malloc(_sz)           malloc(_sz)
+#define cf_malloc_ns(_sz)        cf_alloc_malloc_arena(_sz, g_ns_arena)
+
+#define cf_calloc(_n, _sz)       calloc(_n, _sz)
+#define cf_calloc_ns(_n, _sz)    cf_alloc_calloc_arena(_n, _sz, g_ns_arena)
+
+#define cf_realloc(_p, _sz)      realloc(_p, _sz)
+#define cf_realloc_ns(_p, _sz)   cf_alloc_realloc_arena(_p, _sz, g_ns_arena)
+
+#define cf_valloc(_sz)           valloc(_sz)
+
+#define cf_strdup(_s)            strdup(_s)
+#define cf_strndup(_s, _n)       strndup(_s, _n)
+#define cf_asprintf(_s, _f, ...) asprintf(_s, _f, __VA_ARGS__)
+
+#define cf_free(_p)              free(_p)
+
+
+//==========================================================
+// Public API - reference-counted allocation.
+//
+
+void *cf_rc_alloc(size_t sz);
+void cf_rc_free(void *p);
+
+int32_t cf_rc_count(const void *p);
+int32_t cf_rc_reserve(void *p);
+int32_t cf_rc_release(void *p);
+int32_t cf_rc_releaseandfree(void *p);
diff --git a/cf/include/fault.h b/cf/include/fault.h
new file mode 100644
index 00000000..d0fc3c8c
--- /dev/null
+++ b/cf/include/fault.h
@@ -0,0 +1,434 @@
+/*
+ * fault.h
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <alloca.h>
+#include <execinfo.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "dynbuf.h"
+
+
+// Use COMPILER_ASSERT() for compile-time verification.
+//
+// Usage does not add any compiled code, or cost anything at runtime. When the
+// evaluated expression is false, it causes a compile error which will draw
+// attention to the relevant line.
+//
+// e.g.
+// COMPILER_ASSERT(sizeof(my_int_array) / sizeof(int) == MY_INT_ARRAY_SIZE);
+//
+#define CGLUE(a, b) a##b
+#define CVERIFY(expr, line) typedef char CGLUE(compiler_assert_failed_on_line_, line)[(expr) ? 1 : -1]
+#define COMPILER_ASSERT(expr) CVERIFY(expr, __LINE__)
+
+// Use CF_MUST_CHECK with declarations to force caller to handle return value.
+//
+// e.g.
+// CF_MUST_CHECK int my_function();
+//
+#define CF_MUST_CHECK __attribute__((warn_unused_result))
+
+// Use CF_IGNORE_ERROR() as caller to override CF_MUST_CHECK in declaration.
+//
+// e.g.
+// CF_IGNORE_ERROR(my_function());
+//
+#define CF_IGNORE_ERROR(x) ((void)((x) == 12345))
+
+// Use CF_NEVER_FAILS() as caller to assert that returned value is not negative.
+//
+// e.g.
+// CF_NEVER_FAILS(my_function());
+//
+#define CF_NEVER_FAILS(x) \
+do { \
+	if ((x) < 0) { \
+		cf_crash(CF_MISC, "this cannot happen..."); \
+	} \
+} while (false);
+
+// Use CF_ZSTR_DEFINE() to null-terminate strings conveniently.
+//
+// e.g.
+// CF_ZSTR_DEFINE(zstr, 40, ns_name, name_sz);
+// cf_warning(AS_NAMESPACE, "got namespace %s", zstr);
+//
+#define CF_ZSTR_DEFINE(zstr, max_sz, str, sz) \
+		char zstr[max_sz]; \
+		size_t zstr##len = sz < max_sz ? sz : max_sz - 1; \
+		memcpy(zstr, str, zstr##len); \
+		zstr[zstr##len] = 0;
+
+// Use CF_ZSTRxx() to null-terminate strings conveniently. Useful especially as
+// cf_detail & cf_debug parameters where there's no cost unless the log level
+// is enabled. (Cost may be more than CF_ZSTR_DEFINE() due to copying struct on
+// function return.)
+//
+// e.g.
+// cf_debug(AS_NAMESPACE, "got namespace %s", CF_ZSTR64(ns_name, name_sz));
+//
+
+typedef struct cf_zstr64_s {
+	char s[64];
+} cf_zstr64;
+
+typedef struct cf_zstr1k_s {
+	char s[1024];
+} cf_zstr1k;
+
+static inline cf_zstr64
+cf_null_terminate_64(const char *str, size_t sz)
+{
+	cf_zstr64 zstr;
+	size_t len = sz < sizeof(zstr.s) ? sz : sizeof(zstr.s) - 1;
+
+	memcpy(zstr.s, str, len);
+	zstr.s[len] = 0;
+
+	return zstr;
+}
+
+static inline cf_zstr1k
+cf_null_terminate_1k(const char *str, size_t sz)
+{
+	cf_zstr1k zstr;
+	size_t len = sz < sizeof(zstr.s) ? sz : sizeof(zstr.s) - 1;
+
+	memcpy(zstr.s, str, len);
+	zstr.s[len] = 0;
+
+	return zstr;
+}
+
+#define CF_ZSTR64(str, sz) (cf_null_terminate_64((const char *)str, sz).s)
+#define CF_ZSTR1K(str, sz) (cf_null_terminate_1k((const char *)str, sz).s)
+
+
+/* SYNOPSIS
+ * Fault scoping
+ *
+ * Faults are identified by a context and severity.  The context describes where
+ * the fault occurred, and the severity determines the required action.
+ *
+ * Examples:
+ *    cf_info(CF_MISC, "important message: %s", my_msg);
+ *    cf_crash(CF_MISC, "doom!");
+ *    cf_assert(my_test, CF_MISC, "gloom!");
+ */
+
+/* cf_fault_context
+ * NB: if you add or remove entries from this enum, you must also change
+ * the corresponding strings structure in fault.c */
+typedef enum {
+	CF_MISC,
+
+	CF_ALLOC,
+	CF_ARENAX,
+	CF_HARDWARE,
+	CF_MSG,
+	CF_RBUFFER,
+	CF_SOCKET,
+	CF_TLS,
+	CF_VMAPX,
+
+	AS_AGGR,
+	AS_APPEAL,
+	AS_AS,
+	AS_BATCH,
+	AS_BIN,
+	AS_CFG,
+	AS_CLUSTERING,
+	AS_COMPRESSION,
+	AS_DEMARSHAL,
+	AS_DRV_SSD,
+	AS_EXCHANGE,
+	AS_FABRIC,
+	AS_GEO,
+	AS_HB,
+	AS_HLC,
+	AS_INDEX,
+	AS_INFO,
+	AS_INFO_PORT,
+	AS_JOB,
+	AS_MIGRATE,
+	AS_MON,
+	AS_NAMESPACE,
+	AS_NSUP,
+	AS_PARTICLE,
+	AS_PARTITION,
+	AS_PAXOS,
+	AS_PREDEXP,
+	AS_PROTO,
+	AS_PROXY,
+	AS_PROXY_DIVERT, // special detail context
+	AS_QUERY,
+	AS_RECORD,
+	AS_ROSTER,
+	AS_RW,
+	AS_RW_CLIENT, // special detail context
+	AS_SCAN,
+	AS_SECURITY,
+	AS_SINDEX,
+	AS_SKEW,
+	AS_SMD,
+	AS_STORAGE,
+	AS_TRUNCATE,
+	AS_TSVC,
+	AS_UDF,
+	AS_XDR,
+	CF_FAULT_CONTEXT_UNDEF
+} cf_fault_context;
+
+extern char *cf_fault_context_strings[];
+
+/* cf_fault_severity
+ *     CRITICAL            fatal runtime panics
+ *     WARNING             runtime errors
+ *     INFO                informational or advisory messages
+ *     DEBUG               debugging messages
+ *     DETAIL              detailed debugging messages
+ */
+typedef enum {
+	CF_CRITICAL = 0,
+	CF_WARNING = 1,
+	CF_INFO = 2,
+	CF_DEBUG = 3,
+	CF_DETAIL = 4,
+	CF_FAULT_SEVERITY_UNDEF = 5
+} cf_fault_severity;
+
+/* cf_fault_sink
+ * An endpoint (sink) for a flow of fault messages */
+typedef struct cf_fault_sink {
+	int fd;
+	char *path;
+	int limit[CF_FAULT_CONTEXT_UNDEF];
+} cf_fault_sink;
+
+#define CF_FAULT_SINKS_MAX 8
+
+/**
+ * When we want to dump out some binary data (like a digest, a bit string
+ * or a buffer), we want to be able to specify how we'll display the data.
+ * We expect this list to grow over time, as more binary representations
+ * are needed. (2014_03_20 tjl).
+ */
+typedef enum {
+	CF_DISPLAY_HEX_DIGEST,	 	// Show Special Case DIGEST in Packed Hex
+	CF_DISPLAY_HEX_SPACED, 		// Show binary value in regular spaced hex
+	CF_DISPLAY_HEX_PACKED, 	    // Show binary value in packed hex
+	CF_DISPLAY_HEX_COLUMNS,		// Show binary value in Column Oriented Hex
+	CF_DISPLAY_BASE64,		    // Show binary value in Base64
+	CF_DISPLAY_BITS_SPACED,		// Show binary value in a spaced bit string
+	CF_DISPLAY_BITS_COLUMNS		// Show binary value in Column Oriented Bits
+} cf_display_type;
+
+
+/* Function declarations */
+
+// note: passing a null sink sets for all currently known sinks
+extern int cf_fault_sink_addcontext(cf_fault_sink *s, char *context, char *severity);
+extern cf_fault_sink *cf_fault_sink_add(char *path);
+
+extern cf_fault_sink *cf_fault_sink_hold(char *path);
+extern bool cf_fault_console_is_held();
+extern int cf_fault_sink_activate_all_held();
+extern int cf_fault_sink_get_fd_list(int *fds);
+
+extern int cf_fault_sink_strlist(cf_dyn_buf *db); // pack all contexts into a string - using ids
+extern int cf_fault_sink_context_all_strlist(int sink_id, cf_dyn_buf *db);
+extern int cf_fault_sink_context_strlist(int sink_id, char *context, cf_dyn_buf *db);
+
+extern cf_fault_sink *cf_fault_sink_get_id(int id);
+
+extern void cf_fault_sink_logroll(void);
+
+extern void cf_fault_use_local_time(bool val);
+extern bool cf_fault_is_using_local_time();
+
+extern void cf_fault_log_millis(bool log_millis);
+extern bool cf_fault_is_logging_millis();
+
+// TODO: Rework cf_display_type-based logging to have a more useful
+// output format, instead of having this separate function.
+extern void cf_fault_hex_dump(const char *title, const void *data, size_t len);
+
+extern cf_fault_severity cf_fault_filter[];
+
+// Define the mechanism that we'll use to write into the Server Log.
+// cf_fault_event() is "regular" logging
+extern void cf_fault_event(const cf_fault_context,
+		const cf_fault_severity severity, const char *file_name,
+		const int line, const char *msg, ...)
+		__attribute__ ((format (printf, 5, 6)));
+
+// cf_fault_event2() is for advanced logging, where we want to print some
+// binary object (often a digest).
+extern void cf_fault_event2(const cf_fault_context,
+		const cf_fault_severity severity, const char *file_name, const int line,
+		const void *mem_ptr, size_t len, cf_display_type dt, const char *msg, ...)
+		__attribute__ ((format (printf, 8, 9)));
+
+extern void cf_fault_event_nostack(const cf_fault_context,
+		const cf_fault_severity severity, const char *fn, const int line,
+		const char *msg, ...)
+		__attribute__ ((format (printf, 5, 6)));
+
+// For now there's only one cache, dumped by the ticker.
+extern void cf_fault_cache_event(cf_fault_context context,
+		cf_fault_severity severity, const char *file_name, int line,
+		char *msg, ...)
+		__attribute__ ((format (printf, 5, 6)));
+
+// This is ONLY to keep Eclipse happy without having to tell it __FILENAME__ is
+// defined. The make process will define it via the -D mechanism.
+#ifndef __FILENAME__
+#define __FILENAME__ ""
+#endif
+
+// The "regular" version.
+#define cf_assert(a, context, __msg, ...) \
+		((a) ? (void)0 : \
+			cf_fault_event((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__))
+
+// The "no stack" versions.
+#define cf_assert_nostack(a, context, __msg, ...) \
+		((a) ? (void)0 : \
+			cf_fault_event_nostack((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__))
+#define cf_crash_nostack(context, __msg, ...) \
+		cf_fault_event_nostack((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__)
+
+#define MAX_BACKTRACE_DEPTH 50
+
+// This must literally be the direct clib "free()", because "strings" is
+// allocated by "backtrace_symbols()".
+#define PRINT_STACKTRACE() \
+do { \
+	void *bt[MAX_BACKTRACE_DEPTH]; \
+	int sz = backtrace(bt, MAX_BACKTRACE_DEPTH); \
+	cf_fault_event(AS_AS, CF_WARNING, __FILENAME__, __LINE__, "stacktrace: found %d frames", sz); \
+	char **strings = backtrace_symbols(bt, sz); \
+	if (strings) { \
+		for (int i = 0; i < sz; i++) { \
+			cf_fault_event(AS_AS, CF_WARNING, __FILENAME__, __LINE__, "stacktrace: frame %d: %s", i, strings[i]); \
+		} \
+		free(strings); \
+	} \
+	else { \
+		cf_fault_event(AS_AS, CF_WARNING, __FILENAME__, __LINE__, "stacktrace: found no symbols"); \
+	} \
+} while (0);
+
+#define PRINT_CALL_STACK(severity) \
+do { \
+	void *bt[MAX_BACKTRACE_DEPTH]; \
+	int sz = backtrace(bt, MAX_BACKTRACE_DEPTH); \
+	cf_fault_event(AS_AS, severity, __FILENAME__, __LINE__, "call stack: found %d frames", sz); \
+	char **strings = backtrace_symbols(bt, sz); \
+	if (strings) { \
+		for (int i = 0; i < sz; i++) { \
+			cf_fault_event(AS_AS, severity, __FILENAME__, __LINE__, "call stack: frame %d: %s", i, strings[i]); \
+		} \
+		free(strings); \
+	} \
+	else { \
+		cf_fault_event(AS_AS, severity, __FILENAME__, __LINE__, "call stack: found no symbols"); \
+	} \
+} while (0);
+
+// The "regular" versions.
+#define __SEVLOG(severity, context, __msg, ...) \
+		(severity > cf_fault_filter[context] ? \
+				(void)0 : \
+				cf_fault_event((context), severity, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__))
+
+#define cf_crash(context, __msg, ...) \
+		cf_fault_event((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__)
+
+#define cf_warning(...) __SEVLOG(CF_WARNING, ##__VA_ARGS__)
+#define cf_info(...) __SEVLOG(CF_INFO, ##__VA_ARGS__)
+#define cf_debug(...) __SEVLOG(CF_DEBUG, ##__VA_ARGS__)
+#define cf_detail(...) __SEVLOG(CF_DETAIL, ##__VA_ARGS__)
+
+// In addition to the existing LOG calls, we will now add a new mechanism
+// that will the ability to print out a BINARY ARRAY, in a general manner, at
+// the end of the passed in PRINT STRING.
+// This is a general mechanism that can be used to express a binary array as
+// a hex or Base64 value, but we'll often use it to print a full Digest Value,
+// in either Hex format or Base64 format.
+#define __BINARY_SEVLOG(severity, context, ptr, len, DT, __msg, ...) \
+		(severity > cf_fault_filter[context] ? \
+				(void)0 : \
+				cf_fault_event2((context), severity, __FILENAME__, __LINE__, ptr, len, DT, (__msg), ##__VA_ARGS__))
+
+#define cf_crash_binary(context, ptr, len, DT, __msg, ...) \
+		cf_fault_event2((context), CF_CRITICAL, __FILENAME__, __LINE__, ptr, len, DT, (__msg), ##__VA_ARGS__)
+
+#define cf_warning_binary(...) __BINARY_SEVLOG(CF_WARNING, ##__VA_ARGS__)
+#define cf_info_binary(...) __BINARY_SEVLOG(CF_INFO, ##__VA_ARGS__)
+#define cf_debug_binary(...) __BINARY_SEVLOG(CF_DEBUG, ##__VA_ARGS__)
+#define cf_detail_binary(...) __BINARY_SEVLOG(CF_DETAIL, ##__VA_ARGS__)
+
+// This set of log calls specifically handles DIGEST values.
+#define __DIGEST_SEVLOG(severity, context, ptr,__msg, ...) \
+		(severity > cf_fault_filter[context] ? \
+				(void)0 : \
+				cf_fault_event2((context), severity, __FILENAME__, __LINE__, ptr, 20, CF_DISPLAY_HEX_DIGEST, (__msg), ##__VA_ARGS__))
+
+#define cf_crash_digest(context, ptr,__msg, ...) \
+		cf_fault_event2((context), CF_CRITICAL, __FILENAME__, __LINE__, ptr, 20, CF_DISPLAY_HEX_DIGEST, (__msg), ##__VA_ARGS__)
+
+#define cf_warning_digest(...)  __DIGEST_SEVLOG(CF_WARNING, ##__VA_ARGS__)
+#define cf_info_digest(...)  __DIGEST_SEVLOG(CF_INFO, ##__VA_ARGS__)
+#define cf_debug_digest(...)  __DIGEST_SEVLOG(CF_DEBUG, ##__VA_ARGS__)
+#define cf_detail_digest(...)  __DIGEST_SEVLOG(CF_DETAIL, ##__VA_ARGS__)
+
+// _GNU_SOURCE gives us a strerror_r() that returns (char *).
+#define cf_strerror(err) strerror_r(err, (char *)alloca(200), 200)
+
+/* cf_context_at_severity
+ * Return whether the given context is set to this severity level or higher. */
+extern bool cf_context_at_severity(const cf_fault_context context, const cf_fault_severity severity);
+
+extern void cf_fault_init();
+
+int generate_packed_hex_string(const void *mem_ptr, uint32_t len, char* output);
+
+// For now there's only one cache, dumped by the ticker.
+extern void cf_fault_dump_cache();
+
+#define cf_dump_ticker_cache() cf_fault_dump_cache()
+
+#define __CACHE_SEVLOG(severity, context, __msg, ...) \
+		(severity > cf_fault_filter[context] ? \
+				(void)0 : \
+				cf_fault_cache_event((context), severity, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__))
+
+#define cf_ticker_warning(...) __CACHE_SEVLOG(CF_WARNING, ##__VA_ARGS__)
+#define cf_ticker_info(...) __CACHE_SEVLOG(CF_INFO, ##__VA_ARGS__)
+#define cf_ticker_debug(...) __CACHE_SEVLOG(CF_DEBUG, ##__VA_ARGS__)
+#define cf_ticker_detail(...) __CACHE_SEVLOG(CF_DETAIL, ##__VA_ARGS__)
diff --git a/cf/include/hardware.h b/cf/include/hardware.h
new file mode 100644
index 00000000..87ac526b
--- /dev/null
+++ b/cf/include/hardware.h
@@ -0,0 +1,56 @@
+/*
+ * hardware.h
+ *
+ * Copyright (C) 2016-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <socket.h>
+
+typedef enum {
+	CF_TOPO_AUTO_PIN_NONE,
+	CF_TOPO_AUTO_PIN_CPU,
+	CF_TOPO_AUTO_PIN_NUMA
+} cf_topo_auto_pin;
+
+typedef uint16_t cf_topo_os_cpu_index;
+
+typedef uint16_t cf_topo_numa_node_index;
+typedef uint16_t cf_topo_core_index;
+typedef uint16_t cf_topo_cpu_index;
+
+void cf_topo_config(cf_topo_auto_pin auto_pin, cf_topo_numa_node_index a_numa_node,
+		const cf_addr_list *addrs);
+void cf_topo_force_map_memory(const uint8_t *from, size_t size);
+void cf_topo_migrate_memory(void);
+void cf_topo_info(void);
+
+uint16_t cf_topo_count_cores(void);
+uint16_t cf_topo_count_cpus(void);
+
+cf_topo_cpu_index cf_topo_current_cpu(void);
+cf_topo_cpu_index cf_topo_socket_cpu(const cf_socket *sock);
+
+void cf_topo_pin_to_core(cf_topo_core_index i_core);
+void cf_topo_pin_to_cpu(cf_topo_cpu_index i_cpu);
diff --git a/cf/include/hist.h b/cf/include/hist.h
new file mode 100644
index 00000000..341e268f
--- /dev/null
+++ b/cf/include/hist.h
@@ -0,0 +1,67 @@
+/*
+ * hist.h
+ *
+ * Copyright (C) 2009-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "citrusleaf/cf_atomic.h"
+#include "dynbuf.h"
+
+
+//==========================================================
+// Histogram with logarithmic buckets, used for all the
+// latency metrics.
+//
+
+#define N_BUCKETS (1 + 64)
+#define HISTOGRAM_NAME_SIZE 512
+
+typedef enum {
+	HIST_MILLISECONDS,
+	HIST_MICROSECONDS,
+	HIST_SIZE,
+	HIST_COUNT,
+	HIST_SCALE_MAX_PLUS_1
+} histogram_scale;
+
+#define HIST_TAG_MILLISECONDS	"msec"
+#define HIST_TAG_MICROSECONDS	"usec"
+#define HIST_TAG_SIZE			"bytes"
+#define HIST_TAG_COUNT			"count"
+
+// DO NOT access this member data directly - use the API!
+// (Except for cf_hist_track, for which histogram is a base class.)
+typedef struct histogram_s {
+	char name[HISTOGRAM_NAME_SIZE];
+	const char* scale_tag;
+	uint32_t time_div;
+	cf_atomic64 counts[N_BUCKETS];
+} histogram;
+
+extern histogram *histogram_create(const char *name, histogram_scale scale);
+extern void histogram_clear(histogram *h);
+extern void histogram_dump(histogram *h );
+
+extern uint64_t histogram_insert_data_point(histogram *h, uint64_t start_ns);
+extern void histogram_insert_raw(histogram *h, uint64_t value);
diff --git a/cf/include/hist_track.h b/cf/include/hist_track.h
new file mode 100644
index 00000000..8dfdb287
--- /dev/null
+++ b/cf/include/hist_track.h
@@ -0,0 +1,86 @@
+/*
+ * hist_track.h
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+
+//==========================================================
+// Includes
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "dynbuf.h"
+#include "hist.h"
+
+
+//==========================================================
+// Typedefs
+//
+
+typedef struct cf_hist_track_s cf_hist_track;
+
+typedef enum {
+	CF_HIST_TRACK_FMT_PACKED,
+	CF_HIST_TRACK_FMT_TABLE
+} cf_hist_track_info_format;
+
+
+//==========================================================
+// Public API
+//
+
+//------------------------------------------------
+// Constructor/Destructor
+//
+cf_hist_track* cf_hist_track_create(const char* name, histogram_scale scale);
+void cf_hist_track_destroy(cf_hist_track* _this);
+
+//------------------------------------------------
+// Start/Stop Caching Data
+//
+bool cf_hist_track_start(cf_hist_track* _this, uint32_t back_sec,
+		uint32_t slice_sec, const char* thresholds);
+void cf_hist_track_stop(cf_hist_track* _this);
+
+//------------------------------------------------
+// Histogram API "Overrides"
+//
+void cf_hist_track_clear(cf_hist_track* _this);
+void cf_hist_track_dump(cf_hist_track* _this);
+
+// These are just pass-throughs to histogram insertion methods:
+uint64_t cf_hist_track_insert_data_point(cf_hist_track* _this,
+		uint64_t start_ns);
+void cf_hist_track_insert_raw(cf_hist_track* _this, uint64_t value);
+
+//------------------------------------------------
+// Get Statistics from Cached Data
+//
+void cf_hist_track_get_info(cf_hist_track* _this, uint32_t back_sec,
+		uint32_t duration_sec, uint32_t slice_sec, bool throughput_only,
+		cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p);
+
+//------------------------------------------------
+// Get Current Settings
+//
+void cf_hist_track_get_settings(cf_hist_track* _this, cf_dyn_buf* db_p);
diff --git a/cf/include/linear_hist.h b/cf/include/linear_hist.h
new file mode 100644
index 00000000..da558f09
--- /dev/null
+++ b/cf/include/linear_hist.h
@@ -0,0 +1,61 @@
+/*
+ * linear_hist.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "dynbuf.h"
+
+
+typedef struct linear_hist_s linear_hist;
+
+typedef struct linear_hist_threshold_s {
+	uint32_t value;
+	uint32_t bucket_index;
+	uint32_t bucket_width;
+	uint64_t bucket_count;
+	uint64_t target_count;
+} linear_hist_threshold;
+
+//------------------------------------------------
+// These must all be called from the same thread!
+//
+
+linear_hist *linear_hist_create(const char *name, uint32_t start, uint32_t max_offset, uint32_t num_buckets);
+void linear_hist_destroy(linear_hist *h);
+void linear_hist_reset(linear_hist *h, uint32_t start, uint32_t max_offset, uint32_t num_buckets);
+void linear_hist_clear(linear_hist *h, uint32_t start, uint32_t max_offset);
+
+uint64_t linear_hist_get_total(linear_hist *h);
+void linear_hist_merge(linear_hist *h1, linear_hist *h2);
+void linear_hist_insert_data_point(linear_hist *h, uint32_t point);
+uint64_t linear_hist_get_threshold_for_fraction(linear_hist *h, uint32_t tenths_pct, linear_hist_threshold *p_threshold);
+uint64_t linear_hist_get_threshold_for_subtotal(linear_hist *h, uint64_t subtotal, linear_hist_threshold *p_threshold);
+
+void linear_hist_dump(linear_hist *h);
+void linear_hist_save_info(linear_hist *h);
+
+//------------------------------------------------
+// This call is thread-safe.
+//
+
+void linear_hist_get_info(linear_hist *h, cf_dyn_buf *db);
diff --git a/cf/include/mem_count.h b/cf/include/mem_count.h
new file mode 100644
index 00000000..71652749
--- /dev/null
+++ b/cf/include/mem_count.h
@@ -0,0 +1,51 @@
+/*
+ * mem_count.h
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "dynbuf.h"
+
+/*
+ * Type for selecting the field to be sorted on for memory count reporting.
+ */
+typedef enum sort_field_e {
+    CF_ALLOC_SORT_NET_SZ,
+    CF_ALLOC_SORT_DELTA_SZ,
+    CF_ALLOC_SORT_NET_ALLOC_COUNT,
+    CF_ALLOC_SORT_TOTAL_ALLOC_COUNT,
+    CF_ALLOC_SORT_TIME_LAST_MODIFIED
+} sort_field_t;
+
+/*
+ *  Type for mode of enabling / disabling memory accounting.
+ */
+typedef enum mem_count_mode_e {
+	MEM_COUNT_DISABLE,            // Disable memory accounting.
+	MEM_COUNT_ENABLE,             // Enable memory accounting at daemon start-up time.
+	MEM_COUNT_ENABLE_DYNAMIC      // Enable memory accounting at run-time.
+} mem_count_mode_t;
+
+int mem_count_init(mem_count_mode_t mode);
+void mem_count_stats(void);
+int mem_count_alloc_info(char *file, int line, cf_dyn_buf *db);
+int mem_count_report(sort_field_t sort_field, int top_n, cf_dyn_buf *db);
+void mem_count_shutdown(void);
diff --git a/cf/include/meminfo.h b/cf/include/meminfo.h
new file mode 100644
index 00000000..9df7af07
--- /dev/null
+++ b/cf/include/meminfo.h
@@ -0,0 +1,33 @@
+/*
+ * meminfo.h
+ *
+ * Copyright (C) 2010 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/* SYNOPSIS
+ * We have the ability to evict data to protect the server.
+ */
+
+int
+cf_meminfo(uint64_t *physmem, uint64_t *freemem, int *freepct, bool *swapping);
diff --git a/cf/include/msg.h b/cf/include/msg.h
new file mode 100644
index 00000000..a2ef961c
--- /dev/null
+++ b/cf/include/msg.h
@@ -0,0 +1,232 @@
+/*
+ * msg.h
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// These values are used on the wire - don't change them.
+typedef enum {
+	M_FT_UINT32 = 1,
+	M_FT_UNUSED_2 = 2,
+	M_FT_UINT64 = 3,
+	M_FT_UNUSED_4 = 4,
+	M_FT_STR = 5,
+	M_FT_BUF = 6,
+	M_FT_ARRAY_UINT32 = 7,
+	M_FT_ARRAY_UINT64 = 8,
+	M_FT_ARRAY_BUF = 9,
+	M_FT_ARRAY_STR = 10,
+	M_FT_MSGPACK = 11
+} msg_field_type; // encoded in uint8_t
+
+// These values are used on the wire - don't change them.
+typedef enum {
+	M_TYPE_FABRIC = 0,
+	M_TYPE_HEARTBEAT_V2 = 1,
+	M_TYPE_PAXOS = 2,
+	M_TYPE_MIGRATE = 3,
+	M_TYPE_PROXY = 4,
+	M_TYPE_HEARTBEAT = 5,
+	M_TYPE_CLUSTERING = 6,
+	M_TYPE_RW = 7,
+	M_TYPE_INFO = 8,
+	M_TYPE_EXCHANGE = 9,
+	M_TYPE_APPEAL = 10,
+	M_TYPE_XDR = 11,
+	M_TYPE_UNUSED_12 = 12,
+	M_TYPE_UNUSED_13 = 13,
+	M_TYPE_UNUSED_14 = 14,
+	M_TYPE_SMD = 15,
+	M_TYPE_UNUSED_16 = 16,
+	M_TYPE_UNUSED_17 = 17,
+	M_TYPE_MAX = 18
+} msg_type; // encoded in uint16_t
+
+typedef struct msg_template_s {
+	uint16_t id;
+	msg_field_type type;
+} msg_template;
+
+struct msg_str_array_s;
+struct msg_buf_array_s;
+
+typedef struct msg_field_s {
+	uint16_t id;
+	bool is_set;
+	bool is_free;
+	uint32_t field_sz;
+
+	union {
+		uint32_t ui32;
+		uint64_t ui64;
+		char *str;
+		uint8_t *buf;
+		uint32_t *ui32_a;
+		uint64_t *ui64_a;
+		struct msg_str_array_s *str_a;
+		struct msg_buf_array_s *buf_a;
+		void *any_buf;
+	} u;
+} msg_field;
+
+typedef struct msg_s {
+	msg_type type;
+	uint16_t n_fields;
+	bool just_parsed; // fields point into fabric buffer
+	uint32_t bytes_used;
+	uint32_t bytes_alloc;
+	uint64_t benchmark_time;
+	msg_field f[]; // indexed by id
+} msg;
+
+// msg header on wire.
+typedef struct msg_hdr_s {
+	uint32_t size;
+	uint16_t type;
+} __attribute__ ((__packed__)) msg_hdr;
+
+typedef enum {
+	MSG_GET_DIRECT,
+	MSG_GET_COPY_MALLOC
+} msg_get_type;
+
+typedef enum {
+	MSG_SET_HANDOFF_MALLOC,
+	MSG_SET_COPY
+} msg_set_type;
+
+typedef struct msg_buf_ele_s {
+	uint32_t sz;
+	uint8_t *ptr;
+} msg_buf_ele;
+
+
+//==========================================================
+// Globals.
+//
+
+extern cf_atomic_int g_num_msgs;
+extern cf_atomic_int g_num_msgs_by_type[M_TYPE_MAX];
+
+
+//==========================================================
+// Public API.
+//
+
+//------------------------------------------------
+// Object accounting.
+//
+
+// Free up a "msg" object. Call this function instead of freeing the msg
+// directly in order to keep track of all msgs.
+void msg_put(msg *m);
+
+//------------------------------------------------
+// Lifecycle.
+//
+
+void msg_type_register(msg_type type, const msg_template *mt, size_t mt_sz, size_t scratch_sz);
+msg *msg_create(msg_type type);
+void msg_destroy(msg *m);
+void msg_incr_ref(msg *m);
+
+//------------------------------------------------
+// Pack messages into flattened data.
+//
+
+size_t msg_get_wire_size(const msg *m);
+size_t msg_get_template_fixed_sz(const msg_template *mt, size_t mt_count);
+size_t msg_to_wire(const msg *m, uint8_t *buf);
+
+//------------------------------------------------
+// Parse flattened data into messages.
+//
+
+int msg_parse(msg *m, const uint8_t *buf, size_t bufsz);
+int msg_get_initial(uint32_t *size_r, msg_type *type_r, const uint8_t *buf, uint32_t bufsz);
+
+void msg_reset(msg *m);
+void msg_preserve_fields(msg *m, uint32_t n_field_ids, ...);
+void msg_preserve_all_fields(msg *m);
+
+//------------------------------------------------
+// Set fields in messages.
+//
+
+int msg_set_uint32(msg *m, int field_id, uint32_t v);
+int msg_set_uint64(msg *m, int field_id, uint64_t v);
+int msg_set_str(msg *m, int field_id, const char *v, msg_set_type type);
+int msg_set_buf(msg *m, int field_id, const uint8_t *v, size_t sz, msg_set_type type);
+
+int msg_set_uint32_array_size(msg *m, int field_id, uint32_t count);
+int msg_set_uint32_array(msg *m, int field_id, uint32_t idx, uint32_t v);
+int msg_set_uint64_array_size(msg *m, int field_id, uint32_t count);
+int msg_set_uint64_array(msg *m, int field_id, uint32_t idx, uint64_t v);
+
+void msg_msgpack_list_set_uint32(msg *m, int field_id, const uint32_t *buf, uint32_t count);
+void msg_msgpack_list_set_buf(msg *m, int field_id, const cf_vector *v);
+
+//------------------------------------------------
+// Get fields from messages.
+//
+
+msg_field_type msg_field_get_type(const msg *m, int field_id);
+bool msg_is_set(const msg *m, int field_id);
+int msg_get_uint32(const msg *m, int field_id, uint32_t *val_r);
+int msg_get_uint64(const msg *m, int field_id, uint64_t *val_r);
+int msg_get_str(const msg *m, int field_id, char **str_r, size_t *sz_r, msg_get_type type);
+int msg_get_buf(const msg *m, int field_id, uint8_t **buf_r, size_t *sz_r, msg_get_type type);
+
+int msg_get_uint32_array(const msg *m, int field_id, uint32_t idx, uint32_t *val_r);
+int msg_get_uint64_array_count(const msg *m, int field_id, uint32_t *count_r);
+int msg_get_uint64_array(const msg *m, int field_id, uint32_t idx, uint64_t *val_r);
+
+bool msg_msgpack_container_get_count(const msg *m, int field_id, uint32_t *count_r);
+bool msg_msgpack_list_get_uint32_array(const msg *m, int field_id, uint32_t *buf_r, uint32_t *count_r);
+bool msg_msgpack_list_get_buf_array(const msg *m, int field_id, cf_vector *v_r, bool init_vec);
+
+static inline bool
+msg_msgpack_list_get_buf_array_presized(const msg *m, int field_id, cf_vector *v_r)
+{
+	return msg_msgpack_list_get_buf_array(m, field_id, v_r, false);
+}
+
+
+//==========================================================
+// Debugging API.
+//
+
+void msg_dump(const msg *m, const char *info);
diff --git a/cf/include/node.h b/cf/include/node.h
new file mode 100644
index 00000000..4e2f81eb
--- /dev/null
+++ b/cf/include/node.h
@@ -0,0 +1,71 @@
+/*
+ * node.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "compare.h"
+
+typedef uint64_t cf_node;
+
+uint32_t cf_nodeid_shash_fn(const void *key);
+uint32_t cf_nodeid_rchash_fn(const void *key, uint32_t key_size);
+char *cf_node_name();
+
+static inline int
+index_of_node(const cf_node* nodes, uint32_t n_nodes, cf_node node)
+{
+	for (uint32_t n = 0; n < n_nodes; n++) {
+		if (node == nodes[n]) {
+			return (int)n;
+		}
+	}
+
+	return -1;
+}
+
+static inline bool
+contains_node(const cf_node* nodes, uint32_t n_nodes, cf_node node)
+{
+	return index_of_node(nodes, n_nodes, node) != -1;
+}
+
+static inline uint32_t
+remove_node(cf_node* nodes, uint32_t n_nodes, cf_node node)
+{
+	int n = index_of_node(nodes, n_nodes, node);
+
+	if (n != -1) {
+		nodes[n] = nodes[--n_nodes];
+	}
+
+	return n_nodes;
+}
+
+static inline int
+cf_node_compare_desc(const void* pa, const void* pb)
+{
+	// Relies on cf_node being uint64_t.
+	return cf_compare_uint64_desc(pa, pb);
+}
diff --git a/cf/include/olock.h b/cf/include/olock.h
new file mode 100644
index 00000000..a7907b7d
--- /dev/null
+++ b/cf/include/olock.h
@@ -0,0 +1,49 @@
+/*
+ * olock.h
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * An object lock system allows fewer locks to be created
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <citrusleaf/cf_digest.h>
+
+#include <cf_mutex.h>
+
+
+typedef struct olock_s {
+	uint32_t n_locks;
+	uint32_t mask;
+	cf_mutex locks[];
+} olock;
+
+void olock_lock(olock *ol, cf_digest *d);
+void olock_vlock(olock *ol, cf_digest *d, cf_mutex **vlock);
+void olock_unlock(olock *ol, cf_digest *d);
+olock *olock_create(uint32_t n_locks, bool mutex);
+void olock_destroy(olock *o);
+
+extern olock *g_record_locks;
diff --git a/cf/include/shash.h b/cf/include/shash.h
new file mode 100644
index 00000000..c70e4d11
--- /dev/null
+++ b/cf/include/shash.h
@@ -0,0 +1,110 @@
+/*
+ * shash.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <pthread.h>
+#include <stdint.h>
+
+#include <citrusleaf/cf_atomic.h>
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// Return codes.
+#define CF_SHASH_ERR_FOUND -4
+#define CF_SHASH_ERR_NOT_FOUND -3
+#define CF_SHASH_ERR -1
+#define CF_SHASH_OK 0
+#define CF_SHASH_REDUCE_DELETE 1
+
+// Bit-values for 'flags' parameter.
+#define CF_SHASH_BIG_LOCK  0x01 // thread-safe with single big lock
+#define CF_SHASH_MANY_LOCK 0x02  // thread-safe with lock per bucket
+
+// User must provide the hash function at create time.
+typedef uint32_t (*cf_shash_hash_fn)(const void *key);
+
+// FIXME - explain or replace.
+typedef void (*cf_shash_update_fn)(const void *key, void *value_old, void *value_new, void *udata);
+
+// The "reduce" function called for every element. Returned value governs
+// behavior during reduce as follows:
+// - CF_SHASH_OK - continue iterating
+// - CF_SHASH_REDUCE_DELETE - delete the current element, continue iterating
+// - anything else (e.g. CF_SHASH_ERR) - stop iterating and return reduce_fn's
+//   returned value
+typedef int (*cf_shash_reduce_fn)(const void *key, void *value, void *udata);
+
+// Private data.
+typedef struct cf_shash_s {
+	cf_shash_hash_fn h_fn;
+	uint32_t key_size;
+	uint32_t value_size;
+	uint32_t ele_size;
+	uint32_t n_buckets;
+	uint32_t flags;
+	cf_atomic32 n_elements;
+	void *table;
+	pthread_mutex_t *bucket_locks;
+	pthread_mutex_t big_lock;
+} cf_shash;
+
+
+//==========================================================
+// Public API - useful hash functions.
+//
+
+// TODO - hash function signature may change.
+uint32_t cf_shash_fn_u32(const void *key);
+uint32_t cf_shash_fn_ptr(const void *key);
+uint32_t cf_shash_fn_zstr(const void *key);
+
+
+//==========================================================
+// Public API.
+//
+
+cf_shash *cf_shash_create(cf_shash_hash_fn h_fn, uint32_t key_size, uint32_t value_size, uint32_t n_buckets, uint32_t flags);
+void cf_shash_destroy(cf_shash *h);
+uint32_t cf_shash_get_size(cf_shash *h);
+
+void cf_shash_put(cf_shash *h, const void *key, const void *value);
+int cf_shash_put_unique(cf_shash *h, const void *key, const void *value);
+
+void cf_shash_update(cf_shash *h, const void *key, void *value_old, void *value_new, cf_shash_update_fn update_fn, void *udata);
+
+int cf_shash_get(cf_shash *h, const void *key, void *value);
+int cf_shash_get_vlock(cf_shash *h, const void *key, void **value_r, pthread_mutex_t **vlock_r);
+
+int cf_shash_delete(cf_shash *h, const void *key);
+int cf_shash_delete_lockfree(cf_shash *h, const void *key);
+int cf_shash_get_and_delete(cf_shash *h, const void *key, void *value);
+void cf_shash_delete_all(cf_shash *h);
+
+int cf_shash_reduce(cf_shash *h, cf_shash_reduce_fn reduce_fn, void *udata);
diff --git a/cf/include/socket.h b/cf/include/socket.h
new file mode 100644
index 00000000..ca5f8d29
--- /dev/null
+++ b/cf/include/socket.h
@@ -0,0 +1,340 @@
+/*
+ * socket.h
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include <alloca.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <netinet/in.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+
+#include "fault.h"
+#include "msg.h"
+#include "node.h"
+
+// Use forward declaration instead of including openssl/ssl.h here.
+struct ssl_st;
+
+#define CF_SOCKET_TIMEOUT 10000
+#define CF_SOCK_CFG_MAX 250
+
+// Accesses the socket file descriptor as an rvalue, i.e., the socket file descriptor
+// cannot be modified.
+#define CSFD(sock) ((int32_t)((sock)->fd))
+
+// CSFD() for epoll file descriptors.
+#define CEFD(poll) ((int32_t)(poll).fd)
+
+// Like CEFD(), but produces an lvalue, i.e., the epoll file descriptor can be modified.
+#define EFD(poll) ((poll).fd)
+
+#define cf_ip_addr_print(_addr) ({ \
+	char *_tmp = alloca(250); \
+	cf_ip_addr_to_string_safe(_addr, _tmp, 250); \
+	_tmp; \
+})
+
+#define cf_ip_addr_print_multi(_addrs, _n_addrs) ({ \
+	char *_tmp = alloca(2500); \
+	cf_ip_addr_to_string_multi_safe(_addrs, _n_addrs, _tmp, 2500); \
+	_tmp; \
+})
+
+#define cf_ip_port_print(_port) ({ \
+	char *_tmp = alloca(25); \
+	cf_ip_port_to_string_safe(_port, _tmp, 25); \
+	_tmp; \
+})
+
+#define cf_sock_addr_print(_addr) ({ \
+	char *_tmp = alloca(250); \
+	cf_sock_addr_to_string_safe(_addr, _tmp, 250); \
+	_tmp; \
+})
+
+typedef struct cf_ip_addr_s {
+	sa_family_t family;
+
+	union {
+		struct in_addr v4;
+		struct in6_addr v6;
+	};
+} cf_ip_addr;
+
+typedef uint16_t cf_ip_port;
+
+typedef struct cf_addr_list_s {
+	uint32_t n_addrs;
+	const char *addrs[CF_SOCK_CFG_MAX];
+} cf_addr_list;
+
+typedef struct cf_serv_spec_s {
+	cf_ip_port bind_port;
+	cf_addr_list bind;
+	cf_ip_port std_port;
+	cf_addr_list std;
+	cf_ip_port alt_port;
+	cf_addr_list alt;
+	char *tls_our_name;
+	uint32_t n_tls_peer_names;
+	char *tls_peer_names[CF_SOCK_CFG_MAX];
+} cf_serv_spec;
+
+typedef struct cf_sock_addr_s {
+	cf_ip_addr addr;
+	cf_ip_port port;
+} cf_sock_addr;
+
+typedef enum {
+	CF_SOCKET_STATE_NON_TLS,
+	CF_SOCKET_STATE_TLS_HANDSHAKE,
+	CF_SOCKET_STATE_TLS_READY
+} cf_socket_state;
+
+typedef struct cf_socket_s {
+	int32_t fd;
+	cf_socket_state state;
+	void *cfg;
+	struct ssl_st *ssl;
+} cf_socket;
+
+typedef struct cf_sockets_s {
+	uint32_t n_socks;
+	cf_socket socks[CF_SOCK_CFG_MAX];
+} cf_sockets;
+
+typedef enum {
+	CF_SOCK_OWNER_SERVICE,
+	CF_SOCK_OWNER_SERVICE_TLS,
+	CF_SOCK_OWNER_HEARTBEAT,
+	CF_SOCK_OWNER_HEARTBEAT_TLS,
+	CF_SOCK_OWNER_FABRIC,
+	CF_SOCK_OWNER_FABRIC_TLS,
+	CF_SOCK_OWNER_INFO,
+	CF_SOCK_OWNER_XDR,
+	CF_SOCK_OWNER_INVALID
+} cf_sock_owner;
+
+typedef struct cf_sock_cfg_s {
+	cf_sock_owner owner;
+	cf_ip_port port;
+	cf_ip_addr addr;
+} cf_sock_cfg;
+
+typedef struct cf_serv_cfg_s {
+	uint32_t n_cfgs;
+	cf_sock_cfg cfgs[CF_SOCK_CFG_MAX];
+} cf_serv_cfg;
+
+typedef struct cf_poll_s {
+	int32_t fd;
+} __attribute__((packed)) cf_poll;
+
+// This precisely matches the epoll_event struct.
+typedef struct cf_poll_event_s {
+	uint32_t events;
+	void *data;
+} __attribute__((packed)) cf_poll_event;
+
+typedef struct cf_msock_cfg_s {
+	cf_sock_owner owner;
+	cf_ip_port port;
+	cf_ip_addr addr;
+	cf_ip_addr if_addr;
+	uint8_t ttl;
+} cf_msock_cfg;
+
+typedef struct cf_mserv_cfg_s {
+	uint32_t n_cfgs;
+	cf_msock_cfg cfgs[CF_SOCK_CFG_MAX];
+} cf_mserv_cfg;
+
+void cf_socket_set_advertise_ipv6(bool advertise);
+bool cf_socket_advertises_ipv6(void);
+
+CF_MUST_CHECK int32_t cf_ip_addr_from_string(const char *string, cf_ip_addr *addr);
+CF_MUST_CHECK int32_t cf_ip_addr_from_string_multi(const char *string, cf_ip_addr *addrs, uint32_t *n_addrs);
+CF_MUST_CHECK int32_t cf_ip_addr_to_string(const cf_ip_addr *addr, char *string, size_t size);
+void cf_ip_addr_to_string_safe(const cf_ip_addr *addr, char *string, size_t size);
+CF_MUST_CHECK int32_t cf_ip_addr_to_string_multi(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, size_t size);
+void cf_ip_addr_to_string_multi_safe(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, size_t size);
+CF_MUST_CHECK int32_t cf_ip_addr_from_binary(const uint8_t *binary, size_t size, cf_ip_addr *addr);
+CF_MUST_CHECK int32_t cf_ip_addr_to_binary(const cf_ip_addr *addr, uint8_t *binary, size_t size);
+void cf_ip_addr_to_rack_aware_id(const cf_ip_addr *addr, uint32_t *id);
+
+CF_MUST_CHECK int32_t cf_ip_addr_compare(const cf_ip_addr *lhs, const cf_ip_addr *rhs);
+void cf_ip_addr_copy(const cf_ip_addr *from, cf_ip_addr *to);
+void cf_ip_addr_sort(cf_ip_addr *addrs, uint32_t n_addrs);
+
+bool cf_ip_addr_is_dns_name(const char *string);
+bool cf_ip_addr_str_is_legacy(const char *string);
+bool cf_ip_addr_is_legacy(const cf_ip_addr *addr);
+bool cf_ip_addr_legacy_only(void);
+
+void cf_ip_addr_set_local(cf_ip_addr *addr);
+CF_MUST_CHECK bool cf_ip_addr_is_local(const cf_ip_addr *addr);
+
+void cf_ip_addr_set_any(cf_ip_addr *addr);
+CF_MUST_CHECK bool cf_ip_addr_is_any(const cf_ip_addr *addr);
+
+CF_MUST_CHECK int32_t cf_ip_port_from_string(const char *string, cf_ip_port *port);
+CF_MUST_CHECK int32_t cf_ip_port_to_string(cf_ip_port port, char *string, size_t size);
+void cf_ip_port_to_string_safe(cf_ip_port port, char *string, size_t size);
+CF_MUST_CHECK int32_t cf_ip_port_from_binary(const uint8_t *binary, size_t size, cf_ip_port *port);
+CF_MUST_CHECK int32_t cf_ip_port_to_binary(cf_ip_port port, uint8_t *binary, size_t size);
+void cf_ip_port_from_node_id(cf_node id, cf_ip_port *port);
+
+CF_MUST_CHECK int32_t cf_sock_addr_from_string(const char *string, cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_sock_addr_to_string(const cf_sock_addr *addr, char *string, size_t size);
+void cf_sock_addr_to_string_safe(const cf_sock_addr *addr, char *string, size_t size);
+CF_MUST_CHECK int32_t cf_sock_addr_from_binary(const uint8_t *binary, size_t size, cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_sock_addr_to_binary(const cf_sock_addr *addr, uint8_t *binary, size_t size);
+
+CF_MUST_CHECK int32_t cf_sock_addr_from_host_port(const char *host, cf_ip_port port, cf_sock_addr *addr);
+void cf_sock_addr_from_addr_port(const cf_ip_addr *ip_addr, cf_ip_port port, cf_sock_addr *addr);
+
+CF_MUST_CHECK int32_t cf_sock_addr_compare(const cf_sock_addr *lhs, const cf_sock_addr *rhs);
+void cf_sock_addr_copy(const cf_sock_addr *from, cf_sock_addr *to);
+
+void cf_sock_addr_from_native(const struct sockaddr *native, cf_sock_addr *addr);
+void cf_sock_addr_to_native(const cf_sock_addr *addr, struct sockaddr *native);
+
+void cf_sock_addr_set_any(cf_sock_addr *addr);
+CF_MUST_CHECK bool cf_sock_addr_is_any(const cf_sock_addr *addr);
+
+void cf_sock_cfg_init(cf_sock_cfg *cfg, cf_sock_owner owner);
+void cf_sock_cfg_copy(const cf_sock_cfg *from, cf_sock_cfg *to);
+
+void cf_serv_cfg_init(cf_serv_cfg *cfg);
+CF_MUST_CHECK int32_t cf_serv_cfg_add_sock_cfg(cf_serv_cfg *serv_cfg, const cf_sock_cfg *sock_cfg);
+
+void cf_sockets_init(cf_sockets *socks);
+CF_MUST_CHECK bool cf_sockets_has_socket(const cf_sockets *socks, const cf_socket *sock);
+void cf_sockets_close(cf_sockets *socks);
+
+void cf_fd_disable_blocking(int32_t fd);
+
+void cf_socket_disable_blocking(cf_socket *sock);
+void cf_socket_enable_blocking(cf_socket *sock);
+void cf_socket_disable_nagle(cf_socket *sock);
+void cf_socket_enable_nagle(cf_socket *sock);
+void cf_socket_keep_alive(cf_socket *sock, int32_t idle, int32_t interval, int32_t count);
+void cf_socket_set_send_buffer(cf_socket *sock, int32_t size);
+void cf_socket_set_receive_buffer(cf_socket *sock, int32_t size);
+void cf_socket_set_window(cf_socket *sock, int32_t size);
+
+void cf_socket_init(cf_socket *sock);
+bool cf_socket_exists(cf_socket *sock);
+
+static inline void cf_socket_copy(const cf_socket *from, cf_socket *to)
+{
+	to->fd = from->fd;
+	to->state = from->state;
+	to->cfg = from->cfg;
+	to->ssl = from->ssl;
+}
+
+CF_MUST_CHECK int32_t cf_socket_init_server(cf_serv_cfg *cfg, cf_sockets *socks);
+void cf_socket_show_server(cf_fault_context cont, const char *tag, const cf_sockets *socks);
+CF_MUST_CHECK int32_t cf_socket_init_client(cf_sock_cfg *cfg, int32_t timeout, cf_socket *sock);
+
+CF_MUST_CHECK int32_t cf_socket_accept(cf_socket *lsock, cf_socket *sock, cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_socket_remote_name(const cf_socket *sock, cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_socket_local_name(const cf_socket *sock, cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_socket_available(cf_socket *sock);
+
+CF_MUST_CHECK int32_t cf_socket_recv_from(cf_socket *sock, void *buff, size_t size, int32_t flags, cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_socket_recv(cf_socket *sock, void *buff, size_t size, int32_t flags);
+CF_MUST_CHECK int32_t cf_socket_send_to(cf_socket *sock, const void *buff, size_t size, int32_t flags, const cf_sock_addr *addr);
+CF_MUST_CHECK int32_t cf_socket_send(cf_socket *sock, const void *buff, size_t size, int32_t flags);
+
+CF_MUST_CHECK int32_t cf_socket_recv_from_all(cf_socket *sock, void *buff, size_t size, int32_t flags, cf_sock_addr *addr, int32_t timeout);
+CF_MUST_CHECK int32_t cf_socket_recv_all(cf_socket *sock, void *buff, size_t size, int32_t flags, int32_t timeout);
+CF_MUST_CHECK int32_t cf_socket_send_to_all(cf_socket *sock, const void *buff, size_t size, int32_t flags, const cf_sock_addr *addr, int32_t timeout);
+CF_MUST_CHECK int32_t cf_socket_send_all(cf_socket *sock, const void *buff, size_t size, int32_t flags, int32_t timeout);
+
+void cf_socket_write_shutdown(cf_socket *sock);
+void cf_socket_shutdown(cf_socket *sock);
+void cf_socket_close(cf_socket *sock);
+void cf_socket_drain_close(cf_socket *sock);
+void cf_socket_term(cf_socket *sock);
+
+void cf_msock_cfg_init(cf_msock_cfg *cfg, cf_sock_owner owner);
+void cf_msock_cfg_copy(const cf_msock_cfg *from, cf_msock_cfg *to);
+
+void cf_mserv_cfg_init(cf_mserv_cfg *cfg);
+CF_MUST_CHECK int32_t cf_mserv_cfg_add_msock_cfg(cf_mserv_cfg *serv_cfg, const cf_msock_cfg *sock_cfg);
+CF_MUST_CHECK int32_t cf_mserv_cfg_add_combo(cf_mserv_cfg *serv_cfg, cf_sock_owner owner, cf_ip_port port, cf_ip_addr *addr, cf_ip_addr *if_addr, uint8_t ttl);
+
+CF_MUST_CHECK int32_t cf_socket_mcast_init(cf_mserv_cfg *cfg, cf_sockets *socks);
+void cf_socket_mcast_show(cf_fault_context cont, const char *tag, const cf_sockets *socks);
+CF_MUST_CHECK int32_t cf_socket_mcast_set_inter(cf_socket *sock, const cf_ip_addr *iaddr);
+CF_MUST_CHECK int32_t cf_socket_mcast_set_ttl(cf_socket *sock, int32_t ttl);
+CF_MUST_CHECK int32_t cf_socket_mcast_join_group(cf_socket *sock, const cf_ip_addr *iaddr, const cf_ip_addr *gaddr);
+
+void cf_poll_create(cf_poll *poll);
+void cf_poll_add_fd(cf_poll poll, int32_t fd, uint32_t events, void *data);
+void cf_poll_add_socket(cf_poll poll, const cf_socket *sock, uint32_t events, void *data);
+CF_MUST_CHECK int32_t cf_poll_modify_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t events, void *data, uint32_t n_err_ok, int32_t *err_ok);
+CF_MUST_CHECK int32_t cf_poll_delete_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t n_err_ok, int32_t *err_ok);
+void cf_poll_add_sockets(cf_poll poll, cf_sockets *socks, uint32_t events);
+void cf_poll_delete_sockets(cf_poll poll, cf_sockets *socks);
+CF_MUST_CHECK int32_t cf_poll_wait(cf_poll poll, cf_poll_event *events, int32_t limit, int32_t timeout);
+void cf_poll_destroy(cf_poll poll);
+
+static inline void cf_poll_modify_socket(cf_poll poll, const cf_socket *sock, uint32_t events, void *data)
+{
+	CF_IGNORE_ERROR(cf_poll_modify_socket_forgiving(poll, sock, events, data, 0, NULL));
+}
+
+static inline void cf_poll_delete_socket(cf_poll poll, const cf_socket *sock)
+{
+	CF_IGNORE_ERROR(cf_poll_delete_socket_forgiving(poll, sock, 0, NULL));
+}
+
+CF_MUST_CHECK int32_t cf_inter_get_addr_all(cf_ip_addr *addrs, uint32_t *n_addrs);
+CF_MUST_CHECK int32_t cf_inter_get_addr_all_legacy(cf_ip_addr *addrs, uint32_t *n_addrs);
+CF_MUST_CHECK int32_t cf_inter_get_addr_def(cf_ip_addr *addrs, uint32_t *n_addrs);
+CF_MUST_CHECK int32_t cf_inter_get_addr_def_legacy(cf_ip_addr *addrs, uint32_t *n_addrs);
+CF_MUST_CHECK int32_t cf_inter_get_addr_name(cf_ip_addr *addrs, uint32_t *n_addrs, const char *if_name);
+bool cf_inter_is_inter_name(const char *if_name);
+CF_MUST_CHECK int32_t cf_inter_addr_to_index_and_name(const cf_ip_addr *addr, int32_t *index, char **name);
+void cf_inter_expand_bond(const char *if_name, char **out_names, uint32_t *n_out);
+CF_MUST_CHECK int32_t cf_inter_mtu(const cf_ip_addr *inter_addr);
+CF_MUST_CHECK int32_t cf_inter_min_mtu(void);
+bool cf_inter_detect_changes(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit);
+bool cf_inter_detect_changes_legacy(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit);
+
+CF_MUST_CHECK int32_t cf_node_id_get(cf_ip_port port, const char *if_hint, cf_node *id);
+
+#if defined CF_SOCKET_PRIVATE
+CF_MUST_CHECK size_t cf_socket_addr_len(const struct sockaddr* sa);
+CF_MUST_CHECK int32_t cf_socket_parse_netlink(bool allow_v6, uint32_t family, uint32_t flags,
+		const void *data, size_t len, cf_ip_addr *addr);
+void cf_socket_fix_client(cf_socket *sock);
+void cf_socket_fix_bind(cf_serv_cfg *serv_cfg);
+void cf_socket_fix_server(cf_socket *sock);
+#endif
diff --git a/cf/include/tls.h b/cf/include/tls.h
new file mode 100644
index 00000000..2bd77ca9
--- /dev/null
+++ b/cf/include/tls.h
@@ -0,0 +1,75 @@
+/*
+ * tls.h
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+#include "socket.h"
+
+struct cf_tls_info_s;
+typedef struct cf_tls_info_s cf_tls_info;
+
+typedef struct cf_tls_spec_s {
+	char *ca_file;
+	char *ca_path;
+	char *cert_blacklist;
+	char *cert_file;
+	char *cipher_suite;
+	char *key_file;
+	char *name;
+	char *protocols;
+} cf_tls_spec;
+
+void tls_check_init();
+
+void tls_cleanup();
+void tls_thread_cleanup();
+
+void tls_socket_init(cf_socket *sock);
+void tls_socket_term(cf_socket *sock);
+int tls_socket_shutdown(cf_socket *sock);
+void tls_socket_close(cf_socket *sock);
+
+cf_tls_info *tls_config_server_context(cf_tls_spec *tspec, bool auth_client, uint32_t n_peer_names, char **peer_names);
+cf_tls_info *tls_config_intra_context(cf_tls_spec *tspec, const char *which);
+
+void tls_socket_prepare_server(cf_tls_info *info, cf_socket *sock);
+void tls_socket_prepare_client(cf_tls_info *info, cf_socket *sock);
+
+static inline bool tls_socket_needs_handshake(cf_socket *sock)
+{
+	return sock->state == CF_SOCKET_STATE_TLS_HANDSHAKE;
+}
+
+void tls_socket_must_not_have_data(cf_socket *sock, const char *caller);
+
+int tls_socket_accept(cf_socket *sock);
+int tls_socket_connect(cf_socket *sock);
+int tls_socket_accept_block(cf_socket *sock);
+int tls_socket_connect_block(cf_socket *sock);
+
+int tls_socket_recv(cf_socket *sock, void *buf, size_t sz, int32_t flags,
+					uint64_t timeout_msec);
+
+int tls_socket_send(cf_socket *sock, void const *buf, size_t sz, int32_t flags,
+					uint64_t timeout_msec);
+
+int tls_socket_pending(cf_socket *sock);
diff --git a/cf/include/vmapx.h b/cf/include/vmapx.h
new file mode 100644
index 00000000..93b3c50d
--- /dev/null
+++ b/cf/include/vmapx.h
@@ -0,0 +1,100 @@
+/*
+ * vmapx.h
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma once
+
+//==========================================================
+// Includes.
+//
+
+#include <pthread.h>
+#include <stddef.h>
+#include <stdint.h>
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct vhash_s vhash;
+
+// DO NOT access this member data directly - use the API!
+// Caution - changing this struct could break warm or cool restart.
+typedef struct cf_vmapx_s {
+	// Vector-related.
+	uint32_t			value_size;
+	uint32_t			max_count;
+	volatile uint32_t	count;
+
+	// Hash-related.
+	uint32_t			key_size;
+	vhash*				hash;
+
+	// Generic.
+	pthread_mutex_t		write_lock;
+
+	//<><><><><><><><><><><> 64 bytes <><><><><><><><><><><>
+
+	// Vector data.
+	uint8_t				values[];
+} cf_vmapx;
+
+typedef enum {
+	CF_VMAPX_OK = 0,
+	CF_VMAPX_ERR_BAD_PARAM,
+	CF_VMAPX_ERR_FULL,
+	CF_VMAPX_ERR_NAME_EXISTS,
+	CF_VMAPX_ERR_NAME_NOT_FOUND,
+	CF_VMAPX_ERR_UNKNOWN
+} cf_vmapx_err;
+
+
+//==========================================================
+// Public API.
+//
+
+size_t cf_vmapx_sizeof(uint32_t value_size, uint32_t max_count);
+
+void cf_vmapx_init(cf_vmapx* vmap, uint32_t value_size, uint32_t max_count, uint32_t hash_size, uint32_t max_name_size);
+void cf_vmapx_release(cf_vmapx* vmap);
+
+uint32_t cf_vmapx_count(const cf_vmapx* vmap);
+
+cf_vmapx_err cf_vmapx_get_by_index(const cf_vmapx* vmap, uint32_t index, void** pp_value);
+cf_vmapx_err cf_vmapx_get_by_name(const cf_vmapx* vmap, const char* name, void** pp_value);
+
+cf_vmapx_err cf_vmapx_get_index(const cf_vmapx* vmap, const char* name, uint32_t* p_index);
+cf_vmapx_err cf_vmapx_get_index_w_len(const cf_vmapx* vmap, const char* name, size_t name_len, uint32_t* p_index);
+
+cf_vmapx_err cf_vmapx_put_unique(cf_vmapx* vmap, const char* name, uint32_t* p_index);
+cf_vmapx_err cf_vmapx_put_unique_w_len(cf_vmapx* vmap, const char* name, size_t name_len, uint32_t* p_index);
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+void* vmapx_value_ptr(const cf_vmapx* vmap, uint32_t index);
+
+vhash* vhash_create(uint32_t key_size, uint32_t n_rows);
+void vhash_destroy(vhash* h);
+void vhash_put(vhash* h, const char* key, size_t key_len, uint32_t value);
diff --git a/cf/include/warnings.h b/cf/include/warnings.h
new file mode 100644
index 00000000..d17ca25b
--- /dev/null
+++ b/cf/include/warnings.h
@@ -0,0 +1,28 @@
+/*
+ * warnings.h
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#pragma GCC diagnostic warning "-Wall"
+#pragma GCC diagnostic warning "-Wextra"
+#pragma GCC diagnostic warning "-Wconversion"
+#pragma GCC diagnostic warning "-Wsign-conversion"
+#pragma GCC diagnostic warning "-Wshadow"
+#pragma GCC diagnostic warning "-Wmissing-declarations"
diff --git a/cf/src/Makefile b/cf/src/Makefile
new file mode 100644
index 00000000..7cca26ed
--- /dev/null
+++ b/cf/src/Makefile
@@ -0,0 +1,41 @@
+# Citrusleaf Foundation
+# Makefile
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.in
+
+ifeq ($(USE_EE),1)
+  include $(EEREPO)/cf/make_in/Makefile.vars
+endif
+
+HEADERS += arenax.h bits.h cf_mutex.h cf_str.h compare.h daemon.h dynbuf.h
+HEADERS += enhanced_alloc.h fault.h hist.h hist_track.h linear_hist.h mem_count.h
+HEADERS += meminfo.h msg.h node.h olock.h shash.h socket.h tls.h
+HEADERS += vmapx.h
+
+SOURCES += alloc.c arenax.c cf_mutex.c cf_str.c daemon.c dynbuf.c fault.c hardware.c
+SOURCES += hist.c hist_track.c linear_hist.c meminfo.c msg.c node.c olock.c
+SOURCES += shash.c socket.c vmapx.c
+ifneq ($(USE_EE),1)
+  SOURCES += arenax_ce.c socket_ce.c tls_ce.c
+endif
+
+LIBRARY = $(LIBRARY_DIR)/libcf.a
+
+INCLUDES += $(INCLUDE_DIR:%=-I%) -I$(COMMON)/src/include
+
+OBJECTS = $(SOURCES:%.c=$(OBJECT_DIR)/%.o)
+DEPENDENCIES = $(OBJECTS:%.o=%.d)
+
+.PHONY: all
+all: $(LIBRARY)
+
+.PHONY: clean
+clean:
+	$(RM) $(OBJECTS) $(LIBRARY)
+	$(RM) $(DEPENDENCIES)
+
+$(LIBRARY): $(OBJECTS)
+	$(AR) rs $(LIBRARY) $(OBJECTS)
+
+include $(DEPTH)/make_in/Makefile.targets
diff --git a/cf/src/alloc.c b/cf/src/alloc.c
new file mode 100644
index 00000000..159d3269
--- /dev/null
+++ b/cf/src/alloc.c
@@ -0,0 +1,1075 @@
+/*
+ * alloc.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+// Make sure that stdlib.h gives us aligned_alloc().
+#define _ISOC11_SOURCE
+
+#include "enhanced_alloc.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <pthread.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <jemalloc/jemalloc.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#include "fault.h"
+#include "mem_count.h"
+
+#include "aerospike/ck/ck_pr.h"
+#include "citrusleaf/cf_atomic.h"
+
+#include "warnings.h"
+
+#undef strdup
+#undef strndup
+
+#define N_ARENAS 150
+#define PAGE_SZ 4096
+
+#define MAX_SITES 4096
+#define MAX_THREADS 256
+
+#define MULT 3486784401u
+#define MULT_INV 3396732273u
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+
+typedef struct site_info_s {
+	uint32_t site_id;
+	pid_t thread_id;
+	size_t size_lo;
+	size_t size_hi;
+} site_info;
+
+// Old glibc versions don't provide this; work around compiler warning.
+void *aligned_alloc(size_t align, size_t sz);
+
+const char *jem_malloc_conf = "narenas:" STR(N_ARENAS);
+
+extern size_t je_chunksize_mask;
+extern void *je_huge_aalloc(const void *p);
+
+__thread int32_t g_ns_arena = -1;
+static __thread int32_t g_ns_tcache = -1;
+
+static const void *g_site_ras[MAX_SITES];
+static uint32_t g_n_site_ras;
+
+static site_info g_site_infos[MAX_SITES * MAX_THREADS];
+// Start at 1, then we can use site ID 0 to mean "no site ID".
+static uint32_t g_n_site_infos = 1;
+
+static __thread uint32_t g_thread_site_infos[MAX_SITES];
+
+static __thread pid_t g_tid;
+// Start with *_ALL; see cf_alloc_set_debug() for details.
+static cf_alloc_debug g_debug = CF_ALLOC_DEBUG_ALL;
+
+// All the hook_*() functions are invoked from hook functions that hook into
+// malloc() and friends for memory accounting purposes.
+//
+// This means that we have no idea who called us and, for example, which locks
+// they hold. Let's be careful when calling back into asd code.
+
+static int32_t
+hook_get_arena(const void *p)
+{
+	int32_t **base = (int32_t **)((uint64_t)p & ~je_chunksize_mask);
+	int32_t *arena;
+
+	if (base != p) {
+		// Small or large allocation.
+		arena = base[0];
+	}
+	else {
+		// Huge allocation.
+		arena = je_huge_aalloc(p);
+	}
+
+	return arena[0];
+}
+
+static void
+hook_check_arena(const void *p, int32_t arena)
+{
+	if (g_debug == CF_ALLOC_DEBUG_NONE) {
+		return;
+	}
+
+	int32_t arena_p = hook_get_arena(p);
+
+	if (arena < 0 && arena_p < N_ARENAS) {
+		return;
+	}
+
+	// The "arena" parameter is never < N_ARENAS.
+
+	if (arena >= N_ARENAS && arena_p >= N_ARENAS) {
+		return;
+	}
+
+	size_t jem_sz = jem_sallocx(p, 0);
+	cf_crash(CF_ALLOC, "arena change for %zu@%p: %d -> %d", jem_sz, p, arena_p, arena);
+}
+
+static pid_t
+hook_gettid(void)
+{
+	if (g_tid == 0) {
+		g_tid = (pid_t)syscall(SYS_gettid);
+	}
+
+	return g_tid;
+}
+
+// Map a 64-bit address to a 12-bit site ID.
+
+static uint32_t
+hook_get_site_id(const void *ra)
+{
+	uint32_t site_id = (uint32_t)(uint64_t)ra & (MAX_SITES - 1);
+
+	for (uint32_t i = 0; i < MAX_SITES; ++i) {
+		const void *site_ra = ck_pr_load_ptr(g_site_ras + site_id);
+
+		// The allocation site is already registered and we found its
+		// slot. Return the slot index.
+
+		if (site_ra == ra) {
+			return site_id;
+		}
+
+		// We reached an empty slot, i.e., the allocation site isn't yet
+		// registered. Try to register it. If somebody else managed to grab
+		// this slot in the meantime, keep looping. Otherwise return the
+		// slot index.
+
+		if (site_ra == NULL && ck_pr_cas_ptr(g_site_ras + site_id, NULL, (void *)ra)) {
+			ck_pr_inc_32(&g_n_site_ras);
+			return site_id;
+		}
+
+		site_id = (site_id + 1) & (MAX_SITES - 1);
+	}
+
+	// More than MAX_SITES call sites.
+	cf_crash(CF_ALLOC, "too many call sites");
+	// Not reached.
+	return 0;
+}
+
+static uint32_t
+hook_new_site_info_id(void)
+{
+	uint32_t info_id = ck_pr_faa_32(&g_n_site_infos, 1);
+
+	if (info_id >= g_n_site_infos) {
+		cf_crash(CF_ALLOC, "site info pool exhausted");
+	}
+
+	return info_id;
+}
+
+// Get the info ID of the site_info record for the given site ID and the current
+// thread. In case the current thread doesn't yet have a site_info record for the
+// given site ID, a new site_info record is allocated.
+
+static uint32_t
+hook_get_site_info_id(uint32_t site_id)
+{
+	uint32_t info_id = g_thread_site_infos[site_id];
+
+	// This thread encountered this allocation site before. We already
+	// have a site info record.
+
+	if (info_id != 0) {
+		return info_id;
+	}
+
+	// This is the first time that this thread encounters this allocation
+	// site. We need to allocate a site_info record.
+
+	info_id = hook_new_site_info_id();
+	site_info *info = g_site_infos + info_id;
+
+	info->site_id = site_id;
+	info->thread_id = hook_gettid();
+	info->size_lo = 0;
+	info->size_hi = 0;
+
+	g_thread_site_infos[site_id] = info_id;
+	return info_id;
+}
+
+// Account for an allocation by the current thread for the allocation site
+// with the given address.
+
+static void
+hook_handle_alloc(const void *ra, void *p, size_t sz)
+{
+	if (p == NULL) {
+		return;
+	}
+
+	size_t jem_sz = jem_sallocx(p, 0);
+
+	uint32_t site_id = hook_get_site_id(ra);
+	uint32_t info_id = hook_get_site_info_id(site_id);
+	site_info *info = g_site_infos + info_id;
+
+	size_t size_lo = info->size_lo;
+	info->size_lo += jem_sz;
+
+	// Carry?
+
+	if (info->size_lo < size_lo) {
+		++info->size_hi;
+	}
+
+	uint8_t *data = (uint8_t *)p + jem_sz - sizeof(uint32_t);
+	uint32_t *data32 = (uint32_t *)data;
+
+	uint8_t *mark = (uint8_t *)p + sz;
+	size_t delta = (size_t)(data - mark);
+
+	// Keep 0xffff as a marker for double free detection.
+
+	if (delta > 0xfffe) {
+		delta = 0;
+	}
+
+	*data32 = ((site_id << 16) | (uint32_t)delta) * MULT + 1;
+
+	for (uint32_t i = 0; i < 4 && i < delta; ++i) {
+		mark[i] = data[i];
+	}
+}
+
+// Account for a deallocation by the current thread for the allocation
+// site with the given address.
+
+static void
+hook_handle_free(const void *ra, void *p, size_t jem_sz)
+{
+	uint8_t *data = (uint8_t *)p + jem_sz - sizeof(uint32_t);
+	uint32_t *data32 = (uint32_t *)data;
+
+	uint32_t val = (*data32 - 1) * MULT_INV;
+	uint32_t site_id = val >> 16;
+	uint32_t delta = val & 0xffff;
+
+	if (site_id >= MAX_SITES) {
+		cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, invalid site ID", jem_sz, p, ra);
+	}
+
+	const void *data_ra = ck_pr_load_ptr(g_site_ras + site_id);
+
+	if (delta == 0xffff) {
+		cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, potential double free, possibly freed before with RA %p",
+				jem_sz, p, ra, data_ra);
+	}
+
+	if (delta > jem_sz - sizeof(uint32_t)) {
+		cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, invalid delta length, possibly allocated with RA %p",
+				jem_sz, p, ra, data_ra);
+	}
+
+	uint8_t *mark = data - delta;
+
+	for (uint32_t i = 0; i < 4 && i < delta; ++i) {
+		if (mark[i] != data[i]) {
+			cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, invalid mark, possibly allocated with RA %p",
+					jem_sz, p, ra, data_ra);
+		}
+	}
+
+	uint32_t info_id = hook_get_site_info_id(site_id);
+	site_info *info = g_site_infos + info_id;
+
+	size_t size_lo = info->size_lo;
+	info->size_lo -= jem_sz;
+
+	// Borrow?
+
+	if (info->size_lo > size_lo) {
+		--info->size_hi;
+	}
+
+	// Replace the allocation site with the deallocation site to facilitate
+	// double-free debugging.
+
+	site_id = hook_get_site_id(ra);
+
+	// Also invalidate the delta length, so that we are more likely to detect
+	// double frees.
+
+	*data32 = ((site_id << 16) | 0xffff) * MULT + 1;
+
+	for (uint32_t i = 0; i < 4 && i < delta; ++i) {
+		mark[i] = data[i];
+	}
+}
+
+static void
+valgrind_check(void)
+{
+	// Make sure that we actually call into JEMalloc when invoking malloc().
+	//
+	// By default, Valgrind redirects the standard allocation API functions,
+	// i.e., malloc(), calloc(), etc., to glibc.
+	//
+	// The problem with this is that Valgrind only redirects the standard API
+	// functions. It does not know about, and thus doesn't redirect, our
+	// non-standard functions, e.g., cf_alloc_malloc_arena().
+	//
+	// As we use both, standard and non-standard functions, to allocate memory,
+	// we would end up with an inconsistent mix of allocations, some allocated
+	// by JEMalloc and some by glibc's allocator.
+	//
+	// Sooner or later, we will thus end up passing a memory block allocated by
+	// JEMalloc to free(), which Valgrind has redirected to glibc's allocator.
+
+	void *p1 = malloc(1);
+	free(p1);
+
+	void *p2 = jem_malloc(1);
+	jem_free(p2);
+
+	// If both of the above allocations are handled by JEMalloc, then they will
+	// be located in the same memory page. If, however, the first allocation is
+	// handled by glibc, then the memory blocks will come from two different
+	// memory pages.
+
+	uint64_t page1 = (uint64_t)p1 >> 12;
+	uint64_t page2 = (uint64_t)p2 >> 12;
+
+	if (page1 != page2) {
+		cf_crash_nostack(CF_ALLOC, "Valgrind redirected malloc() to glibc; please run Valgrind with --soname-synonyms=somalloc=nouserintercepts");
+	}
+}
+
+void
+cf_alloc_init(void)
+{
+	valgrind_check();
+
+	// Turn off libstdc++'s memory caching, as it just duplicates JEMalloc's.
+
+	if (setenv("GLIBCXX_FORCE_NEW", "1", 1) < 0) {
+		cf_crash(CF_ALLOC, "setenv() failed: %d (%s)", errno, cf_strerror(errno));
+	}
+
+	// Double-check that hook_get_arena() works, as it depends on JEMalloc's
+	// internal data structures.
+
+	int32_t err = jem_mallctl("thread.tcache.flush", NULL, NULL, NULL, 0);
+
+	if (err != 0) {
+		cf_crash(CF_ALLOC, "error while flushing thread cache: %d (%s)", err, cf_strerror(err));
+	}
+
+	for (size_t sz = 1; sz <= 16 * 1024 * 1024; sz *= 2) {
+		void *p = cf_alloc_malloc_arena(sz, N_ARENAS / 2);
+		int32_t arena = hook_get_arena(p);
+
+		if (arena != N_ARENAS / 2) {
+			cf_crash(CF_ALLOC, "arena mismatch: %d vs. %d", arena, N_ARENAS / 2);
+		}
+
+		free(p);
+	}
+}
+
+// Restrict memory debugging.
+//
+// We always start out with memory debugging fully enabled (*_ALL). Then,
+// once we have parsed the configuration file, we restrict it to what the
+// configuration file says (e.g., *_TRANSIENT).
+//
+// The reason is that we can safely go from "on" to "off", but not vice
+// versa.
+//
+// When "off", we don't add accounting info to an allocation. Now, if we
+// deallocated such an allocation when "on", then we'd erroneously detect
+// a corruption, because we'd try to validate accounting info that isn't
+// there.
+
+void
+cf_alloc_set_debug(cf_alloc_debug debug)
+{
+	g_debug = debug;
+}
+
+int32_t
+cf_alloc_create_arena(void)
+{
+	int32_t arena;
+	size_t arena_len = sizeof(arena);
+
+	int32_t err = jem_mallctl("arenas.extend", &arena, &arena_len, NULL, 0);
+
+	if (err != 0) {
+		cf_crash(CF_ALLOC, "failed to create new arena: %d (%s)", err, cf_strerror(err));
+	}
+
+	cf_debug(CF_ALLOC, "created new arena %d", arena);
+	return arena;
+}
+
+void
+cf_alloc_heap_stats(size_t *allocated_kbytes, size_t *active_kbytes, size_t *mapped_kbytes,
+		double *efficiency_pct, uint32_t *site_count)
+{
+	uint64_t epoch = 1;
+	size_t len = sizeof(epoch);
+
+	int32_t err = jem_mallctl("epoch", &epoch, &len, &epoch, len);
+
+	if (err != 0) {
+		cf_crash(CF_ALLOC, "failed to retrieve epoch: %d (%s)", err, cf_strerror(err));
+	}
+
+	size_t allocated;
+	len = sizeof(allocated);
+
+	err = jem_mallctl("stats.allocated", &allocated, &len, NULL, 0);
+
+	if (err != 0) {
+		cf_crash(CF_ALLOC, "failed to retrieve stats.allocated: %d (%s)", err, cf_strerror(err));
+	}
+
+	size_t active;
+	len = sizeof(active);
+
+	err = jem_mallctl("stats.active", &active, &len, NULL, 0);
+
+	if (err != 0) {
+		cf_crash(CF_ALLOC, "failed to retrieve stats.active: %d (%s)", err, cf_strerror(err));
+	}
+
+	size_t mapped;
+	len = sizeof(mapped);
+
+	err = jem_mallctl("stats.mapped", &mapped, &len, NULL, 0);
+
+	if (err != 0) {
+		cf_crash(CF_ALLOC, "failed to retrieve stats.mapped: %d (%s)", err, cf_strerror(err));
+	}
+
+	if (allocated_kbytes) {
+		*allocated_kbytes = allocated / 1024;
+	}
+
+	if (active_kbytes) {
+		*active_kbytes = active / 1024;
+	}
+
+	if (mapped_kbytes) {
+		*mapped_kbytes = mapped / 1024;
+	}
+
+	if (efficiency_pct) {
+		*efficiency_pct = mapped != 0 ?
+				(double)allocated * 100.0 / (double)mapped : 0.0;
+	}
+
+	if (site_count) {
+		*site_count = ck_pr_load_32(&g_n_site_ras);
+	}
+}
+
+static void
+line_to_log(void *data, const char *line)
+{
+	(void)data;
+
+	char buff[1000];
+	size_t i;
+
+	for (i = 0; i < sizeof(buff) - 1 && line[i] != 0 && line[i] != '\n'; ++i) {
+		buff[i] = line[i];
+	}
+
+	buff[i] = 0;
+	cf_info(CF_ALLOC, "%s", buff);
+}
+
+static void
+line_to_file(void *data, const char *line)
+{
+	fprintf((FILE *)data, "%s", line);
+}
+
+static void
+time_to_file(FILE *fh)
+{
+	time_t now = time(NULL);
+
+	if (now == (time_t)-1) {
+		cf_crash(CF_ALLOC, "time() failed: %d (%s)", errno, cf_strerror(errno));
+	}
+
+	struct tm gmt;
+
+	if (gmtime_r(&now, &gmt) == NULL) {
+		cf_crash(CF_ALLOC, "gmtime_r() failed");
+	}
+
+	char text[250];
+
+	if (strftime(text, sizeof(text), "%b %d %Y %T %Z", &gmt) == 0) {
+		cf_crash(CF_ALLOC, "strftime() failed");
+	}
+
+	fprintf(fh, "---------- %s ----------\n", text);
+}
+
+void
+cf_alloc_log_stats(const char *file, const char *opts)
+{
+	if (file == NULL) {
+		jem_malloc_stats_print(line_to_log, NULL, opts);
+		return;
+	}
+
+	FILE *fh = fopen(file, "a");
+
+	if (fh == NULL) {
+		cf_warning(CF_ALLOC, "failed to open allocation stats file %s: %d (%s)",
+				file, errno, cf_strerror(errno));
+		return;
+	}
+
+	time_to_file(fh);
+	jem_malloc_stats_print(line_to_file, fh, opts);
+	fclose(fh);
+}
+
+void
+cf_alloc_log_site_infos(const char *file)
+{
+	FILE *fh = fopen(file, "a");
+
+	if (fh == NULL) {
+		cf_warning(CF_ALLOC, "failed to open site info file %s: %d (%s)",
+				file, errno, cf_strerror(errno));
+		return;
+	}
+
+	time_to_file(fh);
+	uint32_t n_site_infos = ck_pr_load_32(&g_n_site_infos);
+
+	for (uint32_t i = 1; i < n_site_infos; ++i) {
+		site_info *info = g_site_infos + i;
+		const void *ra = ck_pr_load_ptr(g_site_ras + info->site_id);
+		fprintf(fh, "0x%016" PRIx64 " %9d 0x%016zx 0x%016zx\n", (uint64_t)ra, info->thread_id,
+				info->size_hi, info->size_lo);
+	}
+
+	fclose(fh);
+}
+
+static bool
+is_transient(int32_t arena)
+{
+	// Note that this also considers -1 (i.e., the default thread arena)
+	// to be transient, in addition to arenas 0 .. (N_ARENAS - 1).
+
+	return arena < N_ARENAS;
+}
+
+static bool
+want_debug(int32_t arena)
+{
+	switch (g_debug) {
+	case CF_ALLOC_DEBUG_NONE:
+		return false;
+
+	case CF_ALLOC_DEBUG_TRANSIENT:
+		return is_transient(arena);
+
+	case CF_ALLOC_DEBUG_PERSISTENT:
+		return !is_transient(arena);
+
+	case CF_ALLOC_DEBUG_ALL:
+		return true;
+	}
+
+	// Not reached.
+	return false;
+}
+
+static int32_t
+calc_free_flags(int32_t arena)
+{
+	// If it's a transient allocation, then simply use the default
+	// thread-local cache. No flags needed. Same, if we don't debug
+	// at all; then we can save ourselves the second cache.
+
+	if (is_transient(arena) || g_debug == CF_ALLOC_DEBUG_NONE) {
+		return 0;
+	}
+
+	// If it's a persistent allocation, then use the second per-thread
+	// cache. Add it to the flags. See calc_alloc_flags() for more on
+	// this second cache.
+
+	return MALLOCX_TCACHE(g_ns_tcache);
+}
+
+static void
+do_free(void *p, const void *ra)
+{
+	if (p == NULL) {
+		return;
+	}
+
+	int32_t arena = hook_get_arena(p);
+	int32_t flags = calc_free_flags(arena);
+
+	if (!want_debug(arena)) {
+		jem_dallocx(p, flags);
+		return;
+	}
+
+	size_t jem_sz = jem_sallocx(p, 0);
+	hook_handle_free(ra, p, jem_sz);
+	jem_sdallocx(p, jem_sz, flags);
+}
+
+void
+__attribute__ ((noinline))
+free(void *p)
+{
+	do_free(p, __builtin_return_address(0));
+}
+
+static int32_t
+calc_alloc_flags(int32_t flags, int32_t arena)
+{
+	// Default arena and default thread-local cache. No additional flags
+	// needed.
+
+	if (arena < 0) {
+		return flags;
+	}
+
+	// We're allocating from a specific arena. Add it to the flags.
+
+	flags |= MALLOCX_ARENA(arena);
+
+	// If it's an arena for transient allocations, then we use the default
+	// thread-local cache. No additional flags needed. Same, if we don't
+	// debug at all; then we can save ourselves the second cache.
+
+	if (is_transient(arena) || g_debug == CF_ALLOC_DEBUG_NONE) {
+		return flags;
+	}
+
+	// We have a second per-thread cache for persistent allocations. In this
+	// way we never mix persistent allocations and transient allocations in
+	// the same cache. We need to keep them apart, because debugging may be
+	// enabled for one, but not the other.
+
+	// Create the second per-thread cache, if we haven't already done so.
+
+	if (g_ns_tcache < 0) {
+		size_t len = sizeof(g_ns_tcache);
+		int32_t err = jem_mallctl("tcache.create", &g_ns_tcache, &len, NULL, 0);
+
+		if (err != 0) {
+			cf_crash(CF_ALLOC, "failed to create new cache: %d (%s)", err, cf_strerror(err));
+		}
+	}
+
+	// Add the second (non-default) per-thread cache to the flags.
+
+	flags |= MALLOCX_TCACHE(g_ns_tcache);
+	return flags;
+}
+
+static void *
+do_mallocx(size_t sz, int32_t arena, const void *ra)
+{
+	int32_t flags = calc_alloc_flags(0, arena);
+
+	if (!want_debug(arena)) {
+		return jem_mallocx(sz == 0 ? 1 : sz, flags);
+	}
+
+	size_t ext_sz = sz + sizeof(uint32_t);
+
+	void *p = jem_mallocx(ext_sz, flags);
+	hook_handle_alloc(ra, p, sz);
+
+	return p;
+}
+
+void *
+cf_alloc_try_malloc(size_t sz)
+{
+	// Allowed to return NULL.
+	return do_mallocx(sz, -1, __builtin_return_address(0));
+}
+
+void *
+cf_alloc_malloc_arena(size_t sz, int32_t arena)
+{
+	void *p = do_mallocx(sz, arena, __builtin_return_address(0));
+	cf_assert(p, CF_ALLOC, "malloc_ns failed sz %zu arena %d", sz, arena);
+	return p;
+}
+
+void *
+__attribute__ ((noinline))
+malloc(size_t sz)
+{
+	void *p = do_mallocx(sz, -1, __builtin_return_address(0));
+	cf_assert(p, CF_ALLOC, "malloc failed sz %zu", sz);
+	return p;
+}
+
+static void *
+do_callocx(size_t n, size_t sz, int32_t arena, const void *ra)
+{
+	int32_t flags = calc_alloc_flags(MALLOCX_ZERO, arena);
+	size_t tot_sz = n * sz;
+
+	if (!want_debug(arena)) {
+		return jem_mallocx(tot_sz == 0 ? 1 : tot_sz, flags);
+	}
+
+	size_t ext_sz = tot_sz + sizeof(uint32_t);
+
+	void *p = jem_mallocx(ext_sz, flags);
+	hook_handle_alloc(ra, p, tot_sz);
+
+	return p;
+}
+
+void *
+cf_alloc_calloc_arena(size_t n, size_t sz, int32_t arena)
+{
+	void *p = do_callocx(n, sz, arena, __builtin_return_address(0));
+	cf_assert(p, CF_ALLOC, "calloc_ns failed n %zu sz %zu arena %d", n, sz, arena);
+	return p;
+}
+
+void *
+calloc(size_t n, size_t sz)
+{
+	void *p = do_callocx(n, sz, -1, __builtin_return_address(0));
+	cf_assert(p, CF_ALLOC, "calloc failed n %zu sz %zu", n, sz);
+	return p;
+}
+
+static void *
+do_rallocx(void *p, size_t sz, int32_t arena, const void *ra)
+{
+	if (p == NULL) {
+		return do_mallocx(sz, arena, ra);
+	}
+
+	hook_check_arena(p, arena);
+
+	if (sz == 0) {
+		do_free(p, ra);
+		return NULL;
+	}
+
+	int32_t flags = calc_alloc_flags(0, arena);
+
+	if (!want_debug(arena)) {
+		return jem_rallocx(p, sz, flags);
+	}
+
+	size_t jem_sz = jem_sallocx(p, 0);
+	hook_handle_free(ra, p, jem_sz);
+
+	size_t ext_sz = sz + sizeof(uint32_t);
+
+	void *p2 = jem_rallocx(p, ext_sz, flags);
+	hook_handle_alloc(ra, p2, sz);
+
+	return p2;
+}
+
+void *
+cf_alloc_realloc_arena(void *p, size_t sz, int32_t arena)
+{
+	void *p2 = do_rallocx(p, sz, arena, __builtin_return_address(0));
+	cf_assert(p2 || sz == 0, CF_ALLOC, "realloc_ns failed sz %zu arena %d", sz, arena);
+	return p2;
+}
+
+void *
+realloc(void *p, size_t sz)
+{
+	void *p2 = do_rallocx(p, sz, -1, __builtin_return_address(0));
+	cf_assert(p2 || sz == 0, CF_ALLOC, "realloc failed sz %zu", sz);
+	return p2;
+}
+
+static char *
+do_strdup(const char *s, size_t n, const void *ra)
+{
+	size_t sz = n + 1;
+	size_t ext_sz = want_debug(-1) ? sz + sizeof(uint32_t) : sz;
+
+	char *s2 = jem_mallocx(ext_sz, 0);
+	cf_assert(s2, CF_ALLOC, "strdup failed len %zu", n);
+
+	if (want_debug(-1)) {
+		hook_handle_alloc(ra, s2, sz);
+	}
+
+	memcpy(s2, s, sz);
+	return s2;
+}
+
+char *
+strdup(const char *s)
+{
+	return do_strdup(s, strlen(s), __builtin_return_address(0));
+}
+
+char *
+strndup(const char *s, size_t n)
+{
+	size_t n2 = 0;
+
+	while (n2 < n && s[n2] != 0) {
+		++n2;
+	}
+
+	size_t sz = n2 + 1;
+	size_t ext_sz = want_debug(-1) ? sz + sizeof(uint32_t) : sz;
+
+	char *s2 = jem_mallocx(ext_sz, 0);
+	cf_assert(s2, CF_ALLOC, "strndup failed limit %zu", n);
+
+	if (want_debug(-1)) {
+		hook_handle_alloc(__builtin_return_address(0), s2, sz);
+	}
+
+	memcpy(s2, s, n2);
+	s2[n2] = 0;
+
+	return s2;
+}
+
+int32_t
+asprintf(char **res, const char *form, ...)
+{
+	char buff[25000];
+
+	va_list va;
+	va_start(va, form);
+
+	int32_t n = vsnprintf(buff, sizeof(buff), form, va);
+
+	va_end(va);
+
+	if ((size_t)n >= sizeof(buff)) {
+		cf_crash(CF_ALLOC, "asprintf overflow len %d", n);
+	}
+
+	*res = do_strdup(buff, (size_t)n, __builtin_return_address(0));
+	return n;
+}
+
+int32_t
+posix_memalign(void **p, size_t align, size_t sz)
+{
+	if (!want_debug(-1)) {
+		return jem_posix_memalign(p, align, sz == 0 ? 1 : sz);
+	}
+
+	size_t ext_sz = sz + sizeof(uint32_t);
+	int32_t err = jem_posix_memalign(p, align, ext_sz);
+
+	if (err != 0) {
+		return err;
+	}
+
+	hook_handle_alloc(__builtin_return_address(0), *p, sz);
+	return 0;
+}
+
+void *
+aligned_alloc(size_t align, size_t sz)
+{
+	if (!want_debug(-1)) {
+		return jem_aligned_alloc(align, sz == 0 ? 1 : sz);
+	}
+
+	size_t ext_sz = sz + sizeof(uint32_t);
+
+	void *p = jem_aligned_alloc(align, ext_sz);
+	hook_handle_alloc(__builtin_return_address(0), p, sz);
+
+	return p;
+}
+
+static void *
+do_valloc(size_t sz)
+{
+	if (!want_debug(-1)) {
+		return jem_aligned_alloc(PAGE_SZ, sz == 0 ? 1 : sz);
+	}
+
+	size_t ext_sz = sz + sizeof(uint32_t);
+
+	void *p = jem_aligned_alloc(PAGE_SZ, ext_sz);
+	hook_handle_alloc(__builtin_return_address(0), p, sz);
+
+	return p;
+}
+
+void *
+valloc(size_t sz)
+{
+	void *p = do_valloc(sz);
+	cf_assert(p, CF_ALLOC, "valloc failed sz %zu", sz);
+	return p;
+}
+
+void *
+memalign(size_t align, size_t sz)
+{
+	if (!want_debug(-1)) {
+		return jem_aligned_alloc(align, sz == 0 ? 1 : sz);
+	}
+
+	size_t ext_sz = sz + sizeof(uint32_t);
+
+	void *p = jem_aligned_alloc(align, ext_sz);
+	hook_handle_alloc(__builtin_return_address(0), p, sz);
+
+	return p;
+}
+
+void *
+pvalloc(size_t sz)
+{
+	(void)sz;
+	cf_crash(CF_ALLOC, "obsolete pvalloc() called");
+	// Not reached.
+	return NULL;
+}
+
+void *
+cf_rc_alloc(size_t sz)
+{
+	size_t tot_sz = sizeof(cf_rc_header) + sz;
+	size_t ext_sz = want_debug(-1) ? tot_sz + sizeof(uint32_t) : tot_sz;
+
+	cf_rc_header *head = jem_malloc(ext_sz);
+	cf_assert(head, CF_ALLOC, "rc_alloc failed sz %zu", sz);
+
+	if (want_debug(-1)) {
+		hook_handle_alloc(__builtin_return_address(0), head, tot_sz);
+	}
+
+	head->rc = 1;
+	head->sz = (uint32_t)sz;
+
+	return head + 1;
+}
+
+void
+cf_rc_free(void *p)
+{
+	if (p == NULL) {
+		cf_crash(CF_ALLOC, "trying to cf_rc_free() null pointer");
+	}
+
+	cf_rc_header *head = (cf_rc_header *)p - 1;
+
+	if (!want_debug(-1)) {
+		jem_dallocx(head, 0);
+		return;
+	}
+
+	size_t jem_sz = jem_sallocx(head, 0);
+	hook_handle_free(__builtin_return_address(0), head, jem_sz);
+	jem_sdallocx(head, jem_sz, 0);
+}
+
+int32_t
+cf_rc_reserve(void *p)
+{
+	cf_rc_header *head = (cf_rc_header *)p - 1;
+	return cf_atomic32_incr(&head->rc);
+}
+
+int32_t
+cf_rc_release(void *p)
+{
+	cf_rc_header *head = (cf_rc_header *)p - 1;
+	int32_t rc = cf_atomic32_decr(&head->rc);
+	cf_assert(rc >= 0, CF_ALLOC, "reference count underflow");
+	return rc;
+}
+
+int32_t
+cf_rc_releaseandfree(void *p)
+{
+	cf_rc_header *head = (cf_rc_header *)p - 1;
+	int32_t rc = cf_atomic32_decr(&head->rc);
+	cf_assert(rc >= 0, CF_ALLOC, "reference count underflow");
+
+	if (rc > 0) {
+		return rc;
+	}
+
+	if (!want_debug(-1)) {
+		jem_dallocx(head, 0);
+		return 0;
+	}
+
+	size_t jem_sz = jem_sallocx(head, 0);
+	hook_handle_free(__builtin_return_address(0), head, jem_sz);
+	jem_sdallocx(head, jem_sz, 0);
+	return 0;
+}
+
+int32_t
+cf_rc_count(const void *p)
+{
+	const cf_rc_header *head = (const cf_rc_header *)p - 1;
+	return (int32_t)head->rc;
+}
diff --git a/cf/src/arenax.c b/cf/src/arenax.c
new file mode 100644
index 00000000..4bcf8131
--- /dev/null
+++ b/cf/src/arenax.c
@@ -0,0 +1,201 @@
+/*
+ * arenax.c
+ *
+ * Copyright (C) 2012-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "arenax.h"
+ 
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "fault.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// Must be in-sync with cf_arenax_err:
+const char* ARENAX_ERR_STRINGS[] = {
+	"ok",
+	"bad parameter",
+	"error creating stage",
+	"error attaching stage",
+	"error detaching stage",
+	"unknown error"
+};
+
+
+//==========================================================
+// Public API.
+//
+
+// Return persistent memory size needed. Excludes stages, which cf_arenax
+// handles internally.
+size_t
+cf_arenax_sizeof()
+{
+	return sizeof(cf_arenax);
+}
+
+// Convert cf_arenax_err to meaningful string.
+const char*
+cf_arenax_errstr(cf_arenax_err err)
+{
+	if (err < 0 || err > CF_ARENAX_ERR_UNKNOWN) {
+		err = CF_ARENAX_ERR_UNKNOWN;
+	}
+
+	return ARENAX_ERR_STRINGS[err];
+}
+
+// Create a cf_arenax object in persistent memory. Also create and attach the
+// first arena stage in persistent memory.
+void
+cf_arenax_init(cf_arenax* arena, key_t key_base, uint32_t element_size,
+		uint32_t stage_capacity, uint32_t max_stages, uint32_t flags)
+{
+	if (stage_capacity == 0) {
+		stage_capacity = MAX_STAGE_CAPACITY;
+	}
+	else if (stage_capacity > MAX_STAGE_CAPACITY) {
+		cf_crash(CF_ARENAX, "stage capacity %u too large", stage_capacity);
+	}
+
+	if (max_stages == 0) {
+		max_stages = CF_ARENAX_MAX_STAGES;
+	}
+	else if (max_stages > CF_ARENAX_MAX_STAGES) {
+		cf_crash(CF_ARENAX, "max stages %u too large", max_stages);
+	}
+
+	arena->key_base = key_base;
+	arena->element_size = element_size;
+	arena->stage_capacity = stage_capacity;
+	arena->max_stages = max_stages;
+	arena->flags = flags;
+
+	arena->stage_size = (size_t)stage_capacity * element_size;
+
+	arena->free_h = 0;
+
+	// Skip 0:0 so null handle is never used.
+	arena->at_stage_id = 0;
+	arena->at_element_id = 1;
+
+	if ((flags & CF_ARENAX_BIGLOCK) != 0) {
+		pthread_mutex_init(&arena->lock, NULL);
+	}
+
+	arena->stage_count = 0;
+	memset(arena->stages, 0, sizeof(arena->stages));
+
+	// Add first stage.
+	if (cf_arenax_add_stage(arena) != CF_ARENAX_OK) {
+		cf_crash(CF_ARENAX, "failed to add first stage");
+	}
+
+	// Clear the null element - allocation bypasses it, but it may be read.
+	memset(cf_arenax_resolve(arena, 0), 0, element_size);
+}
+
+// Allocate an element within the arena.
+cf_arenax_handle
+cf_arenax_alloc(cf_arenax* arena)
+{
+	if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) {
+		pthread_mutex_lock(&arena->lock);
+	}
+
+	cf_arenax_handle h;
+
+	// Check free list first.
+	if (arena->free_h != 0) {
+		h = arena->free_h;
+
+		free_element* p_free_element = cf_arenax_resolve(arena, h);
+
+		arena->free_h = p_free_element->next_h;
+	}
+	// Otherwise keep end-allocating.
+	else {
+		if (arena->at_element_id >= arena->stage_capacity) {
+			if (cf_arenax_add_stage(arena) != CF_ARENAX_OK) {
+				if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) {
+					pthread_mutex_unlock(&arena->lock);
+				}
+
+				return 0;
+			}
+
+			arena->at_stage_id++;
+			arena->at_element_id = 0;
+		}
+
+		cf_arenax_set_handle(&h, arena->at_stage_id, arena->at_element_id);
+
+		arena->at_element_id++;
+	}
+
+	if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) {
+		pthread_mutex_unlock(&arena->lock);
+	}
+
+	if ((arena->flags & CF_ARENAX_CALLOC) != 0) {
+		memset(cf_arenax_resolve(arena, h), 0, arena->element_size);
+	}
+
+	return h;
+}
+
+// Free an element.
+void
+cf_arenax_free(cf_arenax* arena, cf_arenax_handle h)
+{
+	free_element* p_free_element = cf_arenax_resolve(arena, h);
+
+	if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) {
+		pthread_mutex_lock(&arena->lock);
+	}
+
+	p_free_element->magic = FREE_MAGIC;
+	p_free_element->next_h = arena->free_h;
+	arena->free_h = h;
+
+	if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) {
+		pthread_mutex_unlock(&arena->lock);
+	}
+}
+
+// Convert cf_arenax_handle to memory address.
+void*
+cf_arenax_resolve(cf_arenax* arena, cf_arenax_handle h)
+{
+	return arena->stages[h >> ELEMENT_ID_NUM_BITS] +
+			((h & ELEMENT_ID_MASK) * arena->element_size);
+}
diff --git a/cf/src/arenax_ce.c b/cf/src/arenax_ce.c
new file mode 100644
index 00000000..fd6d4571
--- /dev/null
+++ b/cf/src/arenax_ce.c
@@ -0,0 +1,59 @@
+/*
+ * arenax_cold.c
+ *
+ * Copyright (C) 2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "arenax.h"
+
+#include <stdint.h>
+#include "citrusleaf/alloc.h"
+#include "fault.h"
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+// Allocate an arena stage, and store its pointer in the stages array.
+cf_arenax_err
+cf_arenax_add_stage(cf_arenax* arena)
+{
+	if (arena->stage_count >= arena->max_stages) {
+		cf_warning(CF_ARENAX, "can't allocate more than %u arena stages",
+				arena->max_stages);
+		return CF_ARENAX_ERR_STAGE_CREATE;
+	}
+
+	uint8_t* p_stage = (uint8_t*)cf_try_malloc(arena->stage_size);
+
+	if (! p_stage) {
+		cf_warning(CF_ARENAX, "could not allocate %zu-byte arena stage %u",
+				arena->stage_size, arena->stage_count);
+		return CF_ARENAX_ERR_STAGE_CREATE;
+	}
+
+	arena->stages[arena->stage_count++] = p_stage;
+
+	return CF_ARENAX_OK;
+}
diff --git a/cf/src/cf_mutex.c b/cf/src/cf_mutex.c
new file mode 100644
index 00000000..84777eb2
--- /dev/null
+++ b/cf/src/cf_mutex.c
@@ -0,0 +1,175 @@
+/*
+ * cf_mutex.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+//==========================================================
+// Includes.
+//
+
+#include <cf_mutex.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include <linux/futex.h>
+#include <sys/syscall.h>
+
+#include "fault.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+#define FUTEX_SPIN_MAX 100
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+inline static void
+sys_futex(void *uaddr, int op, int val)
+{
+	syscall(SYS_futex, uaddr, op, val, NULL, NULL, 0);
+}
+
+#define xchg(__ptr, __val) __sync_lock_test_and_set(__ptr, __val)
+#define cmpxchg(__ptr, __cmp, __set) __sync_val_compare_and_swap(__ptr, __cmp, __set)
+#define cpu_relax() asm volatile("pause\n": : :"memory")
+#define unlikely(__expr) __builtin_expect(!! (__expr), 0)
+#define likely(__expr) __builtin_expect(!! (__expr), 1)
+
+
+//==========================================================
+// Public API - cf_mutex.
+//
+
+void
+cf_mutex_lock(cf_mutex *m)
+{
+	if (likely(cmpxchg((uint32_t *)m, 0, 1) == 0)) {
+		return; // was not locked
+	}
+
+	if (m->u32 == 2) {
+		sys_futex(m, FUTEX_WAIT_PRIVATE, 2);
+	}
+
+	while (xchg((uint32_t *)m, 2) != 0) {
+		sys_futex(m, FUTEX_WAIT_PRIVATE, 2);
+	}
+}
+
+void
+cf_mutex_unlock(cf_mutex *m)
+{
+	uint32_t check = xchg((uint32_t *)m, 0);
+
+	if (unlikely(check == 2)) {
+		sys_futex(m, FUTEX_WAKE_PRIVATE, 1);
+	}
+	else if (unlikely(check == 0)) {
+		cf_crash(CF_MISC, "cf_mutex_unlock() on already unlocked mutex");
+	}
+}
+
+// Return true if lock success.
+bool
+cf_mutex_trylock(cf_mutex *m)
+{
+	if (cmpxchg((uint32_t *)m, 0, 1) == 0) {
+		return true; // was not locked
+	}
+
+	return false;
+}
+
+void
+cf_mutex_lock_spin(cf_mutex *m)
+{
+	for (int i = 0; i < FUTEX_SPIN_MAX; i++) {
+		if (cmpxchg((uint32_t *)m, 0, 1) == 0) {
+			return; // was not locked
+		}
+
+		cpu_relax();
+	}
+
+	if (m->u32 == 2) {
+		sys_futex(m, FUTEX_WAIT_PRIVATE, 2);
+	}
+
+	while (xchg((uint32_t *)m, 2) != 0) {
+		sys_futex(m, FUTEX_WAIT_PRIVATE, 2);
+	}
+}
+
+void
+cf_mutex_unlock_spin(cf_mutex *m)
+{
+	uint32_t check = xchg((uint32_t *)m, 0);
+
+	if (unlikely(check == 2)) {
+		// Spin and hope someone takes the lock.
+		for (int i = 0; i < FUTEX_SPIN_MAX; i++) {
+			if (m->u32 != 0) {
+				if (cmpxchg((uint32_t *)m, 1, 2) == 0) {
+					break;
+				}
+
+				return; // someone else took the lock
+			}
+
+			cpu_relax();
+		}
+
+		sys_futex(m, FUTEX_WAKE_PRIVATE, 1);
+	}
+	else if (unlikely(check == 0)) {
+		cf_crash(CF_MISC, "cf_mutex_unlock_spin() on already unlocked mutex");
+	}
+}
+
+
+//==========================================================
+// Public API - cf_condition.
+//
+
+void
+cf_condition_wait(cf_condition *c, cf_mutex *m)
+{
+	uint32_t seq = c->seq;
+
+	cf_mutex_unlock(m);
+	sys_futex(&c->seq, FUTEX_WAIT_PRIVATE, seq);
+	cf_mutex_lock(m);
+}
+
+void
+cf_condition_signal(cf_condition *c)
+{
+	__sync_fetch_and_add(&c->seq, 1);
+	sys_futex(&c->seq, FUTEX_WAKE_PRIVATE, 1);
+}
diff --git a/cf/src/cf_str.c b/cf/src/cf_str.c
new file mode 100644
index 00000000..57a465c8
--- /dev/null
+++ b/cf/src/cf_str.c
@@ -0,0 +1,419 @@
+/*
+ * cf_str.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * String helper functions
+ *
+ */
+
+#include "cf_str.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <citrusleaf/cf_vector.h>
+
+
+static char itoa_table[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N' };
+
+// return 0 on success, -1 on fail
+int cf_str_atoi(char *s, int *value)
+{
+	int i = 0;
+	bool neg = false;
+
+	if (*s == '-') { neg = true; s++; }
+
+	while (*s >= '0' && *s <= '9') {
+		i *= 10;
+		i += *s - '0';
+		s++;
+	}
+	switch (*s) {
+		case 'k':
+		case 'K':
+			i *= 1024L;
+			s++;
+			break;
+		case 'M':
+		case 'm':
+			i *= (1024L * 1024L);
+			s++;
+			break;
+		case 'G':
+		case 'g':
+			i *= (1024L * 1024L * 1024L);
+			s++;
+			break;
+		default:
+			break;
+	}
+	if (*s != 0) {
+		return(-1); // reached a non-num before EOL
+	}
+	*value = neg ? -i : i;
+	return(0);
+}
+
+// return 0 on success, -1 on fail
+int cf_str_atoi_u32(char *s, unsigned int *value)
+{
+	unsigned int i = 0;
+
+	while (*s >= '0' && *s <= '9') {
+		i *= 10;
+		i += *s - '0';
+		s++;
+	}
+	switch (*s) {
+		case 'k':
+		case 'K':
+			i *= 1024L;
+			s++;
+			break;
+		case 'M':
+		case 'm':
+			i *= (1024L * 1024L);
+			s++;
+			break;
+		case 'G':
+		case 'g':
+			i *= (1024L * 1024L * 1024L);
+			s++;
+			break;
+		default:
+			break;
+	}
+	if (*s != 0) {
+		return(-1); // reached a non-num before EOL
+	}
+	*value = i;
+	return(0);
+}
+
+int cf_str_atoi_64(char *s, int64_t *value)
+{
+	int64_t i = 0;
+	bool neg = false;
+
+	if (*s == '-') { neg = true; s++; }
+
+	while (*s >= '0' && *s <= '9') {
+		i *= 10;
+		i += *s - '0';
+		s++;
+	}
+	switch (*s) {
+		case 'k':
+		case 'K':
+			i *= 1024L;
+			s++;
+			break;
+		case 'M':
+		case 'm':
+			i *= (1024L * 1024L);
+			s++;
+			break;
+		case 'G':
+		case 'g':
+			i *= (1024L * 1024L * 1024L);
+			s++;
+			break;
+		case 'T':
+		case 't':
+			i *= (1024L * 1024L * 1024L * 1024L);
+			s++;
+			break;
+		case 'P':
+		case 'p':
+			i *= (1024L * 1024L * 1024L * 1024L * 1024L);
+			s++;
+			break;
+		default:
+			break;
+	}
+	if (*s != 0) {
+		return(-1); // reached a non-num before EOL
+	}
+	*value = neg ? -i : i;
+	 return(0);
+}
+
+int cf_str_atoi_u64(char *s, uint64_t *value)
+{
+	uint64_t i = 0;
+
+	while (*s >= '0' && *s <= '9') {
+		i *= 10;
+		i += *s - '0';
+		s++;
+	}
+	switch (*s) {
+		case 'k':
+		case 'K':
+			i *= 1024L;
+			s++;
+			break;
+		case 'M':
+		case 'm':
+			i *= (1024L * 1024L);
+			s++;
+			break;
+		case 'G':
+		case 'g':
+			i *= (1024L * 1024L * 1024L);
+			s++;
+			break;
+		case 'T':
+		case 't':
+			i *= (1024L * 1024L * 1024L * 1024L);
+			s++;
+			break;
+		case 'P':
+		case 'p':
+			i *= (1024L * 1024L * 1024L * 1024L * 1024L);
+			s++;
+			break;
+		default:
+			break;
+	}
+	if (*s != 0) {
+		return(-1); // reached a non-num before EOL
+	}
+	*value = i;
+	return(0);
+}
+
+int cf_str_atoi_x64(const char *s, uint64_t *value)
+{
+	if (! ((*s >= '0' && *s <= '9') ||
+			(*s >= 'a' && *s <= 'f') ||
+			(*s >= 'A' && *s <= 'F'))) {
+		return -1;
+	}
+
+	char* tail = NULL;
+	uint64_t i = strtoul(s, &tail, 16);
+
+	// Check for overflow.
+	if (errno == ERANGE) {
+		return -1;
+	}
+
+	// Don't allow trailing non-hex characters.
+	if (tail && *tail != 0) {
+		return -1;
+	}
+
+	*value = i;
+
+	return 0;
+}
+
+int cf_str_atoi_seconds(char *s, uint64_t *value)
+{
+	// Special case: treat -1 the same as 0.
+	if (*s == '-' && *(s + 1) == '1') {
+		*value = 0;
+		return 0;
+	}
+
+	uint64_t i = 0;
+
+	while (*s >= '0' && *s <= '9') {
+		i *= 10;
+		i += *s - '0';
+		s++;
+	}
+	switch (*s) {
+		case 'S':
+		case 's':
+			s++;
+			break;
+		case 'M':
+		case 'm':
+			i *= 60;
+			s++;
+			break;
+		case 'H':
+		case 'h':
+			i *= (60 * 60);
+			s++;
+			break;
+		case 'D':
+		case 'd':
+			i *= (60 * 60 * 24);
+			s++;
+			break;
+		default:
+			break;
+	}
+	if (*s != 0) {
+		return(-1); // reached a non-num before EOL
+	}
+	*value = i;
+	return(0);
+}
+
+
+unsigned int
+cf_str_itoa(int _value, char *_s, int _radix)
+{
+	// special case is the easy way
+	if (_value == 0) {
+		_s[0] = itoa_table[0];
+		_s[1] = 0;
+		return(1);
+	}
+
+	// Account for negatives
+	unsigned int sign_len = 0;
+	if (_value < 0) {
+		*_s++ = '-';
+		_value = - _value;
+		sign_len = 1;
+	}
+	int _v = _value;
+	unsigned int _nd = 0;
+	while (_v) {
+		_nd++;
+		_v /= _radix;
+	}
+
+	unsigned int rv = sign_len + _nd;
+	_s[_nd] = 0;
+	while (_nd) {
+		_nd --;
+		_s[_nd ] = itoa_table [ _value % _radix ];
+		_value = _value / _radix;
+	}
+	return(rv);
+}
+
+unsigned int
+cf_str_itoa_u64(uint64_t _value, char *_s, int _radix)
+{
+	// special case is the easy way
+	if (_value == 0) {
+		_s[0] = itoa_table[0];
+		_s[1] = 0;
+		return(1);
+	}
+
+	uint64_t _v = _value;
+	unsigned int _nd = 0;
+	while (_v) {
+		_nd++;
+		_v /= _radix;
+	}
+
+	unsigned int rv = _nd;
+	_s[_nd] = 0;
+	while (_nd) {
+		_nd --;
+		_s[_nd ] = itoa_table [ _value % _radix ];
+		_value = _value / _radix;
+	}
+	return(rv);
+}
+
+unsigned int
+cf_str_itoa_u32(uint32_t _value, char *_s, int _radix)
+{
+	// special case is the easy way
+	if (_value == 0) {
+		_s[0] = itoa_table[0];
+		_s[1] = 0;
+		return(1);
+	}
+
+	uint32_t _v = _value;
+	unsigned int _nd = 0;
+	while (_v) {
+		_nd++;
+		_v /= _radix;
+	}
+
+	unsigned int rv = _nd;
+	_s[_nd] = 0;
+	while (_nd) {
+		_nd --;
+		_s[_nd ] = itoa_table [ _value % _radix ];
+		_value = _value / _radix;
+	}
+	return(rv);
+}
+
+#define ATOI_ILLEGAL -1
+
+
+static int8_t atoi_table[] = {
+/*			00   01   02   03   04   05   06   07   08   09   0A   0B   0C   0D   0E   0F */
+/* 00 */	-1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+/* 10 */	-1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+/* 20 */	-1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+/* 30 */	 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  -1,  -1,  -1,  -1,  -1,  -1,
+/* 40 */	-1,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+/* 50 */	25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  -1,  -1,  -1,  -1,
+/* 60 */	-1,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+/* 70 */	25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  -1,  -1,  -1,  -1 };
+
+
+int
+cf_str_atoi_u64_x(char *s, uint64_t *value, int radix)
+{
+	uint64_t i = 0;
+	while (*s) {
+		if (*s < 0)	return(-1);
+		int8_t cv = atoi_table[(uint8_t)*s];
+		if (cv < 0 || cv >= radix) return(-1);
+		i *= radix;
+		i += cv;
+		s++;
+	}
+	*value = i;
+	return(0);
+}
+
+
+
+void
+cf_str_split(char *fmt, char *str, cf_vector *v)
+{
+	char c;
+	char *prev = str;
+	while ((c = *str)) {
+		for (uint32_t j = 0; fmt[j]; j++) {
+			if (fmt[j] == c) {
+				*str = 0;
+				cf_vector_append(v, &prev);
+				prev = str+1;
+				break;
+			}
+		}
+		str++;
+	}
+	if (prev != str)
+		cf_vector_append(v, &prev);
+}
diff --git a/cf/src/daemon.c b/cf/src/daemon.c
new file mode 100644
index 00000000..bd59d9ab
--- /dev/null
+++ b/cf/src/daemon.c
@@ -0,0 +1,167 @@
+/*
+ * daemon.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * process utilities
+ */
+
+#include "daemon.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/capability.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "fault.h"
+
+extern int capset(cap_user_header_t header, cap_user_data_t data);
+
+
+static bool g_hold_caps = false;
+static bool g_clear_caps = false;
+
+
+void
+cf_process_privsep(uid_t uid, gid_t gid)
+{
+	if (0 != getuid() || (uid == getuid() && gid == getgid())) {
+		return;
+	}
+
+	// If appropriate, make all capabilities survive the UID/GID switch.
+	if (g_hold_caps) {
+		if (0 > prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0)) {
+			cf_crash(CF_MISC, "prctl: %s", cf_strerror(errno));
+		}
+
+		g_clear_caps = true;
+	}
+
+	// Drop all auxiliary groups.
+	if (0 > setgroups(0, (const gid_t *)0)) {
+		cf_crash(CF_MISC, "setgroups: %s", cf_strerror(errno));
+	}
+
+	// Change privileges.
+	if (0 > setgid(gid)) {
+		cf_crash(CF_MISC, "setgid: %s", cf_strerror(errno));
+	}
+
+	if (0 > setuid(uid)) {
+		cf_crash(CF_MISC, "setuid: %s", cf_strerror(errno));
+	}
+}
+
+
+// TODO - if we get more customers of this API, we could switch to either using
+// a 'hold counter', or a more involved scheme where individual capabilities can
+// be kept and revoked.
+
+void
+cf_process_holdcap(void)
+{
+	g_hold_caps = true;
+}
+
+
+void
+cf_process_clearcap(void)
+{
+	if (! g_clear_caps) {
+		return;
+	}
+
+	struct __user_cap_header_struct cap_head = {
+		.version = _LINUX_CAPABILITY_VERSION_2
+	};
+
+	struct __user_cap_data_struct cap_data[2] = { { 0 } };
+
+	if (0 > capset(&cap_head, cap_data)) {
+		cf_crash(CF_MISC, "capset: %s", cf_strerror(errno));
+	}
+}
+
+
+// Daemonize the server - fork a new child process and exit the parent process.
+// Close all the file descriptors opened except the ones specified in the
+// fd_ignore_list. Redirect console messages to a file.
+void
+cf_process_daemonize(int *fd_ignore_list, int list_size)
+{
+	int FD, j;
+	char cfile[128];
+	pid_t p;
+
+	// Fork ourselves, then let the parent expire.
+	if (-1 == (p = fork())) {
+		cf_crash(CF_MISC, "couldn't fork: %s", cf_strerror(errno));
+	}
+
+	if (0 != p) {
+		// Prefer _exit() over exit(), as we don't want the parent to
+		// do any cleanups.
+		_exit(0);
+	}
+
+	// Get a new session.
+	if (-1 == setsid()) {
+		cf_crash(CF_MISC, "couldn't set session: %s", cf_strerror(errno));
+	}
+
+	// Drop all the file descriptors except the ones in fd_ignore_list.
+	for (int i = getdtablesize(); i > 2; i--) {
+		for (j = 0; j < list_size; j++) {
+			if (fd_ignore_list[j] == i) {
+				break;
+			}
+		}
+
+		if (j ==  list_size) {
+			close(i);
+		}
+	}
+
+	// Open a temporary file for console message redirection.
+	snprintf(cfile, 128, "/tmp/aerospike-console.%d", getpid());
+
+	if (-1 == (FD = open(cfile, O_WRONLY|O_CREAT|O_APPEND, S_IRUSR|S_IWUSR))) {
+		cf_crash(CF_MISC, "couldn't open console redirection file %s: %s", cfile, cf_strerror(errno));
+	}
+
+	if (-1 == chmod(cfile, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) {
+		cf_crash(CF_MISC, "couldn't set mode on console redirection file %s: %s", cfile, cf_strerror(errno));
+	}
+
+	// Redirect stdout, stderr, and stdin to the console file.
+	for (int i = 0; i < 3; i++) {
+		if (-1 == dup2(FD, i)) {
+			cf_crash(CF_MISC, "couldn't duplicate FD: %s", cf_strerror(errno));
+		}
+	}
+}
diff --git a/cf/src/dynbuf.c b/cf/src/dynbuf.c
new file mode 100644
index 00000000..18b69c52
--- /dev/null
+++ b/cf/src/dynbuf.c
@@ -0,0 +1,534 @@
+/*
+ * dynbuf.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "dynbuf.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+
+#include <citrusleaf/alloc.h>
+
+#include "cf_str.h"
+
+
+#define MAX_BACKOFF (1024 * 256)
+
+size_t
+get_new_size(int alloc, int used, int requested)
+{
+	if (alloc - used > requested) {
+		return alloc;
+	}
+
+	size_t new_sz = alloc + requested + sizeof(cf_buf_builder);
+	int backoff;
+
+	if (new_sz < 1024 * 8) {
+		backoff = 1024;
+	}
+	else if (new_sz < 1024 * 32) {
+		backoff = 1024 * 4;
+	}
+	else if (new_sz < 1024 * 128) {
+		backoff = 1024 * 32;
+	}
+	else {
+		backoff = MAX_BACKOFF;
+	}
+
+	return new_sz + (backoff - (new_sz % backoff));
+}
+
+void
+cf_dyn_buf_reserve_internal(cf_dyn_buf *db, size_t sz)
+{
+	size_t new_sz = get_new_size(db->alloc_sz, db->used_sz, sz);
+
+	if (new_sz > db->alloc_sz) {
+		uint8_t	*_t;
+
+		if (db->is_stack) {
+			_t = cf_malloc(new_sz);
+			memcpy(_t, db->buf, db->used_sz);
+			db->is_stack = false;
+		}
+		else {
+			_t = cf_realloc(db->buf, new_sz);
+		}
+
+		db->buf = _t;
+		db->alloc_sz = new_sz;
+	}
+}
+
+#define DB_RESERVE(_n) \
+	if (db->alloc_sz - db->used_sz < _n) { \
+		cf_dyn_buf_reserve_internal(db, _n); \
+	}
+
+void
+cf_dyn_buf_init_heap(cf_dyn_buf *db, size_t sz)
+{
+	db->buf = cf_malloc(sz);
+	db->is_stack = false;
+	db->alloc_sz = sz;
+	db->used_sz = 0;
+}
+
+void
+cf_dyn_buf_reserve(cf_dyn_buf *db, size_t sz, uint8_t **from)
+{
+	DB_RESERVE(sz);
+
+	if (from) {
+		*from = &db->buf[db->used_sz];
+	}
+
+	db->used_sz += sz;
+}
+
+void
+cf_dyn_buf_append_buf(cf_dyn_buf *db, uint8_t *buf, size_t sz)
+{
+	DB_RESERVE(sz);
+	memcpy(&db->buf[db->used_sz], buf, sz);
+	db->used_sz += sz;
+}
+
+void
+cf_dyn_buf_append_string(cf_dyn_buf *db, const char *s)
+{
+	size_t len = strlen(s);
+
+	DB_RESERVE(len);
+	memcpy(&db->buf[db->used_sz], s, len);
+	db->used_sz += len;
+}
+
+void
+cf_dyn_buf_append_char(cf_dyn_buf *db, char c)
+{
+	DB_RESERVE(1);
+	db->buf[db->used_sz] = (uint8_t)c;
+	db->used_sz++;
+}
+
+void
+cf_dyn_buf_append_bool(cf_dyn_buf *db, bool b)
+{
+	if (b) {
+		DB_RESERVE(4);
+		memcpy(&db->buf[db->used_sz], "true", 4);
+		db->used_sz += 4;
+	}
+	else {
+		DB_RESERVE(5);
+		memcpy(&db->buf[db->used_sz], "false", 5);
+		db->used_sz += 5;
+	}
+}
+
+void
+cf_dyn_buf_append_int(cf_dyn_buf *db, int i)
+{
+	DB_RESERVE(12);
+	db->used_sz += cf_str_itoa(i, (char *)&db->buf[db->used_sz], 10);
+}
+
+void
+cf_dyn_buf_append_uint64_x(cf_dyn_buf *db, uint64_t i)
+{
+	DB_RESERVE(18);
+	db->used_sz += cf_str_itoa_u64(i, (char *)&db->buf[db->used_sz], 16);
+}
+
+void
+cf_dyn_buf_append_uint64(cf_dyn_buf *db, uint64_t i)
+{
+	DB_RESERVE(22);
+	db->used_sz += cf_str_itoa_u64(i, (char *)&db->buf[db->used_sz], 10);
+}
+
+void
+cf_dyn_buf_append_uint32(cf_dyn_buf *db, uint32_t i)
+{
+	DB_RESERVE(12);
+	db->used_sz += cf_str_itoa_u32(i, (char *)&db->buf[db->used_sz], 10);
+}
+
+void
+cf_dyn_buf_chomp(cf_dyn_buf *db)
+{
+	if (db->used_sz > 0) {
+		db->used_sz--;
+	}
+}
+
+char *
+cf_dyn_buf_strdup(cf_dyn_buf *db)
+{
+	if (db->used_sz == 0) {
+		return NULL;
+	}
+
+	char *s = cf_malloc(db->used_sz + 1);
+
+	memcpy(s, db->buf, db->used_sz);
+	s[db->used_sz] = 0;
+
+	return s;
+}
+
+void
+cf_dyn_buf_free(cf_dyn_buf *db)
+{
+	if (! db->is_stack && db->buf) {
+		cf_free(db->buf);
+	}
+}
+
+// Helpers to append name value pairs to a cf_dyn_buf in pattern: name=value;
+
+void
+info_append_bool(cf_dyn_buf *db, const char *name, bool value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_bool(db, value);
+	cf_dyn_buf_append_char(db, ';');
+}
+
+void
+info_append_int(cf_dyn_buf *db, const char *name, int value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_int(db, value);
+	cf_dyn_buf_append_char(db, ';');
+}
+
+void
+info_append_string(cf_dyn_buf *db, const char *name, const char *value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_string(db, value);
+	cf_dyn_buf_append_char(db, ';');
+}
+
+void
+info_append_string_safe(cf_dyn_buf *db, const char *name, const char *value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_string(db, value ? value : "null");
+	cf_dyn_buf_append_char(db, ';');
+}
+
+void
+info_append_uint32(cf_dyn_buf *db, const char *name, uint32_t value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_uint32(db, value);
+	cf_dyn_buf_append_char(db, ';');
+}
+
+void
+info_append_uint64(cf_dyn_buf *db, const char *name, uint64_t value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_uint64(db, value);
+	cf_dyn_buf_append_char(db, ';');
+}
+
+void
+info_append_uint64_x(cf_dyn_buf *db, const char *name, uint64_t value)
+{
+	cf_dyn_buf_append_string(db, name);
+	cf_dyn_buf_append_char(db, '=');
+	cf_dyn_buf_append_uint64_x(db, value);
+	cf_dyn_buf_append_char(db, ';');
+}
+
+
+
+void
+cf_buf_builder_reserve_internal(cf_buf_builder **bb_r, size_t sz)
+{
+	cf_buf_builder *bb = *bb_r;
+	size_t new_sz = get_new_size(bb->alloc_sz, bb->used_sz, sz);
+
+	if (new_sz > bb->alloc_sz) {
+		if (bb->alloc_sz - bb->used_sz < MAX_BACKOFF) {
+			bb = cf_realloc(bb, new_sz);
+		}
+		else {
+			// Only possible if buffer was reset. Avoids potential expensive
+			// copy within realloc.
+			cf_buf_builder	*_t = cf_malloc(new_sz);
+
+			memcpy(_t->buf, bb->buf, bb->used_sz);
+			_t->used_sz = bb->used_sz;
+			cf_free(bb);
+			bb = _t;
+		}
+
+		bb->alloc_sz = new_sz - sizeof(cf_buf_builder);
+		*bb_r = bb;
+	}
+}
+
+#define BB_RESERVE(_n) \
+	if ((*bb_r)->alloc_sz - (*bb_r)->used_sz < _n) { \
+		cf_buf_builder_reserve_internal(bb_r, _n); \
+	}
+
+void
+cf_buf_builder_append_buf(cf_buf_builder **bb_r, uint8_t *buf, size_t sz)
+{
+	BB_RESERVE(sz);
+	cf_buf_builder *bb = *bb_r;
+	memcpy(&bb->buf[bb->used_sz], buf, sz);
+	bb->used_sz += sz;
+}
+
+void
+cf_buf_builder_append_string(cf_buf_builder **bb_r, const char *s)
+{
+	size_t	len = strlen(s);
+	BB_RESERVE(len);
+	cf_buf_builder *bb = *bb_r;
+	memcpy(&bb->buf[bb->used_sz], s, len);
+	bb->used_sz += len;
+}
+
+void
+cf_buf_builder_append_char(cf_buf_builder **bb_r, char c)
+{
+	BB_RESERVE(1);
+	cf_buf_builder *bb = *bb_r;
+	bb->buf[bb->used_sz] = (uint8_t)c;
+	bb->used_sz++;
+}
+
+void
+cf_buf_builder_append_ascii_int(cf_buf_builder **bb_r, int i)
+{
+	BB_RESERVE(12);
+	cf_buf_builder *bb = *bb_r;
+	bb->used_sz += cf_str_itoa(i, (char *)&bb->buf[bb->used_sz], 10);
+}
+
+void
+cf_buf_builder_append_ascii_uint64_x(cf_buf_builder **bb_r, uint64_t i)
+{
+	BB_RESERVE(18);
+	cf_buf_builder *bb = *bb_r;
+	bb->used_sz += cf_str_itoa_u64(i, (char *)&bb->buf[bb->used_sz], 16);
+}
+
+void
+cf_buf_builder_append_ascii_uint64(cf_buf_builder **bb_r, uint64_t i)
+{
+	BB_RESERVE(12);
+	cf_buf_builder *bb = *bb_r;
+	bb->used_sz += cf_str_itoa_u64(i, (char *)&bb->buf[bb->used_sz], 10);
+}
+
+void
+cf_buf_builder_append_ascii_uint32(cf_buf_builder **bb_r, uint32_t i)
+{
+	BB_RESERVE(12);
+	cf_buf_builder *bb = *bb_r;
+	bb->used_sz += cf_str_itoa_u32(i, (char *)&bb->buf[bb->used_sz], 10);
+}
+
+void
+cf_buf_builder_append_uint64(cf_buf_builder **bb_r, uint64_t i)
+{
+	BB_RESERVE(8);
+	cf_buf_builder *bb = *bb_r;
+	uint64_t *i_p = (uint64_t *)&bb->buf[bb->used_sz];
+	*i_p = __swab64(i);
+	bb->used_sz += 8;
+}
+
+void
+cf_buf_builder_append_uint32(cf_buf_builder **bb_r, uint32_t i)
+{
+	BB_RESERVE(4);
+	cf_buf_builder *bb = *bb_r;
+	uint32_t *i_p = (uint32_t *)&bb->buf[bb->used_sz];
+	*i_p = htonl(i);
+	bb->used_sz += 4;
+}
+
+void
+cf_buf_builder_append_uint16(cf_buf_builder **bb_r, uint16_t i)
+{
+	BB_RESERVE(2);
+	cf_buf_builder *bb = *bb_r;
+	uint16_t *i_p = (uint16_t *)&bb->buf[bb->used_sz];
+	*i_p = htons(i);
+	bb->used_sz += 2;
+}
+
+void
+cf_buf_builder_append_uint8(cf_buf_builder **bb_r, uint8_t i)
+{
+	BB_RESERVE(1);
+	cf_buf_builder *bb = *bb_r;
+	bb->buf[bb->used_sz] = i;
+	bb->used_sz ++;
+}
+
+void
+cf_buf_builder_reserve(cf_buf_builder **bb_r, int sz, uint8_t **buf)
+{
+	BB_RESERVE(sz);
+	cf_buf_builder *bb = *bb_r;
+
+	if (buf) {
+		*buf = &bb->buf[bb->used_sz];
+	}
+
+	bb->used_sz += sz;
+}
+
+int
+cf_buf_builder_size(cf_buf_builder *bb)
+{
+	return bb->alloc_sz + sizeof(cf_buf_builder);
+}
+
+void
+cf_buf_builder_chomp(cf_buf_builder *bb)
+{
+	if (bb->used_sz > 0) {
+		bb->used_sz--;
+	}
+}
+
+char *
+cf_buf_builder_strdup(cf_buf_builder *bb)
+{
+	if (bb->used_sz == 0) {
+		return NULL;
+	}
+
+	char *s = cf_malloc(bb->used_sz+1);
+
+	memcpy(s, bb->buf, bb->used_sz);
+	s[bb->used_sz] = 0;
+
+	return s;
+}
+
+cf_buf_builder *
+cf_buf_builder_create()
+{
+	cf_buf_builder *bb = cf_malloc(1024);
+
+	bb->alloc_sz = 1024 - sizeof(cf_buf_builder);
+	bb->used_sz = 0;
+
+	return bb;
+}
+
+cf_buf_builder *
+cf_buf_builder_create_size(size_t sz)
+{
+	size_t malloc_sz = (sz < 1024) ? 1024 : sz;
+	cf_buf_builder *bb = cf_malloc(malloc_sz);
+
+	bb->alloc_sz = malloc_sz - sizeof(cf_buf_builder);
+	bb->used_sz = 0;
+
+	return bb;
+}
+
+void
+cf_buf_builder_free(cf_buf_builder *bb)
+{
+	cf_free(bb);
+}
+
+void
+cf_buf_builder_reset(cf_buf_builder *bb)
+{
+	bb->used_sz = 0;
+}
+
+
+
+// TODO - We've only implemented a few cf_ll_buf methods for now. We'll add more
+// functionality if and when it's needed.
+
+void
+cf_ll_buf_grow(cf_ll_buf *llb, size_t sz)
+{
+	size_t buf_sz = sz > llb->head->buf_sz ? sz : llb->head->buf_sz;
+	cf_ll_buf_stage *new_tail = cf_malloc(sizeof(cf_ll_buf_stage) + buf_sz);
+
+	new_tail->next = NULL;
+	new_tail->buf_sz = buf_sz;
+	new_tail->used_sz = 0;
+
+	llb->tail->next = new_tail;
+	llb->tail = new_tail;
+}
+
+#define LLB_RESERVE(_n) \
+		if (_n > llb->tail->buf_sz - llb->tail->used_sz) { \
+			cf_ll_buf_grow(llb, _n); \
+		}
+
+void
+cf_ll_buf_reserve(cf_ll_buf *llb, size_t sz, uint8_t **from)
+{
+	LLB_RESERVE(sz);
+
+	if (from) {
+		*from = llb->tail->buf + llb->tail->used_sz;
+	}
+
+	llb->tail->used_sz += sz;
+}
+
+void
+cf_ll_buf_free(cf_ll_buf *llb)
+{
+	cf_ll_buf_stage *cur = llb->head_is_stack ? llb->head->next : llb->head;
+
+	while (cur) {
+		cf_ll_buf_stage *temp = cur;
+
+		cur = cur->next;
+		cf_free(temp);
+	}
+}
diff --git a/cf/src/fault.c b/cf/src/fault.c
new file mode 100644
index 00000000..793a5dc0
--- /dev/null
+++ b/cf/src/fault.c
@@ -0,0 +1,1138 @@
+/*
+ * fault.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "fault.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "aerospike/as_log.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_b64.h"
+
+#include "shash.h"
+
+
+/*
+ *  Maximum length for logging binary (i.e., hexadecimal or bit string) data.
+ */
+#define MAX_BINARY_BUF_SZ (64 * 1024)
+
+#define SINK_OPEN_FLAGS (O_WRONLY | O_CREAT | O_NONBLOCK | O_APPEND)
+#define SINK_OPEN_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
+
+/* cf_fault_context_strings, cf_fault_severity_strings, cf_fault_scope_strings
+ * Strings describing fault states */
+
+/* MUST BE KEPT IN SYNC WITH FAULT.H */
+
+char *cf_fault_context_strings[] = {
+		"misc",
+
+		"alloc",
+		"arenax",
+		"hardware",
+		"msg",
+		"rbuffer",
+		"socket",
+		"tls",
+		"vmapx",
+
+		"aggr",
+		"appeal",
+		"as",
+		"batch",
+		"bin",
+		"config",
+		"clustering",
+		"compression",
+		"demarshal",
+		"drv_ssd",
+		"exchange",
+		"fabric",
+		"geo",
+		"hb",
+		"hlc",
+		"index",
+		"info",
+		"info-port",
+		"job",
+		"migrate",
+		"mon",
+		"namespace",
+		"nsup",
+		"particle",
+		"partition",
+		"paxos",
+		"predexp",
+		"proto",
+		"proxy",
+		"proxy-divert",
+		"query",
+		"record",
+		"roster",
+		"rw",
+		"rw-client",
+		"scan",
+		"security",
+		"sindex",
+		"skew",
+		"smd",
+		"storage",
+		"truncate",
+		"tsvc",
+		"udf",
+		"xdr"
+};
+
+COMPILER_ASSERT(sizeof(cf_fault_context_strings) / sizeof(char*) == CF_FAULT_CONTEXT_UNDEF);
+
+static const char *cf_fault_severity_strings[] = {
+		"CRITICAL",
+		"WARNING",
+		"INFO",
+		"DEBUG",
+		"DETAIL"
+};
+
+COMPILER_ASSERT(sizeof(cf_fault_severity_strings) / sizeof(const char*) == CF_FAULT_SEVERITY_UNDEF);
+
+cf_fault_sink cf_fault_sinks[CF_FAULT_SINKS_MAX];
+cf_fault_severity cf_fault_filter[CF_FAULT_CONTEXT_UNDEF];
+int cf_fault_sinks_inuse = 0;
+int num_held_fault_sinks = 0;
+
+cf_shash *g_ticker_hash = NULL;
+#define CACHE_MSG_MAX_SIZE 128
+
+typedef struct cf_fault_cache_hkey_s {
+	// Members most likely to be unique come first:
+	int					line;
+	cf_fault_context	context;
+	const char			*file_name;
+	cf_fault_severity	severity;
+	char				msg[CACHE_MSG_MAX_SIZE];
+} __attribute__((__packed__)) cf_fault_cache_hkey;
+
+bool g_use_local_time = false;
+
+static bool g_log_millis = false;
+
+// Filter stderr logging at this level when there are no sinks:
+#define NO_SINKS_LIMIT CF_WARNING
+
+static inline const char*
+severity_tag(cf_fault_severity severity)
+{
+	return severity == CF_CRITICAL ?
+			"FAILED ASSERTION" : cf_fault_severity_strings[severity];
+}
+
+/* cf_context_at_severity
+ * Return whether the given context is set to this severity level or higher. */
+bool
+cf_context_at_severity(const cf_fault_context context, const cf_fault_severity severity)
+{
+	return (severity <= cf_fault_filter[context]);
+}
+
+static inline void
+cf_fault_set_severity(const cf_fault_context context, const cf_fault_severity severity)
+{
+	cf_fault_filter[context] = severity;
+
+	// UDF logging relies on the common as_log facility.
+	// Set as_log_level whenever AS_UDF severity changes.
+	if (context == AS_UDF && severity < CF_FAULT_SEVERITY_UNDEF) {
+		as_log_set_level((as_log_level)severity);
+	}
+}
+
+static inline uint32_t
+cache_hash_fn(const void *key)
+{
+	return (uint32_t)((const cf_fault_cache_hkey*)key)->line +
+			*(uint32_t*)((const cf_fault_cache_hkey*)key)->msg;
+}
+
+/* cf_fault_init
+ * This code MUST be the first thing executed by main(). */
+void
+cf_fault_init()
+{
+	// Initialize the fault filter.
+	for (int j = 0; j < CF_FAULT_CONTEXT_UNDEF; j++) {
+		// We start with no sinks, so let's be in-sync with that.
+		cf_fault_set_severity(j, NO_SINKS_LIMIT);
+	}
+
+	// Create the ticker hash.
+	g_ticker_hash = cf_shash_create(cache_hash_fn, sizeof(cf_fault_cache_hkey),
+			sizeof(uint32_t), 256, CF_SHASH_MANY_LOCK);
+}
+
+
+/* cf_fault_sink_add
+ * Register an sink for faults */
+cf_fault_sink *
+cf_fault_sink_add(char *path)
+{
+	cf_fault_sink *s;
+
+	if ((CF_FAULT_SINKS_MAX - 1) == cf_fault_sinks_inuse)
+		return(NULL);
+
+	s = &cf_fault_sinks[cf_fault_sinks_inuse++];
+	s->path = cf_strdup(path);
+	if (0 == strncmp(path, "stderr", 6))
+		s->fd = 2;
+	else {
+		if (-1 == (s->fd = open(path, SINK_OPEN_FLAGS, SINK_OPEN_MODE))) {
+			cf_fault_sinks_inuse--;
+			return(NULL);
+		}
+	}
+
+	for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++)
+		s->limit[i] = CF_INFO;
+
+	return(s);
+}
+
+
+/* cf_fault_sink_hold
+ * Register but don't activate a sink for faults - return sink object pointer on
+ * success, NULL on failure. Only use at startup when parsing config file. After
+ * all sinks are registered, activate via cf_fault_sink_activate_all_held(). */
+cf_fault_sink *
+cf_fault_sink_hold(char *path)
+{
+	if (num_held_fault_sinks >= CF_FAULT_SINKS_MAX) {
+		cf_warning(CF_MISC, "too many fault sinks");
+		return NULL;
+	}
+
+	cf_fault_sink *s = &cf_fault_sinks[num_held_fault_sinks];
+
+	s->path = cf_strdup(path);
+
+	// If a context is not added, its runtime default will be CF_INFO.
+	for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) {
+		s->limit[i] = CF_INFO;
+	}
+
+	num_held_fault_sinks++;
+
+	return s;
+}
+
+
+/* cf_fault_console_is_held
+ * Return whether the console is held.
+ */
+bool
+cf_fault_console_is_held()
+{
+	for (int i = 0; i < num_held_fault_sinks; i++) {
+		cf_fault_sink *s = &cf_fault_sinks[i];
+		if (!strcmp(s->path, "stderr")) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+static void
+fault_filter_adjust(cf_fault_sink *s, cf_fault_context ctx)
+{
+	// Don't adjust filter while adding contexts during config file parsing.
+	if (cf_fault_sinks_inuse == 0) {
+		return;
+	}
+
+	// Fault filter must allow logs at a less critical severity.
+	if (s->limit[ctx] > cf_fault_filter[ctx]) {
+		cf_fault_set_severity(ctx, s->limit[ctx]);
+	}
+	// Fault filter might be able to become stricter - check all sinks.
+	else if (s->limit[ctx] < cf_fault_filter[ctx]) {
+		cf_fault_severity severity = CF_CRITICAL;
+
+		for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+			cf_fault_sink *t = &cf_fault_sinks[i];
+
+			if (t->limit[ctx] > severity) {
+				severity = t->limit[ctx];
+			}
+		}
+
+		cf_fault_set_severity(ctx, severity);
+	}
+}
+
+
+/* cf_fault_sink_activate_all_held
+ * Activate all sinks on hold - return 0 on success, -1 on failure. Only use
+ * once at startup, after parsing config file. On failure there's no cleanup,
+ * assumes caller will stop the process. */
+int
+cf_fault_sink_activate_all_held()
+{
+	for (int i = 0; i < num_held_fault_sinks; i++) {
+		if (cf_fault_sinks_inuse >= CF_FAULT_SINKS_MAX) {
+			// In case this isn't first sink, force logging as if no sinks:
+			cf_fault_sinks_inuse = 0;
+			cf_warning(CF_MISC, "too many fault sinks");
+			return -1;
+		}
+
+		cf_fault_sink *s = &cf_fault_sinks[i];
+
+		// "Activate" the sink.
+		if (0 == strncmp(s->path, "stderr", 6)) {
+			s->fd = 2;
+		}
+		else if (-1 == (s->fd = open(s->path, SINK_OPEN_FLAGS, SINK_OPEN_MODE))) {
+			// In case this isn't first sink, force logging as if no sinks:
+			cf_fault_sinks_inuse = 0;
+			cf_warning(CF_MISC, "can't open %s: %s", s->path, cf_strerror(errno));
+			return -1;
+		}
+
+		cf_fault_sinks_inuse++;
+
+		// Adjust the fault filter to the runtime levels.
+		for (int j = 0; j < CF_FAULT_CONTEXT_UNDEF; j++) {
+			fault_filter_adjust(s, (cf_fault_context)j);
+		}
+	}
+
+	return 0;
+}
+
+
+/* cf_fault_sink_get_fd_list
+ * Fill list with all active sink fds, excluding stderr - return list count. */
+int
+cf_fault_sink_get_fd_list(int *fds)
+{
+	int num_open_fds = 0;
+
+	for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+		cf_fault_sink *s = &cf_fault_sinks[i];
+
+		// Exclude stderr.
+		if (s->fd > 2 && 0 != strncmp(s->path, "stderr", 6)) {
+			fds[num_open_fds++] = s->fd;
+		}
+	}
+
+	return num_open_fds;
+}
+
+
+static int
+cf_fault_sink_addcontext_all(char *context, char *severity)
+{
+	for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+		cf_fault_sink *s = &cf_fault_sinks[i];
+		int rv = cf_fault_sink_addcontext(s, context, severity);
+		if (rv != 0)	return(rv);
+	}
+	return(0);
+}
+
+
+int
+cf_fault_sink_addcontext(cf_fault_sink *s, char *context, char *severity)
+{
+	if (s == 0) 		return(cf_fault_sink_addcontext_all(context, severity));
+
+	cf_fault_context ctx = CF_FAULT_CONTEXT_UNDEF;
+	cf_fault_severity sev = CF_FAULT_SEVERITY_UNDEF;
+
+	for (int i = 0; i < CF_FAULT_SEVERITY_UNDEF; i++) {
+		if (0 == strncasecmp(cf_fault_severity_strings[i], severity, strlen(severity)))
+			sev = (cf_fault_severity)i;
+	}
+	if (CF_FAULT_SEVERITY_UNDEF == sev)
+		return(-1);
+
+	if (0 == strncasecmp(context, "any", 3)) {
+		for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) {
+			s->limit[i] = sev;
+			fault_filter_adjust(s, (cf_fault_context)i);
+		}
+	} else {
+		for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) {
+		//strncasecmp only compared the length of context passed in the 3rd argument and as cf_fault_context_strings has info and info port,
+		//So when you try to set info to debug it will set info-port to debug . Just forcing it to check the length from cf_fault_context_strings
+			if (0 == strncasecmp(cf_fault_context_strings[i], context, strlen(cf_fault_context_strings[i])))
+				ctx = (cf_fault_context)i;
+		}
+		if (CF_FAULT_CONTEXT_UNDEF == ctx)
+			return(-1);
+
+		s->limit[ctx] = sev;
+		fault_filter_adjust(s, ctx);
+	}
+
+	return(0);
+}
+
+
+void
+cf_fault_use_local_time(bool val)
+{
+	g_use_local_time = val;
+}
+
+bool
+cf_fault_is_using_local_time()
+{
+	return g_use_local_time;
+}
+
+void
+cf_fault_log_millis(bool log_millis)
+{
+		g_log_millis = log_millis;
+}
+
+bool
+cf_fault_is_logging_millis()
+{
+	return g_log_millis;
+}
+
+int
+cf_sprintf_now(char* mbuf, size_t limit)
+{
+	struct tm nowtm;
+
+	if (cf_fault_is_logging_millis()) {
+		// Logging milli seconds as well.
+		struct timeval curTime;
+		gettimeofday(&curTime, NULL);
+		int millis = curTime.tv_usec / 1000;
+		int pos = 0;
+		if (g_use_local_time) {
+			localtime_r(&curTime.tv_sec, &nowtm);
+			pos = strftime(mbuf, limit, "%b %d %Y %T.", &nowtm);
+			pos +=
+			  snprintf(mbuf + pos, limit - pos, "%03d", millis);
+			pos +=
+			  strftime(mbuf + pos, limit - pos, " GMT%z: ", &nowtm);
+			return pos;
+		} else {
+			gmtime_r(&curTime.tv_sec, &nowtm);
+			pos = strftime(mbuf, limit, "%b %d %Y %T.", &nowtm);
+			pos +=
+			  snprintf(mbuf + pos, limit - pos, "%03d", millis);
+			pos +=
+			  strftime(mbuf + pos, limit - pos, " %Z: ", &nowtm);
+			return pos;
+		}
+	}
+
+	// Logging only seconds.
+	time_t now = time(NULL);
+
+	if (g_use_local_time) {
+		localtime_r(&now, &nowtm);
+		return strftime(mbuf, limit, "%b %d %Y %T GMT%z: ", &nowtm);
+	} else {
+		gmtime_r(&now, &nowtm);
+		return strftime(mbuf, limit, "%b %d %Y %T %Z: ", &nowtm);
+	}
+}
+
+/* cf_fault_event
+ * Respond to a fault */
+void
+cf_fault_event(const cf_fault_context context, const cf_fault_severity severity,
+		const char *file_name, const int line, const char *msg, ...)
+{
+	va_list argp;
+	char mbuf[1024];
+	size_t pos;
+
+
+	/* Make sure there's always enough space for the \n\0. */
+	size_t limit = sizeof(mbuf) - 2;
+
+	/* Set the timestamp */
+	pos = cf_sprintf_now(mbuf, limit);
+
+	/* Set the context/scope/severity tag */
+	pos += snprintf(mbuf + pos, limit - pos, "%s (%s): ", severity_tag(severity), cf_fault_context_strings[context]);
+
+	/*
+	 * snprintf() and vsnprintf() will not write more than the size specified,
+	 * but they return the size that would have been written without truncation.
+	 * These checks make sure there's enough space for the final \n\0.
+	 */
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	/* Set the location: filename and line number */
+	if (file_name) {
+		pos += snprintf(mbuf + pos, limit - pos, "(%s:%d) ", file_name, line);
+	}
+
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	/* Append the message */
+	va_start(argp, msg);
+	pos += vsnprintf(mbuf + pos, limit - pos, msg, argp);
+	va_end(argp);
+
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	pos += snprintf(mbuf + pos, 2, "\n");
+
+	/* Route the message to the correct destinations */
+	if (0 == cf_fault_sinks_inuse) {
+		/* If no fault sinks are defined, use stderr for important messages */
+		if (severity <= NO_SINKS_LIMIT)
+			fprintf(stderr, "%s", mbuf);
+	} else {
+		for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+			if ((severity <= cf_fault_sinks[i].limit[context]) || (CF_CRITICAL == severity)) {
+				if (0 >= write(cf_fault_sinks[i].fd, mbuf, pos)) {
+					// this is OK for a bit in case of a HUP. It's even better to queue the buffers and apply them
+					// after the hup. TODO.
+					fprintf(stderr, "internal failure in fault message write: %s\n", cf_strerror(errno));
+				}
+			}
+		}
+	}
+
+	/* Critical errors */
+	if (CF_CRITICAL == severity) {
+		fflush(NULL);
+
+		// Our signal handler will log a stack trace.
+		raise(SIGUSR1);
+	}
+} // end cf_fault_event()
+
+
+/**
+ * Generate a Packed Hex String Representation of the binary string.
+ * e.g. 0xfc86e83a6d6d3024659e6fe48c351aaaf6e964a5
+ * The value is preceeded by a "0x" to denote Hex (which allows it to be
+ * used in other contexts as a hex number).
+ */
+int
+generate_packed_hex_string(const void *mem_ptr, uint32_t len, char* output)
+{
+	uint8_t *d = (uint8_t *) mem_ptr;
+	char* p = output;
+	char* startp = p; // Remember where we started.
+
+	*p++ = '0';
+	*p++ = 'x';
+
+	for (uint32_t i = 0; i < len; i++) {
+		sprintf(p, "%02x", d[i]);
+		p += 2;
+	}
+	*p++ = 0; // Null terminate the output buffer.
+	return (int) (p - startp); // show how much space we used.
+} // end generate_packed_hex_string()
+
+
+/**
+ * Generate a Spaced Hex String Representation of the binary string.
+ * e.g. fc 86 e8 3a 6d 6d 30 24 65 9e 6f e4 8c 35 1a aa f6 e9 64 a5
+ */
+int
+generate_spaced_hex_string(const void *mem_ptr, uint32_t len, char* output)
+{
+	uint8_t *d = (uint8_t *) mem_ptr;
+	char* p = output;
+	char* startp = p; // Remember where we started.
+
+	for (uint32_t i = 0; i < len; i++) {
+		sprintf(p, "%02x ", d[i]); // Notice the space after the 02x.
+		p += 3;
+	}
+	*p++ = 0; // Null terminate the output buffer.
+	return (int) (p - startp); // show how much space we used.
+} // end generate_spaced_hex_string()
+
+
+/**
+ * Generate a Column Hex String Representation of the binary string.
+ * The Columns will be four two-byte values, with spaces between the bytes:
+ * fc86 e83a 6d6d 3024
+ * 659e 6fe4 8c35 1aaa
+ * f6e9 64a5
+ */
+int
+generate_column_hex_string(const void *mem_ptr, uint32_t len, char* output)
+{
+	uint8_t *d = (uint8_t *) mem_ptr;
+	char* p = output;
+	uint32_t i;
+	char* startp = p; // Remember where we started.
+
+	*p++ = '\n'; // Start out on a new line
+
+	for (i = 0; i < len; i++) {
+		sprintf(p, "%02x ", d[i]); // Two chars and a space
+		p += 3;
+		if ((i+1) % 8 == 0 && i != 0) {
+			*p++ = '\n';  // add a line return
+		}
+	}
+	*p++ = '\n'; // Finish with a new line
+	*p++ = 0; // Null terminate the output buffer.
+	return (int) (p - startp); // show how much space we used.
+} // end generate_column_hex_string()
+
+
+/**
+ * Generate a Base64 String Representation of the binary string.
+ * Base64 encoding converts three octets into four 6-bit encoded characters.
+ * So, the string 8-bit bytes are broken down into 6 bit values, each of which
+ * is then converted into a base64 value.
+ * So, for example, the string "Man" :: M[77: 0x4d)] a[97(0x61)] n[110(0x6e)]
+ * Bits: (4)0100 (d)1101 (6)0110 (1)0001 (6)0110 (e)1110
+ * Base 64 bits: 010011     010110     000101    101110
+ * Base 64 Rep:  010011(19) 010110(22) 000101(5) 101110(46)
+ * Base 64 Chars:     T(19)      W(22)      F(5)      u(46)
+ * and so this string is converted into the Base 64 string: "TWFu"
+ */
+int generate_base64_string(const void *mem_ptr, uint32_t len, char output_buf[])
+{
+	uint32_t encoded_len = cf_b64_encoded_len(len);
+	// TODO - check that output_buf is big enough, and/or truncate.
+
+	cf_b64_encode((const uint8_t*)mem_ptr, len, output_buf);
+
+	output_buf[encoded_len] = 0; // null-terminate
+
+	return (int)(encoded_len + 1); // bytes we used, including null-terminator
+} // end generate_base64_hex_string()
+
+
+/**
+ * Generate a BIT representation with spaces between the four bit groups.
+ * Print the bits left to right (big to small).
+ * This is assuming BIG ENDIAN representation (most significant bit is left).
+ */
+int generate_4spaced_bits_string(const void *mem_ptr, uint32_t len, char* output)
+{
+	uint8_t *d = (uint8_t *) mem_ptr;
+	char* p = output;
+	uint8_t uint_val;
+	uint8_t mask = 0x80; // largest single bit value in a byte
+	char* startp = p; // Remember where we started.
+
+	// For each byte in the string
+	for (uint32_t i = 0; i < len; i++) {
+		uint_val = d[i];
+		for (int j = 0; j < 8; j++) {
+			sprintf(p, "%1d", ((uint_val << j) & mask));
+			p++;
+			// Add a space after every 4th bit
+			if ( (j+1) % 4 == 0 ) *p++ = ' ';
+		}
+	}
+	*p++ = 0; // Null terminate the output buffer.
+	return (int) (p - startp); // show how much space we used.
+} // end generate_4spaced_bits_string()
+
+/**
+ * Generate a BIT representation of columns with spaces between the
+ * four bit groups.  Columns will be 8 columns of 4 bits.
+ * (1 32 bit word per row)
+ */
+int generate_column_bits_string(const void *mem_ptr, uint32_t len, char* output)
+{
+	uint8_t *d = (uint8_t *) mem_ptr;
+	char* p = output;
+	uint8_t uint_val;
+	uint8_t mask = 0x80; // largest single bit value in a byte
+	char* startp = p; // Remember where we started.
+
+	// Start on a new line
+	*p++ = '\n';
+
+	// For each byte in the string
+	for (uint32_t i = 0; i < len; i++) {
+		uint_val = d[i];
+		for (int j = 0; j < 8; j++) {
+			sprintf(p, "%1d", ((uint_val << j) & mask));
+			p++;
+			// Add a space after every 4th bit
+			if ((j + 1) % 4 == 0) *p++ = ' ';
+		}
+		// Add a line return after every 4th byte
+		if ((i + 1) % 4 == 0) *p++ = '\n';
+	}
+	*p++ = 0; // Null terminate the output buffer.
+	return (int) (p - startp); // show how much space we used.
+} // end generate_column_bits_string()
+
+
+/* cf_fault_event -- TWO:  Expand on the LOG ability by being able to
+ * print the contents of a BINARY array if we're passed a valid ptr (not NULL).
+ * We will print the array according to "format".
+ * Parms:
+ * (*) scope: The module family (e.g. AS_RW, AS_UDF...)
+ * (*) severify: The scope severity (e.g. INFO, DEBUG, DETAIL)
+ * (*) file_name: Ptr to the FILE generating the call
+ * (*) line: The function (really, the FILE) line number of the source call
+ * (*) mem_ptr: Ptr to memory location of binary array (or NULL)
+ * (*) len: Length of the binary string
+ * (*) format: The single char showing the format (e.g. 'D', 'B', etc)
+ * (*) msg: The format msg string
+ * (*) ... : The variable set of parameters the correspond to the msg string.
+ *
+ * NOTE: We will eventually merge this function with the original cf_fault_event()
+ **/
+void
+cf_fault_event2(const cf_fault_context context,
+		const cf_fault_severity severity, const char *file_name, const int line,
+		const void *mem_ptr, size_t len, cf_display_type dt, const char *msg, ...)
+{
+	va_list argp;
+	char mbuf[MAX_BINARY_BUF_SZ];
+	size_t pos;
+
+	char binary_buf[MAX_BINARY_BUF_SZ];
+
+	// Arbitrarily limit output to a fixed maximum length.
+	if (len > MAX_BINARY_BUF_SZ) {
+		len = MAX_BINARY_BUF_SZ;
+	}
+	char * labelp = NULL; // initialize to quiet build warning
+
+	/* Make sure there's always enough space for the \n\0. */
+	size_t limit = sizeof(mbuf) - 2;
+
+	/* Set the timestamp */
+	pos = cf_sprintf_now(mbuf, limit);
+
+	// If we're given a valid MEMORY POINTER for a binary value, then
+	// compute the string that corresponds to the bytes.
+	if (mem_ptr) {
+		switch (dt) {
+		case CF_DISPLAY_HEX_DIGEST:
+			labelp = "Digest";
+			generate_packed_hex_string(mem_ptr, len, binary_buf);
+			break;
+		case CF_DISPLAY_HEX_SPACED:
+			labelp = "HexSpaced";
+			generate_spaced_hex_string(mem_ptr, len, binary_buf);
+			break;
+		case CF_DISPLAY_HEX_PACKED:
+			labelp = "HexPacked";
+			generate_packed_hex_string(mem_ptr, len, binary_buf);
+			break;
+		case CF_DISPLAY_HEX_COLUMNS:
+			labelp = "HexColumns";
+			generate_column_hex_string(mem_ptr, len, binary_buf);
+			break;
+		case CF_DISPLAY_BASE64:
+			labelp = "Base64";
+			generate_base64_string(mem_ptr, len, binary_buf);
+			break;
+		case CF_DISPLAY_BITS_SPACED:
+			labelp = "BitsSpaced";
+			generate_4spaced_bits_string(mem_ptr, len, binary_buf);
+			break;
+		case CF_DISPLAY_BITS_COLUMNS:
+			labelp = "BitsColumns";
+			generate_column_bits_string(mem_ptr, len, binary_buf);
+			break;
+		default:
+			labelp = "Unknown Format";
+			binary_buf[0] = 0; // make sure it's null terminated.
+			break;
+
+		} // end switch
+	} // if binary data is present
+
+	/* Set the context/scope/severity tag */
+	pos += snprintf(mbuf + pos, limit - pos, "%s (%s): ",
+			severity_tag(severity),
+			cf_fault_context_strings[context]);
+
+	/*
+	 * snprintf() and vsnprintf() will not write more than the size specified,
+	 * but they return the size that would have been written without truncation.
+	 * These checks make sure there's enough space for the final \n\0.
+	 */
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	/* Set the location: filename and line number */
+	if (file_name) {
+		pos += snprintf(mbuf + pos, limit - pos, "(%s:%d) ", file_name, line);
+	}
+
+	// Check for overflow (see above).
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	/* Append the message */
+	va_start(argp, msg);
+	pos += vsnprintf(mbuf + pos, limit - pos, msg, argp);
+	va_end(argp);
+
+	// Check for overflow (see above).
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	// Append our final BINARY string, if present (some might pass in NULL).
+	if ( mem_ptr ) {
+		pos += snprintf(mbuf + pos, limit - pos, "<%s>:%s", labelp, binary_buf);
+	}
+
+	// Check for overflow (see above).
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	pos += snprintf(mbuf + pos, 2, "\n");
+
+	/* Route the message to the correct destinations */
+	if (0 == cf_fault_sinks_inuse) {
+		/* If no fault sinks are defined, use stderr for critical messages */
+		if (CF_CRITICAL == severity)
+			fprintf(stderr, "%s", mbuf);
+	} else {
+		for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+			if ((severity <= cf_fault_sinks[i].limit[context]) || (CF_CRITICAL == severity)) {
+				if (0 >= write(cf_fault_sinks[i].fd, mbuf, pos)) {
+					// this is OK for a bit in case of a HUP. It's even better to queue the buffers and apply them
+					// after the hup. TODO.
+					fprintf(stderr, "internal failure in fault message write: %s\n", cf_strerror(errno));
+				}
+			}
+		}
+	}
+
+	/* Critical errors */
+	if (CF_CRITICAL == severity) {
+		fflush(NULL);
+
+		// Our signal handler will log a stack trace.
+		raise(SIGUSR1);
+	}
+}
+
+
+void
+cf_fault_event_nostack(const cf_fault_context context,
+		const cf_fault_severity severity, const char *fn, const int line,
+		const char *msg, ...)
+{
+	va_list argp;
+	char mbuf[1024];
+	time_t now;
+	struct tm nowtm;
+	size_t pos;
+
+	/* Make sure there's always enough space for the \n\0. */
+	size_t limit = sizeof(mbuf) - 2;
+
+	/* Set the timestamp */
+	now = time(NULL);
+
+	if (g_use_local_time) {
+		localtime_r(&now, &nowtm);
+		pos = strftime(mbuf, limit, "%b %d %Y %T GMT%z: ", &nowtm);
+	}
+	else {
+		gmtime_r(&now, &nowtm);
+		pos = strftime(mbuf, limit, "%b %d %Y %T %Z: ", &nowtm);
+	}
+
+	/* Set the context/scope/severity tag */
+	pos += snprintf(mbuf + pos, limit - pos, "%s (%s): ", severity_tag(severity), cf_fault_context_strings[context]);
+
+	/*
+	 * snprintf() and vsnprintf() will not write more than the size specified,
+	 * but they return the size that would have been written without truncation.
+	 * These checks make sure there's enough space for the final \n\0.
+	 */
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	/* Set the location */
+	if (fn)
+		pos += snprintf(mbuf + pos, limit - pos, "(%s:%d) ", fn, line);
+
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	/* Append the message */
+	va_start(argp, msg);
+	pos += vsnprintf(mbuf + pos, limit - pos, msg, argp);
+	va_end(argp);
+
+	if (pos > limit) {
+		pos = limit;
+	}
+
+	pos += snprintf(mbuf + pos, 2, "\n");
+
+	/* Route the message to the correct destinations */
+	if (0 == cf_fault_sinks_inuse) {
+		/* If no fault sinks are defined, use stderr for important messages */
+		if (severity <= NO_SINKS_LIMIT)
+			fprintf(stderr, "%s", mbuf);
+	} else {
+		for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+			if ((severity <= cf_fault_sinks[i].limit[context]) || (CF_CRITICAL == severity)) {
+				if (0 >= write(cf_fault_sinks[i].fd, mbuf, pos)) {
+					// this is OK for a bit in case of a HUP. It's even better to queue the buffers and apply them
+					// after the hup. TODO.
+					fprintf(stderr, "internal failure in fault message write: %s\n", cf_strerror(errno));
+				}
+			}
+		}
+	}
+
+	/* Critical errors */
+	if (CF_CRITICAL == severity) {
+		fflush(NULL);
+
+		// these signals don't throw stack traces in our system
+		raise(SIGINT);
+	}
+}
+
+
+int
+cf_fault_sink_strlist(cf_dyn_buf *db)
+{
+	for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+		cf_dyn_buf_append_int(db, i);
+		cf_dyn_buf_append_char(db, ':');
+		cf_dyn_buf_append_string(db,cf_fault_sinks[i].path);
+		cf_dyn_buf_append_char(db, ';');
+	}
+	cf_dyn_buf_chomp(db);
+	return(0);
+}
+
+
+extern void
+cf_fault_sink_logroll(void)
+{
+	fprintf(stderr, "cf_fault: rolling log files\n");
+	for (int i = 0; i < cf_fault_sinks_inuse; i++) {
+		cf_fault_sink *s = &cf_fault_sinks[i];
+		if ((0 != strncmp(s->path, "stderr", 6)) && (s->fd > 2)) {
+			int fd = s->fd;
+			s->fd = -1;
+			usleep(1);
+
+			// hopefully, the file has been relinked elsewhere - or you're OK losing it
+			unlink(s->path);
+			close(fd);
+
+			fd = open(s->path, SINK_OPEN_FLAGS, SINK_OPEN_MODE);
+			s->fd = fd;
+		}
+	}
+}
+
+
+cf_fault_sink *cf_fault_sink_get_id(int id)
+{
+	if (id > cf_fault_sinks_inuse)	return(0);
+	return ( &cf_fault_sinks[id] );
+}
+
+int
+cf_fault_sink_context_all_strlist(int sink_id, cf_dyn_buf *db)
+{
+	// get the sink
+	if (sink_id > cf_fault_sinks_inuse)	return(-1);
+	cf_fault_sink *s = &cf_fault_sinks[sink_id];
+
+	for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) {
+		cf_dyn_buf_append_string(db, cf_fault_context_strings[i]);
+		cf_dyn_buf_append_char(db, ':');
+		cf_dyn_buf_append_string(db, cf_fault_severity_strings[s->limit[i]]);
+		cf_dyn_buf_append_char(db, ';');
+	}
+	cf_dyn_buf_chomp(db);
+	return(0);
+}
+
+int
+cf_fault_sink_context_strlist(int sink_id, char *context, cf_dyn_buf *db)
+{
+	// get the sink
+	if (sink_id > cf_fault_sinks_inuse)	return(-1);
+	cf_fault_sink *s = &cf_fault_sinks[sink_id];
+
+	// get the severity
+	int i;
+	for (i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) {
+		if (0 == strcmp(cf_fault_context_strings[i],context))
+			break;
+	}
+	if (i == CF_FAULT_CONTEXT_UNDEF) {
+		cf_dyn_buf_append_string(db, context);
+		cf_dyn_buf_append_string(db, ":unknown");
+		return(0);
+	}
+
+	// get the string
+	cf_dyn_buf_append_string(db, context);
+	cf_dyn_buf_append_char(db, ':');
+	cf_dyn_buf_append_string(db, cf_fault_severity_strings[s->limit[i]]);
+	return(0);
+}
+
+
+static int
+cf_fault_cache_reduce_fn(const void *key, void *data, void *udata)
+{
+	uint32_t *count = (uint32_t*)data;
+
+	if (*count == 0) {
+		return CF_SHASH_REDUCE_DELETE;
+	}
+
+	const cf_fault_cache_hkey *hkey = (const cf_fault_cache_hkey*)key;
+
+	cf_fault_event(hkey->context, hkey->severity, hkey->file_name, hkey->line,
+			"(repeated:%u) %s", *count, hkey->msg);
+
+	*count = 0;
+
+	return CF_SHASH_OK;
+}
+
+
+// For now there's only one cache, dumped by the ticker.
+void
+cf_fault_dump_cache()
+{
+	cf_shash_reduce(g_ticker_hash, cf_fault_cache_reduce_fn, NULL);
+}
+
+
+// For now there's only one cache, dumped by the ticker.
+void
+cf_fault_cache_event(cf_fault_context context, cf_fault_severity severity,
+		const char *file_name, int line, char *msg, ...)
+{
+	cf_fault_cache_hkey key = {
+			.line = line,
+			.context = context,
+			.file_name = file_name,
+			.severity = severity,
+			.msg = { 0 } // must pad hash keys
+	};
+
+	size_t limit = sizeof(key.msg) - 1; // truncate leaving null-terminator
+
+	va_list argp;
+
+	va_start(argp, msg);
+	vsnprintf(key.msg, limit, msg, argp);
+	va_end(argp);
+
+	while (true) {
+		uint32_t *valp = NULL;
+		pthread_mutex_t *lockp = NULL;
+
+		if (cf_shash_get_vlock(g_ticker_hash, &key, (void**)&valp, &lockp) ==
+				CF_SHASH_OK) {
+			// Already in hash - increment count and don't log it.
+			(*valp)++;
+			pthread_mutex_unlock(lockp);
+			break;
+		}
+		// else - not found, add it to hash and log it.
+
+		uint32_t initv = 1;
+
+		if (cf_shash_put_unique(g_ticker_hash, &key, &initv) ==
+				CF_SHASH_ERR_FOUND) {
+			continue; // other thread beat us to it - loop around and get it
+		}
+
+		cf_fault_event(context, severity, file_name, line, "%s", key.msg);
+		break;
+	}
+}
+
+void
+cf_fault_hex_dump(const char *title, const void *data, size_t len)
+{
+	const uint8_t *data8 = data;
+	char line[8 + 3 * 16 + 17];
+	size_t k;
+
+	cf_info(CF_MISC, "hex dump - %s", title);
+
+	for (size_t i = 0; i < len; i += k) {
+		sprintf(line, "%06zx:                                                                 ", i);
+
+		for (k = 0; i + k < len && k < 16; ++k) {
+			char num[3];
+			uint8_t d = data8[i + k];
+			sprintf(num, "%02x", d);
+			line[8 + 3 *  k + 0] = num[0];
+			line[8 + 3 *  k + 1] = num[1];
+			line[8 + 3 * 16 + k] = d >= 32 && d <= 126 ? d : '.';
+		}
+
+		cf_info(CF_MISC, "%s", line);
+	}
+}
diff --git a/cf/src/hardware.c b/cf/src/hardware.c
new file mode 100644
index 00000000..e0358436
--- /dev/null
+++ b/cf/src/hardware.c
@@ -0,0 +1,1791 @@
+/*
+ * hardware.c
+ *
+ * Copyright (C) 2016-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "hardware.h"
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <linux/ethtool.h>
+#include <linux/if.h>
+#include <linux/limits.h>
+#include <linux/mempolicy.h>
+#include <linux/sockios.h>
+
+#include "daemon.h"
+#include "fault.h"
+#include "socket.h"
+
+#include "citrusleaf/alloc.h"
+
+#include "warnings.h"
+
+// Only available in Linux kernel version 3.19 and later; but we'd like to
+// allow compilation with older kernel headers.
+#if !defined SO_INCOMING_CPU
+#define SO_INCOMING_CPU 49
+#endif
+
+#define INVALID_INDEX ((uint16_t)-1)
+#define POLICY_SCRIPT "/etc/aerospike/irqbalance-ban.sh"
+
+#define MEM_PAGE_SIZE (4096L)
+
+typedef enum {
+	FILE_RES_OK,
+	FILE_RES_NOT_FOUND,
+	FILE_RES_ERROR
+} file_res;
+
+typedef enum {
+	CHECK_PROC_PRESENT,
+	CHECK_PROC_PRESENT_NO_ARG,
+	CHECK_PROC_ABSENT
+} check_proc_res;
+
+typedef uint16_t os_numa_node_index;
+typedef uint16_t os_package_index;
+typedef uint16_t os_core_index;
+
+typedef uint16_t irq_number;
+
+typedef struct {
+	uint16_t n_irqs;
+	irq_number irqs[CPU_SETSIZE];
+	uint16_t per_cpu;
+} irq_list;
+
+static cpu_set_t g_os_cpus_online;
+static cpu_set_t g_numa_node_os_cpus_online[CPU_SETSIZE];
+
+static uint16_t g_n_numa_nodes;
+static uint16_t g_n_cores;
+static uint16_t g_n_os_cpus;
+static uint16_t g_n_cpus;
+static uint16_t g_n_irq_cpus;
+
+static os_numa_node_index g_numa_node_index_to_os_numa_node_index[CPU_SETSIZE];
+static cf_topo_os_cpu_index g_core_index_to_os_cpu_index[CPU_SETSIZE];
+static cf_topo_os_cpu_index g_cpu_index_to_os_cpu_index[CPU_SETSIZE];
+static cf_topo_cpu_index g_os_cpu_index_to_cpu_index[CPU_SETSIZE];
+
+static cf_topo_numa_node_index g_i_numa_node;
+
+static file_res
+read_file(const char *path, void *buff, size_t *limit)
+{
+	cf_detail(CF_HARDWARE, "reading file %s with buffer size %zu", path, *limit);
+	int32_t fd = open(path, O_RDONLY);
+
+	if (fd < 0) {
+		if (errno == ENOENT) {
+			cf_detail(CF_HARDWARE, "file %s not found", path);
+			return FILE_RES_NOT_FOUND;
+		}
+
+		cf_warning(CF_HARDWARE, "error while opening file %s for reading: %d (%s)",
+				path, errno, cf_strerror(errno));
+		return FILE_RES_ERROR;
+	}
+
+	size_t total = 0;
+
+	while (total < *limit) {
+		cf_detail(CF_HARDWARE, "reading %zd byte(s) at offset %zu", *limit - total, total);
+		ssize_t len = read(fd, (uint8_t *)buff + total, *limit - total);
+		CF_NEVER_FAILS(len);
+
+		if (len == 0) {
+			cf_detail(CF_HARDWARE, "EOF");
+			break;
+		}
+
+		total += (size_t)len;
+	}
+
+	cf_detail(CF_HARDWARE, "read %zu byte(s) from file %s", total, path);
+	file_res res;
+
+	if (total == *limit) {
+		cf_warning(CF_HARDWARE, "read buffer too small for file %s", path);
+		res = FILE_RES_ERROR;
+	}
+	else {
+		res = FILE_RES_OK;
+		*limit = total;
+	}
+
+	CF_NEVER_FAILS(close(fd));
+	return res;
+}
+
+static file_res
+write_file(const char *path, const void *buff, size_t limit)
+{
+	cf_detail(CF_HARDWARE, "writing file %s with buffer size %zu", path, limit);
+	int32_t fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+
+	if (fd < 0) {
+		if (errno == ENOENT) {
+			cf_detail(CF_HARDWARE, "file %s not found", path);
+			return FILE_RES_NOT_FOUND;
+		}
+
+		cf_warning(CF_HARDWARE, "error while opening file %s for writing: %d (%s)",
+				path, errno, cf_strerror(errno));
+		return FILE_RES_ERROR;
+	}
+
+	size_t total = 0;
+
+	while (total < limit) {
+		cf_detail(CF_HARDWARE, "writing %zd byte(s) at offset %zu", limit - total, total);
+		ssize_t len = write(fd, (uint8_t *)buff + total, limit - total);
+		CF_NEVER_FAILS(len);
+		total += (size_t)len;
+	}
+
+	cf_detail(CF_HARDWARE, "done writing");
+	CF_NEVER_FAILS(close(fd));
+	return FILE_RES_OK;
+}
+
+static void
+write_file_safe(const char *path, const void *buff, size_t limit)
+{
+	if (write_file(path, buff, limit) != FILE_RES_OK) {
+		cf_crash(CF_HARDWARE, "write failed unexpectedly");
+	}
+}
+
+static DIR *
+opendir_safe(const char *path)
+{
+	DIR *dir = opendir(path);
+
+	if (dir == NULL) {
+		cf_crash(CF_HARDWARE, "error while opening directory: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	return dir;
+}
+
+static int32_t
+readdir_safe(DIR *dir, struct dirent *ent)
+{
+	while (true) {
+		errno = 0;
+		struct dirent *tmp = readdir(dir);
+
+		if (tmp == NULL) {
+			if (errno != 0) {
+				cf_crash(CF_HARDWARE, "error while reading directory: %d (%s)",
+						errno, cf_strerror(errno));
+			}
+
+			return -1;
+		}
+
+		if (strcmp(tmp->d_name, ".") == 0 || strcmp(tmp->d_name, "..") == 0) {
+			continue;
+		}
+
+		memcpy(ent, tmp, sizeof(struct dirent));
+		return 0;
+	}
+}
+
+static void
+closedir_safe(DIR *dir)
+{
+	if (closedir(dir) < 0) {
+		cf_crash(CF_HARDWARE, "error while closing PCI device directory: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+}
+
+static bool
+path_exists(const char *path)
+{
+	struct stat stat_info;
+
+	if (stat(path, &stat_info) < 0) {
+		if (errno == ENOENT) {
+			return false;
+		}
+
+		cf_crash(CF_HARDWARE, "error while checking for path %s: %d (%s)",
+				path, errno, cf_strerror(errno));
+	}
+
+	return true;
+}
+
+static void
+set_mempolicy_safe(uint32_t mode, uint64_t *node_mask, size_t max_node)
+{
+	if (syscall(__NR_set_mempolicy, mode, node_mask, max_node) < 0) {
+		cf_crash(CF_HARDWARE, "set_mempolicy() system call failed: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+}
+
+static void
+migrate_pages_safe(pid_t pid, size_t max_node, uint64_t *from_mask, uint64_t *to_mask)
+{
+	int64_t res = syscall(__NR_migrate_pages, pid, max_node, from_mask, to_mask);
+
+	if (res < 0) {
+		cf_crash(CF_HARDWARE, "migrate_pages() syscall failed: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	if (res > 0) {
+		cf_warning(CF_HARDWARE, "could not NUMA-migrate %" PRId64 " page(s)", res);
+	}
+}
+
+static void
+mask_to_string(cpu_set_t *mask, char *buff, size_t limit)
+{
+	cf_topo_os_cpu_index max;
+
+	for (max = CPU_SETSIZE - 1; max > 0; --max) {
+		if (CPU_ISSET(max, mask)) {
+			break;
+		}
+	}
+
+	int32_t words = max / 32 + 1;
+	size_t size = (size_t)words * 9;
+
+	if (size > limit) {
+		cf_crash(CF_HARDWARE, "CPU mask buffer overflow: %zu vs. %zu", size, limit);
+	}
+
+	for (int32_t i = words - 1; i >= 0; --i) {
+		uint32_t val = 0;
+
+		for (int32_t k = 0; k < 32; ++k) {
+			if (CPU_ISSET((size_t)(i * 32 + k), mask)) {
+				val |= 1u << k;
+			}
+		}
+
+		snprintf(buff, limit, "%08x", val);
+
+		if (i > 0) {
+			buff[8] = ',';
+		}
+
+		buff += 9;
+		limit -= 9;
+	}
+}
+
+static file_res
+read_index(const char *path, uint16_t *val)
+{
+	cf_detail(CF_HARDWARE, "reading index from file %s", path);
+	char buff[100];
+	size_t limit = sizeof(buff);
+	file_res res = read_file(path, buff, &limit);
+
+	if (res != FILE_RES_OK) {
+		return res;
+	}
+
+	buff[limit - 1] = '\0';
+	cf_detail(CF_HARDWARE, "parsing index \"%s\"", buff);
+
+	char *end;
+	uint64_t x = strtoul(buff, &end, 10);
+
+	if (*end != '\0' || x >= CPU_SETSIZE) {
+		cf_warning(CF_HARDWARE, "invalid index \"%s\" in %s", buff, path);
+		return FILE_RES_ERROR;
+	}
+
+	*val = (uint16_t)x;
+	return FILE_RES_OK;
+}
+
+static file_res
+read_list(const char *path, cpu_set_t *mask)
+{
+	cf_detail(CF_HARDWARE, "reading list from file %s", path);
+	char buff[1000];
+	size_t limit = sizeof(buff);
+	file_res res = read_file(path, buff, &limit);
+
+	if (res != FILE_RES_OK) {
+		return res;
+	}
+
+	buff[limit - 1] = '\0';
+	cf_detail(CF_HARDWARE, "parsing list \"%s\"", buff);
+
+	CPU_ZERO(mask);
+	char *walker = buff;
+
+	while (true) {
+		char *delim;
+		uint64_t from = strtoul(walker, &delim, 10);
+		uint64_t thru;
+
+		if (*delim == ',' || *delim == '\0'){
+			thru = from;
+		}
+		else if (*delim == '-') {
+			walker = delim + 1;
+			thru = strtoul(walker, &delim, 10);
+		}
+		else {
+			cf_warning(CF_HARDWARE, "invalid list \"%s\" in %s", buff, path);
+			return FILE_RES_ERROR;
+		}
+
+		if (from >= CPU_SETSIZE || thru >= CPU_SETSIZE || from > thru) {
+			cf_warning(CF_HARDWARE, "invalid list \"%s\" in %s", buff, path);
+			return FILE_RES_ERROR;
+		}
+
+		cf_detail(CF_HARDWARE, "marking %d through %d", (int32_t)from, (int32_t)thru);
+
+		for (size_t i = from; i <= thru; ++i) {
+			CPU_SET(i, mask);
+		}
+
+		if (*delim == '\0') {
+			break;
+		}
+
+		walker = delim + 1;
+	}
+
+	char buff2[1000];
+	mask_to_string(mask, buff2, sizeof(buff2));
+	cf_detail(CF_HARDWARE, "list \"%s\" -> mask %s", buff, buff2);
+
+	return FILE_RES_OK;
+}
+
+static void
+detect(cf_topo_numa_node_index a_numa_node)
+{
+	if (a_numa_node == INVALID_INDEX) {
+		cf_detail(CF_HARDWARE, "detecting online CPUs");
+	}
+	else {
+		cf_detail(CF_HARDWARE, "detecting online CPUs on NUMA node %hu", a_numa_node);
+	}
+
+	if (read_list("/sys/devices/system/cpu/online", &g_os_cpus_online) != FILE_RES_OK) {
+		cf_crash(CF_HARDWARE, "error while reading list of online CPUs");
+	}
+
+	cf_detail(CF_HARDWARE, "learning CPU topology");
+
+	cf_topo_numa_node_index os_numa_node_index_to_numa_node_index[CPU_SETSIZE];
+
+	for (int32_t i = 0; i < CPU_SETSIZE; ++i) {
+		CPU_ZERO(&g_numa_node_os_cpus_online[i]);
+
+		g_core_index_to_os_cpu_index[i] = INVALID_INDEX;
+		g_cpu_index_to_os_cpu_index[i] = INVALID_INDEX;
+		g_os_cpu_index_to_cpu_index[i] = INVALID_INDEX;
+
+		os_numa_node_index_to_numa_node_index[i] = INVALID_INDEX;
+		g_numa_node_index_to_os_numa_node_index[i] = INVALID_INDEX;
+	}
+
+	cpu_set_t covered_numa_nodes;
+	cpu_set_t covered_cores[CPU_SETSIZE]; // One mask per package.
+
+	CPU_ZERO(&covered_numa_nodes);
+
+	for (int32_t i = 0; i < CPU_SETSIZE; ++i) {
+		CPU_ZERO(&covered_cores[i]);
+	}
+
+	g_n_numa_nodes = 0;
+	g_n_cores = 0;
+	g_n_os_cpus = 0;
+	g_n_cpus = 0;
+	char path[1000];
+	bool no_numa = false;
+
+	// Loop through all CPUs in the system by looping through OS CPU indexes.
+
+	for (g_n_os_cpus = 0; g_n_os_cpus < CPU_SETSIZE; ++g_n_os_cpus) {
+		cf_detail(CF_HARDWARE, "querying OS CPU index %hu", g_n_os_cpus);
+
+		// Let's look at the CPU's package.
+
+		snprintf(path, sizeof(path),
+				"/sys/devices/system/cpu/cpu%hu/topology/physical_package_id",
+				g_n_os_cpus);
+		os_package_index i_os_package;
+		file_res res = read_index(path, &i_os_package);
+
+		// The entry doesn't exist. We've processed all available CPUs. Stop
+		// looping through the CPUs.
+
+		if (res == FILE_RES_NOT_FOUND) {
+			break;
+		}
+
+		if (res != FILE_RES_OK) {
+			cf_crash(CF_HARDWARE, "error while reading OS package index from %s", path);
+		}
+
+		cf_detail(CF_HARDWARE, "OS package index is %hu", i_os_package);
+
+		// Only consider CPUs that are actually in use.
+
+		if (!CPU_ISSET(g_n_os_cpus, &g_os_cpus_online)) {
+			cf_detail(CF_HARDWARE, "OS CPU index %hu is offline", g_n_os_cpus);
+			continue;
+		}
+
+		// Let's look at the CPU's underlying core. In Hyper Threading systems,
+		// two (logical) CPUs share one (physical) core.
+
+		snprintf(path, sizeof(path),
+				"/sys/devices/system/cpu/cpu%hu/topology/core_id",
+				g_n_os_cpus);
+		os_core_index i_os_core;
+		res = read_index(path, &i_os_core);
+
+		if (res != FILE_RES_OK) {
+			cf_crash(CF_HARDWARE, "error while reading OS core index from %s", path);
+		}
+
+		cf_detail(CF_HARDWARE, "OS core index is %hu", i_os_core);
+
+		// Consider a core when we see it for the first time. In other words, we
+		// consider the first Hyper Threading peer of each core to be that core.
+
+		bool new_core;
+
+		if (CPU_ISSET(i_os_core, &covered_cores[i_os_package])) {
+			cf_detail(CF_HARDWARE, "core (%hu, %hu) already covered", i_os_core, i_os_package);
+			new_core = false;
+		}
+		else {
+			cf_detail(CF_HARDWARE, "core (%hu, %hu) is new", i_os_core, i_os_package);
+			new_core = true;
+			CPU_SET(i_os_core, &covered_cores[i_os_package]);
+		}
+
+		// Identify the NUMA node of the current CPU. We simply look for the
+		// current CPU's topology info subtree in each NUMA node's subtree.
+		// Specifically, we look for the current CPU's "core_id" entry.
+
+		os_numa_node_index i_os_numa_node;
+
+		for (i_os_numa_node = 0; i_os_numa_node < CPU_SETSIZE; ++i_os_numa_node) {
+			snprintf(path, sizeof(path),
+					"/sys/devices/system/cpu/cpu%hu/node%hu/cpu%hu/topology/core_id",
+					g_n_os_cpus, i_os_numa_node, g_n_os_cpus);
+			uint16_t dummy;
+			res = read_index(path, &dummy);
+
+			// We found the NUMA node that has the current CPU in its subtree.
+
+			if (res == FILE_RES_OK) {
+				break;
+			}
+
+			if (res != FILE_RES_NOT_FOUND) {
+				cf_crash(CF_HARDWARE, "error while reading core number from %s", path);
+			}
+		}
+
+		// Some Docker installations seem to not have any NUMA information
+		// in /sys. In this case, assume a system with a single NUMA node.
+
+		if (i_os_numa_node == CPU_SETSIZE) {
+			cf_detail(CF_HARDWARE, "OS CPU index %hu does not have a NUMA node", g_n_os_cpus);
+			no_numa = true;
+			i_os_numa_node = 0;
+		}
+
+		cf_detail(CF_HARDWARE, "OS NUMA node index is %hu", i_os_numa_node);
+
+		// Again, just like with cores, we consider a NUMA node when we encounter
+		// it for the first time.
+
+		bool new_numa_node;
+
+		if (CPU_ISSET(i_os_numa_node, &covered_numa_nodes)) {
+			cf_detail(CF_HARDWARE, "OS NUMA node index %hu already covered", i_os_numa_node);
+			new_numa_node = false;
+		}
+		else {
+			cf_detail(CF_HARDWARE, "OS NUMA node index %hu is new", i_os_numa_node);
+			new_numa_node = true;
+			CPU_SET(i_os_numa_node, &covered_numa_nodes);
+
+			// For now, we only support a 64-bit bitmask (= one uint64_t).
+
+			if (i_os_numa_node >= 64) {
+				cf_crash(CF_HARDWARE, "OS NUMA node index %hu too high", i_os_numa_node);
+			}
+		}
+
+		// Now we know that the CPU is online and we know, whether it is in a newly
+		// seen core (new_core) and/or a newly seen NUMA node (new_numa_node).
+
+		cf_topo_numa_node_index i_numa_node;
+
+		if (new_numa_node) {
+			i_numa_node = g_n_numa_nodes;
+			++g_n_numa_nodes;
+			os_numa_node_index_to_numa_node_index[i_os_numa_node] = i_numa_node;
+			g_numa_node_index_to_os_numa_node_index[i_numa_node] = i_os_numa_node;
+			cf_detail(CF_HARDWARE, "OS NUMA node index %hu -> new NUMA node index %hu",
+					i_os_numa_node, i_numa_node);
+		}
+		else {
+			i_numa_node = os_numa_node_index_to_numa_node_index[i_os_numa_node];
+			cf_detail(CF_HARDWARE, "OS NUMA node index %hu -> existing NUMA node index %hu",
+					i_os_numa_node, i_numa_node);
+		}
+
+		cf_detail(CF_HARDWARE, "OS CPU index %hu on NUMA node index %hu", g_n_os_cpus, i_numa_node);
+		CPU_SET(g_n_os_cpus, &g_numa_node_os_cpus_online[i_numa_node]);
+
+		// If we're in NUMA mode and the CPU isn't on the NUMA mode that we're
+		// running on, then ignore the CPU.
+
+		if (a_numa_node != INVALID_INDEX && a_numa_node != i_numa_node) {
+			cf_detail(CF_HARDWARE, "skipping unwanted NUMA node index %hu", i_numa_node);
+			continue;
+		}
+
+		// If the CPU is a new core, then map a new core index to the OS CPU index.
+
+		if (new_core) {
+			g_core_index_to_os_cpu_index[g_n_cores] = g_n_os_cpus;
+			cf_detail(CF_HARDWARE, "core index %hu -> OS CPU index %hu", g_n_cores, g_n_os_cpus);
+			++g_n_cores;
+		}
+
+		// Map the OS CPU index to a new CPU index and vice versa.
+
+		g_os_cpu_index_to_cpu_index[g_n_os_cpus] = g_n_cpus;
+		g_cpu_index_to_os_cpu_index[g_n_cpus] = g_n_os_cpus;
+
+		cf_detail(CF_HARDWARE, "OS CPU index %hu <-> CPU index %hu", g_n_os_cpus, g_n_cpus);
+		++g_n_cpus;
+	}
+
+	if (g_n_os_cpus == CPU_SETSIZE) {
+		cf_crash(CF_HARDWARE, "too many CPUs");
+	}
+
+	if (a_numa_node != INVALID_INDEX && no_numa) {
+		cf_warning(CF_HARDWARE, "no NUMA information found in /sys");
+	}
+
+	g_i_numa_node = a_numa_node;
+}
+
+static void
+pin_to_numa_node(cf_topo_numa_node_index a_numa_node)
+{
+	cf_info(CF_HARDWARE, "pinning to NUMA node %hu", a_numa_node);
+
+	// Move the current thread (and all of its future descendants) to the CPUs
+	// on the selected NUMA node.
+
+	cpu_set_t cpu_set;
+	CPU_ZERO(&cpu_set);
+
+	for (cf_topo_cpu_index i_cpu = 0; i_cpu < g_n_cpus; ++i_cpu) {
+		cf_topo_os_cpu_index i_os_cpu = g_cpu_index_to_os_cpu_index[i_cpu];
+		CPU_SET(i_os_cpu, &cpu_set);
+	}
+
+	char buff[1000];
+	mask_to_string(&cpu_set, buff, sizeof(buff));
+	cf_detail(CF_HARDWARE, "NUMA node %hu CPU mask: %s", a_numa_node, buff);
+
+	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
+		cf_crash(CF_HARDWARE, "error while pinning thread to NUMA node %hu: %d (%s)",
+				a_numa_node, errno, cf_strerror(errno));
+	}
+
+	// Force future memory allocations to the selected NUMA node.
+
+	os_numa_node_index i_os_numa_node = g_numa_node_index_to_os_numa_node_index[a_numa_node];
+	uint64_t to_mask = 1UL << i_os_numa_node;
+	cf_detail(CF_HARDWARE, "NUMA node mask (to): %016" PRIx64, to_mask);
+
+	// Unlike select(), we have to pass "number of valid bits + 1".
+	set_mempolicy_safe(MPOL_BIND, &to_mask, 65);
+
+	// Make sure we can migrate shared memory that we later attach and map.
+	cf_process_holdcap();
+}
+
+static uint32_t
+pick_random(uint32_t limit)
+{
+	static __thread uint64_t state = 0;
+
+	if (state == 0) {
+		state = (uint64_t)syscall(SYS_gettid);
+	}
+
+	state = state * 6364136223846793005 + 1;
+
+	if (state == 0) {
+		state = 1;
+	}
+
+	return (uint32_t)((state >> 32) % limit);
+}
+
+uint16_t
+cf_topo_count_cores(void)
+{
+	return g_n_cores;
+}
+
+uint16_t
+cf_topo_count_cpus(void)
+{
+	return g_n_cpus;
+}
+
+static cf_topo_cpu_index
+os_cpu_index_to_cpu_index(cf_topo_os_cpu_index i_os_cpu)
+{
+	cf_detail(CF_HARDWARE, "translating OS CPU index %hu", i_os_cpu);
+
+	if (i_os_cpu >= g_n_os_cpus) {
+		cf_crash(CF_HARDWARE, "invalid OS CPU index %hu", i_os_cpu);
+	}
+
+	cf_topo_cpu_index i_cpu = g_os_cpu_index_to_cpu_index[i_os_cpu];
+
+	if (i_cpu == INVALID_INDEX) {
+		cf_detail(CF_HARDWARE, "foreign OS CPU index %hu", i_os_cpu);
+	}
+	else {
+		cf_detail(CF_HARDWARE, "CPU index is %hu", i_cpu);
+	}
+
+	return i_cpu;
+}
+
+cf_topo_cpu_index
+cf_topo_current_cpu(void)
+{
+	cf_detail(CF_HARDWARE, "getting current OS CPU index");
+	int32_t os = sched_getcpu();
+
+	if (os < 0) {
+		cf_crash(CF_HARDWARE, "error while getting OS CPU index: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	return os_cpu_index_to_cpu_index((cf_topo_os_cpu_index)os);
+}
+
+cf_topo_cpu_index
+cf_topo_socket_cpu(const cf_socket *sock)
+{
+	cf_detail(CF_HARDWARE, "determining CPU index for socket FD %d", CSFD(sock));
+
+	int32_t os;
+	socklen_t len = sizeof(os);
+
+	if (getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_CPU, &os, &len) < 0) {
+		cf_crash(CF_SOCKET, "error while determining incoming OS CPU index: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	cf_detail(CF_HARDWARE, "OS CPU index is %d", os);
+	cf_topo_cpu_index i_cpu = os_cpu_index_to_cpu_index((cf_topo_os_cpu_index)os);
+
+	// 1. The incoming connection was handled on the wrong NUMA node. In this case,
+	// pick a random CPU on the correct NUMA node.
+
+	if (i_cpu == INVALID_INDEX) {
+		i_cpu = (cf_topo_cpu_index)pick_random(g_n_cpus);
+		cf_detail(CF_HARDWARE, "picking random CPU index %hu", i_cpu);
+		return i_cpu;
+	}
+
+	// 2. The incoming connection was handled on a CPU that doesn't get any NIC
+	// interrupts. This should not happen for connections from other machines, but
+	// it does happen for connections from the local machine, because they don't
+	// go through the NIC hardware. In this case, pick a random CPU.
+
+	if (i_cpu >= g_n_irq_cpus) {
+		i_cpu = (cf_topo_cpu_index)pick_random(g_n_cpus);
+		cf_detail(CF_HARDWARE, "randomizing unexpected CPU index >%hu to %hu",
+				g_n_irq_cpus - 1, i_cpu);
+		return i_cpu;
+	}
+
+	// 3. Otherwise, redistribute. The first g_n_irq_cpus CPUs out of a total of
+	// g_n_cpus CPUs get NIC interrupts. Suppose we have 2 NIC queues and 8 CPUs,
+	// i.e., that g_n_irq_cpus == 2 and g_n_cpus == 8. We want to redistribute
+	// evenly across the 8 CPUs, i.e., each CPU should be picked with a probability
+	// of 0.125.
+
+	// We're currently running on one of the 2 CPUs that get NIC interrupts, on
+	// either with a probability of p1 = 0.5. We want to stay on the current CPU
+	// with a probability of p2 = g_n_irq_cpus / g_n_cpus == 2 / 8 == 0.25, which
+	// yields the desired total probability of p1 * p2 = 0.5 * 0.25 = 0.125.
+
+	if (pick_random(100000) < g_n_irq_cpus * (uint32_t)100000 / g_n_cpus) {
+		cf_detail(CF_HARDWARE, "staying on CPU index %hu", i_cpu);
+		return i_cpu;
+	}
+
+	// 4. Otherwise, if we switch CPUs, then we jump to a CPU that doesn't receive
+	// NIC interrupts, i.e., one of the remaining 6 CPUs [2 .. 8] in our example.
+	// This reaches each CPU with a probability of (1 - p2) / 6 = 0.125.
+
+	i_cpu = (cf_topo_cpu_index)(g_n_irq_cpus +
+			pick_random((uint32_t)g_n_cpus - (uint32_t)g_n_irq_cpus));
+	cf_detail(CF_HARDWARE, "redirecting to CPU index %hu", i_cpu);
+	return i_cpu;
+}
+
+static void
+pin_to_os_cpu(cf_topo_os_cpu_index i_os_cpu)
+{
+	cf_detail(CF_HARDWARE, "pinning to OS CPU index %hu", i_os_cpu);
+
+	cpu_set_t cpu_set;
+	CPU_ZERO(&cpu_set);
+	CPU_SET(i_os_cpu, &cpu_set);
+
+	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
+		cf_crash(CF_HARDWARE, "error while pinning thread to OS CPU %hu: %d (%s)",
+				i_os_cpu, errno, cf_strerror(errno));
+	}
+}
+
+void
+cf_topo_pin_to_core(cf_topo_core_index i_core)
+{
+	cf_detail(CF_HARDWARE, "pinning to core index %hu", i_core);
+
+	if (i_core >= g_n_cores) {
+		cf_crash(CF_HARDWARE, "invalid core index %hu", i_core);
+	}
+
+	pin_to_os_cpu(g_core_index_to_os_cpu_index[i_core]);
+}
+
+void
+cf_topo_pin_to_cpu(cf_topo_cpu_index i_cpu)
+{
+	cf_detail(CF_HARDWARE, "pinning to CPU index %hu", i_cpu);
+
+	if (i_cpu >= g_n_cpus) {
+		cf_crash(CF_HARDWARE, "invalid CPU index %hu", i_cpu);
+	}
+
+	pin_to_os_cpu(g_cpu_index_to_os_cpu_index[i_cpu]);
+}
+
+static check_proc_res
+check_proc(const char *name, int32_t argc, const char *argv[])
+{
+	cf_detail(CF_HARDWARE, "looking for process %s", name);
+
+	for (int32_t i = 0; i < argc; ++i) {
+		cf_detail(CF_HARDWARE, "argv[%d]: %s", i, argv[i]);
+	}
+
+	DIR *dir = opendir_safe("/proc");
+	struct dirent ent;
+	char cmd[10000];
+	size_t limit;
+	bool found = false;
+
+	while (readdir_safe(dir, &ent) >= 0) {
+		bool numeric = true;
+
+		for (int32_t i = 0; ent.d_name[i] != 0; ++i) {
+			if (!isascii(ent.d_name[i]) || !isdigit(ent.d_name[i])) {
+				numeric = false;
+				break;
+			}
+		}
+
+		if (!numeric) {
+			continue;
+		}
+
+		char path[100];
+		snprintf(path, sizeof(path), "/proc/%s/cmdline", ent.d_name);
+
+		limit = sizeof(cmd) - 1;
+		file_res rfr = read_file(path, cmd, &limit);
+
+		// Can legitimately happen, if the process has exited in the meantime.
+		if (rfr == FILE_RES_NOT_FOUND) {
+			continue;
+		}
+
+		if (rfr == FILE_RES_ERROR) {
+			cf_crash(CF_HARDWARE, "error while reading file %s", path);
+		}
+
+		if (limit > 0 && cmd[limit - 1] != 0) {
+			cmd[limit] = 0;
+		}
+
+		const char *name2 = strrchr(cmd, '/');
+
+		if (name2 != NULL) {
+			++name2;
+		}
+		else {
+			name2 = cmd;
+		}
+
+		if (strcmp(name2, name) == 0) {
+			found = true;
+			break;
+		}
+	}
+
+	closedir_safe(dir);
+
+	if (!found) {
+		cf_detail(CF_HARDWARE, "process %s absent", name);
+		return CHECK_PROC_ABSENT;
+	}
+
+	cf_detail(CF_HARDWARE, "process %s is %s", name, cmd);
+
+	if (argc > 0) {
+		int32_t i_arg = 0;
+
+		for (size_t off = strlen(cmd) + 1; off < limit; off += strlen(cmd + off) + 1) {
+			cf_detail(CF_HARDWARE, "checking argument %s against %s", cmd + off, argv[i_arg]);
+
+			if (strcmp(cmd + off, argv[i_arg]) == 0) {
+				++i_arg;
+
+				if (i_arg >= argc) {
+					break;
+				}
+			}
+			else {
+				i_arg = 0;
+			}
+		}
+
+		if (i_arg >= argc) {
+			cf_detail(CF_HARDWARE, "process %s present with argument", name);
+			return CHECK_PROC_PRESENT;
+		}
+	}
+
+	cf_detail(CF_HARDWARE, "process %s present", name);
+	return CHECK_PROC_PRESENT_NO_ARG;
+}
+
+static uint16_t
+interface_queues(const char *if_name, const char *format)
+{
+	uint16_t n_queues = 0;
+
+	while (true) {
+		char path[1000];
+		snprintf(path, sizeof(path), format, if_name, n_queues);
+		cf_detail(CF_HARDWARE, "checking for path %s", path);
+
+		if (!path_exists(path)) {
+			cf_detail(CF_HARDWARE, "path not found");
+			break;
+		}
+
+		++n_queues;
+	}
+
+	cf_assert(n_queues != 0, CF_HARDWARE, "interface %s has no queues", if_name);
+
+	return n_queues;
+}
+
+static uint16_t
+interface_rx_queues(const char *if_name)
+{
+	cf_detail(CF_HARDWARE, "getting receive queues for interface %s", if_name);
+	return interface_queues(if_name, "/sys/class/net/%s/queues/rx-%hu");
+}
+
+static uint16_t
+interface_tx_queues(const char *if_name)
+{
+	cf_detail(CF_HARDWARE, "getting transmit queues for interface %s", if_name);
+	return interface_queues(if_name, "/sys/class/net/%s/queues/tx-%hu");
+}
+
+static int
+comp_irq_number(const void *lhs, const void *rhs)
+{
+	return *(irq_number *)lhs - *(irq_number *)rhs;
+}
+
+static void
+interface_irqs(const char *if_name, irq_list *irqs)
+{
+	cf_detail(CF_HARDWARE, "getting IRQs for interface %s", if_name);
+
+	DIR *dir = opendir_safe("/sys/bus/pci/devices");
+	struct dirent ent;
+	char path[PATH_MAX];
+	bool found = false;
+
+	while (readdir_safe(dir, &ent) >= 0) {
+		snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/net/%s/ifindex",
+				ent.d_name, if_name);
+		bool exists = path_exists(path);
+
+		if (!exists) {
+			for (int32_t i = 0; i < 100; ++i) {
+				snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/virtio%d/net/%s/ifindex",
+						ent.d_name, i, if_name);
+				exists = path_exists(path);
+
+				if (exists) {
+					break;
+				}
+			}
+		}
+
+		if (!exists) {
+			continue;
+		}
+
+		snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/msi_irqs", ent.d_name);
+
+		if (!path_exists(path)) {
+			cf_crash(CF_HARDWARE, "interface %s does not support MSIs", if_name);
+		}
+
+		cf_detail(CF_HARDWARE, "interface %s is %s", if_name, ent.d_name);
+		found = true;
+		break;
+	}
+
+	closedir_safe(dir);
+
+	if (!found) {
+		cf_crash(CF_HARDWARE, "interface %s does not have a PCI device entry", if_name);
+	}
+
+	dir = opendir_safe(path);
+	int32_t count = 0;
+	irq_number irq_nums[CPU_SETSIZE];
+
+	while (readdir_safe(dir, &ent) >= 0) {
+		char *end;
+		uint64_t tmp = strtoul(ent.d_name, &end, 10);
+
+		if (*end != 0 || tmp > 65535) {
+			cf_crash(CF_HARDWARE, "invalid IRQ number %s in %s", ent.d_name, path);
+		}
+
+		if (count >= CPU_SETSIZE) {
+			cf_crash(CF_HARDWARE, "too many IRQs in %s", path);
+		}
+
+		cf_detail(CF_HARDWARE, "interface %s has IRQ %hu", if_name, (irq_number)tmp);
+		irq_nums[count] = (irq_number)tmp;
+		++count;
+	}
+
+	closedir_safe(dir);
+
+	// Sort IRQ numbers, so that RX and TX interrupts pair up nicely when
+	// populating irqs->irqs[].
+	qsort(irq_nums, (size_t)count, sizeof(irq_number), comp_irq_number);
+
+	char actions[count][100];
+	memset(actions, 0, sizeof(actions));
+
+	FILE *fh = fopen("/proc/interrupts", "r");
+
+	if (fh == NULL) {
+		cf_crash(CF_HARDWARE, "error while opening /proc/interrupts");
+	}
+
+	int32_t line_no = 0;
+	char line[25000];
+
+	while (fgets(line, sizeof(line), fh) != NULL) {
+		++line_no;
+
+		if (line_no == 1) {
+			continue;
+		}
+
+		int32_t i = 0;
+
+		while (line[i] == ' ') {
+			++i;
+		}
+
+		irq_number irq_num = 0;
+
+		while (line[i] >= '0' && line[i] <= '9') {
+			irq_num = (irq_number)(irq_num * 10 + line[i] - '0');
+			++i;
+		}
+
+		if (line[i] != ':') {
+			continue;
+		}
+
+		while (line[i] != 0 && line[i] != '\n') {
+			++i;
+		}
+
+		line[i] = 0;
+
+		while (i >= 0 && line[i] != ' ') {
+			--i;
+		}
+
+		char *action = line + i + 1;
+
+		if (strlen(action) >= sizeof(actions[0])) {
+			cf_crash(CF_HARDWARE, "oversize action in line %d in /proc/interrupts: %s",
+					line_no, action);
+		}
+
+		cf_detail(CF_HARDWARE, "IRQ %hu has action %s", irq_num, action);
+
+		for (i = 0; i < count; ++i) {
+			if (irq_nums[i] == irq_num) {
+				int32_t m = 0;
+
+				// Remove any digits, so that the queue index goes away and all queues
+				// look alike. Also, normalize to lower case. For example:
+				//
+				//   "i40e-em1-TxRx-0" -> "ie-em-txrx-"
+				//   "i40e-em1-TxRx-1" -> "ie-em-txrx-"
+				//   ...
+
+				for (int32_t k = 0; action[k] != 0; ++k) {
+					if (action[k] < '0' || action[k] > '9') {
+						actions[i][m] = (char)tolower((uint8_t)action[k]);
+						++m;
+					}
+				}
+
+				actions[i][m] = 0;
+				cf_detail(CF_HARDWARE, "action pattern is %s", actions[i]);
+				break;
+			}
+		}
+	}
+
+	fclose(fh);
+
+	int32_t n_groups = 0;
+	int32_t group_sizes[count];
+	int32_t group_extra[count];
+	int32_t action_groups[count];
+	int32_t inactive_group = -1;
+
+	for (int32_t i = 0; i < count; ++i) {
+		group_sizes[i] = 0;
+		group_extra[i] = 0;
+		action_groups[i] = -1;
+	}
+
+	// Group by action pattern.
+
+	for (int32_t i = 0; i < count; ++i) {
+		if (action_groups[i] >= 0) {
+			continue;
+		}
+
+		action_groups[i] = n_groups;
+		++group_sizes[n_groups];
+
+		if (actions[i][0] == 0) {
+			inactive_group = n_groups;
+			cf_detail(CF_HARDWARE, "inactive IRQs in new group %d", n_groups);
+		}
+		else {
+			cf_detail(CF_HARDWARE, "new group %d: %s", n_groups, actions[i]);
+		}
+
+		for (int32_t k = i + 1; k < count; ++k) {
+			if (strcmp(actions[i], actions[k]) == 0) {
+				action_groups[k] = n_groups;
+				++group_sizes[n_groups];
+			}
+		}
+
+		cf_detail(CF_HARDWARE, "group %d has %d member(s)", n_groups, group_sizes[n_groups]);
+
+		// Prefer groups whose action patterns have "rx", "tx", "input", or "output" in them.
+
+		if (strstr(actions[i], "rx") != NULL || strstr(actions[i], "tx") != NULL ||
+				strstr(actions[i], "input") != NULL || strstr(actions[i], "output") != NULL) {
+			cf_detail(CF_HARDWARE, "preferring group %d", n_groups);
+			group_extra[n_groups] = 1;
+		}
+
+		++n_groups;
+	}
+
+	// Find the two largest groups.
+
+	int32_t a = -1;
+	int32_t b = -1;
+
+	for (int32_t i = 0; i < n_groups; ++i) {
+		if (i != inactive_group &&
+				(a < 0 || group_sizes[i] + group_extra[i] > group_sizes[a] + group_extra[a])) {
+			a = i;
+		}
+	}
+
+	if (a < 0) {
+		cf_crash(CF_HARDWARE, "no active interrupts for interface %s", if_name);
+	}
+
+	for (int32_t i = 0; i < n_groups; ++i) {
+		if (i != inactive_group && i != a &&
+				(b < 0 || group_sizes[i] + group_extra[i] > group_sizes[b] + group_extra[b])) {
+			b = i;
+		}
+	}
+
+	cf_detail(CF_HARDWARE, "largest groups: %d, %d", a, b);
+
+	// If the two largest groups have an equal number of members, then we assume
+	// that it's a NIC with separate RX and TX queue IRQs.
+
+	if (b >= 0 && group_sizes[a] == group_sizes[b]) {
+		cf_detail(CF_HARDWARE, "assuming %d separate RX and TX queue IRQ(s)",
+				group_sizes[a] + group_sizes[b]);
+		int32_t ia = 0;
+		int32_t ib = 0;
+
+		// Make RX and TX queue IRQs take turns in the IRQ list.
+
+		for (int32_t k = 0; k < count; ++k) {
+			if (action_groups[k] == a) {
+				irqs->irqs[ia * 2] = irq_nums[k];
+				cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ia * 2, irq_nums[k]);
+				++ia;
+			}
+			else if (action_groups[k] == b) {
+				irqs->irqs[ib * 2 + 1] = irq_nums[k];
+				cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ib * 2 + 1, irq_nums[k]);
+				++ib;
+			}
+		}
+
+		irqs->n_irqs = (uint16_t)(group_sizes[a] + group_sizes[b]);
+
+		// Send pairs of two consecutive IRQs in the IRQ list (= the RX and the
+		// TX queue IRQ of a given NIC queue pair) to the same CPU.
+
+		irqs->per_cpu = 2;
+		return;
+	}
+
+	// Otherwise, we assume that it's a NIC with combined RX and TX queue IRQs
+	// and that the largest group contains these IRQs.
+
+	cf_detail(CF_HARDWARE, "assuming %d combined RX and TX queue IRQ(s)", group_sizes[a]);
+	int32_t ia = 0;
+
+	for (int32_t k = 0; k < count; ++k) {
+		if (action_groups[k] == a) {
+			irqs->irqs[ia] = irq_nums[k];
+			cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ia, irq_nums[k]);
+			++ia;
+		}
+	}
+
+	irqs->n_irqs = (uint16_t)group_sizes[a];
+
+	// Send each IRQ in the IRQ list to a different CPU.
+
+	irqs->per_cpu = 1;
+}
+
+static void
+pin_irq(irq_number i_irq, cf_topo_os_cpu_index i_os_cpu)
+{
+	cf_detail(CF_HARDWARE, "pinning IRQ number %hu to OS CPU index %hu", i_irq, i_os_cpu);
+
+	cpu_set_t mask;
+	CPU_ZERO(&mask);
+	CPU_SET(i_os_cpu, &mask);
+
+	char mask_str[200];
+	mask_to_string(&mask, mask_str, sizeof(mask_str));
+	cf_detail(CF_HARDWARE, "CPU mask is %s", mask_str);
+
+	char path[1000];
+	snprintf(path, sizeof(path), "/proc/irq/%hu/smp_affinity", i_irq);
+
+	if (write_file(path, mask_str, strlen(mask_str)) != FILE_RES_OK) {
+		cf_crash(CF_HARDWARE, "error while pinning IRQ, path %s", path);
+	}
+}
+
+static cf_topo_os_cpu_index
+fix_os_cpu_index(cf_topo_os_cpu_index i_os_cpu, const cpu_set_t *online)
+{
+	while (true) {
+		if (i_os_cpu >= g_n_os_cpus) {
+			i_os_cpu = 0;
+		}
+
+		if (CPU_ISSET(i_os_cpu, online)) {
+			return i_os_cpu;
+		}
+
+		++i_os_cpu;
+	}
+}
+
+static void
+config_steering(const char *format, const char *if_name, uint16_t n_queues, bool enable)
+{
+	uint16_t i_queue;
+	cpu_set_t masks[n_queues];
+
+	for (i_queue = 0; i_queue < n_queues; ++i_queue) {
+		CPU_ZERO(&masks[i_queue]);
+	}
+
+	if (enable) {
+		i_queue = 0;
+
+		for (cf_topo_os_cpu_index i_os_cpu = 0; i_os_cpu < g_n_os_cpus; ++i_os_cpu) {
+			if (CPU_ISSET(i_os_cpu, &g_os_cpus_online)) {
+				CPU_SET(i_os_cpu, &masks[i_queue % n_queues]);
+				++i_queue;
+			}
+		}
+	}
+
+	for (i_queue = 0; i_queue < n_queues; ++i_queue) {
+		char path[1000];
+		snprintf(path, sizeof(path), format, if_name, i_queue);
+		cf_detail(CF_HARDWARE, "path is %s", path);
+
+		char mask_str[200];
+		mask_to_string(&masks[i_queue], mask_str, sizeof(mask_str));
+		cf_detail(CF_HARDWARE, "CPU mask is %s", mask_str);
+
+		write_file_safe(path, mask_str, strlen(mask_str));
+	}
+}
+
+static void
+enable_xps(const char *if_name)
+{
+	cf_detail(CF_HARDWARE, "enabling XPS for interface %s", if_name);
+	uint16_t n_queues = interface_tx_queues(if_name);
+	config_steering("/sys/class/net/%s/queues/tx-%hu/xps_cpus", if_name, n_queues, true);
+}
+
+static void
+disable_rps(const char *if_name)
+{
+	cf_detail(CF_HARDWARE, "disabling RPS for interface %s", if_name);
+	uint16_t n_queues = interface_rx_queues(if_name);
+	config_steering("/sys/class/net/%s/queues/rx-%hu/rps_cpus", if_name, n_queues, false);
+}
+
+static void
+config_rfs(const char *if_name, bool enable)
+{
+	cf_detail(CF_HARDWARE, "%s RFS for interface %s", enable ? "enabling" : "disabling", if_name);
+
+	uint16_t n_queues = interface_rx_queues(if_name);
+	uint32_t sz_glob = enable ? 1000000 : 0;
+	uint32_t sz_queue = sz_glob / n_queues;
+
+	cf_detail(CF_HARDWARE, "global size is %u, per-queue size is %u", sz_glob, sz_queue);
+
+	char string[50];
+	snprintf(string, sizeof(string), "%u", sz_glob);
+	write_file_safe("/proc/sys/net/core/rps_sock_flow_entries", string, strlen(string));
+
+	snprintf(string, sizeof(string), "%u", sz_queue);
+
+	for (uint16_t i_queue = 0; i_queue < n_queues; ++i_queue) {
+		char path[1000];
+		snprintf(path, sizeof(path), "/sys/class/net/%s/queues/rx-%hu/rps_flow_cnt",
+				if_name, i_queue);
+		write_file_safe(path, string, strlen(string));
+	}
+}
+
+static void
+enable_coalescing(const char *if_name)
+{
+	cf_detail(CF_HARDWARE, "enabling interrupt coalescing for interface %s", if_name);
+	int32_t sock = socket(AF_INET, SOCK_DGRAM, 0);
+
+	if (sock < 0) {
+		cf_crash(CF_HARDWARE, "error while create ethtool socket: %d (%s)", errno, cf_strerror(errno));
+	}
+
+	struct ifreq req;
+	memset(&req, 0, sizeof(req));
+
+	if (strlen(if_name) > IFNAMSIZ - 1) {
+		cf_crash(CF_HARDWARE, "invalid interface name %s", if_name);
+	}
+
+	strcpy(req.ifr_name, if_name);
+	struct ethtool_coalesce coal = { .cmd = ETHTOOL_GCOALESCE };
+	req.ifr_data = &coal;
+
+	if (ioctl(sock, SIOCETHTOOL, &req) < 0) {
+		if (errno == EOPNOTSUPP) {
+			cf_detail(CF_HARDWARE, "interface %s does not support ETHTOOL_GCOALESCE", if_name);
+			goto cleanup1;
+		}
+
+		cf_crash(CF_HARDWARE, "error while getting interface settings: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	cf_detail(CF_HARDWARE, "current interface settings: adaptive = %u, usecs = %u",
+			coal.use_adaptive_rx_coalesce, coal.rx_coalesce_usecs);
+
+	if (coal.use_adaptive_rx_coalesce != 0 || coal.rx_coalesce_usecs >= 100) {
+		cf_detail(CF_HARDWARE, "leaving interface settings untouched");
+		goto cleanup1;
+	}
+
+	cf_detail(CF_HARDWARE, "adjusting interface settings");
+	coal = (struct ethtool_coalesce){
+		.cmd = ETHTOOL_SCOALESCE,
+		.rx_coalesce_usecs = 100 // .1 ms for now, which adds .05 ms to a request on average.
+	};
+
+	if (ioctl(sock, SIOCETHTOOL, &req) < 0) {
+		if (errno == EOPNOTSUPP) {
+			cf_detail(CF_HARDWARE, "interface %s does not support ETHTOOL_SCOALESCE", if_name);
+			goto cleanup1;
+		}
+
+		cf_crash(CF_HARDWARE, "error while adjusting interface settings: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+cleanup1:
+	CF_NEVER_FAILS(close(sock));
+}
+
+static void
+check_irqbalance(void)
+{
+	cf_detail(CF_HARDWARE, "checking irqbalance");
+
+	check_proc_res res = check_proc("irqbalance", 1, (const char *[]){
+		"--policyscript=" POLICY_SCRIPT
+	});
+
+	if (res == CHECK_PROC_PRESENT_NO_ARG) {
+		res = check_proc("irqbalance", 2, (const char *[]){
+			"--policyscript",
+			POLICY_SCRIPT
+		});
+	}
+
+	if (res == CHECK_PROC_PRESENT_NO_ARG) {
+		res = check_proc("irqbalance", 1, (const char *[]){
+			"-l" POLICY_SCRIPT
+		});
+	}
+
+	if (res == CHECK_PROC_PRESENT_NO_ARG) {
+		res = check_proc("irqbalance", 2, (const char *[]){
+			"-l",
+			POLICY_SCRIPT
+		});
+	}
+
+	if (res == CHECK_PROC_PRESENT_NO_ARG) {
+		cf_crash_nostack(CF_HARDWARE, "please disable irqbalance or run it with the Aerospike policy script, /etc/aerospike/irqbalance-ban.sh");
+	}
+}
+
+static void
+config_interface(const char *if_name, bool rfs, irq_list *irqs)
+{
+	uint16_t n_irq_cpus = 0;
+	cf_topo_os_cpu_index i_os_cpu = fix_os_cpu_index(0, &g_os_cpus_online);
+
+	for (uint16_t i = 0; i < irqs->n_irqs; ++i) {
+		pin_irq(irqs->irqs[i], i_os_cpu);
+
+		if (i % irqs->per_cpu == irqs->per_cpu - 1) {
+			++n_irq_cpus;
+			i_os_cpu = fix_os_cpu_index((cf_topo_os_cpu_index)(i_os_cpu + 1), &g_os_cpus_online);
+		}
+	}
+
+	cf_detail(CF_HARDWARE, "interface %s with %hu RX interrupt(s)", if_name, n_irq_cpus);
+
+	if (g_n_irq_cpus == 0) {
+		g_n_irq_cpus = n_irq_cpus;
+	}
+	else if (n_irq_cpus != g_n_irq_cpus) {
+		cf_crash(CF_HARDWARE, "interface %s with inconsistent number of RX interrupts: %hu vs. %hu",
+				if_name, n_irq_cpus, g_n_irq_cpus);
+	}
+
+	disable_rps(if_name);
+	config_rfs(if_name, rfs);
+	enable_xps(if_name);
+
+	// Redistributing packets with RFS causes inter-CPU interrupts, which increases
+	// the interrupt load on the machine. For low-end systems, make sure that
+	// interrupt coalescing is enabled.
+	//
+	// We consider a machine low-end, if we handle interrupts on 25% or less of the
+	// available CPUs (i.e., if the number of NIC queues is 25% or less of the number
+	// of available CPUs) and it has fewer than 4 NIC queues.
+	//
+	// Better (i.e., NUMA) machines typically come with adaptive interrupt coalescing
+	// enabled by default. That's why we only do this here and not in the NUMA case.
+
+	if (rfs && n_irq_cpus <= g_n_cpus / 4 && n_irq_cpus < 4) {
+		enable_coalescing(if_name);
+	}
+}
+
+static void
+config_interface_numa(const char *if_name, irq_list *irqs)
+{
+	uint16_t n_irq_cpus = 0;
+	cf_topo_os_cpu_index i_os_cpu[g_n_numa_nodes];
+	uint16_t i_numa_node;
+
+	for (i_numa_node = 0; i_numa_node < g_n_numa_nodes; ++i_numa_node) {
+		i_os_cpu[i_numa_node] = fix_os_cpu_index(0, &g_numa_node_os_cpus_online[i_numa_node]);
+	}
+
+	i_numa_node = 0;
+
+	// This configures the IRQs for all NUMA nodes. If multiple asd processes are
+	// running, each process does this, but each does it identically. Hence there
+	// isn't any conflict.
+
+	for (uint16_t i = 0; i < irqs->n_irqs; ++i) {
+		char mask_str[200];
+		mask_to_string(&g_numa_node_os_cpus_online[i_numa_node], mask_str, sizeof(mask_str));
+		cf_detail(CF_HARDWARE, "NUMA node index %hu CPU mask is %s", i_numa_node, mask_str);
+
+		pin_irq(irqs->irqs[i], i_os_cpu[i_numa_node]);
+
+		if (i % irqs->per_cpu == irqs->per_cpu - 1) {
+			// Only count CPUs on our NUMA node.
+
+			if (i_numa_node == g_i_numa_node) {
+				++n_irq_cpus;
+			}
+
+			i_os_cpu[i_numa_node] =
+					fix_os_cpu_index((cf_topo_os_cpu_index)(i_os_cpu[i_numa_node] + 1),
+					&g_numa_node_os_cpus_online[i_numa_node]);
+			i_numa_node = (uint16_t)((i_numa_node + 1) % g_n_numa_nodes);
+		}
+	}
+
+	cf_detail(CF_HARDWARE, "interface %s with %hu RX interrupt(s) on NUMA node %hu",
+			if_name, n_irq_cpus, g_i_numa_node);
+
+	if (g_n_irq_cpus == 0) {
+		g_n_irq_cpus = n_irq_cpus;
+	}
+	else if (n_irq_cpus != g_n_irq_cpus) {
+		cf_crash(CF_HARDWARE, "interface %s with inconsistent number of RX interrupts: %hu vs. %hu",
+				if_name, n_irq_cpus, g_n_irq_cpus);
+	}
+
+	disable_rps(if_name);
+	config_rfs(if_name, true);
+	enable_xps(if_name);
+}
+
+static void
+optimize_interface(const char *if_name)
+{
+	cf_detail(CF_HARDWARE, "optimizing interface %s", if_name);
+	uint16_t n_queues = interface_rx_queues(if_name);
+	irq_list irqs;
+	interface_irqs(if_name, &irqs);
+
+	cf_info(CF_HARDWARE, "detected %hu NIC receive queue(s), %hu interrupt(s) for %s",
+			n_queues, irqs.n_irqs, if_name);
+
+	// We either expect one interrupt per RX queue (shared with TX) or two
+	// interrupts per RX queue (one RX, one TX).
+
+	uint16_t n_irq_cpus = irqs.n_irqs / irqs.per_cpu;
+
+	if (n_irq_cpus != n_queues) {
+		cf_crash(CF_HARDWARE, "suspicious NIC interrupt count %hu with %hu NIC receive queue(s)",
+				irqs.n_irqs, n_queues);
+	}
+
+	if (n_irq_cpus == g_n_cpus) {
+		if (g_i_numa_node != INVALID_INDEX) {
+			cf_detail(CF_HARDWARE, "setting up for a fancy interface with NUMA");
+			config_interface_numa(if_name, &irqs);
+		}
+		else {
+			cf_detail(CF_HARDWARE, "setting up for a fancy interface, no NUMA");
+			config_interface(if_name, false, &irqs);
+		}
+	}
+	else {
+		if (n_irq_cpus <= g_n_cpus / 4) {
+			cf_warning(CF_HARDWARE, "%s has very few NIC queues; only %hu out of %hu CPUs handle(s) NIC interrupts",
+					if_name, n_irq_cpus, g_n_cpus);
+		}
+
+		if (g_i_numa_node != INVALID_INDEX) {
+			cf_detail(CF_HARDWARE, "setting up for a lame interface with NUMA");
+			config_interface_numa(if_name, &irqs);
+		}
+		else {
+			cf_detail(CF_HARDWARE, "setting up for a lame interface, no NUMA");
+			config_interface(if_name, true, &irqs);
+		}
+	}
+}
+
+static void
+check_socket_cpu(void)
+{
+	int32_t fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+
+	if (fd < 0) {
+		cf_crash(CF_SOCKET, "error while creating UDP test socket: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	int32_t val = -1;
+
+	if (setsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &val, sizeof(val)) < 0) {
+		if (errno == ENOPROTOOPT) {
+			cf_crash_nostack(CF_SOCKET, "CPU pinning requires Linux kernel 3.19 or later");
+		}
+
+		cf_crash(CF_SOCKET, "error while testing for SO_INCOMING_CPU: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	CF_NEVER_FAILS(close(fd));
+}
+
+void
+cf_topo_config(cf_topo_auto_pin auto_pin, cf_topo_numa_node_index a_numa_node,
+		const cf_addr_list *addrs)
+{
+	// Detect the NUMA topology.
+
+	switch (auto_pin) {
+	case CF_TOPO_AUTO_PIN_NONE:
+	case CF_TOPO_AUTO_PIN_CPU:
+		detect(INVALID_INDEX);
+		break;
+
+	case CF_TOPO_AUTO_PIN_NUMA:
+		detect(a_numa_node);
+
+		// Clamp the given NUMA node index to the valid range. We can only do this
+		// after we know what g_n_numa_nodes is, which is initialized by the above
+		// call to detect().
+
+		if (a_numa_node >= g_n_numa_nodes) {
+			cf_topo_numa_node_index orig = a_numa_node;
+			a_numa_node = (cf_topo_numa_node_index)(a_numa_node % g_n_numa_nodes);
+			cf_detail(CF_HARDWARE, "invalid NUMA node index: %hu, clamping to %hu", orig, a_numa_node);
+			detect(a_numa_node);
+		}
+
+		break;
+	}
+
+	// If we don't do any pinning, then we're done after NUMA topology detection.
+
+	if (auto_pin == CF_TOPO_AUTO_PIN_NONE) {
+		return;
+	}
+
+	// Make sure that we are running on Linux 3.19 or later.
+
+	check_socket_cpu();
+
+	// Reconfigure the client-facing network interface(s).
+
+	check_irqbalance();
+
+	if (addrs->n_addrs == 0) {
+		cf_crash_nostack(CF_HARDWARE, "auto-pinning requires binding the service to one or more network interfaces");
+	}
+
+	for (uint32_t i = 0; i < addrs->n_addrs; ++i) {
+		const char *if_name = addrs->addrs[i];
+
+		if (!cf_inter_is_inter_name(if_name)) {
+			cf_crash_nostack(CF_HARDWARE, "auto-pinning requires binding the service to network interfaces; \"%s\" isn't a network interface",
+					if_name);
+		}
+
+		char *exp_names[100];
+		uint32_t n_exp = sizeof(exp_names) / sizeof(exp_names[0]);
+		cf_inter_expand_bond(if_name, exp_names, &n_exp);
+
+		for (uint32_t k = 0; k < n_exp; ++k) {
+			optimize_interface(exp_names[k]);
+			cf_free(exp_names[k]);
+		}
+	}
+
+	// If we don't do NUMA pinning, then we're done after setting up the
+	// client-facing network interface(s).
+
+	if (auto_pin == CF_TOPO_AUTO_PIN_CPU) {
+		return;
+	}
+
+	// NUMA pinning.
+
+	pin_to_numa_node(a_numa_node);
+}
+
+void
+cf_topo_force_map_memory(const uint8_t *from, size_t size)
+{
+	if (g_i_numa_node == INVALID_INDEX || size == 0) {
+		return;
+	}
+
+	cf_assert(from, CF_HARDWARE, "invalid cf_topo_force_map_memory() call");
+
+	// Read one byte per memory page to force otherwise lazy mapping.
+
+	const uint8_t *start = (const uint8_t *)
+			(((int64_t)from + (MEM_PAGE_SIZE - 1)) & -MEM_PAGE_SIZE);
+	const uint8_t *end = from + size;
+	const volatile uint8_t *p_byte;
+
+	// In case 'from' was not page-aligned, take care of the partial page.
+	if (start > from) {
+		p_byte = from;
+		p_byte[0];
+	}
+
+	for (p_byte = start; p_byte < end; p_byte += MEM_PAGE_SIZE) {
+		p_byte[0];
+	}
+}
+
+void
+cf_topo_migrate_memory(void)
+{
+	if (g_i_numa_node == INVALID_INDEX) {
+		return;
+	}
+
+	// Migrate existing memory allocations to the selected NUMA node.
+
+	os_numa_node_index i_os_numa_node = g_numa_node_index_to_os_numa_node_index[g_i_numa_node];
+	uint64_t to_mask = 1UL << i_os_numa_node;
+	cf_detail(CF_HARDWARE, "NUMA node mask (to): %016" PRIx64, to_mask);
+
+	uint64_t from_mask = 0;
+
+	for (cf_topo_numa_node_index i_numa_node = 0; i_numa_node < g_n_numa_nodes; ++i_numa_node) {
+		i_os_numa_node = g_numa_node_index_to_os_numa_node_index[i_numa_node];
+		from_mask |= 1u << i_os_numa_node;
+	}
+
+	from_mask &= ~to_mask;
+	cf_detail(CF_HARDWARE, "NUMA node mask (from): %016" PRIx64, from_mask);
+
+	if (from_mask != 0) {
+		cf_info(CF_HARDWARE, "migrating shared memory to local NUMA node - this may take a bit");
+		// Unlike select(), we have to pass "number of valid bits + 1".
+		migrate_pages_safe(0, 65, &from_mask, &to_mask);
+	}
+
+	// We had kept capabilities so we could do this migrate - revoke them now.
+	cf_process_clearcap();
+}
+
+void
+cf_topo_info(void)
+{
+	if (g_i_numa_node == INVALID_INDEX) {
+		cf_info(CF_HARDWARE, "detected %hu CPU(s), %hu core(s), %hu NUMA node(s)",
+				g_n_cpus, g_n_cores, g_n_numa_nodes);
+	}
+	else {
+		cf_info(CF_HARDWARE, "detected %hu CPU(s), %hu core(s) on NUMA node %hu of %hu",
+				g_n_cpus, g_n_cores, g_i_numa_node, g_n_numa_nodes);
+	}
+}
diff --git a/cf/src/hist.c b/cf/src/hist.c
new file mode 100644
index 00000000..6e03563c
--- /dev/null
+++ b/cf/src/hist.c
@@ -0,0 +1,305 @@
+/*
+ * hist.c
+ *
+ * Copyright (C) 2008-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "hist.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_clock.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+
+//==========================================================
+// Histogram with logarithmic buckets.
+//
+
+//------------------------------------------------
+// Create a histogram. There's no destroy(), but
+// you can just cf_free() the histogram.
+//
+histogram*
+histogram_create(const char *name, histogram_scale scale)
+{
+	cf_assert(name, AS_INFO, "null histogram name");
+	cf_assert(strlen(name) < HISTOGRAM_NAME_SIZE, AS_INFO,
+			"bad histogram name %s", name);
+	cf_assert(scale >= 0 && scale < HIST_SCALE_MAX_PLUS_1, AS_INFO,
+			"bad histogram scale %d", scale);
+
+	histogram *h = cf_malloc(sizeof(histogram));
+
+	strcpy(h->name, name);
+	memset((void *)&h->counts, 0, sizeof(h->counts));
+
+	// If histogram_insert_data_point() is called for a size or count histogram,
+	// the divide by 0 will crash - consider that a high-performance assert.
+
+	switch (scale) {
+	case HIST_MILLISECONDS:
+		h->scale_tag = HIST_TAG_MILLISECONDS;
+		h->time_div = 1000 * 1000;
+		break;
+	case HIST_MICROSECONDS:
+		h->scale_tag = HIST_TAG_MICROSECONDS;
+		h->time_div = 1000;
+		break;
+	case HIST_SIZE:
+		h->scale_tag = HIST_TAG_SIZE;
+		h->time_div = 0;
+		break;
+	case HIST_COUNT:
+		h->scale_tag = HIST_TAG_COUNT;
+		h->time_div = 0;
+		break;
+	default:
+		cf_crash(AS_INFO, "%s: unrecognized histogram scale %d", name, scale);
+		break;
+	}
+
+	return h;
+}
+
+//------------------------------------------------
+// Clear a histogram.
+//
+void
+histogram_clear(histogram *h)
+{
+	for (int i = 0; i < N_BUCKETS; i++) {
+		cf_atomic64_set(&h->counts[i], 0);
+	}
+}
+
+//------------------------------------------------
+// Dump a histogram to log.
+//
+// Note - DO NOT change the log output format in
+// this method - tools such as as_log_latency
+// assume this format.
+//
+void
+histogram_dump(histogram *h)
+{
+	int b;
+	uint64_t counts[N_BUCKETS];
+
+	for (b = 0; b < N_BUCKETS; b++) {
+		counts[b] = cf_atomic64_get(h->counts[b]);
+	}
+
+	int i = N_BUCKETS;
+	int j = 0;
+	uint64_t total_count = 0;
+
+	for (b = 0; b < N_BUCKETS; b++) {
+		if (counts[b] != 0) {
+			if (i > b) {
+				i = b;
+			}
+
+			j = b;
+			total_count += counts[b];
+		}
+	}
+
+	char buf[100];
+	int pos = 0;
+	int k = 0;
+
+	buf[0] = '\0';
+
+	cf_info(AS_INFO, "histogram dump: %s (%lu total) %s", h->name, total_count,
+			h->scale_tag);
+
+	for ( ; i <= j; i++) {
+		if (counts[i] == 0) { // print only non-zero columns
+			continue;
+		}
+
+		int bytes = sprintf(buf + pos, " (%02d: %010lu)", i, counts[i]);
+
+		if (bytes <= 0) {
+			cf_info(AS_INFO, "histogram dump error");
+			return;
+		}
+
+		pos += bytes;
+
+		if ((k & 3) == 3) { // maximum of 4 printed columns per log line
+			 cf_info(AS_INFO, "%s", buf);
+			 pos = 0;
+			 buf[0] = '\0';
+		}
+
+		k++;
+	}
+
+	if (pos > 0) {
+		cf_info(AS_INFO, "%s", buf);
+	}
+}
+
+//------------------------------------------------
+// BYTE_MSB[n] returns the position of the most
+// significant bit. If no bits are set (n = 0) it
+// returns 0. Otherwise the positions are 1 ... 8
+// from low to high, so e.g. n = 13 returns 4:
+//
+//		bits:		0  0  0  0  1  1  0  1
+//		position:	8  7  6  5 [4] 3  2  1
+//
+static const char BYTE_MSB[] = {
+		0, 1, 2, 2, 3, 3, 3, 3,  4, 4, 4, 4, 4, 4, 4, 4,
+		5, 5, 5, 5, 5, 5, 5, 5,  5, 5, 5, 5, 5, 5, 5, 5,
+		6, 6, 6, 6, 6, 6, 6, 6,  6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6,  6, 6, 6, 6, 6, 6, 6, 6,
+
+		7, 7, 7, 7, 7, 7, 7, 7,  7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7,  7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7,  7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7,  7, 7, 7, 7, 7, 7, 7, 7,
+
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
+		8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8
+};
+
+//------------------------------------------------
+// Returns the position of the most significant
+// bit of n. Positions are 1 ... 64 from low to
+// high, so:
+//
+//		n			msb(n)
+//		--------	------
+//		0			0
+//		1			1
+//		2 ... 3		2
+//		4 ... 7		3
+//		8 ... 15	4
+//		etc.
+//
+static int
+msb(uint64_t n)
+{
+	int shift = 0;
+
+	while (true) {
+		uint64_t n_div_256 = n >> 8;
+
+		if (n_div_256 == 0) {
+			return shift + (int)BYTE_MSB[n];
+		}
+
+		n = n_div_256;
+		shift += 8;
+	}
+
+	// Should never get here.
+	cf_crash(AS_INFO, "end of msb()");
+	return -1;
+}
+
+//------------------------------------------------
+// Insert a time interval data point. The interval
+// is time elapsed since start_ns, converted to
+// milliseconds or microseconds as appropriate.
+// Assumes start_ns was obtained via cf_getns()
+// some time ago. Generates a histogram with
+// either:
+//
+//		bucket	millisecond range
+//		------	-----------------
+//		0		0 to 1  (more exactly, 0.999999)
+//		1		1 to 2  (more exactly, 1.999999)
+//		2		2 to 4  (more exactly, 3.999999)
+//		3		4 to 8  (more exactly, 7.999999)
+//		4		8 to 16 (more exactly, 15.999999)
+//		etc.
+//
+// or:
+//
+//		bucket	microsecond range
+//		------	-----------------
+//		0		0 to 1  (more exactly, 0.999)
+//		1		1 to 2  (more exactly, 1.999)
+//		2		2 to 4  (more exactly, 3.999)
+//		3		4 to 8  (more exactly, 7.999)
+//		4		8 to 16 (more exactly, 15.999)
+//		etc.
+//
+uint64_t
+histogram_insert_data_point(histogram *h, uint64_t start_ns)
+{
+	uint64_t end_ns = cf_getns();
+	uint64_t delta_t = (end_ns - start_ns) / h->time_div;
+
+	int bucket = 0;
+
+	if (delta_t != 0) {
+		bucket = msb(delta_t);
+
+		if (start_ns > end_ns) {
+			// Either the clock went backwards, or wrapped. (Assume the former,
+			// since it takes ~580 years from 0 to wrap.)
+			cf_warning(AS_INFO, "%s - clock went backwards: start %lu end %lu",
+					h->name, start_ns, end_ns);
+			bucket = 0;
+		}
+	}
+
+	cf_atomic64_incr(&h->counts[bucket]);
+
+	return end_ns;
+}
+
+//------------------------------------------------
+// Insert a raw data point. Generates a histogram
+// with:
+//
+//		bucket	value range
+//		------	-----------
+//		0		0
+//		1		1
+//		2		2, 3
+//		3		4 to 7
+//		4		8 to 15
+//		etc.
+//
+void
+histogram_insert_raw(histogram *h, uint64_t value)
+{
+	cf_atomic64_incr(&h->counts[msb(value)]);
+}
diff --git a/cf/src/hist_track.c b/cf/src/hist_track.c
new file mode 100644
index 00000000..16177fd8
--- /dev/null
+++ b/cf/src/hist_track.c
@@ -0,0 +1,732 @@
+/*
+ * hist_track.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * A histogram with cached data.
+ */
+
+
+//==========================================================
+// Includes
+//
+
+#include "hist_track.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <citrusleaf/cf_atomic.h>
+#include <citrusleaf/alloc.h>
+
+#include "dynbuf.h"
+#include "fault.h"
+#include "hist.h"
+
+
+//==========================================================
+// Private "Class Members"
+//
+
+//------------------------------------------------
+// Constants
+//
+
+// More than one day of 10 second slices uses too much memory.
+const uint32_t MAX_NUM_ROWS = (24 * 60 * 60) / 10;
+
+// Caching this few is legal but silly.
+const uint32_t MIN_NUM_ROWS = 2;
+
+// Don't track/report thresholds with a larger bucket index than this.
+// This corresponds to the 32 second threshold - that should be big enough.
+#define MAX_BUCKET 15
+
+// Don't track/report more than this many thresholds.
+// This could in principle be less than (MAX_BUCKET + 1), e.g. it could be
+// 4, and we could track buckets 0, 5, 10, 15.
+#define MAX_NUM_COLS (MAX_BUCKET + 1)
+
+#define DEFAULT_NUM_COLS 3
+const uint32_t default_buckets[DEFAULT_NUM_COLS] = { 0, 3, 6 };
+// For our standard latency histograms, 0: >1ms, 3: >8ms, 6: >64ms.
+
+// No output line can be longer than this.
+#define MAX_FORMATTED_ROW_SIZE 512
+#define MAX_FORMATTED_SETTINGS_SIZE 512
+
+//------------------------------------------------
+// Data
+//
+
+typedef struct row_s {
+	uint32_t		timestamp;
+	uint64_t		total;
+	uint64_t		overs[];
+} row;
+
+struct cf_hist_track_s {
+	// Base Histogram (must be first)
+	histogram		hist;
+
+	// Tracking-related
+	row*			rows;
+	size_t			row_size;
+	uint32_t		num_rows;
+	uint32_t		write_row_n;
+	uint32_t		oldest_row_n;
+	pthread_mutex_t	rows_lock;
+	uint32_t		slice_sec;
+	uint32_t		buckets[MAX_NUM_COLS];
+	uint32_t		num_cols;
+};
+
+//------------------------------------------------
+// Function Declarations
+//
+
+static inline row* get_row(cf_hist_track* this, uint32_t row_n);
+static uint32_t get_start_row_n(cf_hist_track* this, uint32_t back_sec);
+static void output_header(cf_hist_track* this, uint32_t start_ts,
+		uint32_t num_cols, cf_hist_track_info_format info_fmt,
+		cf_dyn_buf* db_p);
+static void output_slice(cf_hist_track* this, row* prev_row_p, row* row_p,
+		uint32_t diff_sec, uint32_t num_cols,
+		cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p);
+static int threshold_to_bucket(int threshold);
+static int thresholds_to_buckets(const char* thresholds, uint32_t buckets[]);
+
+
+//==========================================================
+// Public API
+//
+
+//------------------------------------------------
+// Create a cf_hist_track object.
+//
+cf_hist_track*
+cf_hist_track_create(const char* name, histogram_scale scale)
+{
+	cf_assert(name, AS_INFO, "null histogram name");
+	cf_assert(strlen(name) < HISTOGRAM_NAME_SIZE, AS_INFO,
+			"bad histogram name %s", name);
+	cf_assert(scale >= 0 && scale < HIST_SCALE_MAX_PLUS_1, AS_INFO,
+			"bad histogram scale %d", scale);
+
+	cf_hist_track* this = (cf_hist_track*)cf_malloc(sizeof(cf_hist_track));
+
+	pthread_mutex_init(&this->rows_lock, NULL);
+
+	// Base histogram setup, same as in histogram_create():
+	strcpy(this->hist.name, name);
+	memset((void*)this->hist.counts, 0, sizeof(this->hist.counts));
+
+	// If cf_hist_track_insert_data_point() is called for a size or count
+	// histogram, the divide by 0 will crash - consider that a high-performance
+	// assert.
+
+	switch (scale) {
+	case HIST_MILLISECONDS:
+		this->hist.scale_tag = HIST_TAG_MILLISECONDS;
+		this->hist.time_div = 1000 * 1000;
+		break;
+	case HIST_MICROSECONDS:
+		this->hist.scale_tag = HIST_TAG_MICROSECONDS;
+		this->hist.time_div = 1000;
+		break;
+	case HIST_SIZE:
+		this->hist.scale_tag = HIST_TAG_SIZE;
+		this->hist.time_div = 0;
+		break;
+	case HIST_COUNT:
+		this->hist.scale_tag = HIST_TAG_COUNT;
+		this->hist.time_div = 0;
+		break;
+	default:
+		cf_crash(AS_INFO, "%s: unrecognized histogram scale %d", name, scale);
+		break;
+	}
+
+	// Start with tracking off.
+	this->rows = NULL;
+
+	return this;
+}
+
+//------------------------------------------------
+// Destroy a cf_hist_track object.
+//
+void
+cf_hist_track_destroy(cf_hist_track* this)
+{
+	cf_hist_track_stop(this);
+	pthread_mutex_destroy(&this->rows_lock);
+	cf_free(this);
+}
+
+//------------------------------------------------
+// Start tracking. May call this again without
+// first calling cf_hist_track_disable() to use
+// different caching parameters, but previous
+// cache is lost.
+//
+// TODO - resolve errors ???
+bool
+cf_hist_track_start(cf_hist_track* this, uint32_t back_sec, uint32_t slice_sec,
+		const char* thresholds)
+{
+	if (slice_sec == 0) {
+		return false;
+	}
+
+	uint32_t num_rows = back_sec / slice_sec;
+
+	// Check basic sanity of row-related parameters.
+	if (num_rows > MAX_NUM_ROWS || num_rows < MIN_NUM_ROWS) {
+		return false;
+	}
+
+	// If thresholds aren't specified, use defaults.
+	uint32_t* buckets = (uint32_t*)default_buckets;
+	int num_cols = DEFAULT_NUM_COLS;
+
+	// Parse non-default thresholds and check resulting buckets.
+	uint32_t parsed_buckets[MAX_NUM_COLS];
+
+	if (thresholds) {
+		buckets = parsed_buckets;
+		num_cols = thresholds_to_buckets(thresholds, buckets);
+
+		if (num_cols < 0) {
+			return false;
+		}
+	}
+
+	pthread_mutex_lock(&this->rows_lock);
+
+	if (this->rows) {
+		cf_free(this->rows);
+	}
+
+	this->row_size = sizeof(row) + (num_cols * sizeof(uint64_t));
+	this->rows = (row*)cf_malloc(num_rows * this->row_size);
+	this->num_rows = num_rows;
+	this->write_row_n = 0;
+	this->oldest_row_n = 0;
+	this->slice_sec = slice_sec;
+
+	for (int i = 0; i < num_cols; i++) {
+		this->buckets[i] = buckets[i];
+	}
+
+	this->num_cols = (uint32_t)num_cols;
+
+	pthread_mutex_unlock(&this->rows_lock);
+
+	return true;
+}
+
+//------------------------------------------------
+// Stop tracking, freeing cache.
+//
+void
+cf_hist_track_stop(cf_hist_track* this)
+{
+	pthread_mutex_lock(&this->rows_lock);
+
+	if (this->rows) {
+		cf_free(this->rows);
+		this->rows = NULL;
+	}
+
+	pthread_mutex_unlock(&this->rows_lock);
+}
+
+//------------------------------------------------
+// Clear histogram buckets, and if tracking, stop.
+// Must call cf_hist_track_enable() to start
+// tracking again.
+//
+void
+cf_hist_track_clear(cf_hist_track* this)
+{
+	cf_hist_track_stop(this);
+	histogram_clear((histogram*)this);
+}
+
+//------------------------------------------------
+// Print all non-zero histogram buckets, and if
+// tracking, cache timestamp, total data points,
+// and threshold data.
+//
+void
+cf_hist_track_dump(cf_hist_track* this)
+{
+	// Always print the histogram.
+	histogram_dump((histogram*)this);
+
+	// If tracking is enabled, save a row in the cache.
+	pthread_mutex_lock(&this->rows_lock);
+
+	if (! this->rows) {
+		pthread_mutex_unlock(&this->rows_lock);
+		return;
+	}
+
+	uint32_t now_ts = (uint32_t)time(NULL);
+
+	// But don't save row if slice_sec hasn't elapsed since last saved row.
+	if (this->write_row_n != 0 &&
+			now_ts - get_row(this, this->write_row_n - 1)->timestamp <
+			this->slice_sec) {
+		pthread_mutex_unlock(&this->rows_lock);
+		return;
+	}
+
+	row* row_p = get_row(this, this->write_row_n);
+
+	// "Freeze" the histogram for consistency of total.
+	uint64_t counts[N_BUCKETS];
+	uint64_t total_count = 0;
+
+	for (int j = 0; j < N_BUCKETS; j++) {
+		counts[j] = cf_atomic64_get(this->hist.counts[j]);
+		total_count += counts[j];
+	}
+
+	uint64_t subtotal = 0;
+
+	// b's "over" is total minus sum of values in all buckets 0 thru b.
+	for (int i = 0, b = 0; i < this->num_cols; b++) {
+		subtotal += counts[b];
+
+		if (this->buckets[i] == b) {
+			row_p->overs[i++] = total_count - subtotal;
+		}
+	}
+
+	row_p->total = total_count;
+	row_p->timestamp = now_ts;
+
+	// Increment the current and oldest row indexes.
+	this->write_row_n++;
+
+	if (this->write_row_n > this->num_rows) {
+		this->oldest_row_n++;
+	}
+
+	pthread_mutex_unlock(&this->rows_lock);
+}
+
+//------------------------------------------------
+// Pass-through to base histogram.
+//
+uint64_t
+cf_hist_track_insert_data_point(cf_hist_track* this, uint64_t start_ns)
+{
+	return histogram_insert_data_point((histogram*)this, start_ns);
+}
+
+//------------------------------------------------
+// Pass-through to base histogram.
+//
+void
+cf_hist_track_insert_raw(cf_hist_track* this, uint64_t value)
+{
+	histogram_insert_raw((histogram*)this, value);
+}
+
+//------------------------------------------------
+// Get time-sliced info from cache.
+//
+void
+cf_hist_track_get_info(cf_hist_track* this, uint32_t back_sec,
+		uint32_t duration_sec, uint32_t slice_sec, bool throughput_only,
+		cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p)
+{
+	pthread_mutex_lock(&this->rows_lock);
+
+	if (! this->rows) {
+		cf_dyn_buf_append_string(db_p, "error-not-tracking;");
+		pthread_mutex_unlock(&this->rows_lock);
+		return;
+	}
+
+	uint32_t start_row_n = get_start_row_n(this, back_sec);
+
+	if (start_row_n == -1) {
+		cf_dyn_buf_append_string(db_p, "error-no-data-yet-or-back-too-small;");
+		pthread_mutex_unlock(&this->rows_lock);
+		return;
+	}
+
+	uint32_t num_cols = throughput_only ? 0 : this->num_cols;
+	row* prev_row_p = get_row(this, start_row_n);
+
+	output_header(this, prev_row_p->timestamp, num_cols, info_fmt, db_p);
+
+	if (slice_sec == 0) {
+		row* row_p = get_row(this, this->write_row_n - 1);
+		uint32_t diff_sec = row_p->timestamp - prev_row_p->timestamp;
+
+		output_slice(this, prev_row_p, row_p, diff_sec, num_cols, info_fmt,
+				db_p);
+
+		pthread_mutex_unlock(&this->rows_lock);
+		return;
+	}
+
+	uint32_t start_ts = prev_row_p->timestamp;
+	bool no_slices = true;
+
+	for (uint32_t row_n = start_row_n + 1; row_n < this->write_row_n; row_n++) {
+		row* row_p = get_row(this, row_n);
+
+		uint32_t diff_sec = row_p->timestamp - prev_row_p->timestamp;
+
+		if (diff_sec < slice_sec) {
+			continue;
+		}
+
+		output_slice(this, prev_row_p, row_p, diff_sec, num_cols, info_fmt,
+				db_p);
+		no_slices = false;
+
+		// Doing this at the end guarantees we get at least one slice.
+		if (duration_sec != 0 && row_p->timestamp - start_ts > duration_sec) {
+			break;
+		}
+
+		prev_row_p = row_p;
+	}
+
+	if (no_slices) {
+		cf_dyn_buf_append_string(db_p,
+				"error-slice-too-big-or-back-too-small;");
+	}
+
+	pthread_mutex_unlock(&this->rows_lock);
+}
+
+//------------------------------------------------
+// Get current settings which were passed into
+// cf_hist_track_start(), in format suitable for
+// info_command_config_get().
+//
+void
+cf_hist_track_get_settings(cf_hist_track* this, cf_dyn_buf* db_p)
+{
+	pthread_mutex_lock(&this->rows_lock);
+
+	if (! this->rows) {
+		pthread_mutex_unlock(&this->rows_lock);
+		return;
+	}
+
+	const char* name = ((histogram*)this)->name;
+	char output[MAX_FORMATTED_SETTINGS_SIZE];
+	char* write_p = output;
+	char* end_p = output + MAX_FORMATTED_SETTINGS_SIZE - 2;
+
+	write_p += snprintf(output, MAX_FORMATTED_SETTINGS_SIZE - 2,
+			"%s-hist-track-back=%u;"
+			"%s-hist-track-slice=%u;"
+			"%s-hist-track-thresholds=",
+			name, this->num_rows * this->slice_sec,
+			name, this->slice_sec,
+			name);
+
+	for (int i = 0; i < this->num_cols; i++) {
+		write_p += snprintf(write_p, end_p - write_p, "%u,",
+				(uint32_t)1 << this->buckets[i]);
+	}
+
+	if (this->num_cols > 0) {
+		write_p--;
+	}
+
+	*write_p++ = ';';
+	*write_p = 0;
+
+	cf_dyn_buf_append_string(db_p, output);
+
+	pthread_mutex_unlock(&this->rows_lock);
+}
+
+
+//==========================================================
+// Private Functions
+//
+
+//------------------------------------------------
+// Get row pointer for specified row count. Note
+// that row_size is determined dynamically, so we
+// can't just do rows[i].
+//
+static inline row*
+get_row(cf_hist_track* this, uint32_t row_n)
+{
+	return (row*)((uint8_t*)this->rows +
+			((row_n % this->num_rows) * this->row_size));
+}
+
+//------------------------------------------------
+// Find row at or after timestamp specified by
+// back_sec.
+//
+static uint32_t
+get_start_row_n(cf_hist_track* this, uint32_t back_sec)
+{
+	// Must be at least two rows to get a slice.
+	if (this->write_row_n < 2) {
+		return -1;
+	}
+
+	uint32_t now_ts = (uint32_t)time(NULL);
+
+	// In case we call this with default back_sec (0) or back_sec more than UTC
+	// epoch to now - start from the beginning.
+	if (back_sec == 0 || back_sec >= now_ts) {
+		return this->oldest_row_n;
+	}
+
+	uint32_t start_ts = now_ts - back_sec;
+
+	// Find the most recent slice interval.
+	uint32_t last_row_n = this->write_row_n - 1;
+	uint32_t slice_sec = get_row(this, last_row_n)->timestamp -
+			get_row(this, last_row_n - 1)->timestamp;
+
+	// Use recent slice interval to guess how many rows back to look.
+	uint32_t back_row_n = back_sec / slice_sec;
+	uint32_t guess_row_n = last_row_n > back_row_n ?
+			last_row_n - back_row_n : 0;
+
+	if (guess_row_n < this->oldest_row_n) {
+		guess_row_n = this->oldest_row_n;
+	}
+
+	// Begin at guessed row, and iterate to find exact row to start at.
+	uint32_t guess_ts = get_row(this, guess_row_n)->timestamp;
+	uint32_t start_row_n;
+
+	if (guess_ts < start_ts) {
+		for (start_row_n = guess_row_n + 1; start_row_n < last_row_n;
+				start_row_n++) {
+			if (get_row(this, start_row_n)->timestamp >= start_ts) {
+				break;
+			}
+		}
+	}
+	else if (guess_ts > start_ts) {
+		for (start_row_n = guess_row_n; start_row_n > this->oldest_row_n;
+				start_row_n--) {
+			if (get_row(this, start_row_n - 1)->timestamp < start_ts) {
+				break;
+			}
+		}
+	}
+	else {
+		start_row_n = guess_row_n;
+	}
+
+	// Make sure when default query is run (e.g. latency:), we return at least
+	// valid last data instead of returning an error. This case happens when the
+	// query is timed such that it's right when histogram is being dumped.
+	if (start_row_n == last_row_n) {
+		start_row_n = last_row_n - 1;
+	}
+
+	// Can't get a slice if there isn't at least one row after the start row.
+	return start_row_n < last_row_n ? start_row_n : -1;
+}
+
+//------------------------------------------------
+// Make info "header" and append it to db_p.
+//
+static void
+output_header(cf_hist_track* this, uint32_t start_ts, uint32_t num_cols,
+		cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p)
+{
+	cf_dyn_buf_append_string(db_p, ((histogram*)this)->name);
+
+	const char* time_fmt;
+	const char* rate_fmt;
+	const char* pcts_fmt;
+	char line_sep;
+
+	switch (info_fmt) {
+	case CF_HIST_TRACK_FMT_PACKED:
+	default:
+		time_fmt = ":%T-GMT";
+		rate_fmt = ",ops/sec";
+		pcts_fmt = ",>%ums";
+		line_sep = ';';
+		break;
+	case CF_HIST_TRACK_FMT_TABLE:
+		time_fmt = ":\n%T GMT       % > (ms)";
+		rate_fmt = "\n   to      ops/sec";
+		pcts_fmt = " %6u";
+		line_sep = '\n';
+		break;
+	}
+
+	char output[MAX_FORMATTED_ROW_SIZE];
+	char* write_p = output;
+	char* end_p = output + MAX_FORMATTED_ROW_SIZE - 2;
+	time_t start_ts_time_t = (time_t)start_ts;
+	struct tm start_tm;
+
+	gmtime_r(&start_ts_time_t, &start_tm);
+	write_p += strftime(output, MAX_FORMATTED_ROW_SIZE - 2, time_fmt, &start_tm);
+	write_p += snprintf(write_p, end_p - write_p, "%s", rate_fmt);
+
+	for (int i = 0; i < num_cols; i++) {
+		write_p += snprintf(write_p, end_p - write_p, pcts_fmt,
+				(uint32_t)(1 << this->buckets[i]));
+	}
+
+	*write_p++ = line_sep;
+	*write_p = 0;
+
+	cf_dyn_buf_append_string(db_p, output);
+}
+
+//------------------------------------------------
+// Calculate output info for slice defined by two
+// rows, and append to db_p.
+//
+static void
+output_slice(cf_hist_track* this, row* prev_row_p, row* row_p,
+		uint32_t diff_sec, uint32_t num_cols,
+		cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p)
+{
+	const char* time_fmt;
+	const char* rate_fmt;
+	const char* pcts_fmt;
+	char line_sep;
+
+	switch (info_fmt) {
+	case CF_HIST_TRACK_FMT_PACKED:
+	default:
+		time_fmt = "%T";
+		rate_fmt = ",%.1f";
+		pcts_fmt = ",%.2f";
+		line_sep = ';';
+		break;
+	case CF_HIST_TRACK_FMT_TABLE:
+		time_fmt = "%T";
+		rate_fmt = " %9.1f";
+		pcts_fmt = " %6.2f";
+		line_sep = '\n';
+		break;
+	}
+
+	char output[MAX_FORMATTED_ROW_SIZE];
+	char* write_p = output;
+	char* end_p = output + MAX_FORMATTED_ROW_SIZE - 2;
+	time_t row_ts_time_t = (time_t)row_p->timestamp;
+	struct tm row_tm;
+
+	gmtime_r(&row_ts_time_t, &row_tm);
+	write_p += strftime(output, MAX_FORMATTED_ROW_SIZE - 2, time_fmt, &row_tm);
+
+	uint64_t diff_total = row_p->total - prev_row_p->total;
+	double ops_per_sec = (double)(diff_total) / diff_sec;
+
+	write_p += snprintf(write_p, end_p - write_p, rate_fmt, ops_per_sec);
+
+	for (int i = 0; i < num_cols; i++) {
+		// We "freeze" the histogram to calculate "overs", so it shouldn't be
+		// possible for an "over" to be less than the one in the previous row.
+		uint64_t diff_overs = row_p->overs[i] - prev_row_p->overs[i];
+		double pcts_over_i = diff_total != 0 ?
+				(double)(diff_overs * 100) / diff_total : 0;
+
+		write_p += snprintf(write_p, end_p - write_p, pcts_fmt, pcts_over_i);
+	}
+
+	*write_p++ = line_sep;
+	*write_p = 0;
+
+	cf_dyn_buf_append_string(db_p, output);
+}
+
+//------------------------------------------------
+// Convert threshold milliseconds to bucket index.
+//
+static int
+threshold_to_bucket(int threshold)
+{
+	if (threshold < 1) {
+		return -1;
+	}
+
+	int n = threshold;
+	int b = 0;
+
+	while (n > 1) {
+		n >>= 1;
+		b++;
+	}
+
+	// Check that threshold is an exact power of 2.
+	return (1 << b) == threshold ? b : -1;
+}
+
+//------------------------------------------------
+// Convert thresholds string to buckets array.
+//
+static int
+thresholds_to_buckets(const char* thresholds, uint32_t buckets[])
+{
+	// Copy since strtok() is destructive.
+	char toks[strlen(thresholds) + 1];
+
+	strcpy(toks, thresholds);
+
+	int i = 0;
+	char* tok = strtok(toks, ",");
+
+	while (tok) {
+		if (i == MAX_NUM_COLS) {
+			return -1;
+		}
+
+		int b = threshold_to_bucket(atoi(tok));
+
+		// Make sure it's a rising sequence of valid bucket indexes.
+		if (b < 0 || b > MAX_BUCKET || (i > 0 && b <= buckets[i - 1])) {
+			return -1;
+		}
+
+		buckets[i++] = (uint32_t)b;
+
+		tok = strtok(NULL, ",");
+	}
+
+	return i;
+}
diff --git a/cf/src/linear_hist.c b/cf/src/linear_hist.c
new file mode 100644
index 00000000..14f233bb
--- /dev/null
+++ b/cf/src/linear_hist.c
@@ -0,0 +1,366 @@
+/*
+ * linear_hist.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+//==========================================================
+// Includes.
+//
+
+#include "linear_hist.h"
+
+#include <pthread.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+
+//==========================================================
+// Private class data.
+//
+
+#define LINEAR_HIST_NAME_SIZE 512
+#define INFO_SNAPSHOT_SIZE 2048
+
+struct linear_hist_s {
+	char name[LINEAR_HIST_NAME_SIZE];
+
+	pthread_mutex_t info_lock;
+	char info_snapshot[INFO_SNAPSHOT_SIZE];
+
+	uint32_t num_buckets;
+	uint64_t *counts;
+
+	uint32_t start;
+	uint32_t bucket_width;
+};
+
+
+//==========================================================
+// Public API.
+//
+
+//------------------------------------------------
+// Create a linear histogram.
+//
+linear_hist*
+linear_hist_create(const char *name, uint32_t start, uint32_t max_offset,
+		uint32_t num_buckets)
+{
+	if (! (name && strlen(name) < LINEAR_HIST_NAME_SIZE)) {
+		cf_crash(AS_INFO, "linear_hist_create - bad name %s",
+				name ? name : "<null>");
+	}
+
+	if (start + max_offset < start) {
+		cf_crash(AS_INFO, "linear_hist_create - max_offset overflow");
+	}
+
+	if (num_buckets == 0) {
+		cf_crash(AS_INFO, "linear_hist_create - 0 num_buckets");
+	}
+
+	linear_hist *h = cf_malloc(sizeof(linear_hist));
+
+	strcpy(h->name, name);
+
+	if (0 != pthread_mutex_init(&h->info_lock, NULL)) {
+		cf_crash(AS_INFO, "linear_hist_create - mutex init failed");
+	}
+
+	h->info_snapshot[0] = 0;
+
+	h->num_buckets = num_buckets;
+	h->counts = cf_malloc(sizeof(uint64_t) * num_buckets);
+
+	linear_hist_clear(h, start, max_offset);
+
+	return h;
+}
+
+//------------------------------------------------
+// Destroy a linear histogram.
+//
+void
+linear_hist_destroy(linear_hist *h)
+{
+	pthread_mutex_destroy(&h->info_lock);
+	cf_free(h->counts);
+	cf_free(h);
+}
+
+//------------------------------------------------
+// Clear, re-scale/re-size a linear histogram.
+//
+void
+linear_hist_reset(linear_hist *h, uint32_t start, uint32_t max_offset,
+		uint32_t num_buckets)
+{
+	if (h->num_buckets == num_buckets) {
+		linear_hist_clear(h, start, max_offset);
+		return;
+	}
+
+	h->num_buckets = num_buckets;
+	h->counts = cf_realloc(h->counts, sizeof(uint64_t) * num_buckets);
+	linear_hist_clear(h, start, max_offset);
+}
+
+//------------------------------------------------
+// Clear and (re-)scale a linear histogram.
+//
+void
+linear_hist_clear(linear_hist *h, uint32_t start, uint32_t max_offset)
+{
+	h->start = start;
+	h->bucket_width = (max_offset + (h->num_buckets - 1)) / h->num_buckets;
+
+	// Only needed to protect against max_offset 0.
+	if (h->bucket_width == 0) {
+		h->bucket_width = 1;
+	}
+
+	memset((void *)h->counts, 0, sizeof(uint64_t) * h->num_buckets);
+}
+
+//------------------------------------------------
+// Access method for total count.
+//
+uint64_t
+linear_hist_get_total(linear_hist *h)
+{
+	uint64_t total_count = 0;
+
+	for (uint32_t i = 0; i < h->num_buckets; i++) {
+		total_count += h->counts[i];
+	}
+
+	return total_count;
+}
+
+//------------------------------------------------
+// Merge h2 into h1.
+//
+void
+linear_hist_merge(linear_hist *h1, linear_hist *h2)
+{
+	if (! (h1->num_buckets == h2->num_buckets && h1->start == h2->start &&
+			h1->bucket_width == h2->bucket_width)) {
+		cf_crash(AS_INFO, "linear_hist_merge - dissimilar histograms");
+	}
+
+	for (uint32_t i = 0; i < h1->num_buckets; i++) {
+		h1->counts[i] += h2->counts[i];
+	}
+}
+
+//------------------------------------------------
+// Insert a data point. Points out of range will
+// end up in the bucket at the appropriate end.
+//
+void
+linear_hist_insert_data_point(linear_hist *h, uint32_t point)
+{
+	int32_t offset = (int32_t)(point - h->start);
+	int32_t bucket = 0;
+
+	if (offset > 0) {
+		bucket = offset / h->bucket_width;
+
+		if (bucket >= (int32_t)h->num_buckets) {
+			bucket = h->num_buckets - 1;
+		}
+	}
+
+	h->counts[bucket]++;
+}
+
+//------------------------------------------------
+// Get the low edge of the "threshold" bucket -
+// the bucket in which the specified percentage of
+// total count is exceeded (accumulating from low
+// bucket).
+//
+uint64_t
+linear_hist_get_threshold_for_fraction(linear_hist *h, uint32_t tenths_pct,
+		linear_hist_threshold *p_threshold)
+{
+	return linear_hist_get_threshold_for_subtotal(h,
+			(linear_hist_get_total(h) * (uint64_t)tenths_pct) / 1000,
+			p_threshold);
+}
+
+//------------------------------------------------
+// Get the low edge of the "threshold" bucket -
+// the bucket in which the specified subtotal
+// count is exceeded (accumulating from low
+// bucket).
+//
+uint64_t
+linear_hist_get_threshold_for_subtotal(linear_hist *h, uint64_t subtotal,
+		linear_hist_threshold *p_threshold)
+{
+	p_threshold->bucket_width = h->bucket_width;
+	p_threshold->target_count = subtotal;
+
+	uint64_t count = 0;
+	uint32_t i;
+
+	for (i = 0; i < h->num_buckets; i++) {
+		count += h->counts[i];
+
+		if (count > subtotal) {
+			break;
+		}
+	}
+
+	if (i == h->num_buckets) {
+		// This means subtotal >= h->total_count.
+		p_threshold->value = 0xFFFFffff;
+		p_threshold->bucket_index = 0; // irrelevant
+		p_threshold->bucket_count = 0; // irrelevant
+		return count;
+	}
+
+	p_threshold->value = h->start + (i * h->bucket_width);
+	p_threshold->bucket_index = i;
+	p_threshold->bucket_count = h->counts[i];
+
+	// Return subtotal of everything below "threshold" bucket.
+	return count - h->counts[i];
+}
+
+//------------------------------------------------
+// Dump a linear histogram to log.
+//
+// Note - DO NOT change the log output format in
+// this method - public documentation assumes this
+// format.
+//
+void
+linear_hist_dump(linear_hist *h)
+{
+	uint32_t i = h->num_buckets;
+	uint32_t j = 0;
+	uint32_t k = 0;
+	uint64_t total_count = 0;
+
+	for (uint32_t b = 0; b < h->num_buckets; b++) {
+		if (h->counts[b] != 0) {
+			if (i > b) {
+				i = b;
+			}
+
+			j = b;
+			k++;
+			total_count += h->counts[b];
+		}
+	}
+
+	char buf[100];
+	int pos = 0;
+	int n = 0;
+
+	buf[0] = '\0';
+
+	cf_debug(AS_NSUP, "linear histogram dump: %s [%u %u]/[%u] (%lu total)",
+			h->name, h->start, h->start + (h->num_buckets * h->bucket_width),
+			h->bucket_width, total_count);
+
+	if (k > 100) {
+		// For now, just don't bother if there's too much to dump.
+		cf_debug(AS_NSUP, "... (%u buckets with non-zero count)", k);
+		return;
+	}
+
+	for ( ; i <= j; i++) {
+		if (h->counts[i] == 0) { // print only non-zero columns
+			continue;
+		}
+
+		int bytes = sprintf(buf + pos, " (%02u: %010lu)", i, h->counts[i]);
+
+		if (bytes <= 0) {
+			cf_debug(AS_NSUP, "linear histogram dump error");
+			return;
+		}
+
+		pos += bytes;
+
+		if ((n & 3) == 3) { // maximum of 4 printed columns per log line
+			 cf_debug(AS_NSUP, "%s", buf);
+			 pos = 0;
+			 buf[0] = '\0';
+		}
+
+		n++;
+	}
+
+	if (pos > 0) {
+		cf_debug(AS_NSUP, "%s", buf);
+	}
+}
+
+//------------------------------------------------
+// Save a linear histogram "snapshot".
+//
+void
+linear_hist_save_info(linear_hist *h)
+{
+	pthread_mutex_lock(&h->info_lock);
+
+	if (h->num_buckets > 100) {
+		// For now, just don't bother if there's too much to save.
+		sprintf(h->info_snapshot, "%u,%u ...", h->num_buckets, h->bucket_width);
+
+		pthread_mutex_unlock(&h->info_lock);
+		return;
+	}
+
+	// Write num_buckets, the bucket width, and the first bucket's count.
+	int i = 0;
+	int pos = snprintf(h->info_snapshot, INFO_SNAPSHOT_SIZE, "%u,%u,%lu",
+			h->num_buckets, h->bucket_width, h->counts[i++]);
+
+	while (pos < INFO_SNAPSHOT_SIZE && i < h->num_buckets) {
+		pos += snprintf(h->info_snapshot + pos, INFO_SNAPSHOT_SIZE - pos,
+				",%lu", h->counts[i++]);
+	}
+
+	pthread_mutex_unlock(&h->info_lock);
+}
+
+//------------------------------------------------
+// Append a linear histogram "snapshot" to db.
+//
+void
+linear_hist_get_info(linear_hist *h, cf_dyn_buf *db)
+{
+	pthread_mutex_lock(&h->info_lock);
+	cf_dyn_buf_append_string(db, h->info_snapshot);
+	pthread_mutex_unlock(&h->info_lock);
+}
diff --git a/cf/src/meminfo.c b/cf/src/meminfo.c
new file mode 100644
index 00000000..12f1fdb4
--- /dev/null
+++ b/cf/src/meminfo.c
@@ -0,0 +1,152 @@
+/*
+ * meminfo.c
+ *
+ * Copyright (C) 2008 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "meminfo.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+
+int
+cf_meminfo(uint64_t *physmem, uint64_t *freemem, int *freepct, bool *swapping)
+{
+	// do this without a malloc, because we might be in trouble, malloc-wise
+	char buf[4096];
+	memset(buf, 0, sizeof(buf)); // makes valgrind happy?
+
+	// be a little oversafe
+	if (physmem) *physmem = 0;
+	if (freemem) *freemem = 0;
+	if (freepct) *freepct = 0;
+	if (swapping) *swapping = 0;
+
+	// open /proc/meminfo
+	int fd = open("/proc/meminfo", O_RDONLY , 0 /*mask not used if not creating*/ );
+	if (fd < 0) {
+		fprintf(stderr, "meminfo failed: can't open proc file\n");
+		return(-1);
+	}
+
+	// this loop is overkill. proc read won't block, realistically
+	int pos = 0, lim = sizeof(buf);
+	int rv = 0;
+	do {
+
+		rv = read(fd, &buf[pos], lim - pos);
+		if (rv > 0)
+			pos += rv;
+		else if (rv < 0) {
+			fprintf(stderr, "meminfo failed: read returned %d errno %d pos %d\n",rv,errno,pos);
+			close(fd);
+			return(-1);
+		}
+
+	} while ((rv > 0) && (pos < lim));
+
+	close(fd);
+
+	char *physMemStr = "MemTotal"; uint64_t physMem = 0;
+	char *freeMemStr = "MemFree"; uint64_t freeMem = 0;
+	char *activeMemStr = "Active"; uint64_t activeMem = 0;
+	char *inactiveMemStr = "Inactive"; uint64_t inactiveMem = 0;
+	char *cachedMemStr = "Cached"; uint64_t cachedMem = 0;
+	char *buffersMemStr = "Buffers"; uint64_t buffersMem = 0;
+	char *swapTotalStr = "SwapTotal"; uint64_t swapTotal = 0;
+	char *swapFreeStr = "SwapFree"; uint64_t swapFree = 0;
+	char *sharedMemStr = "Shmem"; uint64_t sharedMem = 0;
+
+	// parse each line - always three tokens, the name, the integer, and 'kb'
+	char *cur = buf;
+	char *saveptr = 0, *tok1, *tok2, *tok3;
+	do {
+		tok1 = tok2 = tok3 = 0;
+		tok1 = strtok_r(cur,": \r\n" , &saveptr);
+		cur = 0;
+		tok2 = strtok_r(cur,": \r\n" , &saveptr);
+		tok3 = strtok_r(cur,": \r\n" , &saveptr);
+
+		if (tok1 && tok3) {
+			if (strcmp(tok1, physMemStr) == 0)
+				physMem = atoi(tok2);
+			else if (strcmp(tok1, freeMemStr) == 0)
+				freeMem = atoi(tok2);
+			else if (strcmp(tok1, swapTotalStr) == 0)
+				swapTotal = atoi(tok2);
+			else if (strcmp(tok1, swapFreeStr) == 0)
+				swapFree = atoi(tok2);
+			else if (strcmp(tok1, activeMemStr) == 0)
+				activeMem = atoi(tok2);
+			else if (strcmp(tok1, inactiveMemStr) == 0)
+				inactiveMem = atoi(tok2);
+			else if (strcmp(tok1, cachedMemStr) == 0)
+				cachedMem = atoi(tok2);
+			else if (strcmp(tok1, buffersMemStr) == 0)
+				buffersMem = atoi(tok2);
+			else if (strcmp(tok1, sharedMemStr) == 0)
+				sharedMem = atoi(tok2);
+		}
+
+	} while(tok1 && tok2 && tok3);
+
+	//
+	// Calculate available memory:
+	//   Start with the total physical memory in the system.
+	//   Next, subtract out the total of the active and inactive VM.
+	//   Finally, add back in the cached memory and buffers, which are effectively available if & when needed.
+	//   Caution: Subtract the shared memory, which is included in the cached memory, but is not available.
+	//
+	uint64_t availableMem = physMem - activeMem - inactiveMem + cachedMem + buffersMem - sharedMem;
+
+	if (physmem) *physmem = physMem * 1024L;
+	if (freemem) *freemem = availableMem * 1024L;
+
+	// just easier to do this kind of thing in one place
+	if (freepct) *freepct = (100L * availableMem) / physMem;
+
+	if (swapping) {
+		*swapping = false;
+#if 0
+		uint64_t swapUsedPct = ((swapTotal - swapFree)*100)/swapTotal;
+		if (swapUsedPct > 10) {
+			*swapping = true;
+			fprintf(stderr, " SWAPPING: %"PRIu64" %"PRIu64" %"PRIu64,
+				swapUsedPct, swapTotal, swapFree);
+		}
+#else
+		// Silence compiler warnings.
+		(void) swapFree;
+		(void) swapTotal;
+		(void) freeMem;
+#endif
+	}
+
+//	fprintf(stderr, "%u swapTotal %u swapFree %u swapFreePct ::: swapping %d\n",
+//		(unsigned int) swapTotal,(unsigned int)swapFree,(int)swapUsedPct,(int) *swapping);
+
+	return(0);
+}
diff --git a/cf/src/msg.c b/cf/src/msg.c
new file mode 100644
index 00000000..e68dd808
--- /dev/null
+++ b/cf/src/msg.c
@@ -0,0 +1,1205 @@
+/*
+ * msg.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+
+//==========================================================
+// Includes.
+//
+
+#include "msg.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aerospike/as_msgpack.h"
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_atomic.h"
+#include "citrusleaf/cf_byte_order.h"
+#include "citrusleaf/cf_vector.h"
+
+#include "dynbuf.h"
+#include "fault.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+typedef struct msg_type_entry_s {
+	const msg_template *mt;
+	uint16_t entry_count;
+	uint32_t scratch_sz;
+} msg_type_entry;
+
+// msg field header on wire.
+typedef struct msg_field_hdr_s {
+	uint16_t id;
+	uint8_t type;
+	uint8_t content[];
+} __attribute__ ((__packed__)) msg_field_hdr;
+
+
+//==========================================================
+// Globals.
+//
+
+// Total number of "msg" objects allocated:
+cf_atomic_int g_num_msgs = 0;
+
+// Total number of "msg" objects allocated per type:
+cf_atomic_int g_num_msgs_by_type[M_TYPE_MAX] = { 0 };
+
+static msg_type_entry g_mte[M_TYPE_MAX];
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static size_t msg_get_field_wire_size(msg_field_type type, size_t field_sz);
+static uint32_t msg_field_stamp(const msg_field *mf, msg_type mtype, uint8_t *buf);
+static void msg_field_save(msg *m, msg_field *mf);
+
+
+//==========================================================
+// Inlines.
+//
+
+static inline msg_field_type
+mf_type(const msg_field *mf, msg_type type)
+{
+	return g_mte[type].mt[mf->id].type;
+}
+
+static inline void
+mf_destroy(msg_field *mf)
+{
+	if (mf->is_set) {
+		if (mf->is_free) {
+			cf_free(mf->u.any_buf);
+			mf->is_free = false;
+		}
+
+		mf->is_set = false;
+	}
+}
+
+
+//==========================================================
+// Public API - object accounting.
+//
+
+// Call this instead of freeing msg directly, to keep track of all msgs.
+void
+msg_put(msg *m)
+{
+	cf_atomic_int_decr(&g_num_msgs);
+	cf_atomic_int_decr(&g_num_msgs_by_type[m->type]);
+	cf_rc_free(m);
+}
+
+
+//==========================================================
+// Public API - lifecycle.
+//
+
+void
+msg_type_register(msg_type type, const msg_template *mt, size_t mt_sz,
+		size_t scratch_sz)
+{
+	cf_assert(type >= 0 && type < M_TYPE_MAX, CF_MSG, "invalid type %d", type);
+
+	msg_type_entry *mte = &g_mte[type];
+	uint16_t mt_count = (uint16_t)(mt_sz / sizeof(msg_template));
+
+	if (mte->mt) {
+		// This happens on the heartbeat version jump - handle gently for now.
+		cf_info(CF_MSG, "msg_type_register() type %d already registered", type);
+		return;
+	}
+
+	cf_assert(mt_count != 0, CF_MSG, "msg_type_register() empty template");
+
+	uint16_t max_id = 0;
+
+	for (uint16_t i = 0; i < mt_count; i++) {
+		if (mt[i].id >= max_id) {
+			max_id = mt[i].id;
+		}
+	}
+
+	mte->entry_count = max_id + 1;
+
+	msg_template *table = cf_calloc(mte->entry_count, sizeof(msg_template));
+
+	for (uint16_t i = 0; i < mt_count; i++) {
+		table[mt[i].id] = mt[i];
+	}
+
+	mte->mt = table;
+	mte->scratch_sz = (uint32_t)scratch_sz;
+}
+
+msg *
+msg_create(msg_type type)
+{
+	// Caller validates type is in range - this validates it's not unused.
+	if (! g_mte[type].mt) {
+		return NULL;
+	}
+
+	const msg_type_entry *mte = &g_mte[type];
+	uint16_t mt_count = mte->entry_count;
+	size_t u_sz = sizeof(msg) + (sizeof(msg_field) * mt_count);
+	size_t a_sz = u_sz + (size_t)mte->scratch_sz;
+	msg *m = cf_rc_alloc(a_sz);
+
+	m->n_fields = mt_count;
+	m->bytes_used = (uint32_t)u_sz;
+	m->bytes_alloc = (uint32_t)a_sz;
+	m->just_parsed = false;
+	m->type = type;
+
+	for (uint16_t i = 0; i < mt_count; i++) {
+		msg_field *mf = &m->f[i];
+
+		mf->id = i;
+		mf->is_set = false;
+		mf->is_free = false;
+	}
+
+	// Keep track of allocated msgs.
+	cf_atomic_int_incr(&g_num_msgs);
+	cf_atomic_int_incr(&g_num_msgs_by_type[type]);
+
+	return m;
+}
+
+void
+msg_destroy(msg *m)
+{
+	int cnt = cf_rc_release(m);
+
+	if (cnt == 0) {
+		for (uint32_t i = 0; i < m->n_fields; i++) {
+			mf_destroy(&m->f[i]);
+		}
+
+		msg_put(m);
+	}
+	else {
+		cf_assert(cnt > 0, CF_MSG, "msg_destroy(%p) extra call", m);
+	}
+}
+
+void
+msg_incr_ref(msg *m)
+{
+	cf_rc_reserve(m);
+}
+
+
+//==========================================================
+// Public API - pack messages into flattened data.
+//
+
+size_t
+msg_get_wire_size(const msg *m)
+{
+	size_t sz = sizeof(msg_hdr);
+
+	for (uint16_t i = 0; i < m->n_fields; i++) {
+		const msg_field *mf = &m->f[i];
+
+		if (mf->is_set) {
+			sz += msg_get_field_wire_size(mf_type(mf, m->type), mf->field_sz);
+		}
+	}
+
+	return sz;
+}
+
+size_t
+msg_get_template_fixed_sz(const msg_template *mt, size_t mt_count)
+{
+	size_t sz = sizeof(msg_hdr);
+
+	for (size_t i = 0; i < mt_count; i++) {
+		sz += msg_get_field_wire_size(mt[i].type, 0);
+	}
+
+	return sz;
+}
+
+size_t
+msg_to_wire(const msg *m, uint8_t *buf)
+{
+	msg_hdr *hdr = (msg_hdr *)buf;
+
+	hdr->type = cf_swap_to_be16(m->type);
+
+	buf += sizeof(msg_hdr);
+
+	const uint8_t *body = buf;
+
+	for (uint16_t i = 0; i < m->n_fields; i++) {
+		const msg_field *mf = &m->f[i];
+
+		if (mf->is_set) {
+			buf += msg_field_stamp(mf, m->type, buf);
+		}
+	}
+
+	uint32_t body_sz = (uint32_t)(buf - body);
+
+	hdr->size = cf_swap_to_be32(body_sz);
+
+	return sizeof(msg_hdr) + body_sz;
+}
+
+
+//==========================================================
+// Public API - parse flattened data into messages.
+//
+
+int
+msg_parse(msg *m, const uint8_t *buf, size_t bufsz)
+{
+	if (bufsz < sizeof(msg_hdr)) {
+		return -1;
+	}
+
+	const msg_hdr *hdr = (const msg_hdr *)buf;
+	buf += sizeof(msg_hdr);
+
+	uint32_t sz = cf_swap_from_be32(hdr->size);
+	uint16_t type = cf_swap_from_be16(hdr->type);
+
+	if (bufsz < sz + sizeof(msg_hdr)) {
+		return -2;
+	}
+
+	if (m->type != type) {
+		cf_ticker_warning(CF_MSG, "parsed type %d for msg type %d", type, m->type);
+		return -3;
+	}
+
+	const uint8_t *eob = buf + sz;
+	size_t left = sz;
+
+	while (left != 0) {
+		if (left < sizeof(msg_field_hdr) + sizeof(uint32_t)) {
+			return -4;
+		}
+
+		const msg_field_hdr *fhdr = (const msg_field_hdr *)buf;
+		buf += sizeof(msg_field_hdr);
+
+		uint32_t id = (uint32_t)cf_swap_from_be16(fhdr->id);
+		msg_field_type ft = (msg_field_type)fhdr->type;
+		size_t fsz;
+		uint32_t size = 0;
+
+		switch (ft) {
+		case M_FT_UINT32:
+			fsz = sizeof(uint32_t);
+			break;
+		case M_FT_UINT64:
+			fsz = sizeof(uint64_t);
+			break;
+		default:
+			size = sizeof(uint32_t);
+			fsz = cf_swap_from_be32(*(const uint32_t *)buf);
+			buf += sizeof(uint32_t);
+			break;
+		}
+
+		if (left < sizeof(msg_field_hdr) + size + fsz) {
+			return -5;
+		}
+
+		msg_field *mf;
+
+		if (id >= m->n_fields) {
+			mf = NULL;
+		}
+		else {
+			mf = &m->f[id];
+		}
+
+		if (mf && ft != mf_type(mf, m->type)) {
+			cf_ticker_warning(CF_MSG, "msg type %d: parsed type %d for field type %d", m->type, ft, mf_type(mf, m->type));
+			mf = NULL;
+		}
+
+		if (mf) {
+			mf->is_set = true;
+
+			switch (mf_type(mf, m->type)) {
+			case M_FT_UINT32:
+				mf->u.ui32 = cf_swap_from_be32(*(uint32_t *)buf);
+				break;
+			case M_FT_UINT64:
+				mf->u.ui64 = cf_swap_from_be64(*(uint64_t *)buf);
+				break;
+			case M_FT_STR:
+			case M_FT_BUF:
+			case M_FT_ARRAY_UINT32:
+			case M_FT_ARRAY_UINT64:
+			case M_FT_ARRAY_STR:
+			case M_FT_ARRAY_BUF:
+			case M_FT_MSGPACK:
+				mf->field_sz = (uint32_t)fsz;
+				mf->u.any_buf = (void *)buf;
+				mf->is_free = false;
+				break;
+			default:
+				cf_ticker_detail(CF_MSG, "msg_parse: field type %d not supported - skipping", mf_type(mf, m->type));
+				mf->is_set = false;
+				break;
+			}
+		}
+
+		if (eob < buf) {
+			break;
+		}
+
+		buf += fsz;
+		left = (size_t)(eob - buf);
+	}
+
+	m->just_parsed = true;
+
+	return 0;
+}
+
+int
+msg_get_initial(uint32_t *size_r, msg_type *type_r, const uint8_t *buf,
+		uint32_t bufsz)
+{
+	if (bufsz < sizeof(msg_hdr)) {
+		return -1;
+	}
+
+	const msg_hdr *hdr = (const msg_hdr *)buf;
+
+	*size_r = cf_swap_from_be32(hdr->size) + (uint32_t)sizeof(msg_hdr);
+	*type_r = (msg_type)cf_swap_from_be16(hdr->type);
+
+	return 0;
+}
+
+void
+msg_reset(msg *m)
+{
+	m->bytes_used = (uint32_t)((m->n_fields * sizeof(msg_field)) + sizeof(msg));
+	m->just_parsed = false;
+
+	for (uint16_t i = 0; i < m->n_fields; i++) {
+		mf_destroy(&m->f[i]);
+	}
+}
+
+void
+msg_preserve_fields(msg *m, uint32_t n_field_ids, ...)
+{
+	bool reflect[m->n_fields];
+
+	for (uint16_t i = 0; i < m->n_fields; i++) {
+		reflect[i] = false;
+	}
+
+	va_list argp;
+	va_start(argp, n_field_ids);
+
+	for (uint32_t n = 0; n < n_field_ids; n++) {
+		reflect[va_arg(argp, int)] = true;
+	}
+
+	va_end(argp);
+
+	for (uint32_t i = 0; i < m->n_fields; i++) {
+		msg_field *mf = &m->f[i];
+
+		if (mf->is_set) {
+			if (reflect[i]) {
+				if (m->just_parsed) {
+					msg_field_save(m, mf);
+				}
+			}
+			else {
+				mf->is_set = false;
+			}
+		}
+	}
+
+	m->just_parsed = false;
+}
+
+void
+msg_preserve_all_fields(msg *m)
+{
+	if (! m->just_parsed) {
+		return;
+	}
+
+	for (uint32_t i = 0; i < m->n_fields; i++) {
+		msg_field *mf = &m->f[i];
+
+		if (mf->is_set) {
+			msg_field_save(m, mf);
+		}
+	}
+
+	m->just_parsed = false;
+}
+
+
+//==========================================================
+// Public API - set fields in messages.
+//
+
+int
+msg_set_uint32(msg *m, int field_id, uint32_t v)
+{
+	m->f[field_id].is_set = true;
+	m->f[field_id].u.ui32 = v;
+
+	return 0;
+}
+
+int
+msg_set_uint64(msg *m, int field_id, uint64_t v)
+{
+	m->f[field_id].is_set = true;
+	m->f[field_id].u.ui64 = v;
+
+	return 0;
+}
+
+int
+msg_set_str(msg *m, int field_id, const char *v, msg_set_type type)
+{
+	msg_field *mf = &m->f[field_id];
+
+	mf_destroy(mf);
+
+	mf->field_sz = (uint32_t)strlen(v) + 1;
+
+	if (type == MSG_SET_COPY) {
+		uint32_t fsz = mf->field_sz;
+
+		if (m->bytes_alloc - m->bytes_used >= fsz) {
+			mf->u.str = (char *)m + m->bytes_used;
+			m->bytes_used += fsz;
+			mf->is_free = false;
+			memcpy(mf->u.str, v, fsz);
+		}
+		else {
+			mf->u.str = cf_strdup(v);
+			mf->is_free = true;
+		}
+	}
+	else if (type == MSG_SET_HANDOFF_MALLOC) {
+		mf->u.str = (char *)v;
+		mf->is_free = (v != NULL);
+
+		if (! v) {
+			cf_warning(CF_MSG, "handoff malloc with null pointer");
+		}
+	}
+
+	mf->is_set = true;
+
+	return 0;
+}
+
+int
+msg_set_buf(msg *m, int field_id, const uint8_t *v, size_t sz,
+		msg_set_type type)
+{
+	msg_field *mf = &m->f[field_id];
+
+	mf_destroy(mf);
+
+	mf->field_sz = (uint32_t)sz;
+
+	if (type == MSG_SET_COPY) {
+		if (m->bytes_alloc - m->bytes_used >= sz) {
+			mf->u.buf = (uint8_t *)m + m->bytes_used;
+			m->bytes_used += (uint32_t)sz;
+			mf->is_free = false;
+		}
+		else {
+			mf->u.buf = cf_malloc(sz);
+			mf->is_free = true;
+		}
+
+		memcpy(mf->u.buf, v, sz);
+
+	}
+	else if (type == MSG_SET_HANDOFF_MALLOC) {
+		mf->u.buf = (void *)v;
+		mf->is_free = (v != NULL);
+
+		if (! v) {
+			cf_warning(CF_MSG, "handoff malloc with null pointer");
+		}
+	}
+
+	mf->is_set = true;
+
+	return 0;
+}
+
+int
+msg_set_uint32_array_size(msg *m, int field_id, uint32_t count)
+{
+	msg_field *mf = &m->f[field_id];
+
+	cf_assert(! mf->is_set, CF_MSG, "msg_set_uint32_array_size() field already set");
+
+	mf->field_sz = (uint32_t)(count * sizeof(uint32_t));
+	mf->u.ui32_a = cf_malloc(mf->field_sz);
+	mf->is_set = true;
+	mf->is_free = true;
+
+	return 0;
+}
+
+int
+msg_set_uint32_array(msg *m, int field_id, uint32_t idx, uint32_t v)
+{
+	msg_field *mf = &m->f[field_id];
+
+	cf_assert(mf->is_set, CF_MSG, "msg_set_uint32_array() field not set");
+	cf_assert(idx < (mf->field_sz >> 2), CF_MSG, "msg_set_uint32_array() idx out of bounds");
+
+	mf->u.ui32_a[idx] = cf_swap_to_be32(v);
+
+	return 0;
+}
+
+int
+msg_set_uint64_array_size(msg *m, int field_id, uint32_t count)
+{
+	msg_field *mf = &m->f[field_id];
+
+	cf_assert(! mf->is_set, CF_MSG, "msg_set_uint64_array_size() field already set");
+
+	mf->field_sz = (uint32_t)(count * sizeof(uint64_t));
+	mf->u.ui64_a = cf_malloc(mf->field_sz);
+	mf->is_set = true;
+	mf->is_free = true;
+
+	return 0;
+}
+
+int
+msg_set_uint64_array(msg *m, int field_id, uint32_t idx, uint64_t v)
+{
+	msg_field *mf = &m->f[field_id];
+
+	cf_assert(mf->is_set, CF_MSG, "msg_set_uint64_array() field not set");
+	cf_assert(idx < (mf->field_sz >> 3), CF_MSG, "msg_set_uint64_array() idx out of bounds");
+
+	mf->u.ui64_a[idx] = cf_swap_to_be64(v);
+
+	return 0;
+}
+
+void
+msg_msgpack_list_set_uint32(msg *m, int field_id, const uint32_t *buf,
+		uint32_t count)
+{
+	msg_field *mf = &m->f[field_id];
+	uint32_t a_sz = as_pack_list_header_get_size(count);
+
+	mf_destroy(mf);
+
+	for (uint32_t i = 0; i < count; i++) {
+		a_sz += as_pack_uint64_size((uint64_t)buf[i]);
+	}
+
+	mf->field_sz = a_sz;
+	mf->u.any_buf = cf_malloc(a_sz);
+
+	as_packer pk = {
+			.buffer = mf->u.any_buf,
+			.offset = 0,
+			.capacity = (int)a_sz,
+	};
+
+	int e = as_pack_list_header(&pk, count);
+
+	cf_assert(e == 0, CF_MSG, "as_pack_list_header failed");
+
+	for (uint32_t i = 0; i < count; i++) {
+		e = as_pack_uint64(&pk, (uint64_t)buf[i]);
+		cf_assert(e == 0, CF_MSG, "as_pack_str failed");
+	}
+
+	mf->is_free = true;
+	mf->is_set = true;
+}
+
+void
+msg_msgpack_list_set_buf(msg *m, int field_id, const cf_vector *v)
+{
+	msg_field *mf = &m->f[field_id];
+	uint32_t count = cf_vector_size(v);
+	uint32_t a_sz = as_pack_list_header_get_size(count);
+
+	mf_destroy(mf);
+
+	for (uint32_t i = 0; i < count; i++) {
+		const msg_buf_ele *ele = cf_vector_getp((cf_vector *)v, i);
+
+		if (! ele->ptr) {
+			a_sz++; // TODO - add to common later
+		}
+		else {
+			a_sz += as_pack_str_size(ele->sz);
+		}
+	}
+
+	mf->field_sz = a_sz;
+	mf->u.any_buf = cf_malloc(a_sz);
+
+	as_packer pk = {
+			.buffer = mf->u.any_buf,
+			.offset = 0,
+			.capacity = (int)a_sz,
+	};
+
+	int e = as_pack_list_header(&pk, count);
+
+	cf_assert(e == 0, CF_MSG, "as_pack_list_header failed");
+
+	for (uint32_t i = 0; i < count; i++) {
+		const msg_buf_ele *ele = cf_vector_getp((cf_vector *)v, i);
+
+		if (! ele->ptr) {
+			pk.buffer[pk.offset++] = 0xc0; // TODO - add to common later
+		}
+		else {
+			e = as_pack_str(&pk, ele->ptr, ele->sz);
+			cf_assert(e == 0, CF_MSG, "as_pack_str failed");
+		}
+	}
+
+	mf->is_free = true;
+	mf->is_set = true;
+}
+
+
+//==========================================================
+// Public API - get fields from messages.
+//
+
+msg_field_type
+msg_field_get_type(const msg *m, int field_id)
+{
+	return mf_type(&m->f[field_id], m->type);
+}
+
+bool
+msg_is_set(const msg *m, int field_id)
+{
+	cf_assert(field_id >= 0 && field_id < (int)m->n_fields, CF_MSG, "invalid field_id %d", field_id);
+
+	return m->f[field_id].is_set;
+}
+
+int
+msg_get_uint32(const msg *m, int field_id, uint32_t *val_r)
+{
+	if (! m->f[field_id].is_set) {
+		return -1;
+	}
+
+	*val_r = m->f[field_id].u.ui32;
+
+	return 0;
+}
+
+int
+msg_get_uint64(const msg *m, int field_id, uint64_t *val_r)
+{
+	if (! m->f[field_id].is_set) {
+		return -1;
+	}
+
+	*val_r = m->f[field_id].u.ui64;
+
+	return 0;
+}
+
+int
+msg_get_str(const msg *m, int field_id, char **str_r, size_t *sz_r,
+		msg_get_type type)
+{
+	if (! m->f[field_id].is_set) {
+		return -1;
+	}
+
+	if (type == MSG_GET_DIRECT) {
+		*str_r = m->f[field_id].u.str;
+	}
+	else if (type == MSG_GET_COPY_MALLOC) {
+		*str_r = cf_strdup(m->f[field_id].u.str);
+	}
+	else {
+		cf_crash(CF_MSG, "msg_get_str: illegal msg_get_type");
+	}
+
+	if (sz_r) {
+		*sz_r = m->f[field_id].field_sz;
+	}
+
+	return 0;
+}
+
+int
+msg_get_buf(const msg *m, int field_id, uint8_t **buf_r, size_t *sz_r,
+		msg_get_type type)
+{
+	if (! m->f[field_id].is_set) {
+		return -1;
+	}
+
+	if (type == MSG_GET_DIRECT) {
+		*buf_r = m->f[field_id].u.buf;
+	}
+	else if (type == MSG_GET_COPY_MALLOC) {
+		*buf_r = cf_malloc(m->f[field_id].field_sz);
+		memcpy(*buf_r, m->f[field_id].u.buf, m->f[field_id].field_sz);
+	}
+	else {
+		cf_crash(CF_MSG, "msg_get_buf: illegal msg_get_type");
+	}
+
+	if (sz_r) {
+		*sz_r = m->f[field_id].field_sz;
+	}
+
+	return 0;
+}
+
+int
+msg_get_uint32_array(const msg *m, int field_id, uint32_t index,
+		uint32_t *val_r)
+{
+	const msg_field *mf = &m->f[field_id];
+
+	if (! mf->is_set) {
+		return -1;
+	}
+
+	*val_r = cf_swap_from_be32(mf->u.ui32_a[index]);
+
+	return 0;
+}
+
+int
+msg_get_uint64_array_count(const msg *m, int field_id, uint32_t *count_r)
+{
+	const msg_field *mf = &m->f[field_id];
+
+	if (! mf->is_set) {
+		return -1;
+	}
+
+	*count_r = mf->field_sz >> 3;
+
+	return 0;
+}
+
+int
+msg_get_uint64_array(const msg *m, int field_id, uint32_t index,
+		uint64_t *val_r)
+{
+	const msg_field *mf = &m->f[field_id];
+
+	if (! mf->is_set) {
+		return -1;
+	}
+
+	*val_r = cf_swap_from_be64(mf->u.ui64_a[index]);
+
+	return 0;
+}
+
+bool
+msg_msgpack_container_get_count(const msg *m, int field_id, uint32_t *count_r)
+{
+	const msg_field *mf = &m->f[field_id];
+
+	if (! mf->is_set) {
+		return false;
+	}
+
+	as_unpacker pk = {
+			.buffer = (const uint8_t *)mf->u.any_buf,
+			.offset = 0,
+			.length = (int)mf->field_sz
+	};
+
+	as_val_t type = as_unpack_peek_type(&pk);
+	int64_t count;
+
+	switch (type) {
+	case AS_LIST:
+		count = as_unpack_list_header_element_count(&pk);
+		break;
+	case AS_MAP:
+		count = as_unpack_map_header_element_count(&pk);
+		break;
+	default:
+		cf_ticker_warning(CF_MSG, "type %d not a packed container", type);
+		return false;
+	}
+
+	if (count < 0) {
+		cf_ticker_warning(CF_MSG, "invalid packed container type %d", type);
+		return false;
+	}
+
+	*count_r = (uint32_t)count;
+
+	return true;
+}
+
+bool
+msg_msgpack_list_get_uint32_array(const msg *m, int field_id, uint32_t *buf_r,
+		uint32_t *count_r)
+{
+	cf_assert(buf_r, CF_MSG, "buf_r is null");
+
+	const msg_field *mf = &m->f[field_id];
+
+	if (! mf->is_set) {
+		return false;
+	}
+
+	as_unpacker pk = {
+			.buffer = (const uint8_t *)mf->u.any_buf,
+			.offset = 0,
+			.length = (int)mf->field_sz
+	};
+
+	as_val_t type = as_unpack_peek_type(&pk);
+	int64_t count;
+
+	switch (type) {
+	case AS_LIST:
+		count = as_unpack_list_header_element_count(&pk);
+		break;
+	default:
+		cf_ticker_warning(CF_MSG, "msg_msgpack_array_get_uint32_array() type %d but expected list", type);
+		return false;
+	}
+
+	if (count < 0) {
+		cf_ticker_warning(CF_MSG, "invalid packed list type %d", type);
+		return false;
+	}
+
+	if (*count_r < (uint32_t)count) {
+		cf_warning(CF_MSG, "count_r %u < %ld too small", *count_r, count);
+		return false;
+	}
+
+	for (int64_t i = 0; i < count; i++) {
+		uint64_t val;
+		int ret = as_unpack_uint64(&pk, &val);
+
+		if (ret != 0 || (val & (0xFFFFffffUL << 32)) != 0) {
+			cf_warning(CF_MSG, "i %ld/%ld invalid packed uint32 ret %d val 0x%lx", i, count, ret, val);
+			return false;
+		}
+
+		buf_r[i] = (uint32_t)val;
+	}
+
+	*count_r = (uint32_t)count;
+
+	return true;
+}
+
+bool
+msg_msgpack_list_get_buf_array(const msg *m, int field_id, cf_vector *v_r,
+		bool init_vec)
+{
+	const msg_field *mf = &m->f[field_id];
+
+	if (! mf->is_set) {
+		return false;
+	}
+
+	as_unpacker pk = {
+			.buffer = (const uint8_t *)mf->u.any_buf,
+			.offset = 0,
+			.length = (int)mf->field_sz
+	};
+
+	as_val_t type = as_unpack_peek_type(&pk);
+	int64_t count;
+
+	switch (type) {
+	case AS_LIST:
+		count = as_unpack_list_header_element_count(&pk);
+		break;
+	default:
+		cf_ticker_warning(CF_MSG, "msg_msgpack_array_get_buf_vec_with_init() type %d but expected list", type);
+		return false;
+	}
+
+	if (count < 0) {
+		cf_ticker_warning(CF_MSG, "invalid packed list type %d", type);
+		return false;
+	}
+
+	if (init_vec) {
+		if (cf_vector_init(v_r, sizeof(msg_buf_ele), (uint32_t)count, 0) != 0) {
+			cf_warning(CF_MSG, "vector malloc failed - count %ld", count);
+			return false;
+		}
+	}
+	else if ((uint32_t)count > v_r->capacity) { // TODO - wrap to avoid access of private members?
+		cf_warning(CF_MSG, "count %ld > vector cap %u", count, v_r->capacity);
+		return false;
+	}
+
+	for (int64_t i = 0; i < count; i++) {
+		msg_buf_ele ele;
+		int saved_offset = pk.offset;
+
+		ele.ptr = (uint8_t *)as_unpack_str(&pk, &ele.sz);
+
+		if (! ele.ptr) {
+			pk.offset = saved_offset;
+			ele.sz = 0;
+
+			if (as_unpack_size(&pk) <= 0) {
+				if (init_vec) {
+					cf_vector_destroy(v_r);
+				}
+
+				cf_warning(CF_MSG, "i %ld/%ld invalid msgpack element with type %d", i, count, type);
+
+				return false;
+			}
+		}
+
+		cf_vector_append(v_r, &ele);
+	}
+
+	return true;
+}
+
+
+//==========================================================
+// Public API - debugging only.
+//
+
+void
+msg_dump(const msg *m, const char *info)
+{
+	cf_info(CF_MSG, "msg_dump: %s: msg %p rc %d n-fields %u bytes-used %u bytes-alloc'd %u type %d",
+			info, m, (int)cf_rc_count((void*)m), m->n_fields, m->bytes_used,
+			m->bytes_alloc, m->type);
+
+	for (uint32_t i = 0; i < m->n_fields; i++) {
+		const msg_field *mf =  &m->f[i];
+
+		cf_info(CF_MSG, "mf %02u: id %u is-set %d", i, mf->id, mf->is_set);
+
+		if (mf->is_set) {
+			switch (mf_type(mf, m->type)) {
+			case M_FT_UINT32:
+				cf_info(CF_MSG, "   type UINT32 value %u", mf->u.ui32);
+				break;
+			case M_FT_UINT64:
+				cf_info(CF_MSG, "   type UINT64 value %lu", mf->u.ui64);
+				break;
+			case M_FT_STR:
+				cf_info(CF_MSG, "   type STR sz %u free %c value %s",
+						mf->field_sz, mf->is_free ? 't' : 'f', mf->u.str);
+				break;
+			case M_FT_BUF:
+				cf_info_binary(CF_MSG, mf->u.buf, mf->field_sz,
+						CF_DISPLAY_HEX_COLUMNS,
+						"   type BUF sz %u free %c value ",
+						mf->field_sz, mf->is_free ? 't' : 'f');
+				break;
+			case M_FT_ARRAY_UINT32:
+				cf_info(CF_MSG, "   type ARRAY_UINT32: count %u n-uint32 %u free %c",
+						mf->field_sz, mf->field_sz >> 2,
+						mf->is_free ? 't' : 'f');
+				{
+					uint32_t n_ints = mf->field_sz >> 2;
+					for (uint32_t j = 0; j < n_ints; j++) {
+						cf_info(CF_MSG, "      idx %u value %u",
+								j, ntohl(mf->u.ui32_a[j]));
+					}
+				}
+				break;
+			case M_FT_ARRAY_UINT64:
+				cf_info(CF_MSG, "   type ARRAY_UINT64: count %u n-uint64 %u free %c",
+						mf->field_sz, mf->field_sz >> 3,
+						mf->is_free ? 't' : 'f');
+				{
+					uint32_t n_ints = mf->field_sz >> 3;
+					for (uint32_t j = 0; j < n_ints; j++) {
+						cf_info(CF_MSG, "      idx %u value %lu",
+								j, __bswap_64(mf->u.ui64_a[j]));
+					}
+				}
+				break;
+			default:
+				cf_info(CF_MSG, "   type %d unknown", mf_type(mf, m->type));
+				break;
+			}
+		}
+	}
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static size_t
+msg_get_field_wire_size(msg_field_type type, size_t field_sz)
+{
+	switch (type) {
+	case M_FT_UINT32:
+		return sizeof(msg_field_hdr) + sizeof(uint32_t);
+	case M_FT_UINT64:
+		return sizeof(msg_field_hdr) + sizeof(uint64_t);
+	case M_FT_STR:
+	case M_FT_BUF:
+	case M_FT_ARRAY_UINT32:
+	case M_FT_ARRAY_UINT64:
+	case M_FT_ARRAY_STR:
+	case M_FT_ARRAY_BUF:
+	case M_FT_MSGPACK:
+		break;
+	default:
+		cf_crash(CF_MSG, "unexpected field type %d", type);
+		break;
+	}
+
+	return sizeof(msg_field_hdr) + sizeof(uint32_t) + field_sz;
+}
+
+// Returns the number of bytes written.
+static uint32_t
+msg_field_stamp(const msg_field *mf, msg_type mtype, uint8_t *buf)
+{
+	msg_field_hdr *hdr = (msg_field_hdr *)buf;
+	msg_field_type type = mf_type(mf, mtype);
+
+	buf += sizeof(msg_field_hdr);
+
+	hdr->id = cf_swap_to_be16((uint16_t)mf->id);
+	hdr->type = (uint8_t)type;
+
+	switch (type) {
+	case M_FT_UINT32:
+		*(uint32_t *)buf = cf_swap_to_be32(mf->u.ui32);
+		return sizeof(msg_field_hdr) + sizeof(uint32_t);
+	case M_FT_UINT64:
+		*(uint64_t *)buf = cf_swap_to_be64(mf->u.ui64);
+		return sizeof(msg_field_hdr) + sizeof(uint64_t);
+	default:
+		break;
+	}
+
+	uint32_t fsz;
+	uint32_t *p_fsz = (uint32_t *)buf;
+
+	buf += sizeof(uint32_t);
+
+	switch (type) {
+	case M_FT_STR:
+	case M_FT_BUF:
+	case M_FT_ARRAY_UINT32:
+	case M_FT_ARRAY_UINT64:
+	case M_FT_ARRAY_STR:
+	case M_FT_ARRAY_BUF:
+	case M_FT_MSGPACK:
+		fsz = mf->field_sz;
+		memcpy(buf, mf->u.any_buf, fsz);
+		break;
+	default:
+		cf_crash(CF_MSG, "unexpected field type %d", type);
+		return 0;
+	}
+
+	*p_fsz = cf_swap_to_be32(fsz);
+
+	return (uint32_t)(sizeof(msg_field_hdr) + sizeof(uint32_t) + fsz);
+}
+
+static void
+msg_field_save(msg *m, msg_field *mf)
+{
+	switch (mf_type(mf, m->type)) {
+	case M_FT_UINT32:
+	case M_FT_UINT64:
+		break;
+	case M_FT_STR:
+	case M_FT_BUF:
+	case M_FT_ARRAY_UINT32:
+	case M_FT_ARRAY_UINT64:
+	case M_FT_ARRAY_STR:
+	case M_FT_ARRAY_BUF:
+	case M_FT_MSGPACK:
+		// Should only preserve received messages where buffer pointers point
+		// directly into a fabric buffer.
+		cf_assert(! mf->is_free, CF_MSG, "invalid msg preserve");
+
+		if (m->bytes_alloc - m->bytes_used >= mf->field_sz) {
+			void *buf = ((uint8_t *)m) + m->bytes_used;
+
+			memcpy(buf, mf->u.any_buf, mf->field_sz);
+			mf->u.any_buf = buf;
+			m->bytes_used += mf->field_sz;
+			mf->is_free = false;
+		}
+		else {
+			void *buf = cf_malloc(mf->field_sz);
+
+			memcpy(buf, mf->u.any_buf, mf->field_sz);
+			mf->u.any_buf = buf;
+			mf->is_free = true;
+		}
+		break;
+	default:
+		break;
+	}
+}
diff --git a/cf/src/node.c b/cf/src/node.c
new file mode 100644
index 00000000..5dedc489
--- /dev/null
+++ b/cf/src/node.c
@@ -0,0 +1,67 @@
+/*
+ * node.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "node.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "citrusleaf/alloc.h"
+
+#include "fault.h"
+
+
+uint32_t
+cf_nodeid_shash_fn(const void *key)
+{
+	cf_node id = *(const cf_node *)key;
+
+	return (uint32_t)(id >> 32) | (uint32_t)id;
+}
+
+uint32_t
+cf_nodeid_rchash_fn(const void *key, uint32_t key_size)
+{
+	(void)key_size;
+
+	return cf_nodeid_shash_fn(key);
+}
+
+char *
+cf_node_name()
+{
+	char buffer[1024];
+	int res = gethostname(buffer, sizeof(buffer));
+
+	if (res == (int)sizeof(buffer) || (res < 0 && errno == ENAMETOOLONG)) {
+		cf_crash(CF_MISC, "host name too long");
+	}
+
+	if (res < 0) {
+		cf_warning(CF_MISC, "error while determining host name: %d (%s)",
+				errno, cf_strerror(errno));
+		buffer[0] = 0;
+	}
+
+	return cf_strdup(buffer);
+}
diff --git a/cf/src/olock.c b/cf/src/olock.c
new file mode 100644
index 00000000..65ea3282
--- /dev/null
+++ b/cf/src/olock.c
@@ -0,0 +1,114 @@
+/*
+ * olock.c
+ *
+ * Copyright (C) 2008-2014 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+/*
+ * The object lock system gives a list
+ *
+ */
+
+#include "olock.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <citrusleaf/cf_digest.h>
+#include <citrusleaf/alloc.h>
+
+#include <cf_mutex.h>
+
+
+// This ruins the notion that olocks are a generic class, but...
+// (Perhaps better in index.c or record.c, if we ever make a record.h?)
+olock *g_record_locks;
+
+
+// an interesting detail: since this digest is used to choose among
+// servers, you must use different bits to choose which OLOCK
+
+//
+// ASSUMES d is DIGEST and ol is OLOCK *
+//
+
+#define OLOCK_HASH(__ol, __d) ( ( (__d->digest[2] << 8) | (__d->digest[3]) ) & __ol->mask )
+
+void
+olock_lock(olock *ol, cf_digest *d)
+{
+	uint32_t n = OLOCK_HASH(ol, d);
+
+	cf_mutex_lock(&ol->locks[n]);
+}
+
+void
+olock_vlock(olock *ol, cf_digest *d, cf_mutex **vlock)
+{
+	uint32_t n = OLOCK_HASH(ol, d);
+
+	*vlock = &ol->locks[n];
+
+	cf_mutex_lock(*vlock);
+}
+
+void
+olock_unlock(olock *ol, cf_digest *d)
+{
+	uint32_t n = OLOCK_HASH(ol, d);
+
+	cf_mutex_unlock(&ol->locks[n]);
+}
+
+olock *
+olock_create(uint32_t n_locks, bool mutex)
+{
+	olock *ol = cf_malloc(sizeof(olock) + (sizeof(cf_mutex) * n_locks));
+
+	uint32_t mask = n_locks - 1;
+
+	if ((mask & n_locks) != 0) {
+		fprintf(stderr, "olock: make sure your number of locks is a power of 2, n_locks aint\n");
+		return 0;
+	}
+
+	ol->n_locks = n_locks;
+	ol->mask = mask;
+
+	if (mutex) {
+		memset(ol->locks, 0, sizeof(cf_mutex) * n_locks);
+	}
+	else {
+		fprintf(stderr, "olock: todo add reader writer locks\n");
+	}
+
+	return ol;
+}
+
+void
+olock_destroy(olock *ol)
+{
+	for (int i = 0; i < ol->n_locks; i++) {
+		cf_mutex_destroy(&ol->locks[i]);
+	}
+
+	cf_free(ol);
+}
diff --git a/cf/src/shash.c b/cf/src/shash.c
new file mode 100644
index 00000000..df5cbf62
--- /dev/null
+++ b/cf/src/shash.c
@@ -0,0 +1,712 @@
+/*
+ * shash.c
+ *
+ * Copyright (C) 2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "shash.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_hash_math.h"
+
+#include "fault.h"
+
+
+//==========================================================
+// Typedefs & constants.
+//
+
+// TODO - in_use is wasteful, especially when not first in bucket.
+typedef struct cf_shash_ele_s {
+	struct cf_shash_ele_s *next;
+	bool in_use;
+	uint8_t data[];
+} cf_shash_ele;
+
+
+//==========================================================
+// Forward declarations.
+//
+
+static inline void cf_shash_clear_table(cf_shash *h);
+static inline void cf_shash_destroy_elements(cf_shash *h);
+static inline uint32_t cf_shash_calculate_hash(cf_shash *h, const void *key);
+static inline pthread_mutex_t *cf_shash_lock(cf_shash *h, uint32_t i);
+static inline void cf_shash_unlock(pthread_mutex_t *l);
+static inline cf_shash_ele *cf_shash_get_bucket(cf_shash *h, uint32_t i);
+static inline void cf_shash_fill_element(cf_shash_ele *e, cf_shash *h, const void *key, const void *value);
+static inline void cf_shash_size_incr(cf_shash *h);
+static inline void cf_shash_size_decr(cf_shash *h);
+int cf_shash_delete_or_pop(cf_shash *h, const void *key, void *value);
+
+
+//==========================================================
+// Inlines & macros.
+//
+
+#define ELE_KEY(_h, _e) ((void *)_e->data)
+#define ELE_VALUE(_h, _e) ((void *)(_e->data + _h->key_size))
+
+
+//==========================================================
+// Public API - useful hash functions.
+//
+
+// Interpret first 4 bytes of key as (host-ordered) uint32_t. (Note - caller
+// is responsible for ensuring key size is at least 4 bytes.)
+uint32_t
+cf_shash_fn_u32(const void *key)
+{
+	return *(const uint32_t *)key;
+}
+
+// Useful if key is a pointer.
+uint32_t
+cf_shash_fn_ptr(const void *key)
+{
+	return cf_hash_ptr32(key);
+}
+
+// Useful if key is a null-terminated string. (Note - using fixed-size keys, so
+// key must still be padded to correctly compare keys in a bucket.)
+uint32_t
+cf_shash_fn_zstr(const void *key)
+{
+	return cf_hash_fnv32((const uint8_t *)key, strlen(key));
+}
+
+
+//==========================================================
+// Public API.
+//
+
+cf_shash *
+cf_shash_create(cf_shash_hash_fn h_fn, uint32_t key_size, uint32_t value_size,
+		uint32_t n_buckets, uint32_t flags)
+{
+	cf_assert(h_fn && key_size != 0 && n_buckets != 0, CF_MISC, "bad param");
+	// Note - value_size 0 works, and is used.
+
+	cf_shash *h = cf_malloc(sizeof(cf_shash));
+
+	h->h_fn = h_fn;
+	h->key_size = key_size;
+	h->value_size = value_size;
+	h->ele_size = sizeof(cf_shash_ele) + key_size + value_size;
+	h->n_buckets = n_buckets;
+	h->flags = flags;
+	h->n_elements = 0;
+
+	// Can't have both lock options, but can opt for no locks at all.
+	cf_assert((flags & CF_SHASH_BIG_LOCK) == 0 ||
+			(flags & CF_SHASH_MANY_LOCK) == 0, CF_MISC, "bad flags param");
+
+	h->table = (cf_shash_ele *)cf_malloc(n_buckets * h->ele_size);
+
+	cf_shash_clear_table(h);
+
+	if ((flags & CF_SHASH_BIG_LOCK) != 0) {
+		pthread_mutex_init(&h->big_lock, NULL);
+	}
+	else if ((flags & CF_SHASH_MANY_LOCK) != 0) {
+		h->bucket_locks = cf_malloc(sizeof(pthread_mutex_t) * n_buckets);
+
+		for (uint32_t i = 0; i < n_buckets; i++) {
+			pthread_mutex_init(&h->bucket_locks[i], NULL);
+		}
+	}
+
+	return h;
+}
+
+void
+cf_shash_destroy(cf_shash *h)
+{
+	if (! h) {
+		return;
+	}
+
+	cf_shash_destroy_elements(h);
+
+	if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+		pthread_mutex_destroy(&h->big_lock);
+	}
+	else if ((h->flags & CF_SHASH_MANY_LOCK) != 0) {
+		for (uint32_t i = 0; i < h->n_buckets; i++) {
+			pthread_mutex_destroy(&h->bucket_locks[i]);
+		}
+
+		cf_free(h->bucket_locks);
+	}
+
+	cf_free(h->table);
+	cf_free(h);
+}
+
+uint32_t
+cf_shash_get_size(cf_shash *h)
+{
+	cf_assert(h, CF_MISC, "bad param");
+
+	// For now, not bothering with different methods per lock mode.
+	return cf_atomic32_get(h->n_elements);
+}
+
+void
+cf_shash_put(cf_shash *h, const void *key, const void *value)
+{
+	cf_assert(h && key && value, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	pthread_mutex_t *l = cf_shash_lock(h, hash);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	// Most common case should be insert into empty bucket.
+	if (! e->in_use) {
+		cf_shash_fill_element(e, h, key, value);
+		cf_shash_unlock(l);
+		return;
+	}
+
+	cf_shash_ele *e_head = e;
+
+	while (e) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) {
+			// Replace the previous value with the new value.
+			memcpy(ELE_VALUE(h, e), value, h->value_size);
+			cf_shash_unlock(l);
+			return;
+		}
+
+		e = e->next;
+	}
+
+	e = (cf_shash_ele *)cf_malloc(h->ele_size);
+
+	cf_shash_fill_element(e, h, key, value);
+
+	// Insert just after head.
+	e->next = e_head->next;
+	e_head->next = e;
+
+	cf_shash_unlock(l);
+}
+
+int
+cf_shash_put_unique(cf_shash *h, const void *key, const void *value)
+{
+	cf_assert(h && key && value, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	pthread_mutex_t *l = cf_shash_lock(h, hash);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	// Most common case should be insert into empty bucket.
+	if (! e->in_use) {
+		cf_shash_fill_element(e, h, key, value);
+		cf_shash_unlock(l);
+		return CF_SHASH_OK;
+	}
+
+	cf_shash_ele *e_head = e;
+
+	while (e) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) {
+			cf_shash_unlock(l);
+			return CF_SHASH_ERR_FOUND;
+		}
+
+		e = e->next;
+	}
+
+	e = (cf_shash_ele *)cf_malloc(h->ele_size);
+
+	cf_shash_fill_element(e, h, key, value);
+
+	// Insert just after head.
+	e->next = e_head->next;
+	e_head->next = e;
+
+	cf_shash_unlock(l);
+
+	return CF_SHASH_OK;
+}
+
+// FIXME - replace with cf_shash_put_unique_or_get_vlock()?
+void
+cf_shash_update(cf_shash *h, const void *key, void *value_old, void *value_new,
+		cf_shash_update_fn update_fn, void *udata)
+{
+	cf_assert(h && key && update_fn, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	pthread_mutex_t *l = cf_shash_lock(h, hash);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	// Insert new value into empty bucket.
+	if (! e->in_use) {
+		(update_fn)(key, NULL, value_new, udata);
+		cf_shash_fill_element(e, h, key, value_new);
+		cf_shash_unlock(l);
+		return;
+	}
+
+	cf_shash_ele *e_head = e;
+
+	while (e) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) {
+			if (value_old) {
+				memcpy(value_old, ELE_VALUE(h, e), h->value_size);
+			}
+
+			(update_fn)(key, value_old, value_new, udata);
+
+			memcpy(ELE_VALUE(h, e), value_new, h->value_size);
+			cf_shash_unlock(l);
+
+			return;
+		}
+
+		e = e->next;
+	}
+
+	(update_fn)(key, NULL, value_new, udata);
+
+	e = (cf_shash_ele *)cf_malloc(h->ele_size);
+
+	cf_shash_fill_element(e, h, key, value_new);
+
+	// Insert just after head.
+	e->next = e_head->next;
+	e_head->next = e;
+
+	cf_shash_unlock(l);
+}
+
+int
+cf_shash_get(cf_shash *h, const void *key, void *value)
+{
+	cf_assert(h && key, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	pthread_mutex_t *l = cf_shash_lock(h, hash);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	while (e && e->in_use) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) {
+			if (value) {
+				memcpy(value, ELE_VALUE(h, e), h->value_size);
+			}
+
+			cf_shash_unlock(l);
+			return CF_SHASH_OK;
+		}
+
+		e = e->next;
+	}
+
+	cf_shash_unlock(l);
+
+	return CF_SHASH_ERR_NOT_FOUND;
+}
+
+int
+cf_shash_get_vlock(cf_shash *h, const void *key, void **value_r,
+		pthread_mutex_t **vlock_r)
+{
+	cf_assert(h && key && value_r && vlock_r, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	pthread_mutex_t *l = cf_shash_lock(h, hash);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	while (e && e->in_use) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) {
+			*value_r = ELE_VALUE(h, e);
+			*vlock_r = l;
+			return CF_SHASH_OK;
+		}
+
+		e = e->next;
+	}
+
+	cf_shash_unlock(l);
+
+	return CF_SHASH_ERR_NOT_FOUND;
+}
+
+int
+cf_shash_delete(cf_shash *h, const void *key)
+{
+	return cf_shash_delete_or_pop(h, key, NULL);
+}
+
+int
+cf_shash_delete_lockfree(cf_shash *h, const void *key)
+{
+	cf_assert(h && key, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	cf_shash_ele *e_prev = NULL;
+
+	// Look for the element, remove and release if found.
+	while (e && e->in_use) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) != 0) {
+			e_prev = e;
+			e = e->next;
+			continue;
+		}
+		// else - found it, remove from hash, free (if needed).
+
+		// If not at head, patch pointers and free element.
+		if (e_prev) {
+			e_prev->next = e->next;
+			cf_free(e);
+		}
+		// If at head with no next, empty head.
+		else if (! e->next) {
+			e->in_use = false;
+		}
+		// If at head with a next, copy next into head and free next.
+		else {
+			cf_shash_ele *free_e = e->next;
+
+			memcpy(e, e->next, h->ele_size);
+			cf_free(free_e);
+		}
+
+		cf_shash_size_decr(h);
+
+		return CF_SHASH_OK;
+	}
+
+	return CF_SHASH_ERR_NOT_FOUND;
+}
+
+// TODO - Rename to cf_shash_pop()?
+int
+cf_shash_get_and_delete(cf_shash *h, const void *key, void *value)
+{
+	cf_assert(value, CF_MISC, "bad param");
+
+	return cf_shash_delete_or_pop(h, key, value);
+}
+
+void
+cf_shash_delete_all(cf_shash *h)
+{
+	cf_assert(h, CF_MISC, "bad param");
+
+	if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+		pthread_mutex_lock(&h->big_lock);
+	}
+
+	uint8_t *bucket = (uint8_t*)h->table;
+
+	for (uint32_t i = 0; i < h->n_buckets; i++) {
+		pthread_mutex_t *bucket_lock = NULL;
+
+		if ((h->flags & CF_SHASH_MANY_LOCK) != 0) {
+			bucket_lock = &h->bucket_locks[i];
+			pthread_mutex_lock(bucket_lock);
+		}
+
+		cf_shash_ele *e = ((cf_shash_ele *)bucket)->next;
+
+		while (e) {
+			cf_shash_ele *temp = e->next;
+
+			cf_free(e);
+			e = temp;
+
+			cf_shash_size_decr(h);
+		}
+
+		if (((cf_shash_ele *)bucket)->in_use) {
+			((cf_shash_ele *)bucket)->in_use = false;
+			((cf_shash_ele *)bucket)->next = NULL;
+
+			cf_shash_size_decr(h);
+		}
+
+		if (bucket_lock) {
+			pthread_mutex_unlock(bucket_lock);
+		}
+
+		bucket += h->ele_size;
+	}
+
+	if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+		pthread_mutex_unlock(&h->big_lock);
+	}
+}
+
+int
+cf_shash_reduce(cf_shash *h, cf_shash_reduce_fn reduce_fn, void *udata)
+{
+	cf_assert(h && reduce_fn, CF_MISC, "bad param");
+
+	if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+		pthread_mutex_lock(&h->big_lock);
+	}
+
+	uint8_t *bucket = (uint8_t*)h->table;
+
+	for (uint32_t i = 0; i < h->n_buckets; i++) {
+		pthread_mutex_t *bucket_lock = NULL;
+
+		if ((h->flags & CF_SHASH_MANY_LOCK) != 0) {
+			bucket_lock = &h->bucket_locks[i];
+			pthread_mutex_lock(bucket_lock);
+		}
+
+		cf_shash_ele *e = (cf_shash_ele *)bucket;
+		cf_shash_ele *e_prev = NULL;
+
+		while (e && e->in_use) {
+			int rv = reduce_fn(ELE_KEY(h, e), ELE_VALUE(h, e), udata);
+
+			if (rv == CF_SHASH_OK) {
+				// Caller says keep going - most common case.
+
+				e_prev = e;
+				e = e->next;
+			}
+			else if (rv == CF_SHASH_REDUCE_DELETE) {
+				// Caller says delete this element and keep going.
+
+				// If not at head, patch pointers and free element.
+				if (e_prev) {
+					e_prev->next = e->next;
+					cf_free(e);
+					e = e_prev->next;
+				}
+				// If at head with no next, empty head.
+				else if (! e->next) {
+					e->in_use = false;
+				}
+				// If at head with a next, copy next into head and free next.
+				else {
+					cf_shash_ele *free_e = e->next;
+
+					memcpy(e, e->next, h->ele_size);
+					cf_free(free_e);
+				}
+
+				cf_shash_size_decr(h);
+			}
+			else {
+				// Caller says stop iterating.
+
+				if (bucket_lock) {
+					pthread_mutex_unlock(bucket_lock);
+				}
+
+				if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+					pthread_mutex_unlock(&h->big_lock);
+				}
+
+				return rv;
+			}
+		}
+
+		if (bucket_lock) {
+			pthread_mutex_unlock(bucket_lock);
+		}
+
+		bucket += h->ele_size;
+	}
+
+	if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+		pthread_mutex_unlock(&h->big_lock);
+	}
+
+	return CF_SHASH_OK;
+}
+
+
+//==========================================================
+// Local helpers.
+//
+
+static inline void
+cf_shash_clear_table(cf_shash *h)
+{
+	uint8_t *bucket = (uint8_t*)h->table;
+	uint8_t *end = bucket + (h->n_buckets * h->ele_size);
+
+	while (bucket < end) {
+		((cf_shash_ele *)bucket)->next = NULL;
+		((cf_shash_ele *)bucket)->in_use = false;
+		bucket += h->ele_size;
+	}
+}
+
+static inline void
+cf_shash_destroy_elements(cf_shash *h)
+{
+	uint8_t *bucket = (uint8_t*)h->table;
+	uint8_t *end = bucket + (h->n_buckets * h->ele_size);
+
+	while (bucket < end) {
+		cf_shash_ele *e = ((cf_shash_ele *)bucket)->next;
+
+		while (e) {
+			cf_shash_ele *temp = e->next;
+
+			cf_free(e);
+			e = temp;
+		}
+
+		bucket += h->ele_size;
+	}
+}
+
+static inline uint32_t
+cf_shash_calculate_hash(cf_shash *h, const void *key)
+{
+	return h->h_fn(key) % h->n_buckets;
+}
+
+static inline pthread_mutex_t *
+cf_shash_lock(cf_shash *h, uint32_t i)
+{
+	pthread_mutex_t *l = NULL;
+
+	if ((h->flags & CF_SHASH_BIG_LOCK) != 0) {
+		l = &h->big_lock;
+	}
+	else if ((h->flags & CF_SHASH_MANY_LOCK) != 0) {
+		l = &h->bucket_locks[i];
+	}
+
+	if (l) {
+		pthread_mutex_lock(l);
+	}
+
+	return l;
+}
+
+static inline void
+cf_shash_unlock(pthread_mutex_t *l)
+{
+	if (l) {
+		pthread_mutex_unlock(l);
+	}
+}
+
+static inline cf_shash_ele *
+cf_shash_get_bucket(cf_shash *h, uint32_t i)
+{
+	return (cf_shash_ele *)((uint8_t *)h->table + (h->ele_size * i));
+}
+
+static inline void
+cf_shash_fill_element(cf_shash_ele *e, cf_shash *h, const void *key,
+		const void *value)
+{
+	memcpy(ELE_KEY(h, e), key, h->key_size);
+	memcpy(ELE_VALUE(h, e), value, h->value_size);
+	e->in_use = true;
+	cf_shash_size_incr(h);
+}
+
+static inline void
+cf_shash_size_incr(cf_shash *h)
+{
+	// For now, not bothering with different methods per lock mode.
+	cf_atomic32_incr(&h->n_elements);
+}
+
+static inline void
+cf_shash_size_decr(cf_shash *h)
+{
+	// For now, not bothering with different methods per lock mode.
+	cf_atomic32_decr(&h->n_elements);
+}
+
+int
+cf_shash_delete_or_pop(cf_shash *h, const void *key, void *value)
+{
+	cf_assert(h && key, CF_MISC, "bad param");
+
+	uint32_t hash = cf_shash_calculate_hash(h, key);
+	pthread_mutex_t *l = cf_shash_lock(h, hash);
+	cf_shash_ele *e = cf_shash_get_bucket(h, hash);
+
+	cf_shash_ele *e_prev = NULL;
+
+	// Look for the element, remove and release if found.
+	while (e && e->in_use) {
+		if (memcmp(ELE_KEY(h, e), key, h->key_size) != 0) {
+			e_prev = e;
+			e = e->next;
+			continue;
+		}
+		// else - found it, remove from hash, free (if needed) outside lock.
+
+		// Return value.
+		if (value) {
+			memcpy(value, ELE_VALUE(h, e), h->value_size);
+		}
+
+		// Save pointer to free.
+		cf_shash_ele *free_e = NULL;
+
+		// If not at head, patch pointers and free element.
+		if (e_prev) {
+			e_prev->next = e->next;
+			free_e = e;
+		}
+		// If at head with no next, empty head.
+		else if (! e->next) {
+			e->in_use = false;
+		}
+		// If at head with a next, copy next into head and free next.
+		else {
+			free_e = e->next;
+			memcpy(e, e->next, h->ele_size);
+		}
+
+		cf_shash_size_decr(h);
+		cf_shash_unlock(l);
+
+		if (free_e) {
+			cf_free(free_e);
+		}
+
+		return CF_SHASH_OK;
+	}
+
+	cf_shash_unlock(l);
+
+	return CF_SHASH_ERR_NOT_FOUND;
+}
diff --git a/cf/src/socket.c b/cf/src/socket.c
new file mode 100644
index 00000000..b48564f8
--- /dev/null
+++ b/cf/src/socket.c
@@ -0,0 +1,2551 @@
+/*
+ * socket.c
+ *
+ * Copyright (C) 2008-2017 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#define CF_SOCKET_PRIVATE
+#include "socket.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <ifaddrs.h>
+#include <inttypes.h>
+#include <poll.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <asm/types.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "fault.h"
+#include "tls.h"
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_digest.h"
+
+void
+cf_ip_addr_to_string_safe(const cf_ip_addr *addr, char *string, size_t size)
+{
+	if (cf_ip_addr_to_string(addr, string, size) < 0) {
+		cf_crash(CF_SOCKET, "String buffer overflow");
+	}
+}
+
+int32_t
+cf_ip_addr_to_string_multi(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, size_t size)
+{
+	size_t off = 0;
+
+	for (uint32_t i = 0; i < n_addrs; ++i) {
+		if (i > 0) {
+			if (off >= size) {
+				cf_warning(CF_SOCKET, "Output buffer overflow");
+				return -1;
+			}
+
+			string[off] = ',';
+			++off;
+		}
+
+		int32_t len = cf_ip_addr_to_string(&addrs[i], string + off, size - off);
+
+		if (len < 0) {
+			return -1;
+		}
+
+		off += len;
+	}
+
+	if (off >= size) {
+		cf_warning(CF_SOCKET, "Output buffer overflow");
+		return -1;
+	}
+
+	string[off] = 0;
+	return off;
+}
+
+void
+cf_ip_addr_to_string_multi_safe(const cf_ip_addr *addrs, uint32_t n_addrs, char *string,
+		size_t size)
+{
+	if (cf_ip_addr_to_string_multi(addrs, n_addrs, string, size) < 0) {
+		cf_crash(CF_SOCKET, "String buffer overflow");
+	}
+}
+
+int32_t
+cf_ip_addr_from_string(const char *string, cf_ip_addr *addr)
+{
+	cf_ip_addr addrs[CF_SOCK_CFG_MAX];
+	uint32_t n_addrs = CF_SOCK_CFG_MAX;
+
+	if (cf_ip_addr_from_string_multi(string, addrs, &n_addrs) < 0) {
+		return -1;
+	}
+
+	cf_ip_addr_copy(&addrs[0], addr);
+	return 0;
+}
+
+void
+cf_ip_addr_sort(cf_ip_addr *addrs, uint32_t n_addrs)
+{
+	int32_t n = n_addrs;
+	bool swapped;
+
+	do {
+		swapped = false;
+
+		for (int32_t i = 0; i < n - 1; ++i) {
+			if (cf_ip_addr_compare(&addrs[i], &addrs[i + 1]) < 0) {
+				cf_ip_addr tmp;
+				cf_ip_addr_copy(&addrs[i], &tmp);
+				cf_ip_addr_copy(&addrs[i + 1], &addrs[i]);
+				cf_ip_addr_copy(&tmp, &addrs[i + 1]);
+				swapped = true;
+			}
+		}
+
+		--n;
+	}
+	while (swapped);
+}
+
+static int32_t
+validate_dns_label(const char *label)
+{
+	int32_t i;
+
+	for (i = 0; label[i] != 0 && label[i] != '.'; ++i) {
+		bool ok = (label[i] >= '0' && label[i] <= '9') ||
+				(label[i] >= 'a' && label[i] <= 'z') ||
+				(label[i] >= 'A' && label[i] <= 'Z') ||
+				label[i] == '-';
+
+		if (!ok) {
+			return -1;
+		}
+	}
+
+	if (i == 0) {
+		return -1;
+	}
+
+	return i;
+}
+
+bool
+cf_ip_addr_is_dns_name(const char *string)
+{
+	if (cf_inter_is_inter_name(string)) {
+		return false;
+	}
+
+	if (string[0] >= '0' && string[0] <= '9') {
+		return false;
+	}
+
+	int32_t n_labels = 0;
+	int32_t i = 0;
+
+	while (string[i] != 0) {
+		int32_t len = validate_dns_label(string + i);
+
+		if (len < 0) {
+			return false;
+		}
+
+		i += len;
+		++n_labels;
+
+		if (string[i] == '.') {
+			++i;
+		}
+	}
+
+	return n_labels > 1;
+}
+
+int32_t
+cf_ip_port_from_string(const char *string, cf_ip_port *port)
+{
+	char *end;
+	uint64_t tmp = strtoul(string, &end, 10);
+
+	if (*end != 0 || tmp > 65535) {
+		cf_warning(CF_SOCKET, "Invalid port '%s'", string);
+		return -1;
+	}
+
+	*port = (cf_ip_port)tmp;
+	return 0;
+}
+
+int32_t
+cf_ip_port_to_string(cf_ip_port port, char *string, size_t size)
+{
+	int32_t count = snprintf(string, size, "%hu", port);
+
+	if ((size_t)count >= size) {
+		cf_warning(CF_SOCKET, "Output buffer overflow");
+		return -1;
+	}
+
+	return count;
+}
+
+void
+cf_ip_port_to_string_safe(cf_ip_port port, char *string, size_t size)
+{
+	if (cf_ip_port_to_string(port, string, size) < 0) {
+		cf_crash(CF_SOCKET, "String buffer overflow");
+	}
+}
+
+int32_t
+cf_ip_port_from_binary(const uint8_t *binary, size_t size, cf_ip_port *port)
+{
+	if (size < 2) {
+		cf_warning(CF_SOCKET, "Input buffer underflow");
+		return -1;
+	}
+
+	*port = (binary[0] << 8) | binary[1];
+	return 2;
+}
+
+int32_t
+cf_ip_port_to_binary(cf_ip_port port, uint8_t *binary, size_t size)
+{
+	if (size < 2) {
+		cf_warning(CF_SOCKET, "Output buffer overflow");
+		return -1;
+	}
+
+	binary[0] = port >> 8;
+	binary[1] = port & 255;
+	return 2;
+}
+
+void
+cf_ip_port_from_node_id(cf_node id, cf_ip_port *port)
+{
+	uint8_t *buff = (uint8_t *)&id;
+	memcpy(port, buff + 6, 2);
+}
+
+void
+cf_sock_addr_to_string_safe(const cf_sock_addr *addr, char *string, size_t size)
+{
+	if (cf_sock_addr_to_string(addr, string, size) < 0) {
+		cf_crash(CF_SOCKET, "String buffer overflow");
+	}
+}
+
+int32_t
+cf_sock_addr_from_binary(const uint8_t *binary, size_t size, cf_sock_addr *addr)
+{
+	int32_t total = 0;
+	int32_t count = cf_ip_addr_from_binary(binary, size, &addr->addr);
+
+	if (count < 0) {
+		return -1;
+	}
+
+	total += count;
+	count = cf_ip_port_from_binary(binary + total, size - total, &addr->port);
+
+	if (count < 0) {
+		return -1;
+	}
+
+	total += count;
+	return total;
+}
+
+int32_t
+cf_sock_addr_to_binary(const cf_sock_addr *addr, uint8_t *binary, size_t size)
+{
+	int32_t total = 0;
+	int32_t count = cf_ip_addr_to_binary(&addr->addr, binary, size);
+
+	if (count < 0) {
+		return -1;
+	}
+
+	total += count;
+	count = cf_ip_port_to_binary(addr->port, binary + total, size - total);
+
+	if (count < 0) {
+		return -1;
+	}
+
+	total += count;
+	return total;
+}
+
+int32_t
+cf_sock_addr_from_host_port(const char *host, cf_ip_port port, cf_sock_addr *addr)
+{
+	if (cf_ip_addr_from_string(host, &addr->addr) < 0) {
+		cf_warning(CF_SOCKET, "Invalid host address '%s'", host);
+		return -1;
+	}
+
+	addr->port = port;
+	return 0;
+}
+
+void
+cf_sock_addr_from_addr_port(const cf_ip_addr *ip_addr, cf_ip_port port, cf_sock_addr *addr)
+{
+	cf_ip_addr_copy(ip_addr, &addr->addr);
+	addr->port = port;
+}
+
+int32_t
+cf_sock_addr_compare(const cf_sock_addr *lhs, const cf_sock_addr *rhs)
+{
+	int32_t res = cf_ip_addr_compare(&lhs->addr, &rhs->addr);
+
+	if (res != 0) {
+		return res;
+	}
+
+	if (lhs->port == rhs->port) {
+		return 0;
+	}
+
+	return (int32_t)lhs->port - (int32_t)rhs->port;
+}
+
+void
+cf_sock_addr_copy(const cf_sock_addr *from, cf_sock_addr *to)
+{
+	cf_ip_addr_copy(&from->addr, &to->addr);
+	to->port = from->port;
+}
+
+void
+cf_sock_addr_set_any(cf_sock_addr *addr)
+{
+	cf_ip_addr_set_any(&addr->addr);
+	addr->port = 0;
+}
+
+bool
+cf_sock_addr_is_any(const cf_sock_addr *addr)
+{
+	return cf_ip_addr_is_any(&addr->addr) && addr->port == 0;
+}
+
+void
+cf_sock_cfg_init(cf_sock_cfg *cfg, cf_sock_owner owner)
+{
+	cfg->owner = owner;
+	cfg->port = 0;
+	cf_ip_addr_set_any(&cfg->addr);
+}
+
+void
+cf_sock_cfg_copy(const cf_sock_cfg *from, cf_sock_cfg *to)
+{
+	to->owner = from->owner;
+	to->port = from->port;
+	cf_ip_addr_copy(&from->addr, &to->addr);
+}
+
+void
+cf_serv_cfg_init(cf_serv_cfg *cfg)
+{
+	cfg->n_cfgs = 0;
+}
+
+int32_t
+cf_serv_cfg_add_sock_cfg(cf_serv_cfg *serv_cfg, const cf_sock_cfg *sock_cfg)
+{
+	if (serv_cfg->n_cfgs >= CF_SOCK_CFG_MAX) {
+		cf_warning(CF_SOCKET, "Too many socket configurations in server configuration");
+		return -1;
+	}
+
+	uint32_t n = serv_cfg->n_cfgs;
+
+	for (uint32_t i = 0; i < n; ++i) {
+		cf_sock_cfg *walker = &serv_cfg->cfgs[i];
+
+		if (walker->owner == sock_cfg->owner && walker->port == sock_cfg->port &&
+				cf_ip_addr_compare(&walker->addr, &sock_cfg->addr) == 0) {
+			return 0;
+		}
+	}
+
+	cf_sock_cfg_copy(sock_cfg, &serv_cfg->cfgs[n]);
+	serv_cfg->n_cfgs = ++n;
+	return 0;
+}
+
+void
+cf_sockets_init(cf_sockets *socks)
+{
+	socks->n_socks = 0;
+}
+
+bool
+cf_sockets_has_socket(const cf_sockets *socks, const cf_socket *sock)
+{
+	return socks != NULL && sock >= &socks->socks[0] && sock < &socks->socks[socks->n_socks];
+}
+
+void
+cf_sockets_close(cf_sockets *socks)
+{
+	for (uint32_t i = 0; i < socks->n_socks; ++i) {
+		cf_socket_close(&socks->socks[i]);
+		cf_socket_term(&socks->socks[i]);
+	}
+}
+
+static int32_t
+safe_fcntl(int32_t fd, int32_t cmd, int32_t arg)
+{
+	int32_t res = fcntl(fd, cmd, arg);
+
+	if (res < 0) {
+		cf_crash(CF_SOCKET, "fcntl(%d) failed on FD %d: %d (%s)",
+				cmd, fd, errno, cf_strerror(errno));
+	}
+
+	return res;
+}
+
+static int32_t
+safe_ioctl(int32_t fd, int32_t req, int32_t *arg)
+{
+	int32_t res = ioctl(fd, req, arg);
+
+	if (res < 0) {
+		cf_crash(CF_SOCKET, "ioctl(%d) failed on FD %d: %d (%s)",
+				req, fd, errno, cf_strerror(errno));
+	}
+
+	return res;
+}
+
+static void
+safe_setsockopt(int32_t fd, int32_t level, int32_t name, const void *val, socklen_t len)
+{
+	if (setsockopt(fd, level, name, val, len) < 0) {
+		cf_crash(CF_SOCKET, "setsockopt(%d¸ %d) failed on FD %d: %d (%s)",
+				level, name, fd, errno, cf_strerror(errno));
+	}
+}
+
+static void
+safe_getsockopt(int32_t fd, int32_t level, int32_t name, void *val, socklen_t *len)
+{
+	if (getsockopt(fd, level, name, val, len) < 0) {
+		cf_crash(CF_SOCKET, "getsockopt(%d, %d) failed on FD %d: %d (%s)",
+				level, name, fd, errno, cf_strerror(errno));
+	}
+}
+
+static int32_t
+safe_wait(int32_t efd, struct epoll_event *events, int32_t max, int32_t timeout)
+{
+	while (true) {
+		cf_debug(CF_SOCKET, "Waiting on epoll FD %d", efd);
+		int32_t count = epoll_wait(efd, events, max, timeout);
+
+		if (count < 0) {
+			if (errno == EINTR) {
+				cf_debug(CF_SOCKET, "Interrupted");
+				continue;
+			}
+
+			cf_crash(CF_SOCKET, "epoll_wait() failed on epoll FD %d: %d (%s)",
+					efd, errno, cf_strerror(errno));
+		}
+
+		return count;
+	}
+}
+
+static void
+safe_close(int32_t fd)
+{
+	if (close(fd) < 0) {
+		cf_crash(CF_SOCKET, "Error while closing FD %d: %d (%s)",
+				fd, errno, cf_strerror(errno));
+	}
+}
+
+void
+cf_fd_disable_blocking(int32_t fd)
+{
+	int32_t flags = safe_fcntl(fd, F_GETFL, 0);
+	safe_fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+void
+cf_socket_disable_blocking(cf_socket *sock)
+{
+	cf_fd_disable_blocking(sock->fd);
+}
+
+void
+cf_socket_enable_blocking(cf_socket *sock)
+{
+	int32_t flags = safe_fcntl(sock->fd, F_GETFL, 0);
+	safe_fcntl(sock->fd, F_SETFL, flags & ~O_NONBLOCK);
+}
+
+void
+cf_socket_disable_nagle(cf_socket *sock)
+{
+	static const int32_t flag = 1;
+	safe_setsockopt(sock->fd, SOL_TCP, TCP_NODELAY, &flag, sizeof(flag));
+}
+
+void
+cf_socket_enable_nagle(cf_socket *sock)
+{
+	static const int32_t flag = 0;
+	safe_setsockopt(sock->fd, SOL_TCP, TCP_NODELAY, &flag, sizeof(flag));
+}
+
+void
+cf_socket_keep_alive(cf_socket *sock, int32_t idle, int32_t interval, int32_t count)
+{
+	static const int32_t flag = 1;
+	safe_setsockopt(sock->fd, SOL_SOCKET, SO_KEEPALIVE, &flag, sizeof(flag));
+
+	if (idle > 0) {
+		safe_setsockopt(sock->fd, SOL_TCP, TCP_KEEPIDLE, &idle, sizeof(idle));
+	}
+
+	if (interval > 0) {
+		safe_setsockopt(sock->fd, SOL_TCP, TCP_KEEPINTVL, &interval, sizeof(interval));
+	}
+
+	if (count > 0) {
+		safe_setsockopt(sock->fd, SOL_TCP, TCP_KEEPCNT, &count, sizeof(count));
+	}
+}
+
+void
+cf_socket_set_send_buffer(cf_socket *sock, int32_t size)
+{
+	safe_setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size));
+}
+
+void
+cf_socket_set_receive_buffer(cf_socket *sock, int32_t size)
+{
+	safe_setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size));
+}
+
+void
+cf_socket_set_window(cf_socket *sock, int32_t size)
+{
+	safe_setsockopt(sock->fd, SOL_TCP, TCP_WINDOW_CLAMP, &size, sizeof(size));
+}
+
+void
+cf_socket_init(cf_socket *sock)
+{
+	sock->fd = -1;
+	sock->state = CF_SOCKET_STATE_NON_TLS;
+	sock->cfg = NULL;
+	tls_socket_init(sock);
+}
+
+bool
+cf_socket_exists(cf_socket *sock)
+{
+	return sock->fd >= 0;
+}
+
+int32_t
+cf_socket_init_server(cf_serv_cfg *cfg, cf_sockets *socks)
+{
+	int32_t res = -1;
+
+	if (cfg->n_cfgs < 1) {
+		cf_warning(CF_SOCKET, "Missing service socket configuration");
+		goto cleanup0;
+	}
+
+	cf_socket_fix_bind(cfg);
+
+	cf_debug(CF_SOCKET, "Initializing %u server socket(s)", cfg->n_cfgs);
+	uint32_t n;
+	cf_socket *sock;
+
+	for (n = 0; n < cfg->n_cfgs; ++n) {
+		sock = &socks->socks[n];
+
+		if (cfg->cfgs[n].port == 0) {
+			cf_warning(CF_SOCKET, "Missing service port");
+			goto cleanup1;
+		}
+
+		cf_sock_addr addr;
+		cf_sock_addr_from_addr_port(&cfg->cfgs[n].addr, cfg->cfgs[n].port, &addr);
+
+		struct sockaddr_storage sas;
+		cf_sock_addr_to_native(&addr, (struct sockaddr *)&sas);
+
+		cf_debug(CF_SOCKET, "Initializing server for %s", cf_sock_addr_print(&addr));
+		int32_t fd = socket(sas.ss_family, SOCK_STREAM, 0);
+
+		if (fd < 0) {
+			cf_warning(CF_SOCKET, "Error while creating socket for %s: %d (%s)",
+					cf_sock_addr_print(&addr), errno, cf_strerror(errno));
+			goto cleanup1;
+		}
+
+		cf_socket_init(sock);
+		sock->fd = fd;
+		fd = -1;
+
+		cf_socket_fix_server(sock);
+		cf_socket_disable_blocking(sock);
+
+		// No Nagle here. It will be disabled for the accepted connections.
+
+		static const int32_t flag = 1;
+		safe_setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &flag, sizeof(flag));
+
+		while (bind(sock->fd, (struct sockaddr *)&sas,
+				cf_socket_addr_len((struct sockaddr *)&sas)) < 0) {
+			if (errno != EADDRINUSE) {
+				cf_warning(CF_SOCKET, "Error while binding to %s: %d (%s)",
+						cf_sock_addr_print(&addr), errno, cf_strerror(errno));
+				goto cleanup2;
+			}
+
+			cf_warning(CF_SOCKET, "Socket %s in use, waiting", cf_sock_addr_print(&addr));
+			usleep(5 * 1000 * 1000);
+		}
+
+		if (listen(sock->fd, 512) < 0) {
+			cf_warning(CF_SOCKET, "Error while listening on %s: %d (%s)",
+					cf_sock_addr_print(&addr), errno, cf_strerror(errno));
+			goto cleanup2;
+		}
+
+		sock->cfg = &cfg->cfgs[n];
+	}
+
+	socks->n_socks = n;
+	res = 0;
+	goto cleanup0;
+
+cleanup2:
+	cf_socket_close(sock);
+	cf_socket_term(sock);
+
+cleanup1:
+	for (uint32_t i = 0; i < n; ++i) {
+		cf_socket_close(&socks->socks[i]);
+		cf_socket_term(&socks->socks[i]);
+	}
+
+cleanup0:
+	return res;
+}
+
+void
+cf_socket_show_server(cf_fault_context cont, const char *tag, const cf_sockets *socks)
+{
+	for (uint32_t i = 0; i < socks->n_socks; ++i) {
+		cf_sock_cfg *cfg = socks->socks[i].cfg;
+		cf_sock_addr addr;
+		cf_sock_addr_from_addr_port(&cfg->addr, cfg->port, &addr);
+		cf_info(cont, "Started %s endpoint %s", tag, cf_sock_addr_print(&addr));
+	}
+}
+
+static int32_t
+connect_socket(const cf_socket *sock, struct sockaddr *sa, int32_t timeout)
+{
+	cf_debug(CF_SOCKET, "Connecting FD %d", sock->fd);
+	int32_t res = -1;
+	int32_t rv = connect(sock->fd, sa, cf_socket_addr_len(sa));
+
+	if (rv == 0) {
+		cf_debug(CF_SOCKET, "FD %d connected [1]", sock->fd);
+		res = 0;
+		goto cleanup0;
+	}
+
+	if (errno != EINPROGRESS) {
+		cf_ticker_warning(CF_SOCKET, "Error while connecting: %d (%s)", errno, cf_strerror(errno));
+		goto cleanup0;
+	}
+
+	if (timeout == 0) {
+		cf_debug(CF_SOCKET, "FD %d still connecting, but no timeout", sock->fd);
+		res = 0;
+		goto cleanup0;
+	}
+
+	int32_t efd = epoll_create(1);
+
+	if (efd < 0) {
+		cf_crash(CF_SOCKET, "epoll_create() failed: %d (%s)", errno, cf_strerror(errno));
+	}
+
+	struct epoll_event event = { .data.fd = sock->fd, .events = EPOLLOUT };
+
+	if (epoll_ctl(efd, EPOLL_CTL_ADD, sock->fd, &event) < 0) {
+		cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+	}
+
+	int32_t count = safe_wait(efd, &event, 1, timeout);
+
+	if (count == 0) {
+		cf_ticker_warning(CF_SOCKET, "Timeout while connecting");
+		goto cleanup1;
+	}
+
+	int32_t err;
+	socklen_t err_len = sizeof(err);
+	safe_getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
+
+	if (err != 0) {
+		cf_ticker_warning(CF_SOCKET, "Error while connecting: %d (%s)", err, cf_strerror(err));
+		goto cleanup1;
+	}
+
+	cf_debug(CF_SOCKET, "FD %d connected [2]", sock->fd);
+	res = 0;
+
+cleanup1:
+	if (epoll_ctl(efd, EPOLL_CTL_DEL, sock->fd, NULL) < 0) {
+		cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+	}
+
+	safe_close(efd);
+
+cleanup0:
+	return res;
+}
+
+int32_t
+cf_socket_init_client(cf_sock_cfg *cfg, int32_t timeout, cf_socket *sock)
+{
+	int32_t res = -1;
+
+	if (cf_ip_addr_is_any(&cfg->addr)) {
+		cf_warning(CF_SOCKET, "Missing IP address");
+		goto cleanup0;
+	}
+
+	if (cfg->port == 0) {
+		cf_warning(CF_SOCKET, "Missing port");
+		goto cleanup0;
+	}
+
+	cf_sock_addr addr;
+	cf_sock_addr_from_addr_port(&cfg->addr, cfg->port, &addr);
+
+	struct sockaddr_storage sas;
+	cf_sock_addr_to_native(&addr, (struct sockaddr *)&sas);
+
+	cf_debug(CF_SOCKET, "Initializing client for %s", cf_sock_addr_print(&addr));
+	int32_t fd = socket(sas.ss_family, SOCK_STREAM, 0);
+
+	if (fd < 0) {
+		cf_warning(CF_SOCKET, "Error while creating socket for %s: %d (%s)",
+				cf_sock_addr_print(&addr), errno, cf_strerror(errno));
+		goto cleanup0;
+	}
+
+	cf_socket_init(sock);
+	sock->fd = fd;
+	fd = -1;
+
+	cf_socket_fix_client(sock);
+	cf_socket_disable_blocking(sock);
+	cf_socket_disable_nagle(sock);
+
+	if (connect_socket(sock, (struct sockaddr *)&sas, timeout) < 0) {
+		cf_ticker_warning(CF_SOCKET, "Error while connecting socket to %s",
+				cf_sock_addr_print(&addr));
+		goto cleanup1;
+	}
+
+	sock->cfg = cfg;
+	res = 0;
+	goto cleanup0;
+
+cleanup1:
+	cf_socket_close(sock);
+	cf_socket_term(sock);
+
+cleanup0:
+	return res;
+}
+
+int32_t
+cf_socket_accept(cf_socket *lsock, cf_socket *sock, cf_sock_addr *addr)
+{
+	int32_t res = -1;
+
+	struct sockaddr_storage sas;
+	struct sockaddr *sa = NULL;
+	socklen_t sa_len = 0;
+
+	if (addr != NULL) {
+		sa = (struct sockaddr *)&sas;
+		sa_len = sizeof(sas);
+	}
+
+	int32_t fd = accept(lsock->fd, sa, &sa_len);
+
+	if (fd < 0) {
+		cf_debug(CF_SOCKET, "Error while accepting from FD %d: %d (%s)",
+				lsock->fd, errno, cf_strerror(errno));
+		goto cleanup0;
+	}
+
+	if (addr != NULL) {
+		cf_sock_addr_from_native(sa, addr);
+	}
+
+	cf_socket_init(sock);
+	sock->fd = fd;
+	fd = -1;
+
+	cf_socket_disable_blocking(sock);
+	cf_socket_disable_nagle(sock);
+
+	sock->cfg = lsock->cfg;
+	res = 0;
+
+cleanup0:
+	return res;
+}
+
+typedef int32_t (*name_func)(int32_t fd, struct sockaddr *sa, socklen_t *sa_len);
+
+static int32_t
+x_name(name_func func, const char *which, int32_t fd, cf_sock_addr *addr)
+{
+	struct sockaddr_storage sas;
+	socklen_t sas_len = sizeof(sas);
+
+	if (func(fd, (struct sockaddr *)&sas, &sas_len) < 0) {
+		cf_warning(CF_SOCKET, "Error while getting %s name: %d (%s)",
+				which, errno, cf_strerror(errno));
+		return -1;
+	}
+
+	cf_sock_addr_from_native((struct sockaddr *)&sas, addr);
+	return 0;
+}
+
+int32_t
+cf_socket_remote_name(const cf_socket *sock, cf_sock_addr *addr)
+{
+	return x_name(getpeername, "remote", sock->fd, addr);
+}
+
+int32_t
+cf_socket_local_name(const cf_socket *sock, cf_sock_addr *addr)
+{
+	return x_name(getsockname, "local", sock->fd, addr);
+}
+
+int32_t
+cf_socket_available(cf_socket *sock)
+{
+	int32_t size;
+	safe_ioctl(sock->fd, FIONREAD, &size);
+
+	size += tls_socket_pending(sock);
+	
+	return size;
+}
+
+int32_t
+cf_socket_send_to(cf_socket *sock, const void *buff, size_t size, int32_t flags, const cf_sock_addr *addr)
+{
+	cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_send_to() with TLS");
+
+	struct sockaddr_storage sas;
+	struct sockaddr *sa = NULL;
+	socklen_t sa_len = 0;
+
+	if (addr != NULL) {
+		cf_sock_addr_to_native(addr, (struct sockaddr *)&sas);
+		sa = (struct sockaddr *)&sas;
+		sa_len = cf_socket_addr_len((struct sockaddr *)&sas);
+	}
+
+	int32_t res = sendto(sock->fd, buff, size, flags | MSG_NOSIGNAL, sa, sa_len);
+
+	if (res < 0) {
+		cf_debug(CF_SOCKET, "Error while sending on FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+	}
+
+	return res;
+}
+
+int32_t
+cf_socket_send(cf_socket *sock, const void *buff, size_t size, int32_t flags)
+{
+	if (sock->ssl) {
+		ssize_t rv = tls_socket_send(sock, buff, size, flags, 0);
+		if (rv < 0) {
+			// errno is set by tls_socket_send.
+			if (errno == ETIMEDOUT) {
+				errno = EAGAIN;
+			}
+			return -1;
+		}
+		else {
+			// This might be a partial return.
+			return rv;
+		}
+	}
+	else {
+		return cf_socket_send_to(sock, buff, size, flags, NULL);
+	}
+}
+
+int32_t
+cf_socket_recv_from(cf_socket *sock, void *buff, size_t size, int32_t flags, cf_sock_addr *addr)
+{
+	cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_recv_from() with TLS");
+
+	struct sockaddr_storage sas;
+	struct sockaddr *sa = NULL;
+	socklen_t sa_len = 0;
+
+	if (addr != NULL) {
+		sa = (struct sockaddr *)&sas;
+		sa_len = sizeof(sas);
+	}
+
+	int32_t res = recvfrom(sock->fd, buff, size, flags, sa, &sa_len);
+
+	if (res < 0) {
+		cf_debug(CF_SOCKET, "Error while receiving on FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+	}
+	else if (addr != NULL) {
+		cf_sock_addr_from_native(sa, addr);
+	}
+
+	return res;
+}
+
+int32_t
+cf_socket_recv(cf_socket *sock, void *buff, size_t size, int32_t flags)
+{
+	if (sock->ssl) {
+		ssize_t rv = tls_socket_recv(sock, buff, size, flags, 0);
+		if (rv < 0) {
+			// errno is set by tls_socket_send.
+			if (errno == ETIMEDOUT) {
+				errno = EAGAIN;
+			}
+			return -1;
+		}
+		else {
+			// This might be a partial return.
+			return rv;
+		}
+	}
+	else {
+		return cf_socket_recv_from(sock, buff, size, flags, NULL);
+	}
+}
+
+static bool
+socket_wait(const cf_socket *sock, uint16_t events, int32_t timeout)
+{
+	cf_detail(CF_SOCKET, "Waiting for events 0x%x on FD %d with timeout %d",
+			events, sock->fd, timeout);
+
+	struct pollfd pfd = { .fd = sock->fd, .events = events | POLLRDHUP };
+
+	while (true) {
+		int32_t count = poll(&pfd, 1, timeout);
+
+		if (count < 0) {
+			if (errno == EINTR) {
+				continue;
+			}
+
+			cf_crash(CF_SOCKET, "Error while polling FD %d: %d (%s)",
+					pfd.fd, errno, cf_strerror(errno));
+		}
+
+		if (count > 1) {
+			cf_crash(CF_SOCKET, "Unexpected number of events on FD %d: %d", sock->fd, count);
+		}
+
+		if (count == 0) {
+			cf_detail(CF_SOCKET, "Timeout while waiting on FD %d", sock->fd);
+			return false;
+		}
+
+		cf_detail(CF_SOCKET, "Got events 0x%x on FD %d", pfd.revents, sock->fd);
+		return true;
+	}
+}
+
+int32_t
+cf_socket_send_to_all(cf_socket *sock, const void *buffp, size_t size, int32_t flags,
+		const cf_sock_addr *addr, int32_t timeout)
+{
+	cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_send_to_all() with TLS");
+
+	uint8_t *buff = (uint8_t *) buffp;
+	cf_detail(CF_SOCKET, "Blocking send on FD %d, size = %zu", sock->fd, size);
+	size_t off = 0;
+
+	while (off < size) {
+		ssize_t count = cf_socket_send_to(sock, buff + off, size - off, flags, addr);
+
+		if (count < 0) {
+			if (errno == EAGAIN) {
+				cf_debug(CF_SOCKET, "FD %d is blocking", sock->fd);
+
+				if (socket_wait(sock, POLLOUT, timeout)) {
+					continue;
+				}
+
+				cf_debug(CF_SOCKET, "Timeout during blocking send on FD %d", sock->fd);
+				errno = ETIMEDOUT;
+				return -1;
+			}
+
+			return -1;
+		}
+
+		if (count == 0) {
+			// TODO - remove warning if this turns out to be normal.
+			cf_warning(CF_SOCKET, "Sent 0 bytes on FD %d", sock->fd);
+			errno = ENOTCONN;
+			return -1;
+		}
+
+		off += count;
+	}
+
+	cf_detail(CF_SOCKET, "Blocking send on FD %d complete", sock->fd);
+	return 0;
+}
+
+int32_t
+cf_socket_send_all(cf_socket *sock, const void *buff, size_t size, int32_t flags,
+		int32_t timeout)
+{
+	if (sock->ssl) {
+		return tls_socket_send(sock, buff, size, flags, timeout);
+	}
+	else {
+		return cf_socket_send_to_all(sock, buff, size, flags, NULL, timeout);
+	}
+}
+
+int32_t
+cf_socket_recv_from_all(cf_socket *sock, void *buffp, size_t size, int32_t flags,
+		cf_sock_addr *addr, int32_t timeout)
+{
+	cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_recv_from_all() with TLS");
+
+	uint8_t *buff = (uint8_t *) buffp;
+	cf_detail(CF_SOCKET, "Blocking receive on FD %d, size = %zu", sock->fd, size);
+	size_t off = 0;
+
+	while (off < size) {
+		ssize_t count = cf_socket_recv_from(sock, buff + off, size - off, flags, addr);
+
+		if (count < 0) {
+			if (errno == EAGAIN) {
+				cf_debug(CF_SOCKET, "FD %d is blocking", sock->fd);
+
+				if (socket_wait(sock, POLLIN, timeout)) {
+					continue;
+				}
+
+				cf_debug(CF_SOCKET, "Timeout during blocking receive on FD %d", sock->fd);
+				errno = ETIMEDOUT;
+				return -1;
+			}
+
+			return -1;
+		}
+
+		if (count == 0) {
+			errno = ENOTCONN;
+			return -1;
+		}
+
+		off += count;
+	}
+
+	cf_detail(CF_SOCKET, "Blocking receive on FD %d complete", sock->fd);
+	return 0;
+}
+
+int32_t
+cf_socket_recv_all(cf_socket *sock, void *buff, size_t size, int32_t flags, int32_t timeout)
+{
+	if (sock->ssl) {
+		return tls_socket_recv(sock, buff, size, flags, timeout);
+	}
+	else {
+		return cf_socket_recv_from_all(sock, buff, size, flags, NULL, timeout);
+	}
+}
+
+static void
+x_shutdown(cf_socket *sock, int32_t how)
+{
+	if (sock->ssl) {
+		tls_socket_shutdown(sock);
+	}
+
+	if (shutdown(sock->fd, how) < 0) {
+		if (errno != ENOTCONN) {
+			cf_crash(CF_SOCKET, "shutdown() failed on FD %d: %d (%s)",
+					sock->fd, errno, cf_strerror(errno));
+		}
+		else {
+			cf_debug(CF_SOCKET, "shutdown() on disconnected FD %d: %d (%s)",
+					sock->fd, errno, cf_strerror(errno));
+		}
+	}
+}
+
+void
+cf_socket_write_shutdown(cf_socket *sock)
+{
+	cf_debug(CF_SOCKET, "Shutting down write channel of FD %d", sock->fd);
+	x_shutdown(sock, SHUT_WR);
+}
+
+void
+cf_socket_shutdown(cf_socket *sock)
+{
+	cf_debug(CF_SOCKET, "Shutting down FD %d", sock->fd);
+	x_shutdown(sock, SHUT_RDWR);
+}
+
+void
+cf_socket_close(cf_socket *sock)
+{
+	cf_debug(CF_SOCKET, "Closing FD %d", sock->fd);
+	tls_socket_close(sock);
+	safe_close(sock->fd);
+	sock->fd = -1;
+}
+
+void
+cf_socket_drain_close(cf_socket *sock)
+{
+	cf_debug(CF_SOCKET, "Draining and closing FD %d", sock->fd);
+	int32_t efd = epoll_create(1);
+
+	if (efd < 0) {
+		cf_crash(CF_SOCKET, "epoll_create() failed: %d (%s)", errno, cf_strerror(errno));
+	}
+
+	struct epoll_event event = { .data.fd = sock->fd, .events = EPOLLRDHUP };
+
+	if (epoll_ctl(efd, EPOLL_CTL_ADD, sock->fd, &event) < 0) {
+		cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+	}
+
+	cf_socket_shutdown(sock);
+	int32_t count = safe_wait(efd, &event, 1, 5000);
+
+	if (count == 0) {
+		cf_warning(CF_SOCKET, "Timeout while waiting for FD %d to drain", sock->fd);
+		goto cleanup1;
+	}
+
+	cf_debug(CF_SOCKET, "FD %d drained", sock->fd);
+
+cleanup1:
+	if (epoll_ctl(efd, EPOLL_CTL_DEL, sock->fd, NULL) < 0) {
+		cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+	}
+
+	safe_close(efd);
+	cf_socket_close(sock);
+	cf_socket_term(sock);
+}
+
+void
+cf_socket_term(cf_socket *sock)
+{
+	tls_socket_term(sock);
+	sock->fd = -1;
+}
+
+void
+cf_msock_cfg_init(cf_msock_cfg *cfg, cf_sock_owner owner)
+{
+	cfg->owner = owner;
+	cfg->port = 0;
+	cf_ip_addr_set_any(&cfg->addr);
+	cf_ip_addr_set_any(&cfg->if_addr);
+	cfg->ttl = 0;
+}
+
+void
+cf_msock_cfg_copy(const cf_msock_cfg *from, cf_msock_cfg *to)
+{
+	to->owner = from->owner;
+	to->port = from->port;
+	cf_ip_addr_copy(&from->addr, &to->addr);
+	cf_ip_addr_copy(&from->if_addr, &to->if_addr);
+	to->ttl = from->ttl;
+}
+
+void
+cf_mserv_cfg_init(cf_mserv_cfg *cfg)
+{
+	cfg->n_cfgs = 0;
+}
+
+int32_t
+cf_mserv_cfg_add_msock_cfg(cf_mserv_cfg *serv_cfg, const cf_msock_cfg *sock_cfg)
+{
+	if (serv_cfg->n_cfgs >= CF_SOCK_CFG_MAX) {
+		cf_warning(CF_SOCKET, "Too many socket configurations in server configuration");
+		return -1;
+	}
+
+	uint32_t n = serv_cfg->n_cfgs;
+
+	for (uint32_t i = 0; i < n; ++i) {
+		cf_msock_cfg *walker = &serv_cfg->cfgs[i];
+
+		if (walker->owner == sock_cfg->owner && walker->port == sock_cfg->port &&
+				cf_ip_addr_compare(&walker->addr, &sock_cfg->addr) == 0 &&
+				cf_ip_addr_compare(&walker->if_addr, &sock_cfg->if_addr) == 0 &&
+				walker->ttl == sock_cfg->ttl) {
+			return 0;
+		}
+	}
+
+	cf_msock_cfg_copy(sock_cfg, &serv_cfg->cfgs[n]);
+	serv_cfg->n_cfgs = ++n;
+	return 0;
+}
+
+int32_t
+cf_socket_mcast_init(cf_mserv_cfg *cfg, cf_sockets *socks)
+{
+	int32_t res = -1;
+
+	if (cfg->n_cfgs < 1) {
+		cf_warning(CF_SOCKET, "Missing multicast socket configuration");
+		goto cleanup0;
+	}
+
+	cf_debug(CF_SOCKET, "Initializing %u multicast socket(s)", cfg->n_cfgs);
+	uint32_t n;
+	cf_socket *sock;
+
+	for (n = 0; n < cfg->n_cfgs; ++n) {
+		sock = &socks->socks[n];
+
+		if (cfg->cfgs[n].port == 0) {
+			cf_warning(CF_SOCKET, "Missing multicast port");
+			goto cleanup1;
+		}
+
+		cf_sock_addr addr;
+		cf_sock_addr_from_addr_port(&cfg->cfgs[n].addr, cfg->cfgs[n].port, &addr);
+
+		struct sockaddr_storage sas;
+		cf_sock_addr_to_native(&addr, (struct sockaddr *)&sas);
+
+		cf_debug(CF_SOCKET, "Initializing multicast socket for %s", cf_sock_addr_print(&addr));
+		int32_t fd = socket(sas.ss_family, SOCK_DGRAM, 0);
+
+		if (fd < 0) {
+			cf_warning(CF_SOCKET, "Error while creating socket for %s: %d (%s)",
+					cf_sock_addr_print(&addr), errno, cf_strerror(errno));
+			goto cleanup1;
+		}
+
+		cf_socket_init(sock);
+		sock->fd = fd;
+		fd = -1;
+
+		cf_socket_fix_client(sock);
+		cf_socket_fix_server(sock);
+
+		static const int32_t yes = 1;
+		safe_setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes));
+
+		if (!cf_ip_addr_is_any(&cfg->cfgs[n].if_addr)) {
+			cf_info(CF_SOCKET, "Setting multicast interface address: %s",
+					cf_ip_addr_print(&cfg->cfgs[n].if_addr));
+
+			if (cf_socket_mcast_set_inter(sock, &cfg->cfgs[n].if_addr) < 0) {
+				cf_warning(CF_SOCKET, "Error while binding to interface %s",
+						cf_ip_addr_print(&cfg->cfgs[n].if_addr));
+				goto cleanup2;
+			}
+		}
+
+		uint8_t ttl = cfg->cfgs[n].ttl;
+
+		if (ttl > 0) {
+			cf_info(CF_SOCKET, "Setting multicast TTL: %d", ttl);
+
+			if (cf_socket_mcast_set_ttl(sock, ttl) < 0) {
+				cf_warning(CF_SOCKET, "Error while setting multicast TTL");
+				goto cleanup2;
+			}
+		}
+
+		while (bind(sock->fd, (struct sockaddr *)&sas,
+				cf_socket_addr_len((struct sockaddr *)&sas)) < 0) {
+			if (errno != EADDRINUSE) {
+				cf_warning(CF_SOCKET, "Error while binding to %s: %d (%s)",
+						cf_sock_addr_print(&addr), errno, cf_strerror(errno));
+				goto cleanup2;
+			}
+
+			cf_warning(CF_SOCKET, "Socket %s in use, waiting", cf_sock_addr_print(&addr));
+			usleep(5 * 1000 * 1000);
+		}
+
+		cf_info(CF_SOCKET, "Joining multicast group: %s", cf_ip_addr_print(&addr.addr));
+
+		if (cf_socket_mcast_join_group(sock, &cfg->cfgs[n].if_addr, &addr.addr) < 0) {
+			cf_warning(CF_SOCKET, "Error while joining multicast group %s",
+					cf_ip_addr_print(&addr.addr));
+			goto cleanup2;
+		}
+
+		sock->cfg = &cfg->cfgs[n];
+	}
+
+	socks->n_socks = n;
+	res = 0;
+	goto cleanup0;
+
+cleanup2:
+	cf_socket_close(sock);
+	cf_socket_term(sock);
+
+cleanup1:
+	for (uint32_t i = 0; i < n; ++i) {
+		cf_socket_close(&socks->socks[i]);
+		cf_socket_term(&socks->socks[i]);
+	}
+
+cleanup0:
+	return res;
+}
+
+void
+cf_socket_mcast_show(cf_fault_context cont, const char *tag, const cf_sockets *socks)
+{
+	for (uint32_t i = 0; i < socks->n_socks; ++i) {
+		cf_msock_cfg *cfg = socks->socks[i].cfg;
+		cf_sock_addr addr;
+		cf_sock_addr_from_addr_port(&cfg->if_addr, cfg->port, &addr);
+		cf_info(cont, "Started %s endpoint %s", tag, cf_sock_addr_print(&addr));
+	}
+}
+
+// #define VERY_CHATTY
+
+void
+cf_poll_create(cf_poll *poll)
+{
+	int32_t fd = epoll_create(1);
+
+	if (fd < 0) {
+		cf_crash(CF_SOCKET, "Error while creating epoll instance: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+
+	*poll = (cf_poll){ .fd = fd };
+	cf_debug(CF_SOCKET, "Created new epoll instance with FD %d", fd);
+}
+
+void
+cf_poll_add_fd(cf_poll poll, int32_t fd, uint32_t events, void *data)
+{
+	cf_debug(CF_SOCKET,
+			 "Adding FD %d to epoll instance with FD %d, events = 0x%x",
+			 fd, poll.fd, events);
+	struct epoll_event ev = { .events = events, .data.ptr = data };
+
+	if (epoll_ctl(poll.fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+		cf_crash(CF_SOCKET,
+				 "Error while adding FD %d to epoll instance %d: %d (%s)",
+				 fd, poll.fd, errno, cf_strerror(errno));
+	}
+}
+
+void
+cf_poll_add_socket(cf_poll poll, const cf_socket *sock, uint32_t events, void *data)
+{
+	cf_poll_add_fd(poll, sock->fd, events, data);
+}
+
+int32_t
+cf_poll_modify_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t events, void *data,
+		uint32_t n_err_ok, int32_t *err_ok)
+{
+#if defined VERY_CHATTY
+	cf_detail(CF_SOCKET, "Modifying FD %d in epoll instance with FD %d, events = 0x%x",
+			sock->fd, poll.fd, events);
+#endif
+
+	struct epoll_event ev = { .events = events, .data.ptr = data };
+
+	if (epoll_ctl(poll.fd, EPOLL_CTL_MOD, sock->fd, &ev) < 0) {
+		for (uint32_t i = 0; i < n_err_ok; ++i) {
+			if (errno == err_ok[i]) {
+				return errno;
+			}
+		}
+
+		cf_crash(CF_SOCKET, "Error while modifying FD %d in epoll instance %d: %d (%s)",
+				sock->fd, poll.fd, errno, cf_strerror(errno));
+	}
+
+	return 0;
+}
+
+int32_t
+cf_poll_delete_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t n_err_ok,
+		int32_t *err_ok)
+{
+	cf_detail(CF_SOCKET, "Deleting FD %d from epoll instance with FD %d", sock->fd, poll.fd);
+
+	if (epoll_ctl(poll.fd, EPOLL_CTL_DEL, sock->fd, NULL) < 0) {
+		for (uint32_t i = 0; i < n_err_ok; ++i) {
+			if (errno == err_ok[i]) {
+				return errno;
+			}
+		}
+
+		cf_crash(CF_SOCKET, "Error while deleting FD %d from epoll instance %d: %d (%s)",
+				sock->fd, poll.fd, errno, cf_strerror(errno));
+	}
+
+	return 0;
+}
+
+void
+cf_poll_add_sockets(cf_poll poll, cf_sockets *socks, uint32_t events)
+{
+	for (uint32_t i = 0; i < socks->n_socks; ++i) {
+		cf_poll_add_socket(poll, &socks->socks[i], events, &socks->socks[i]);
+	}
+}
+
+void
+cf_poll_delete_sockets(cf_poll poll, cf_sockets *socks)
+{
+	for (uint32_t i = 0; i < socks->n_socks; ++i) {
+		cf_poll_delete_socket(poll, &socks->socks[i]);
+	}
+}
+
+int32_t
+cf_poll_wait(cf_poll poll, cf_poll_event *events, int32_t limit, int32_t timeout)
+{
+#if defined VERY_CHATTY
+	cf_detail(CF_SOCKET, "Waiting on epoll instance with FD %d", poll.fd);
+#endif
+
+	while (true) {
+		int32_t res = epoll_wait(poll.fd, (struct epoll_event *)events, limit, timeout);
+
+		if (res >= 0) {
+#if defined VERY_CHATTY
+			if (cf_fault_filter[CF_SOCKET] >= CF_DETAIL) {
+				cf_detail(CF_SOCKET, "Epoll instance with FD %d reports %d event(s)", poll.fd, res);
+
+				for (int32_t i = 0; i < res; ++i) {
+					cf_detail(CF_SOCKET, "Event #%d: 0x%x, %p",
+							i, events[i].events, events[i].data);
+				}
+			}
+#endif
+
+			return res;
+		}
+
+		if (errno != EINTR) {
+			cf_crash(CF_SOCKET, "Error while waiting for events on epoll instance %d: %d (%s)",
+					poll.fd, errno, cf_strerror(errno));
+		}
+	}
+}
+
+void
+cf_poll_destroy(cf_poll poll)
+{
+	cf_debug(CF_SOCKET, "Destroying epoll instance with FD %d", poll.fd);
+
+	if (close(poll.fd) < 0) {
+		cf_crash(CF_SOCKET, "Error while closing epoll instance: %d (%s)",
+				errno, cf_strerror(errno));
+	}
+}
+
+#define RESP_SIZE (2 * 1024 * 1024)
+#define MAX_INTERS 500
+#define MAX_ADDRS 20
+
+typedef struct inter_entry_s {
+	uint32_t index;
+	char name[50];
+	bool def_route;
+	bool up;
+	uint32_t mtu;
+	uint32_t mac_addr_len;
+	uint8_t mac_addr[50];
+	uint32_t n_addrs;
+	cf_ip_addr addrs[MAX_ADDRS];
+
+	union {
+		struct inter_entry_s *entry;
+		uint32_t index;
+	} master;
+} inter_entry;
+
+typedef struct inter_info_s {
+	uint32_t n_inters;
+	inter_entry inters[MAX_INTERS];
+} inter_info;
+
+typedef struct inter_filter_s {
+	bool allow_v6;
+	bool def_route;
+	bool up;
+	const char *if_name;
+} inter_filter;
+
+typedef struct cb_context_s {
+	bool has_label;
+	bool has_address;
+	bool has_local;
+	bool has_index;
+	bool has_priority;
+	char curr_label[50];
+	cf_ip_addr curr_address;
+	uint32_t curr_index;
+	uint32_t curr_priority;
+	bool allow_v6;
+	inter_info *inter;
+} cb_context;
+
+typedef void (*reset_cb)(cb_context *cont);
+typedef void (*data_cb)(cb_context *cont, void *info, int32_t type, void *data, size_t len);
+typedef void (*post_cb)(cb_context *cont);
+
+static int32_t
+netlink_dump(int32_t type, int32_t filter1, int32_t filter2a, int32_t filter2b, int32_t filter2c,
+		int32_t filter2d, size_t size, reset_cb reset_fn, data_cb data_fn, post_cb post_fn,
+		cb_context *cont)
+{
+	int32_t res = -1;
+	int32_t nls = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+
+	if (nls < 0) {
+		cf_warning(CF_SOCKET, "Error while creating netlink socket: %d (%s)",
+				errno, cf_strerror(errno));
+		goto cleanup0;
+	}
+
+	struct sockaddr_nl loc;
+	memset(&loc, 0, sizeof(loc));
+	loc.nl_family = AF_NETLINK;
+
+	if (bind(nls, (struct sockaddr *)&loc, sizeof(loc)) < 0) {
+		cf_warning(CF_SOCKET, "Error while binding netlink socket: %d (%s)",
+				errno, cf_strerror(errno));
+		goto cleanup1;
+	}
+
+	static cf_atomic32 seq = 0;
+	struct {
+		struct nlmsghdr h;
+		struct rtgenmsg m;
+	} req;
+
+	memset(&req, 0, sizeof(req));
+	req.h.nlmsg_len = NLMSG_LENGTH(sizeof(req.m));
+	req.h.nlmsg_type = type;
+	req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
+	req.h.nlmsg_seq = cf_atomic32_add(&seq, 1);
+	req.m.rtgen_family = PF_UNSPEC;
+
+	struct sockaddr_nl rem;
+	memset(&rem, 0, sizeof(rem));
+	rem.nl_family = AF_NETLINK;
+
+	struct iovec iov;
+	memset(&iov, 0, sizeof(iov));
+	iov.iov_base = &req;
+	iov.iov_len = req.h.nlmsg_len;
+
+	struct msghdr msg;
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = &rem;
+	msg.msg_namelen = sizeof(rem);
+
+	if (sendmsg(nls, &msg, 0) < 0) {
+		cf_warning(CF_SOCKET, "Error while sending netlink request: %d (%s)",
+				errno, cf_strerror(errno));
+		goto cleanup1;
+	}
+
+	uint8_t *resp = cf_malloc(RESP_SIZE);
+
+	memset(resp, 0, RESP_SIZE);
+	bool done = false;
+
+	while (!done) {
+		memset(&rem, 0, sizeof(rem));
+		memset(&iov, 0, sizeof(iov));
+		iov.iov_base = resp;
+		iov.iov_len = RESP_SIZE;
+
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_name = &rem;
+		msg.msg_namelen = sizeof(rem);
+
+		ssize_t len = recvmsg(nls, &msg, 0);
+
+		if (len < 0) {
+			cf_warning(CF_SOCKET, "Error while receiving netlink response: %d (%s)",
+					errno, cf_strerror(errno));
+			goto cleanup2;
+		}
+
+		if ((msg.msg_flags & MSG_TRUNC) != 0) {
+			cf_warning(CF_SOCKET, "Received truncated netlink message");
+			goto cleanup2;
+		}
+
+		struct nlmsghdr *h = (struct nlmsghdr *)resp;
+
+		while (NLMSG_OK(h, len)) {
+			if (h->nlmsg_type == NLMSG_NOOP) {
+				h = NLMSG_NEXT(h, len);
+				continue;
+			}
+
+			if (h->nlmsg_type == NLMSG_ERROR) {
+				int32_t *err = NLMSG_DATA(h);
+				cf_warning(CF_SOCKET, "Received netlink error message: %d (%s)",
+						-*err, cf_strerror(-*err));
+				goto cleanup2;
+			}
+
+			if (h->nlmsg_type == NLMSG_DONE) {
+				done = true;
+				break;
+			}
+
+			if (h->nlmsg_type == NLMSG_OVERRUN) {
+				cf_warning(CF_SOCKET, "Received netlink overrun message");
+				goto cleanup2;
+			}
+
+			if (h->nlmsg_type == filter1) {
+				if (reset_fn != NULL) {
+					reset_fn(cont);
+				}
+
+				void *info = NLMSG_DATA(h);
+				uint32_t a_len = h->nlmsg_len - NLMSG_LENGTH(size);
+				struct rtattr *a = (struct rtattr *)((uint8_t *)info + NLMSG_ALIGN(size));
+
+				while (RTA_OK(a, a_len)) {
+					if (a->rta_type == filter2a || a->rta_type == filter2b ||
+							a->rta_type == filter2c || a->rta_type == filter2d) {
+						data_fn(cont, info, a->rta_type, RTA_DATA(a), RTA_PAYLOAD(a));
+					}
+
+					a = RTA_NEXT(a, a_len);
+				}
+
+				if (post_fn != NULL) {
+					post_fn(cont);
+				}
+			}
+
+			if ((h->nlmsg_flags & NLM_F_MULTI) == 0) {
+				done = true;
+				break;
+			}
+
+			h = NLMSG_NEXT(h, len);
+		}
+	}
+
+	res = 0;
+
+cleanup2:
+	cf_free(resp);
+
+cleanup1:
+	close(nls);
+
+cleanup0:
+	return res;
+}
+
+static void
+reset_fn(cb_context *cont)
+{
+	cont->has_label = false;
+	cont->has_address = false;
+	cont->has_local = false;
+	cont->has_index = false;
+	cont->has_priority = false;
+	memset(&cont->curr_label, 0, sizeof(cont->curr_label));
+	cf_ip_addr_set_any(&cont->curr_address);
+	cont->curr_index = 0;
+	cont->curr_priority = 0;
+}
+
+static void
+link_fn(cb_context *cont, void *info_, int32_t type, void *data, size_t len)
+{
+	struct ifinfomsg *info = info_;
+	inter_info *inter = cont->inter;
+	inter_entry *entry = NULL;
+
+	for (uint32_t i = 0; i < inter->n_inters; ++i) {
+		if (inter->inters[i].index == info->ifi_index) {
+			entry = &inter->inters[i];
+			break;
+		}
+	}
+
+	if (entry == NULL) {
+		uint32_t i = inter->n_inters;
+
+		if (i >= MAX_INTERS) {
+			cf_crash(CF_SOCKET, "Too many interfaces");
+		}
+
+		entry = &inter->inters[i];
+		++inter->n_inters;
+
+		entry->index = info->ifi_index;
+		entry->up = (info->ifi_flags & (IFF_UP | IFF_RUNNING)) == (IFF_UP | IFF_RUNNING);
+	}
+
+	if (type == IFLA_IFNAME) {
+		if (len > sizeof(entry->name)) {
+			cf_crash(CF_SOCKET, "Interface name too long: %s", (char *)data);
+		}
+
+		// Length includes terminating NUL.
+		memcpy(entry->name, data, len);
+		cf_detail(CF_SOCKET, "Collected interface name %s", entry->name);
+	}
+	else if (type == IFLA_ADDRESS) {
+		if (len > sizeof(entry->mac_addr)) {
+			cf_crash(CF_SOCKET, "MAC address too long");
+		}
+
+		entry->mac_addr_len = (uint32_t)len;
+		memcpy(entry->mac_addr, data, len);
+	}
+	else if (type == IFLA_MTU) {
+		if (len != 4) {
+			cf_crash(CF_SOCKET, "MTU value has invalid length: %zu", len);
+		}
+
+		memcpy(&entry->mtu, data, len);
+		cf_detail(CF_SOCKET, "Collected interface MTU %s -> %u", entry->name, entry->mtu);
+	}
+	else if (type == IFLA_MASTER) {
+		if (len != 4) {
+			cf_crash(CF_SOCKET, "Master index has invalid length: %zu", len);
+		}
+
+		memcpy(&entry->master.index, data, len);
+		cf_detail(CF_SOCKET, "Collected interface master index %s -> %u",
+				entry->name, entry->master.index);
+	}
+}
+
+static void
+addr_fn(cb_context *cont, void *info_, int32_t type, void *data, size_t len)
+{
+	struct ifaddrmsg *info = info_;
+
+	if (cont->curr_index == 0) {
+		cont->curr_index = info->ifa_index;
+	}
+
+	if (type == IFA_LABEL) {
+		if (len > sizeof(cont->curr_label)) {
+			cf_crash(CF_SOCKET, "Interface label too long: %s", (char *)data);
+		}
+
+		// Length includes terminating NUL.
+		memcpy(cont->curr_label, data, len);
+		cont->has_label = true;
+		cf_detail(CF_SOCKET, "Collected interface label %s", cont->curr_label);
+	}
+	else if (type == IFA_ADDRESS) {
+		// IFA_LOCAL takes precedence over IFA_ADDRESS.
+		if (cont->has_local) {
+			cf_detail(CF_SOCKET, "Prioritizing local address");
+			return;
+		}
+
+		if (cf_socket_parse_netlink(cont->allow_v6, info->ifa_family, info->ifa_flags,
+				data, len, &cont->curr_address) < 0) {
+			return;
+		}
+
+		cont->has_address = true;
+		cf_detail(CF_SOCKET, "Considering interface address %s",
+				cf_ip_addr_print(&cont->curr_address));
+	}
+	else if (type == IFA_LOCAL) {
+		if (cf_socket_parse_netlink(cont->allow_v6, info->ifa_family, info->ifa_flags,
+				data, len, &cont->curr_address) < 0) {
+			return;
+		}
+
+		cont->has_local = true;
+		cf_detail(CF_SOCKET, "Considering local interface address %s",
+				cf_ip_addr_print(&cont->curr_address));
+	}
+}
+
+static void
+addr_fix_fn(cb_context *cont)
+{
+	if (!cont->has_address && !cont->has_local) {
+		return;
+	}
+
+	inter_info *inter = cont->inter;
+	inter_entry *by_index = NULL;
+	inter_entry *by_label = NULL;
+
+	for (uint32_t i = 0; i < inter->n_inters; ++i) {
+		if (inter->inters[i].index == cont->curr_index) {
+			by_index = &inter->inters[i];
+			break;
+		}
+	}
+
+	if (by_index == NULL) {
+		cf_crash(CF_SOCKET, "Invalid interface index: %u", cont->curr_index);
+	}
+
+	if (cont->has_label) {
+		for (uint32_t i = 0; i < inter->n_inters; ++i) {
+			if (strcmp(inter->inters[i].name, cont->curr_label) == 0) {
+				by_label = &inter->inters[i];
+				break;
+			}
+		}
+
+		if (by_label == NULL) {
+			cf_detail(CF_SOCKET, "New interface for label %s", cont->curr_label);
+			uint32_t i = inter->n_inters;
+
+			if (i >= MAX_INTERS) {
+				cf_crash(CF_SOCKET, "Too many interfaces");
+			}
+
+			by_label = &inter->inters[i];
+			++inter->n_inters;
+
+			by_label->index = by_index->index;
+			by_label->up = by_index->up;
+			memcpy(&by_label->mac_addr, &by_index->mac_addr, sizeof(by_label->mac_addr));
+			by_label->mac_addr_len = by_index->mac_addr_len;
+			by_label->mtu = by_index->mtu;
+
+			memcpy(&by_label->name, cont->curr_label, sizeof(by_label->name));
+		}
+	}
+
+	inter_entry *entry = by_label != NULL ? by_label : by_index;
+	uint32_t i = entry->n_addrs;
+
+	if (i >= MAX_ADDRS) {
+		cf_crash(CF_SOCKET, "Too many addresses for interface %s", entry->name);
+	}
+
+	cf_ip_addr *addr = &entry->addrs[i];
+	cf_ip_addr_copy(&cont->curr_address, addr);
+
+	++entry->n_addrs;
+	cf_detail(CF_SOCKET, "Collected interface address %s -> %s",
+			entry->name, cf_ip_addr_print(addr));
+}
+
+static void
+route_fn(cb_context *cont, void *info_, int32_t type, void *data, size_t len)
+{
+	struct rtmsg *info = info_;
+
+	// Ignore entries with RTM_F_CLONED, because they are route cache entries.
+	if ((info->rtm_flags & RTM_F_CLONED) != 0) {
+		return;
+	}
+
+	if (type == RTA_DST) {
+		if (cf_socket_parse_netlink(cont->allow_v6, info->rtm_family, 0,
+				data, len, &cont->curr_address) < 0) {
+			// If the address is not allowed, set to a non-zero address, because
+			// zero means default route.
+			cf_ip_addr_set_local(&cont->curr_address);
+		}
+
+		cont->has_address = true;
+	}
+	else if (type == RTA_OIF) {
+		if (len != 4) {
+			cf_detail(CF_SOCKET, "Invalid interface index");
+			return;
+		}
+
+		cont->curr_index = *(uint32_t *)data;
+		cont->has_index = true;
+	}
+	else if (type == RTA_PRIORITY) {
+		if (len != 4) {
+			cf_detail(CF_SOCKET, "Invalid route priority");
+			return;
+		}
+
+		cont->curr_priority = *(uint32_t *)data;
+		cont->has_priority = true;
+	}
+}
+
+static void
+route_fix_fn(cb_context *cont)
+{
+	// It's not a default route, if it has an address and the address isn't zero.
+	if (cont->has_address && !cf_ip_addr_is_any(&cont->curr_address)) {
+		return;
+	}
+
+	// It's one of the catch-all entries.
+	if (cont->has_priority && cont->curr_priority == UINT32_MAX) {
+		return;
+	}
+
+	// It doesn't have an interface index.
+	if (!cont->has_index) {
+		return;
+	}
+
+	inter_info *inter = cont->inter;
+	bool found = false;
+
+	for (uint32_t i = 0; i < inter->n_inters; ++i) {
+		inter_entry *entry = &inter->inters[i];
+
+		if (inter->inters[i].index == cont->curr_index) {
+			found = true;
+			entry->def_route = true;
+			cf_detail(CF_SOCKET, "Collected default route %s -> %s",
+					entry->name, cf_ip_addr_print(&cont->curr_address));
+			// Don't stop after the first match. Aliases share the same index.
+		}
+	}
+
+	if (!found) {
+		cf_crash(CF_SOCKET, "Invalid interface index: %u", cont->curr_index);
+	}
+}
+
+static void
+enumerate_inter(inter_info *inter, bool allow_v6)
+{
+	cb_context cont;
+	memset(&cont, 0, sizeof(cont));
+	cont.inter = inter;
+	cont.allow_v6 = allow_v6;
+
+	reset_fn(&cont);
+
+	if (netlink_dump(RTM_GETLINK, RTM_NEWLINK, IFLA_IFNAME, IFLA_ADDRESS, IFLA_MTU, IFLA_MASTER,
+			sizeof(struct ifinfomsg), NULL, link_fn, NULL, &cont) < 0) {
+		cf_crash(CF_SOCKET, "Error while enumerating network links");
+	}
+
+	if (netlink_dump(RTM_GETADDR, RTM_NEWADDR, IFA_LABEL, IFA_ADDRESS, IFA_LOCAL, -1,
+			sizeof(struct ifaddrmsg), reset_fn, addr_fn, addr_fix_fn, &cont) < 0) {
+		cf_crash(CF_SOCKET, "Error while enumerating network addresses");
+	}
+
+	if (netlink_dump(RTM_GETROUTE, RTM_NEWROUTE, RTA_DST, RTA_OIF, RTA_PRIORITY, -1,
+			sizeof(struct rtmsg), reset_fn, route_fn, route_fix_fn, &cont) < 0) {
+		cf_crash(CF_SOCKET, "Error while enumerating network routes");
+	}
+
+	for (int32_t i = 0; i < inter->n_inters; ++i) {
+		inter_entry *entry = &inter->inters[i];
+		cf_ip_addr_sort(entry->addrs, entry->n_addrs);
+
+		if (entry->master.index == 0) {
+			entry->master.entry = NULL;
+			continue;
+		}
+
+		inter_entry *master = NULL;
+
+		for (int32_t k = 0; k < inter->n_inters; ++k) {
+			inter_entry *cand = &inter->inters[k];
+
+			if (cand->index == entry->master.index) {
+				master = cand;
+				break;
+			}
+		}
+
+		if (master == NULL) {
+			cf_crash(CF_SOCKET, "Invalid master index: %u", entry->master.index);
+		}
+
+		entry->master.entry = master;
+	}
+
+	if (cf_fault_filter[CF_SOCKET] >= CF_DETAIL) {
+		cf_detail(CF_SOCKET, "%u interface(s)", inter->n_inters);
+
+		for (uint32_t i = 0; i < inter->n_inters; ++i) {
+			inter_entry *entry = &inter->inters[i];
+			cf_detail(CF_SOCKET, "Name = %s", entry->name);
+			cf_detail(CF_SOCKET, "MAC address = %02x:%02x:%02x:%02x:%02x:%02x",
+					entry->mac_addr[0], entry->mac_addr[1], entry->mac_addr[2],
+					entry->mac_addr[3], entry->mac_addr[4], entry->mac_addr[5]);
+			cf_detail(CF_SOCKET, "Default route = %d", (int32_t)entry->def_route);
+			cf_detail(CF_SOCKET, "Up = %d", (int32_t)entry->up);
+			cf_detail(CF_SOCKET, "MTU = %u", entry->mtu);
+
+			for (int32_t k = 0; k < entry->n_addrs; ++k) {
+				cf_ip_addr *addr = &entry->addrs[k];
+				cf_detail(CF_SOCKET, "Address = %s", cf_ip_addr_print(addr));
+			}
+
+			cf_detail(CF_SOCKET, "Master = %s",
+					entry->master.entry != NULL ? entry->master.entry->name : "(none)");
+		}
+	}
+}
+
+static int32_t
+inter_get_addr(cf_ip_addr *addrs, uint32_t *n_addrs, inter_filter *filter)
+{
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, filter->allow_v6);
+
+	uint32_t count = 0;
+
+	for (uint32_t i = 0; i < inter.n_inters; ++i) {
+		inter_entry *entry = &inter.inters[i];
+
+		if (filter->def_route && !entry->def_route) {
+			continue;
+		}
+
+		if (filter->up && !entry->up) {
+			continue;
+		}
+
+		if (filter->if_name != NULL && strcmp(filter->if_name, entry->name) != 0) {
+			continue;
+		}
+
+		for (uint32_t k = 0; k < entry->n_addrs; ++k) {
+			cf_ip_addr *addr = &entry->addrs[k];
+
+			if (count >= *n_addrs) {
+				cf_warning(CF_SOCKET, "Buffer overflow while enumerating interface addresses");
+				return -1;
+			}
+
+			cf_ip_addr_copy(addr, &addrs[count]);
+			++count;
+		}
+	}
+
+	*n_addrs = count;
+	return 0;
+}
+
+int32_t
+cf_inter_get_addr_all(cf_ip_addr *addrs, uint32_t *n_addrs)
+{
+	static inter_filter filter = {
+		.allow_v6 = true, .def_route = false, .up = true, .if_name = NULL
+	};
+
+	return inter_get_addr(addrs, n_addrs, &filter);
+}
+
+int32_t
+cf_inter_get_addr_all_legacy(cf_ip_addr *addrs, uint32_t *n_addrs)
+{
+	static inter_filter filter = {
+		.allow_v6 = false, .def_route = false, .up = true, .if_name = NULL
+	};
+
+	return inter_get_addr(addrs, n_addrs, &filter);
+}
+
+int32_t
+cf_inter_get_addr_def(cf_ip_addr *addrs, uint32_t *n_addrs)
+{
+	static inter_filter filter = {
+		.allow_v6 = true, .def_route = true, .up = true, .if_name = NULL
+	};
+
+	return inter_get_addr(addrs, n_addrs, &filter);
+}
+
+int32_t
+cf_inter_get_addr_def_legacy(cf_ip_addr *addrs, uint32_t *n_addrs)
+{
+	static inter_filter filter = {
+		.allow_v6 = false, .def_route = true, .up = true, .if_name = NULL
+	};
+
+	return inter_get_addr(addrs, n_addrs, &filter);
+}
+
+int32_t
+cf_inter_get_addr_name(cf_ip_addr *addrs, uint32_t *n_addrs, const char *if_name)
+{
+	inter_filter filter = {
+		.allow_v6 = true, .def_route = false, .up = false, .if_name = if_name
+	};
+
+	return inter_get_addr(addrs, n_addrs, &filter);
+}
+
+bool
+cf_inter_is_inter_name(const char *if_name)
+{
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, true);
+
+	for (uint32_t i = 0; i < inter.n_inters; ++i) {
+		if (strcmp(inter.inters[i].name, if_name) == 0) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+int32_t
+cf_inter_addr_to_index_and_name(const cf_ip_addr *addr, int32_t *index, char **name)
+{
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, true);
+
+	for (uint32_t i = 0; i < inter.n_inters; ++i) {
+		inter_entry *entry = &inter.inters[i];
+
+		for (uint32_t k = 0; k < entry->n_addrs; ++k) {
+			if (cf_ip_addr_compare(&entry->addrs[k], addr) == 0) {
+				if (name != NULL) {
+					*name = cf_strdup(entry->name);
+				}
+
+				if (index != NULL) {
+					*index = (int32_t)entry->index;
+				}
+
+				return 0;
+			}
+		}
+	}
+
+	return -1;
+}
+
+void
+cf_inter_expand_bond(const char *if_name, char **out_names, uint32_t *n_out)
+{
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, true);
+
+	uint32_t n = 0;
+
+	for (uint32_t i = 0; i < inter.n_inters; ++i) {
+		inter_entry *entry = &inter.inters[i];
+
+		if (entry->master.entry == NULL || strcmp(entry->master.entry->name, if_name) != 0) {
+			continue;
+		}
+
+		if (n >= *n_out) {
+			cf_crash(CF_SOCKET, "Output buffer overflow");
+		}
+
+		out_names[n] = cf_strdup(entry->name);
+		++n;
+	}
+
+	if (n == 0) {
+		out_names[0] = cf_strdup(if_name);
+		n = 1;
+	}
+
+	*n_out = n;
+}
+
+int32_t
+cf_inter_mtu(const cf_ip_addr *inter_addr)
+{
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, true);
+
+	for (uint32_t i = 0; i < inter.n_inters; ++i) {
+		inter_entry *entry = &inter.inters[i];
+
+		for (uint32_t k = 0; k < entry->n_addrs; ++k) {
+			cf_ip_addr *entry_addr = &entry->addrs[k];
+
+			if (cf_ip_addr_compare(inter_addr, entry_addr) == 0) {
+				return entry->mtu;
+			}
+		}
+	}
+
+	return -1;
+}
+
+int32_t
+cf_inter_min_mtu(void)
+{
+	uint32_t min = UINT32_MAX;
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, true);
+
+	for (uint32_t i = 0; i < inter.n_inters; ++i) {
+		inter_entry *entry = &inter.inters[i];
+
+		if (entry->up && entry->mtu < min) {
+			min = entry->mtu;
+		}
+	}
+
+	return (int32_t)min;
+}
+
+static bool
+detect_changes(bool legacy, cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit)
+{
+	cf_ip_addr curr[CF_SOCK_CFG_MAX];
+	uint32_t n_curr = CF_SOCK_CFG_MAX;
+	int32_t res;
+
+	if (legacy) {
+		res = cf_inter_get_addr_all_legacy(curr, &n_curr);
+	}
+	else {
+		res = cf_inter_get_addr_all(curr, &n_curr);
+	}
+
+	if (res < 0) {
+		cf_crash(AS_INFO, "Error while getting interface addresses");
+	}
+
+	if (n_curr > limit) {
+		cf_crash(AS_INFO, "Too many network interface addresses: %d", n_curr);
+	}
+
+	cf_ip_addr_sort(curr, n_curr);
+	uint32_t n_filter = 0;
+
+	for (uint32_t i = 0; i < n_curr; ++i) {
+		if (cf_ip_addr_is_local(&curr[i])) {
+			continue;
+		}
+
+		if (i > n_filter) {
+			cf_ip_addr_copy(&curr[i], &curr[n_filter]);
+		}
+
+		++n_filter;
+	}
+
+	n_curr = n_filter;
+	bool change = false;
+
+	if (n_curr != *n_addrs) {
+		change = true;
+	}
+	else {
+		for (uint32_t i = 0; i < n_curr; ++i) {
+			if (cf_ip_addr_compare(&addrs[i], &curr[i]) != 0) {
+				change = true;
+				break;
+			}
+		}
+	}
+
+	if (change) {
+		for (uint32_t i = 0; i < n_curr; ++i) {
+			cf_ip_addr_copy(&curr[i], &addrs[i]);
+		}
+
+		*n_addrs = n_curr;
+	}
+
+	return change;
+}
+
+bool
+cf_inter_detect_changes(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit)
+{
+	return detect_changes(false, addrs, n_addrs, limit);
+}
+
+bool
+cf_inter_detect_changes_legacy(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit)
+{
+	return detect_changes(true, addrs, n_addrs, limit);
+}
+
+static const char *if_in_order[] = {
+	"eth", "bond", "wlan",
+	NULL
+};
+
+static const char *if_default[] = {
+	"^eth[[:digit:]]+$", "^bond[[:digit:]]+$", "^wlan[[:digit:]]+$",
+	"^em[[:digit:]]+_[[:digit:]]+$", "^p[[:digit:]]+p[[:digit:]]+_[[:digit:]]+$",
+	NULL
+};
+
+static const char *if_default2[] = {
+	"^em[[:digit:]]+$", "^p[[:digit:]]+p[[:digit:]]+$", NULL
+};
+
+static const char *if_any[] = {
+	"^.*$",
+	NULL
+};
+
+static bool
+validate_inter(inter_entry *entry)
+{
+	cf_debug(CF_SOCKET, "Validating interface %s", entry->name);
+
+	if (entry->n_addrs == 0) {
+		cf_debug(CF_SOCKET, "No IP addresses");
+		return false;
+	}
+
+	if (entry->mac_addr_len < 6) {
+		cf_debug(CF_SOCKET, "Invalid MAC address length: %d", entry->mac_addr_len);
+		return false;
+	}
+
+	static const uint8_t all0[6] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+	static const uint8_t all1[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+	if (memcmp(entry->mac_addr, all0, 6) == 0 || memcmp(entry->mac_addr, all1, 6) == 0) {
+		cf_debug(CF_SOCKET, "Invalid MAC address: %02x:%02x:%02x:%02x:%02x:%02x",
+				entry->mac_addr[0], entry->mac_addr[1], entry->mac_addr[2],
+				entry->mac_addr[3], entry->mac_addr[4], entry->mac_addr[5]);
+		return false;
+	}
+
+	cf_debug(CF_SOCKET, "Interface OK");
+	return true;
+}
+
+static inter_entry *
+find_inter(inter_info *inter, const char *name, bool validate)
+{
+	cf_debug(CF_SOCKET, "Looking for %s", name);
+
+	for (uint32_t i = 0; i < inter->n_inters; ++i) {
+		inter_entry *entry = &inter->inters[i];
+		cf_debug(CF_SOCKET, "Checking %s", entry->name);
+
+		if (strcmp(entry->name, name) == 0 && (!validate || validate_inter(entry))) {
+			return entry;
+		}
+	}
+
+	return NULL;
+}
+
+static inter_entry *
+match_inter(inter_info *inter, const char **patterns)
+{
+	for (uint32_t i = 0; i < inter->n_inters; ++i) {
+		inter_entry *entry = &inter->inters[i];
+		cf_debug(CF_SOCKET, "Matching %s", entry->name);
+
+		for (uint32_t k = 0; patterns[k] != NULL; ++k) {
+			cf_debug(CF_SOCKET, "Matching with %s", patterns[k]);
+			regex_t rex;
+
+			if (regcomp(&rex, patterns[k], REG_EXTENDED | REG_NOSUB) != 0) {
+				cf_crash(CF_SOCKET, "Error while compiling regular expression %s", patterns[k]);
+			}
+
+			bool ok = regexec(&rex, entry->name, 0, NULL, 0) == 0 && validate_inter(entry);
+			regfree(&rex);
+
+			if (ok) {
+				return entry;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+int32_t
+cf_node_id_get(cf_ip_port port, const char *if_hint, cf_node *id)
+{
+	cf_debug(CF_SOCKET, "Getting node ID");
+	inter_info inter;
+	memset(&inter, 0, sizeof(inter));
+	enumerate_inter(&inter, true);
+
+	inter_entry *entry;
+
+	if (if_hint != NULL) {
+		cf_debug(CF_SOCKET, "Checking user-specified interface %s", if_hint);
+		entry = find_inter(&inter, if_hint, false);
+
+		if (entry != NULL) {
+			goto success;
+		}
+
+		cf_warning(CF_SOCKET, "Unable to find interface %s specified in configuration file",
+				if_hint);
+		return -1;
+	}
+
+	cf_debug(CF_SOCKET, "Trying default interfaces in order");
+
+	for (int32_t i = 0; if_in_order[i] != NULL; ++i) {
+		for (int32_t k = 0; k < 11; ++k) {
+			char tmp[100];
+			snprintf(tmp, sizeof(tmp), "%s%d", if_in_order[i], k);
+			entry = find_inter(&inter, tmp, true);
+
+			if (entry != NULL) {
+				goto success;
+			}
+		}
+	}
+
+	cf_debug(CF_SOCKET, "Trying default interfaces");
+	entry = match_inter(&inter, if_default);
+
+	if (entry != NULL) {
+		goto success;
+	}
+
+	cf_debug(CF_SOCKET, "Trying secondary default interfaces");
+	entry = match_inter(&inter, if_default2);
+
+	if (entry != NULL) {
+		goto success;
+	}
+
+	cf_debug(CF_SOCKET, "Trying any interface");
+	entry = match_inter(&inter, if_any);
+
+	if (entry != NULL) {
+		goto success;
+	}
+
+	cf_warning(CF_SOCKET, "Unable to find any suitable network device for node ID");
+	return -1;
+
+success:
+	;
+	uint8_t *buff = (uint8_t *)id;
+
+	if (entry->mac_addr_len == 6) {
+		memcpy(buff, entry->mac_addr, 6);
+	}
+	else {
+		cf_digest dig;
+		cf_digest_compute(entry->mac_addr, entry->mac_addr_len, &dig);
+		memcpy(buff, dig.digest, 6);
+	}
+
+	memcpy(buff + 6, &port, 2);
+
+	cf_info(CF_SOCKET, "Node port %d, node ID %" PRIx64, port, *id);
+	return 0;
+}
diff --git a/cf/src/socket_ce.c b/cf/src/socket_ce.c
new file mode 100644
index 00000000..0fa6c72f
--- /dev/null
+++ b/cf/src/socket_ce.c
@@ -0,0 +1,459 @@
+/*
+ * socket_ce.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#define CF_SOCKET_PRIVATE
+#include "socket.h"
+
+#include <errno.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "fault.h"
+
+#include "citrusleaf/alloc.h"
+
+static char *
+safe_strndup(const char *string, size_t length)
+{
+	char *res = cf_strndup(string, length);
+
+	if (res == NULL) {
+		cf_crash(CF_SOCKET, "Out of memory");
+	}
+
+	return res;
+}
+
+void
+cf_socket_set_advertise_ipv6(bool advertise)
+{
+	cf_warning(CF_SOCKET, "'advertise-ipv6' is relevant for enterprise only");
+}
+
+bool
+cf_socket_advertises_ipv6(void)
+{
+	return false;
+}
+
+int32_t
+cf_ip_addr_from_string_multi(const char *string, cf_ip_addr *addrs, uint32_t *n_addrs)
+{
+	if (strcmp(string, "any") == 0) {
+		if (*n_addrs < 1) {
+			cf_warning(CF_SOCKET, "Too many IP addresses");
+			return -1;
+		}
+
+		cf_ip_addr_set_any(&addrs[0]);
+		*n_addrs = 1;
+		return 0;
+	}
+
+	if (strcmp(string, "local") == 0) {
+		if (*n_addrs < 1) {
+			cf_warning(CF_SOCKET, "Too many IP addresses");
+			return -1;
+		}
+
+		cf_ip_addr_set_local(&addrs[0]);
+		*n_addrs = 1;
+		return 0;
+	}
+
+	if (cf_inter_is_inter_name(string)) {
+		cf_ip_addr if_addrs[CF_SOCK_CFG_MAX];
+		uint32_t n_if_addrs = CF_SOCK_CFG_MAX;
+
+		if (cf_inter_get_addr_name(if_addrs, &n_if_addrs, string) < 0) {
+			cf_warning(CF_SOCKET, "Error while getting interface addresses for '%s'", string);
+			return -1;
+		}
+
+		if (n_if_addrs == 0) {
+			cf_warning(CF_SOCKET, "Interface %s does not have any IP addresses", string);
+			return -1;
+		}
+
+		if (n_if_addrs > *n_addrs) {
+			cf_warning(CF_SOCKET, "Too many IP addresses");
+			return -1;
+		}
+
+		for (uint32_t i = 0; i < n_if_addrs; ++i) {
+			cf_ip_addr_copy(&if_addrs[i], &addrs[i]);
+		}
+
+		*n_addrs = n_if_addrs;
+		return 0;
+	}
+
+	int32_t res = -1;
+	struct addrinfo *info = NULL;
+	static struct addrinfo hints = {
+		.ai_flags = 0,
+		.ai_family = AF_INET
+	};
+
+	int32_t x = getaddrinfo(string, NULL, &hints, &info);
+
+	if (x != 0) {
+		cf_warning(CF_SOCKET, "Error while converting address '%s': %s", string, gai_strerror(x));
+		goto cleanup0;
+	}
+
+	uint32_t i = 0;
+
+	for (struct addrinfo *walker = info; walker != NULL; walker = walker->ai_next) {
+		if (walker->ai_socktype == SOCK_STREAM) {
+			if (i >= *n_addrs) {
+				cf_warning(CF_SOCKET, "Too many IP addresses");
+				goto cleanup1;
+			}
+
+			struct sockaddr_in *sai = (struct sockaddr_in *)walker->ai_addr;
+			addrs[i].v4 = sai->sin_addr;
+			++i;
+		}
+	}
+
+	cf_ip_addr_sort(addrs, i);
+	*n_addrs = i;
+	res = 0;
+
+cleanup1:
+	freeaddrinfo(info);
+
+cleanup0:
+	return res;
+}
+
+bool
+cf_ip_addr_str_is_legacy(const char *string)
+{
+	(void)string;
+	return true;
+}
+
+bool
+cf_ip_addr_is_legacy(const cf_ip_addr* addr)
+{
+	(void)addr;
+	return true;
+}
+
+bool
+cf_ip_addr_legacy_only(void)
+{
+	return true;
+}
+
+int32_t
+cf_ip_addr_to_string(const cf_ip_addr *addr, char *string, size_t size)
+{
+	if (inet_ntop(AF_INET, &addr->v4, string, size) == NULL) {
+		cf_warning(CF_SOCKET, "Output buffer overflow");
+		return -1;
+	}
+
+	return strlen(string);
+}
+
+int32_t
+cf_ip_addr_from_binary(const uint8_t *binary, size_t size, cf_ip_addr *addr)
+{
+	if (size != 4) {
+		cf_debug(CF_SOCKET, "Input buffer size incorrect.");
+		return -1;
+	}
+
+	memcpy(&addr->v4, binary, 4);
+	return 4;
+}
+
+int32_t
+cf_ip_addr_to_binary(const cf_ip_addr *addr, uint8_t *binary, size_t size)
+{
+	if (size < 4) {
+		cf_warning(CF_SOCKET, "Output buffer overflow");
+		return -1;
+	}
+
+	memcpy(binary, &addr->v4, 4);
+	return 4;
+}
+
+void
+cf_ip_addr_to_rack_aware_id(const cf_ip_addr *addr, uint32_t *id)
+{
+	*id = ntohl(addr->v4.s_addr);
+}
+
+int32_t
+cf_ip_addr_compare(const cf_ip_addr *lhs, const cf_ip_addr *rhs)
+{
+	return memcmp(&lhs->v4, &rhs->v4, 4);
+}
+
+void
+cf_ip_addr_copy(const cf_ip_addr *from, cf_ip_addr *to)
+{
+	to->v4 = from->v4;
+}
+
+void
+cf_ip_addr_set_local(cf_ip_addr *addr)
+{
+	addr->v4.s_addr = htonl(0x7f000001);
+}
+
+bool
+cf_ip_addr_is_local(const cf_ip_addr *addr)
+{
+	return (ntohl(addr->v4.s_addr) & 0xff000000) == 0x7f000000;
+}
+
+void
+cf_ip_addr_set_any(cf_ip_addr *addr)
+{
+	addr->v4.s_addr = 0;
+}
+
+bool
+cf_ip_addr_is_any(const cf_ip_addr *addr)
+{
+	return addr->v4.s_addr == 0;
+}
+
+int32_t
+cf_sock_addr_to_string(const cf_sock_addr *addr, char *string, size_t size)
+{
+	int32_t total = 0;
+	int32_t count = cf_ip_addr_to_string(&addr->addr, string, size);
+
+	if (count < 0) {
+		return -1;
+	}
+
+	total += count;
+
+	if (size - total < 2) {
+		cf_warning(CF_SOCKET, "Output buffer overflow");
+		return -1;
+	}
+
+	string[total++] = ':';
+	string[total] = 0;
+
+	count = cf_ip_port_to_string(addr->port, string + total, size - total);
+
+	if (count < 0) {
+		return -1;
+	}
+
+	total += count;
+	return total;
+}
+
+int32_t
+cf_sock_addr_from_string(const char *string, cf_sock_addr *addr)
+{
+	int32_t res = -1;
+	const char *colon = strchr(string, ':');
+
+	if (colon == NULL) {
+		cf_warning(CF_SOCKET, "Missing ':' in socket address '%s'", string);
+		goto cleanup0;
+	}
+
+	const char *host = safe_strndup(string, colon - string);
+
+	if (cf_ip_addr_from_string(host, &addr->addr) < 0) {
+		cf_warning(CF_SOCKET, "Invalid host address '%s' in socket address '%s'", host, string);
+		goto cleanup1;
+	}
+
+	if (cf_ip_port_from_string(colon + 1, &addr->port) < 0) {
+		cf_warning(CF_SOCKET, "Invalid port '%s' in socket address '%s'", colon + 1, string);
+		goto cleanup1;
+	}
+
+	res = 0;
+
+cleanup1:
+	cf_free((void *)host);
+
+cleanup0:
+	return res;
+}
+
+void
+cf_sock_addr_from_native(const struct sockaddr *native, cf_sock_addr *addr)
+{
+	if (native->sa_family != AF_INET) {
+		cf_crash(CF_SOCKET, "Invalid address family: %d", native->sa_family);
+	}
+
+	struct sockaddr_in *sai = (struct sockaddr_in *)native;
+	addr->addr.v4 = sai->sin_addr;
+	addr->port = ntohs(sai->sin_port);
+}
+
+void
+cf_sock_addr_to_native(const cf_sock_addr *addr, struct sockaddr *native)
+{
+	struct sockaddr_in *sai = (struct sockaddr_in *)native;
+	memset(sai, 0, sizeof(struct sockaddr_in));
+	sai->sin_family = AF_INET;
+	sai->sin_addr = addr->addr.v4;
+	sai->sin_port = htons(addr->port);
+}
+
+int32_t
+cf_mserv_cfg_add_combo(cf_mserv_cfg *serv_cfg, cf_sock_owner owner, cf_ip_port port,
+		cf_ip_addr *addr, cf_ip_addr *if_addr, uint8_t ttl)
+{
+	cf_msock_cfg sock_cfg;
+	cf_msock_cfg_init(&sock_cfg, owner);
+	sock_cfg.port = port;
+	cf_ip_addr_copy(addr, &sock_cfg.addr);
+	cf_ip_addr_copy(if_addr, &sock_cfg.if_addr);
+	sock_cfg.ttl = ttl;
+
+	return cf_mserv_cfg_add_msock_cfg(serv_cfg, &sock_cfg);
+}
+
+int32_t
+cf_socket_mcast_set_inter(cf_socket *sock, const cf_ip_addr *iaddr)
+{
+	struct ip_mreqn mr;
+	memset(&mr, 0, sizeof(mr));
+	mr.imr_address = iaddr->v4;
+
+	if (setsockopt(sock->fd, IPPROTO_IP, IP_MULTICAST_IF, &mr, sizeof(mr)) < 0) {
+		cf_warning(CF_SOCKET, "setsockopt(IP_MULTICAST_IF) failed on FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+int32_t
+cf_socket_mcast_set_ttl(cf_socket *sock, int32_t ttl)
+{
+	if (setsockopt(sock->fd, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)) < 0) {
+		cf_warning(CF_SOCKET, "setsockopt(IP_MULTICAST_TTL) failed on FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+int32_t
+cf_socket_mcast_join_group(cf_socket *sock, const cf_ip_addr *iaddr, const cf_ip_addr *gaddr)
+{
+	struct ip_mreqn mr;
+	memset(&mr, 0, sizeof(mr));
+
+	if (!cf_ip_addr_is_any(iaddr)) {
+		mr.imr_address = iaddr->v4;
+	}
+
+	mr.imr_multiaddr = gaddr->v4;
+
+	if (setsockopt(sock->fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mr, sizeof(mr)) < 0) {
+		cf_warning(CF_SOCKET, "setsockopt(IP_ADD_MEMBERSHIP) failed on FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+		return -1;
+	}
+
+#ifdef IP_MULTICAST_ALL
+	// Only receive traffic from multicast groups this socket actually joins.
+	// Note: Bind address filtering takes precedence, so this is simply an extra level of
+	// restriction.
+	static const int32_t no = 0;
+
+	if (setsockopt(sock->fd, IPPROTO_IP, IP_MULTICAST_ALL, &no, sizeof(no)) < 0) {
+		cf_warning(CF_SOCKET, "setsockopt(IP_MULTICAST_ALL) failed on FD %d: %d (%s)",
+				sock->fd, errno, cf_strerror(errno));
+		return -1;
+	}
+#endif
+
+	return 0;
+}
+
+size_t
+cf_socket_addr_len(const struct sockaddr *sa)
+{
+	switch (sa->sa_family) {
+	case AF_INET:
+		return sizeof(struct sockaddr_in);
+
+	default:
+		cf_crash(CF_SOCKET, "Invalid address family: %d", sa->sa_family);
+		return 0;
+	}
+}
+
+int32_t
+cf_socket_parse_netlink(bool allow_ipv6, uint32_t family, uint32_t flags,
+		const void *data, size_t len, cf_ip_addr *addr)
+{
+	(void)allow_ipv6;
+	(void)flags;
+
+	if (family != AF_INET || len != 4) {
+		return -1;
+	}
+
+	memcpy(&addr->v4, data, 4);
+	return 0;
+}
+
+void
+cf_socket_fix_client(cf_socket *sock)
+{
+	(void)sock;
+}
+
+void
+cf_socket_fix_bind(cf_serv_cfg *serv_cfg)
+{
+	(void)serv_cfg;
+}
+
+void
+cf_socket_fix_server(cf_socket *sock)
+{
+	(void)sock;
+}
diff --git a/cf/src/tls_ce.c b/cf/src/tls_ce.c
new file mode 100644
index 00000000..c86438cf
--- /dev/null
+++ b/cf/src/tls_ce.c
@@ -0,0 +1,159 @@
+/*
+ * tls.c
+ *
+ * Copyright (C) 2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+#include <openssl/ssl.h>
+
+#include "fault.h"
+#include "socket.h"
+#include "tls.h"
+
+void
+tls_check_init()
+{
+}
+
+void
+tls_cleanup()
+{
+}
+
+void
+tls_thread_cleanup()
+{
+}
+
+void
+tls_socket_init(cf_socket *sock)
+{
+	sock->ssl = NULL;
+}
+
+void
+tls_socket_term(cf_socket *sock)
+{
+	if (sock->ssl) {
+		cf_crash(CF_TLS, "unexpected TLS state");
+	}
+}
+
+int
+tls_socket_shutdown(cf_socket *sock)
+{
+	if (sock->ssl) {
+		cf_crash(CF_TLS, "unexpected TLS state");
+	}
+	return -1;
+}
+
+void
+tls_socket_close(cf_socket *sock)
+{
+	if (sock->ssl) {
+		cf_crash(CF_TLS, "unexpected TLS state");
+	}
+}
+
+cf_tls_info *
+tls_config_server_context(cf_tls_spec *tspec, bool auth_client, uint32_t n_peer_names, char **peer_names)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return NULL;
+}
+
+cf_tls_info *
+tls_config_intra_context(cf_tls_spec *tspec, const char *which)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return NULL;
+}
+
+void
+tls_socket_prepare_server(cf_tls_info *info, cf_socket *sock)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+}
+
+void
+tls_socket_prepare_client(cf_tls_info *info, cf_socket *sock)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+}
+
+void
+tls_socket_must_not_have_data(cf_socket *sock, const char *caller)
+{
+	if (sock->state == CF_SOCKET_STATE_NON_TLS) {
+		return;
+	}
+
+	cf_crash(CF_TLS, "unexpected TLS state");
+}
+
+int
+tls_socket_accept(cf_socket *sock)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return 1;
+}
+
+int
+tls_socket_connect(cf_socket *sock)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return 1;
+}
+
+int
+tls_socket_accept_block(cf_socket *sock)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return 1;
+}
+
+int
+tls_socket_connect_block(cf_socket *sock)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return 1;
+}
+
+int
+tls_socket_recv(cf_socket *sock, void *buf, size_t sz, int32_t flags,
+				uint64_t deadline_msec)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return 1;
+}
+
+int
+tls_socket_send(cf_socket *sock, void const *buf, size_t sz, int32_t flags,
+				uint64_t deadline_msec)
+{
+	cf_crash(CF_TLS, "unexpected TLS state");
+	return 1;
+}
+
+int
+tls_socket_pending(cf_socket *sock)
+{
+	return 0;
+}
+
diff --git a/cf/src/vmapx.c b/cf/src/vmapx.c
new file mode 100644
index 00000000..c2ad4a9b
--- /dev/null
+++ b/cf/src/vmapx.c
@@ -0,0 +1,398 @@
+/*
+ * vmapx.c
+ *
+ * Copyright (C) 2012-2016 Aerospike, Inc.
+ *
+ * Portions may be licensed to Aerospike, Inc. under one or more contributor
+ * license agreements.
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+//==========================================================
+// Includes.
+//
+
+#include "vmapx.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "fault.h"
+
+#include "citrusleaf/alloc.h"
+#include "citrusleaf/cf_hash_math.h"
+
+
+//==========================================================
+// Forward declarations.
+//
+
+bool vhash_get(const vhash* h, const char* key, size_t key_len, uint32_t* p_value);
+
+
+//==========================================================
+// Public API.
+//
+
+// Return memory size needed - includes cf_vmapx struct plus values vector.
+size_t
+cf_vmapx_sizeof(uint32_t value_size, uint32_t max_count)
+{
+	return sizeof(cf_vmapx) + ((size_t)value_size * (size_t)max_count);
+}
+
+// Initialize an already allocated cf_vmapx object.
+void
+cf_vmapx_init(cf_vmapx* vmap, uint32_t value_size, uint32_t max_count,
+		uint32_t hash_size, uint32_t max_name_size)
+{
+	cf_assert(vmap, CF_VMAPX, "null vmap pointer");
+	cf_assert((value_size & 3) == 0, CF_VMAPX, "bad value_size");
+	cf_assert(max_count != 0, CF_VMAPX, "bad max_count");
+	cf_assert(hash_size != 0, CF_VMAPX, "bad hash_size");
+	cf_assert(max_name_size != 0 && max_name_size <= value_size, CF_VMAPX,
+			"bad max_name_size");
+
+	vmap->value_size = value_size;
+	vmap->max_count = max_count;
+	vmap->count = 0;
+
+	vmap->key_size = max_name_size;
+	vmap->hash = vhash_create(max_name_size, hash_size);
+
+	pthread_mutex_init(&vmap->write_lock, 0);
+}
+
+// Don't call after failed cf_vmapx_create() or cf_vmapx_resume() call - those
+// functions clean up on failure.
+void
+cf_vmapx_release(cf_vmapx* vmap)
+{
+	// Helps in handling bins vmap, which doesn't exist in single-bin mode.
+	if (! vmap) {
+		return;
+	}
+
+	pthread_mutex_destroy(&vmap->write_lock);
+
+	vhash_destroy(vmap->hash);
+}
+
+// Return count.
+uint32_t
+cf_vmapx_count(const cf_vmapx* vmap)
+{
+	return vmap->count;
+}
+
+// Get value by index.
+cf_vmapx_err
+cf_vmapx_get_by_index(const cf_vmapx* vmap, uint32_t index, void** pp_value)
+{
+	// This check is commented out for now to avoid the volatile access.
+	// TODO - ultimately, caller code can be simplified. (Especially if this
+	// just returned the value pointer.) And if necessary, we could make a
+	// "safe" version of this that does the check.
+
+//	if (index >= vmap->count) {
+//		return CF_VMAPX_ERR_BAD_PARAM;
+//	}
+
+	*pp_value = vmapx_value_ptr(vmap, index);
+
+	return CF_VMAPX_OK;
+}
+
+// Get value by null-terminated name.
+cf_vmapx_err
+cf_vmapx_get_by_name(const cf_vmapx* vmap, const char* name, void** pp_value)
+{
+	size_t name_len = strlen(name);
+
+	if (name_len >= vmap->key_size) {
+		return CF_VMAPX_ERR_NAME_NOT_FOUND;
+	}
+
+	uint32_t index;
+
+	if (! vhash_get(vmap->hash, name, name_len, &index)) {
+		return CF_VMAPX_ERR_NAME_NOT_FOUND;
+	}
+
+	*pp_value = vmapx_value_ptr(vmap, index);
+
+	return CF_VMAPX_OK;
+}
+
+// Get index by null-terminated name. May pass null p_index to check existence.
+cf_vmapx_err
+cf_vmapx_get_index(const cf_vmapx* vmap, const char* name, uint32_t* p_index)
+{
+	size_t name_len = strlen(name);
+
+	if (name_len >= vmap->key_size) {
+		return CF_VMAPX_ERR_NAME_NOT_FOUND;
+	}
+
+	return vhash_get(vmap->hash, name, name_len, p_index) ?
+			CF_VMAPX_OK : CF_VMAPX_ERR_NAME_NOT_FOUND;
+}
+
+// Same as above, but non-null-terminated name.
+cf_vmapx_err
+cf_vmapx_get_index_w_len(const cf_vmapx* vmap, const char* name,
+		size_t name_len, uint32_t* p_index)
+{
+	if (name_len >= vmap->key_size) {
+		return CF_VMAPX_ERR_NAME_NOT_FOUND;
+	}
+
+	return vhash_get(vmap->hash, name, name_len, p_index) ?
+			CF_VMAPX_OK : CF_VMAPX_ERR_NAME_NOT_FOUND;
+}
+
+// The value must begin with a string which is its name. (The hash map is not
+// stored in persistent memory, so names must be in the vector to enable us to
+// rebuild the hash map on warm or cool restart.)
+//
+// If name is not found, add new name, clear rest of value in vector, and return
+// newly assigned index (and CF_VMAPX_OK). If name is found, return index for
+// existing value (with CF_VMAPX_ERR_NAME_EXISTS). May pass null p_index.
+cf_vmapx_err
+cf_vmapx_put_unique(cf_vmapx* vmap, const char* name, uint32_t* p_index)
+{
+	return cf_vmapx_put_unique_w_len(vmap, name, strlen(name), p_index);
+}
+
+// Same as above, but with known name length.
+cf_vmapx_err
+cf_vmapx_put_unique_w_len(cf_vmapx* vmap, const char* name, size_t name_len,
+		uint32_t* p_index)
+{
+	// Make sure name fits in key's allocated size.
+	if (name_len >= vmap->key_size) {
+		return CF_VMAPX_ERR_BAD_PARAM;
+	}
+
+	pthread_mutex_lock(&vmap->write_lock);
+
+	// If name is found, return existing name's index, ignore p_value.
+	if (vhash_get(vmap->hash, name, name_len, p_index)) {
+		pthread_mutex_unlock(&vmap->write_lock);
+		return CF_VMAPX_ERR_NAME_EXISTS;
+	}
+
+	// Make sure name has no illegal premature null-terminator.
+	for (uint32_t i = 0; i < name_len; i++) {
+		if (name[i] == 0) {
+			pthread_mutex_unlock(&vmap->write_lock);
+			return CF_VMAPX_ERR_BAD_PARAM;
+		}
+	}
+
+	uint32_t count = vmap->count;
+
+	// If vmap is full, can't add more.
+	if (count >= vmap->max_count) {
+		pthread_mutex_unlock(&vmap->write_lock);
+		return CF_VMAPX_ERR_FULL;
+	}
+
+	// Add name to vector (and clear rest of value).
+	char* value_ptr = (char*)vmapx_value_ptr(vmap, count);
+
+	memset((void*)value_ptr, 0, vmap->value_size);
+	memcpy((void*)value_ptr, name, name_len);
+
+	// Increment count here so indexes returned by other public API calls (just
+	// after adding to hash below) are guaranteed to be valid.
+	vmap->count++;
+
+	// Add to hash.
+	vhash_put(vmap->hash, value_ptr, name_len, count);
+
+	pthread_mutex_unlock(&vmap->write_lock);
+
+	if (p_index) {
+		*p_index = count;
+	}
+
+	return CF_VMAPX_OK;
+}
+
+
+//==========================================================
+// Private API - for enterprise separation only.
+//
+
+// Return value pointer at trusted index.
+void*
+vmapx_value_ptr(const cf_vmapx* vmap, uint32_t index)
+{
+	return (void*)(vmap->values + (vmap->value_size * index));
+}
+
+
+//==========================================================
+// vhash "scoped class".
+//
+
+// Custom hashmap for cf_vmapx usage.
+// - Elements are added but never removed.
+// - It's thread safe yet lockless. (Relies on cf_vmapx's write_lock.)
+// - Element keys are null-terminated strings.
+// - Element values are uint32_t's.
+
+struct vhash_s {
+	uint32_t key_size;
+	uint32_t ele_size;
+	uint32_t n_rows;
+	uint8_t* table;
+	bool row_usage[];
+};
+
+typedef struct vhash_ele_s {
+	struct vhash_ele_s* next;
+	uint8_t data[]; // key_size bytes of key, 4 bytes of value
+} vhash_ele;
+
+#define VHASH_ELE_KEY_PTR(_e)		((char*)_e->data)
+#define VHASH_ELE_VALUE_PTR(_h, _e)	((uint32_t*)(_e->data + _h->key_size))
+
+// Copy null-terminated key into hash, then pad with non-null characters.
+// Padding ensures quicker compare in vhash_get() when key in hash is shorter,
+// and prevents accidental match if key param has illegal null character(s).
+static inline void
+vhash_set_ele_key(char* ele_key, size_t key_size, const char* zkey,
+		size_t zkey_size)
+{
+	memcpy((void*)ele_key, (const void*)zkey, zkey_size);
+	memset((void*)(ele_key + zkey_size), 'x', key_size - zkey_size);
+}
+
+// Create vhash with specified key size (max) and number or rows.
+vhash*
+vhash_create(uint32_t key_size, uint32_t n_rows)
+{
+	size_t row_usage_size = n_rows * sizeof(bool);
+	vhash* h = (vhash*)cf_malloc(sizeof(vhash) + row_usage_size);
+
+	h->key_size = key_size;
+	h->ele_size = sizeof(vhash_ele) + key_size + sizeof(uint32_t);
+	h->n_rows = n_rows;
+
+	size_t table_size = n_rows * h->ele_size;
+
+	h->table = (uint8_t*)cf_malloc(table_size);
+
+	memset((void*)h->row_usage, 0, row_usage_size);
+	memset((void*)h->table, 0, table_size);
+
+	return h;
+}
+
+// Destroy vhash. (Assumes it was fully created.)
+void
+vhash_destroy(vhash* h)
+{
+	vhash_ele* e_table = (vhash_ele*)h->table;
+
+	for (uint32_t i = 0; i < h->n_rows; i++) {
+		if (e_table->next) {
+			vhash_ele* e = e_table->next;
+
+			while (e) {
+				vhash_ele* t = e->next;
+
+				cf_free(e);
+				e = t;
+			}
+		}
+
+		e_table = (vhash_ele*)((uint8_t*)e_table + h->ele_size);
+	}
+
+	cf_free(h->table);
+	cf_free(h);
+}
+
+// Add element. Key must be null-terminated, although its length is known.
+void
+vhash_put(vhash* h, const char* zkey, size_t key_len, uint32_t value)
+{
+	uint64_t hashed_key = cf_hash_fnv32((const uint8_t*)zkey, key_len);
+	uint32_t row_i = (uint32_t)(hashed_key % h->n_rows);
+
+	vhash_ele* e = (vhash_ele*)(h->table + (h->ele_size * row_i));
+
+	if (! h->row_usage[row_i]) {
+		vhash_set_ele_key(VHASH_ELE_KEY_PTR(e), h->key_size, zkey, key_len + 1);
+		*VHASH_ELE_VALUE_PTR(h, e) = value;
+		// TODO - need barrier?
+		h->row_usage[row_i] = true;
+
+		return;
+	}
+
+	vhash_ele* e_head = e;
+
+	// This function is always called under write lock, after get, so we'll
+	// never encounter the key - don't bother checking it.
+	while (e) {
+		e = e->next;
+	}
+
+	e = (vhash_ele*)cf_malloc(h->ele_size);
+
+	vhash_set_ele_key(VHASH_ELE_KEY_PTR(e), h->key_size, zkey, key_len + 1);
+	*VHASH_ELE_VALUE_PTR(h, e) = value;
+
+	e->next = e_head->next;
+	// TODO - need barrier?
+	e_head->next = e;
+}
+
+// Get element value. Key may or may not be null-terminated.
+bool
+vhash_get(const vhash* h, const char* key, size_t key_len, uint32_t* p_value)
+{
+	uint64_t hashed_key = cf_hash_fnv32((const uint8_t*)key, key_len);
+	uint32_t row_i = (uint32_t)(hashed_key % h->n_rows);
+
+	if (! h->row_usage[row_i]) {
+		return false;
+	}
+
+	// TODO - need barrier?
+	vhash_ele* e = (vhash_ele*)(h->table + (h->ele_size * row_i));
+
+	while (e) {
+		if (VHASH_ELE_KEY_PTR(e)[key_len] == 0 &&
+				memcmp(VHASH_ELE_KEY_PTR(e), key, key_len) == 0) {
+			if (p_value) {
+				*p_value = *VHASH_ELE_VALUE_PTR(h, e);
+			}
+
+			return true;
+		}
+
+		e = e->next;
+	}
+
+	return false;
+}
diff --git a/make_in/Makefile.in b/make_in/Makefile.in
new file mode 100644
index 00000000..de078017
--- /dev/null
+++ b/make_in/Makefile.in
@@ -0,0 +1,97 @@
+# Aerospike Server
+# Makefile.in
+#
+# Define project global settings for compiler options.
+#
+
+# [Note:  "DEPTH" must be defined by the "include"ing Makefile.]
+
+# Common variable definitions:
+include $(DEPTH)/make_in/Makefile.vars
+
+CF_INCLUDE_DIR = $(DEPTH)/common/src/include
+CF_LIB_DIR     = $(DEPTH)/common/target/$(PLATFORM)/lib
+
+AS_CFLAGS += -D_FILE_OFFSET_BITS=64 -std=gnu99 -D_REENTRANT -D_GNU_SOURCE
+
+# Use the enhanced memory allocator (rather than the default version in the Common module.)
+AS_CFLAGS += -DENHANCED_ALLOC
+
+LIBRARIES += -lcrypto
+
+LIBRARIES += -lpthread -lrt -ldl -lz -lm
+
+# Location of source files being compiled:
+#   "" (the default) means "src".  Automatically set to "../m4/" during non-zero macro expansion build phases.
+SRCDIR =
+
+INCLUDES += -I$(JEMALLOC)/include
+LIBRARIES := $(JEMALLOC)/lib/libjemalloc.a $(LIBRARIES)
+
+# Popular values:
+#   x86_64 for 64-bit Intel
+#   i686 for 32-bit Intel
+MARCH_NATIVE = $(shell uname -m)
+
+# If GCC v4.4.7 or later, use DWARF version 4, othewise use version 2:
+ifeq ($(shell $(DEPTH)/build/VersionCheck.py 'gcc -dumpversion' 4.4.7), 1)
+  DWARF_VERSION=4
+else
+  DWARF_VERSION=2
+endif
+
+# Overrride optimizations via: make O=n
+O = 3 
+
+ifeq ($(DOPROFILE),1)
+  OPTFLAGS = -O$(O) -fPIE -pg -fprofile-arcs -ftest-coverage -DDOPROFILE
+else
+  OPTFLAGS = -O$(O)
+endif
+
+# Strict aliasing is really interesting. -fno-strict-aliasing relaxes the coding rules, but makes
+# some code run a little slower. I'm not seeing a real difference at the moment, so turning it off
+# I think to get the real speed, you turn on -fstrict-aliasing, and the appropriate -W, which 
+# generates a few warnings in our code where type punning is used for printf sanity. This
+# also interacts with the restrict keyword, which I understand in theory, but attempts to use
+# it throw errors out of the compiler so I don't understand it yet.
+#
+# Removing the frame pointers does add a few percent in speed, too, but we need better debugging
+# at this point...
+#
+# And the jury's a little out on -mss3 and -msse4. They aren't turned on by -march=native,
+# even though native should understand that those are the preferred types, and available.
+#
+# Tree vectorize is turned on the in the O3 mechanism. It's fascinating to turn on the tree vectorize
+# debugs. Very rarely do we have loops that vectorize, because we often use functions in our loops.
+# and, sometimes loops are vectorized but will need lengths greater than 20 or 30 to show speed improvements,
+# loops of this size are unlikely in our code.
+#
+# O3 also enables -finline-functions, among other things.
+COMMON_CFLAGS = -gdwarf-$(DWARF_VERSION) -g3 $(OPTFLAGS) -fno-common -fno-strict-aliasing -Wall $(AS_CFLAGS) $(AS_EE_CFLAGS)
+
+# Code generated for the "nocona" architecture has been determined to run well on a wide variety of current machines.
+ifneq ($(ARCH),$(filter $(ARCH),ppc64 ppc64le))
+  COMMON_CFLAGS += -march=nocona
+endif
+
+# Generate dependency files.
+COMMON_CFLAGS += -MMD
+
+# Require strict warning-free compilation.
+COMMON_CFLAGS += -Werror
+
+CFLAGS = $(COMMON_CFLAGS) -DMARCH_$(MARCH_NATIVE)
+
+# Define a macro for the base source file name.
+DEF_FN += -D__FILENAME__=\"$(notdir $<)\"
+
+# Alternative Compiler Flags Settings:
+# Note:  "native" is optimized for the build environment, which might not be the same as the deployment environment:
+#CFAGS_NATIVE = $(COMMON_CFLAGS) -march=native
+#CFLAGS_64 = $(COMMON_FLAGS) -DMARCH_x86_64
+#CFLAGS_32 = $(COMMON_CFLAGS) -DMARCH_i686
+
+LDFLAGS = -rdynamic -L$(CF_LIB_DIR)
+
+STRIP = strip -p -v -s
diff --git a/make_in/Makefile.targets b/make_in/Makefile.targets
new file mode 100644
index 00000000..92f0904f
--- /dev/null
+++ b/make_in/Makefile.targets
@@ -0,0 +1,16 @@
+# Aerospike Server
+# Makefile.targets
+#
+# Common Makefile targets, dependencies, and pattern-matching rules.
+#
+
+strip:	$(SERVER)
+	$(STRIP) $(SERVER) -o $(SERVER).stripped
+
+-include $(DEPENDENCIES)
+
+$(OBJECT_DIR)/%.o: %.c
+	$(CC) $(CFLAGS) $(DEF_FN) -o $@$(SUFFIX) -c $(INCLUDES) $(SRCDIR)$<
+
+$(OBJECT_DIR)/%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(CFLAGS) $(DEF_FN) -o $@$(SUFFIX) -c $(INCLUDES) $(SRCDIR)$<
diff --git a/make_in/Makefile.vars b/make_in/Makefile.vars
new file mode 100644
index 00000000..54296e80
--- /dev/null
+++ b/make_in/Makefile.vars
@@ -0,0 +1,98 @@
+# Aerospike Server
+# Makefile.vars
+#
+# Common Makefile variables.
+#
+# To enable or disable the following features, add <VARIABLE>=(1|0) to the "make" command line.
+#  E.g., to build without JEMalloc support, use:
+#
+#      prompt% make USE_JEM=0
+#
+# To link with the static or dynamic version of a library, add "LD_<LIBRARY>=(static|dynamic)",
+#  where <LIBRARY> is "CRYPTO", "LUA", "LUAJIT", "JANSSON" or "JEM", to the "make" command line.
+#  E.g., to build with JEMalloc dynamically linked, use:
+#
+#      prompt% make LD_JEM=dynamic
+#
+# [Note:  "EXT_CFLAGS" contains "external" CFLAGS passed to sub-module builds.]
+#
+
+ifneq ($(EEREPO),)
+  include $(EEREPO)/make_in/Makefile.vars
+endif
+
+# By default, build the community edition.
+EDITION = community
+
+# Build host machine architecture.
+ARCH = $(shell uname -m)
+
+# Use LuaJIT instead of Lua?  [By default, yes.]
+USE_LUAJIT = 1
+ifeq ($(ARCH),$(filter $(ARCH),ppc64 ppc64le))
+  USE_LUAJIT = 0
+endif
+
+# Default mode used for linking the Jansson JSON API Library:
+LD_JANSSON = static
+
+# Default mode used for linking the LuaJIT library:
+LD_LUAJIT = static
+
+# Default mode used for linking the Lua library:
+LD_LUA = static
+
+# Options to pass to Jansson's "configure" script.
+JANSSON_CONFIG_OPT =
+
+# Options to pass to JEMalloc's "configure" script.
+JEM_CONFIG_OPT = "EXTRA_CFLAGS=-I/opt/valgrind/include -I/usr/local/include" --with-jemalloc-prefix=jem_
+
+EXT_CFLAGS += -DENHANCED_ALLOC
+
+# Set the default depth to the top level unless overriden:
+DEPTH ?= .
+
+# Directory structure for build products:
+
+TARGET_DIR  = $(DEPTH)/target
+
+PLATFORM    = $(shell uname)-$(ARCH)
+BUILD_DIR   = $(TARGET_DIR)/$(PLATFORM)
+
+GEN_DIR     = $(BUILD_DIR)/gen
+INCLUDE_DIR = ../include $(GEN_DIR)
+OBJECT_DIR  = $(BUILD_DIR)/obj
+LIBRARY_DIR = $(BUILD_DIR)/lib
+BIN_DIR     = $(BUILD_DIR)/bin
+
+# Auto-generated version files:
+VERSION_SRC = $(GEN_DIR)/version.c
+VERSION_OBJ = $(VERSION_SRC:$(GEN_DIR)/%.c=$(OBJECT_DIR)/%.o)
+
+# Paths to the submodules:
+AI_PATH       := $(realpath $(DEPTH)/ai)
+AS_PATH       := $(realpath $(DEPTH)/as)
+CF_PATH       := $(realpath $(DEPTH)/cf)
+COMMON_PATH   := $(realpath $(DEPTH)/modules/common)
+JANSSON_PATH  := $(realpath $(DEPTH)/modules/jansson)
+MOD_LUA_PATH  := $(realpath $(DEPTH)/modules/mod-lua)
+LUA_CORE_PATH := $(realpath $(DEPTH)/modules/lua-core)
+JEMALLOC_PATH := $(realpath $(DEPTH)/modules/jemalloc)
+LUAJIT_PATH   := $(realpath $(DEPTH)/modules/luajit)
+S2_PATH       := $(realpath $(DEPTH)/modules/s2-geometry-library/geometry)
+
+# Overridable values used by sub-makefiles:
+AI       = $(AI_PATH)
+AS       = $(AS_PATH)
+CF       = $(CF_PATH)
+COMMON   = $(COMMON_PATH)
+JANSSON  = $(JANSSON_PATH)
+MOD_LUA  = $(MOD_LUA_PATH)
+LUA_CORE = $(LUA_CORE_PATH)
+JEMALLOC = $(JEMALLOC_PATH)
+LUAJIT   = $(LUAJIT_PATH)
+S2       = $(S2_PATH)
+
+# Programs, for which GNU Make doesn't define implicit variables:
+OBJCOPY  := objcopy
diff --git a/modules/common b/modules/common
new file mode 160000
index 00000000..fc2dd1df
--- /dev/null
+++ b/modules/common
@@ -0,0 +1 @@
+Subproject commit fc2dd1df4268f15674752440d568918a58b40eb7
diff --git a/modules/jansson b/modules/jansson
new file mode 160000
index 00000000..5cc594c9
--- /dev/null
+++ b/modules/jansson
@@ -0,0 +1 @@
+Subproject commit 5cc594c9e8bc01f9531f80aba82c9775bba94c18
diff --git a/modules/jemalloc b/modules/jemalloc
new file mode 160000
index 00000000..92192432
--- /dev/null
+++ b/modules/jemalloc
@@ -0,0 +1 @@
+Subproject commit 921924328a0bf4204feb2a315415170b8370223c
diff --git a/modules/lua-core b/modules/lua-core
new file mode 160000
index 00000000..acb9eb1e
--- /dev/null
+++ b/modules/lua-core
@@ -0,0 +1 @@
+Subproject commit acb9eb1ec2dda2c64375b28c7fb08a0518aadd27
diff --git a/modules/luajit b/modules/luajit
new file mode 160000
index 00000000..6c4a1825
--- /dev/null
+++ b/modules/luajit
@@ -0,0 +1 @@
+Subproject commit 6c4a18258631ff01f963e9a1e64df57d7a453fd6
diff --git a/modules/mod-lua b/modules/mod-lua
new file mode 160000
index 00000000..5293a5c1
--- /dev/null
+++ b/modules/mod-lua
@@ -0,0 +1 @@
+Subproject commit 5293a5c10567269ac194c705a6910277aff1d2a1
diff --git a/modules/s2-geometry-library b/modules/s2-geometry-library
new file mode 160000
index 00000000..97562341
--- /dev/null
+++ b/modules/s2-geometry-library
@@ -0,0 +1 @@
+Subproject commit 975623412e292079b962bf73983bfb6ac63f3ba9
diff --git a/modules/telemetry b/modules/telemetry
new file mode 160000
index 00000000..611e169a
--- /dev/null
+++ b/modules/telemetry
@@ -0,0 +1 @@
+Subproject commit 611e169a7d60d803e0de0fc92c35364a3a94f33c
diff --git a/pkg/deb/Makefile b/pkg/deb/Makefile
new file mode 100644
index 00000000..45d327fb
--- /dev/null
+++ b/pkg/deb/Makefile
@@ -0,0 +1,148 @@
+# Build Aerospike Server ".deb" Distribution.
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.vars
+
+PKG = $(realpath $(DEPTH)/pkg)
+SOURCE_ROOT = $(PKG)/dist
+BUILD_ROOT = $(SOURCE_ROOT)/BUILD
+OPT_AS = $(BUILD_ROOT)/opt/aerospike
+
+REV = $(shell $(DEPTH)/build/version)
+OS = $(shell $(DEPTH)/build/os_version)
+SIZE = $(shell du -k $(BIN_DIR)/asd | cut -f1)
+DEPS = 
+
+comma:= ,
+empty:=
+space:= $(empty) $(empty)
+
+ifeq ($(OS),$(filter $(OS),debian8 ubuntu16.04))
+	USE_SYSTEMD = 1
+	CONF_VERSION = _systemd
+	DEPS = 
+else
+	DEPS = logrotate
+endif
+
+ifeq ($(EDITION),community)
+  DEPS += python
+endif
+
+DEB = $(PKG)/packages/aerospike-server-$(EDITION)-$(REV).$(OS).x86_64.deb
+
+ifeq ($(USE_EE),1)
+all:	dist-xdr package clean
+else
+all:	dist package clean
+endif
+
+.PHONY: dist
+dist:
+	install -d $(BUILD_ROOT)/DEBIAN
+	install -d $(BUILD_ROOT)/etc/aerospike
+	install -d $(BUILD_ROOT)/etc/aerospike/sample
+ifeq ($(USE_SYSTEMD),1)
+	install -d $(BUILD_ROOT)/usr/lib/systemd/system
+	install -d $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d
+else
+	install -d $(BUILD_ROOT)/etc/init.d
+	install -d $(BUILD_ROOT)/etc/logrotate.d
+	install -d $(BUILD_ROOT)/var/log/aerospike
+	install -d $(BUILD_ROOT)/var/run/aerospike
+endif
+	install -d $(BUILD_ROOT)/usr/bin
+
+	install -pm 644 $(PKG)/deb/conffiles $(BUILD_ROOT)/DEBIAN
+ifeq ($(EDITION),community)
+	cat $(PKG)/deb/conffiles.telemetry >> $(BUILD_ROOT)/DEBIAN/conffiles
+endif
+	install -pm 755 $(PKG)/deb/postinst.server $(BUILD_ROOT)/DEBIAN/postinst
+
+	install -pm 755 $(BIN_DIR)/asd $(BUILD_ROOT)/usr/bin/asd
+ifeq ($(USE_SYSTEMD),1)
+	install -pm 755 $(DEPTH)/tools/bin/asd-coldstart $(BUILD_ROOT)/usr/bin/asd-coldstart
+endif
+	install -pm 755 $(DEPTH)/tools/citrus2aero/upgrade2to3 $(BUILD_ROOT)/usr/bin/asmigrate2to3
+	install -pm 755 $(DEPTH)/tools/fixownership/fixownership.py $(BUILD_ROOT)/usr/bin/asfixownership
+	install -pm 755 $(DEPTH)/as/etc/irqbalance-ban.sh $(BUILD_ROOT)/etc/aerospike/irqbalance-ban.sh
+	install -pm 644 $(DEPTH)/as/etc/aerospike$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/aerospike.conf
+	cat $(DEPTH)/as/etc/README.sample.conf.md > $(BUILD_ROOT)/etc/aerospike/sample/README.md
+	install -pm 644 $(DEPTH)/as/etc/aerospike_ssd$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_ssd.conf
+	install -pm 644 $(DEPTH)/as/etc/aerospike_mesh$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_mesh.conf
+
+ifeq ($(USE_SYSTEMD),1)
+	cat $(DEPTH)/as/etc/aerospike.service.head >> $(PKG)/deb/aerospike.service
+  ifeq ($(EDITION),community)
+	cat $(DEPTH)/as/etc/aerospike.service.telemetry >> $(PKG)/deb/aerospike.service
+  endif
+	cat $(DEPTH)/as/etc/aerospike.service.tail >> $(PKG)/deb/aerospike.service
+	install -p -D -m 644 $(PKG)/deb/aerospike.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike.service
+	install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.tmpfiles $(BUILD_ROOT)/etc/tmpfiles.d/aerospike.conf
+	install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike
+	install -p -D -m 755 $(DEPTH)/as/etc/asd-systemd-helper $(BUILD_ROOT)/usr/bin/asd-systemd-helper
+	install -p -D -m 644 $(DEPTH)/as/etc/aerospike.service.d/* $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d
+else
+	install -pm 755 $(DEPTH)/as/etc/init-script.deb $(BUILD_ROOT)/etc/init.d/aerospike
+	sed -i 's/@EDITION@/$(EDITION)/g' $(BUILD_ROOT)/etc/init.d/aerospike
+	install -pm 644 $(DEPTH)/as/etc/logrotate_asd $(BUILD_ROOT)/etc/logrotate.d/aerospike
+endif
+
+	install -d $(OPT_AS)/doc
+ifeq ($(EDITION),community)
+	install -pm 644 $(DEPTH)/LICENSE.CE $(OPT_AS)/doc/LICENSE
+	install -pm 644 $(DEPTH)/LICENSE-AGPL $(OPT_AS)/doc
+	install -pm 644 $(DEPTH)/LICENSE-APACHE $(OPT_AS)/doc
+else
+	install -pm 644 $(EEREPO)/LICENSE.EE $(OPT_AS)/doc/LICENSE
+endif
+	cat $(DEPTH)/LICENSE.3rdParty >> $(OPT_AS)/doc/LICENSE
+
+ifeq ($(EDITION),community)
+  ifeq ($(USE_SYSTEMD),1)
+	install -pm 755 $(DEPTH)/as/etc/aerospike_telemetry.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike_telemetry.service
+	install -pm 644 $(DEPTH)/as/etc/aerospike_telemetry.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike_telemetry
+  else
+	install -pm 755 $(DEPTH)/as/etc/init-telemetry-script.deb $(BUILD_ROOT)/etc/init.d/aerospike_telemetry
+	install -pm 644 $(DEPTH)/as/etc/logrotate_telemetry $(BUILD_ROOT)/etc/logrotate.d/aerospike_telemetry
+  endif
+	install -d $(OPT_AS)/telemetry
+	install -d $(OPT_AS)/telemetry/phonehome
+	install -d $(OPT_AS)/telemetry/daemon
+	install -pm 644 $(DEPTH)/as/etc/telemetry.conf $(BUILD_ROOT)/etc/aerospike
+	install -pm 644 $(DEPTH)/modules/telemetry/README.md $(OPT_AS)/doc/TELEMETRY.md
+	install -pm 755 $(DEPTH)/modules/telemetry/telemetry.py $(OPT_AS)/telemetry
+	install -pm 755 $(DEPTH)/modules/telemetry/phonehome/*.py $(OPT_AS)/telemetry/phonehome
+	install -pm 755 $(DEPTH)/modules/telemetry/daemon/*.py $(OPT_AS)/telemetry/daemon
+endif
+
+	install -d $(OPT_AS)/data
+	install -d $(OPT_AS)/smd
+	install -d $(OPT_AS)/sys/udf/lua/external
+	install -d $(OPT_AS)/usr/udf/lua
+	install -pm 644 $(DEPTH)/modules/lua-core/src/*.lua $(OPT_AS)/sys/udf/lua
+	for FILE in `find $(DEPTH)/modules/lua-core/src/external -type f` ; do \
+	   install -pm 644 $$FILE $(OPT_AS)/sys/udf/lua/external ; \
+	done
+
+	install -d $(OPT_AS)/bin
+	install -pm 755 $(DEPTH)/tools/memacct/asparsemem $(OPT_AS)/bin
+
+	sed 's/@VERSION@/'$(REV)'/g' < $(PKG)/deb/server-64 > $(BUILD_ROOT)/DEBIAN/control
+	sed -i 's/@EDITION@/'$(EDITION)'/g' $(BUILD_ROOT)/DEBIAN/control
+	sed -i 's/@SIZE@/'$(SIZE)'/g' $(BUILD_ROOT)/DEBIAN/control
+	sed -i 's/@DEPS@/$(addprefix $(comma), $(subst $(space),$(comma),$(strip $(DEPS))))/g' $(BUILD_ROOT)/DEBIAN/control
+
+package:
+	install -pm 644 $(OPT_AS)/doc/LICENSE $(PKG)/packages
+	fakeroot dpkg-deb --build $(BUILD_ROOT) $(DEB)
+
+clean:
+	rm -rf $(SOURCE_ROOT)/*
+ifeq ($(USE_SYSTEMD),1)
+	rm -rf $(PKG)/deb/aerospike.service
+endif
+
+ifeq ($(USE_EE),1)
+  include $(XDR)/make_in/Makefile.deb.in
+endif
diff --git a/pkg/deb/asinstall b/pkg/deb/asinstall
new file mode 100755
index 00000000..2fe45fec
--- /dev/null
+++ b/pkg/deb/asinstall
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Install Aerospike server and tools on Debian6/Debian7/Ubuntu12.  
+# This script must be run as root or sudo.
+
+#------------------
+# Verify User
+#------------------
+
+if [ $EUID -ne 0 ]
+then
+	echo "This script requires root or sudo privileges."
+	exit 1
+fi
+
+#---------------
+# Check argparse
+#---------------
+fn=/tmp/pkgexists
+
+cat <<EOF >$fn
+try:
+	import argparse
+	print(1)
+except:
+	print(0)
+EOF
+
+has_argparse=`python $fn`
+rm $fn
+
+if [ "$has_argparse" = "0" ]
+then
+	echo Installing python-argparse
+	apt-get -y install python-argparse
+fi
+
+#---------------
+# Install tools
+#---------------
+echo Installing tools
+
+# Use default arguments if none passed in.
+if [ $# -eq 0 ]
+then
+	echo dpkg -i aerospike-tools-*.deb
+	dpkg -i aerospike-tools-*.deb
+else
+	echo dpkg "$@" aerospike-tools-*.deb
+	dpkg "$@" aerospike-tools-*.deb
+fi
+
+#---------------
+# Install server
+#---------------
+echo Installing server
+
+# Use default arguments if none passed in.
+if [ $# -eq 0 ]
+then
+	echo dpkg -i aerospike-server-*.deb
+	dpkg -i aerospike-server-*.deb
+else
+	echo dpkg "$@" aerospike-server-*.deb
+	dpkg "$@" aerospike-server-*.deb
+fi
+
diff --git a/pkg/deb/conffiles b/pkg/deb/conffiles
new file mode 100644
index 00000000..897d3824
--- /dev/null
+++ b/pkg/deb/conffiles
@@ -0,0 +1 @@
+/etc/aerospike/aerospike.conf
diff --git a/pkg/deb/conffiles.telemetry b/pkg/deb/conffiles.telemetry
new file mode 100644
index 00000000..d34ff127
--- /dev/null
+++ b/pkg/deb/conffiles.telemetry
@@ -0,0 +1 @@
+/etc/aerospike/telemetry.conf
diff --git a/pkg/deb/copyright b/pkg/deb/copyright
new file mode 100644
index 00000000..2cc8da93
--- /dev/null
+++ b/pkg/deb/copyright
@@ -0,0 +1,9 @@
+Aerospike Server
+
+Copyright: Aerospike, Inc <info@aerospike.com>
+
+These files are owned by Aerospike, Inc.
+
+Permission to use is covered by customer agreements signed by Aerospike.
+Please see your customer agreements or non-disclosure agreements
+for distribution and re-distribution rights.
diff --git a/pkg/deb/postinst.server b/pkg/deb/postinst.server
new file mode 100755
index 00000000..70847520
--- /dev/null
+++ b/pkg/deb/postinst.server
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+set -e
+
+case "$1" in
+  configure)
+
+  # create aerospike group if it isn't already there
+  if ! getent group aerospike >/dev/null; then
+    groupadd -r aerospike
+  fi
+
+  # create aerospike user if it isn't already there
+  if ! getent passwd aerospike >/dev/null; then
+    useradd -r -d /opt/aerospike -c 'Aerospike server' -g aerospike -s /sbin/nologin aerospike
+  fi
+
+  for dir in /opt/aerospike /var/log/aerospike /var/run/aerospike ; do
+    if [ -d $dir ]; then
+      chown -R aerospike:aerospike $dir
+    fi
+  done
+
+  if [ -d /run/systemd/system ]; then
+    systemctl --system daemon-reload >/dev/null 2>&1 || true
+  fi
+
+  ;;
+esac
+
+exit 0
diff --git a/pkg/deb/server-64 b/pkg/deb/server-64
new file mode 100644
index 00000000..3ec899b5
--- /dev/null
+++ b/pkg/deb/server-64
@@ -0,0 +1,9 @@
+Package: aerospike-server-@EDITION@
+Version: @VERSION@-1
+Section: Databases
+Priority: optional
+Architecture: amd64
+Depends: libc6 (>= 2.7)@DEPS@
+Maintainer: Aerospike, Inc. <support@aerospike.com>
+Description: The Aerospike distributed datastore allows fully scalable and reliable data storage with elastic server properties.
+Installed-Size: @SIZE@
diff --git a/pkg/dist/.gitignore b/pkg/dist/.gitignore
new file mode 100644
index 00000000..13e4d83e
--- /dev/null
+++ b/pkg/dist/.gitignore
@@ -0,0 +1 @@
+[^.]*
diff --git a/pkg/packages/.gitignore b/pkg/packages/.gitignore
new file mode 100644
index 00000000..13e4d83e
--- /dev/null
+++ b/pkg/packages/.gitignore
@@ -0,0 +1 @@
+[^.]*
diff --git a/pkg/rpm/Makefile b/pkg/rpm/Makefile
new file mode 100644
index 00000000..9d7cc6e0
--- /dev/null
+++ b/pkg/rpm/Makefile
@@ -0,0 +1,158 @@
+# Build Aerospike Server RPM Distribution.
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.vars
+
+PKG = $(realpath $(DEPTH)/pkg)
+SOURCE_ROOT = $(PKG)/dist
+BUILD_ROOT = $(SOURCE_ROOT)/BUILD
+OPT_AS = $(BUILD_ROOT)/opt/aerospike
+
+REV = $(shell $(DEPTH)/build/version | sed 's/-/_/g')
+OS = $(shell $(DEPTH)/build/os_version)
+
+ifeq ($(OS),el7)
+  USE_SYSTEMD = 1
+  CONF_VERSION = _systemd
+endif
+
+ifeq ($(USE_EE),1)
+all:	dist-xdr package clean
+else
+all:	dist package clean
+endif
+
+.PHONY:dist
+dist:
+	install -d $(BUILD_ROOT)/etc/aerospike
+	install -d $(BUILD_ROOT)/etc/aerospike/sample
+
+ifeq ($(USE_SYSTEMD),1)
+	install -d $(BUILD_ROOT)/usr/lib/systemd/system
+	install -d $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d
+else
+	install -d $(BUILD_ROOT)/var/log/aerospike
+	install -d $(BUILD_ROOT)/var/run/aerospike
+	install -d $(BUILD_ROOT)/etc/init.d
+	install -d $(BUILD_ROOT)/etc/logrotate.d
+endif
+	install -d $(BUILD_ROOT)/usr/bin
+
+	install -pm 755 $(BIN_DIR)/asd $(BUILD_ROOT)/usr/bin/asd
+ifeq ($(USE_SYSTEMD),1)
+	install -pm 755 $(DEPTH)/tools/bin/asd-coldstart $(BUILD_ROOT)/usr/bin/asd-coldstart
+endif
+	install -pm 755 $(DEPTH)/tools/citrus2aero/upgrade2to3 $(BUILD_ROOT)/usr/bin/asmigrate2to3
+	install -pm 755 $(DEPTH)/tools/fixownership/fixownership.py $(BUILD_ROOT)/usr/bin/asfixownership
+	install -pm 755 $(DEPTH)/as/etc/irqbalance-ban.sh $(BUILD_ROOT)/etc/aerospike/irqbalance-ban.sh
+	install -pm 644 $(DEPTH)/as/etc/aerospike$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/aerospike.conf
+	cat $(DEPTH)/as/etc/README.sample.conf.md > $(BUILD_ROOT)/etc/aerospike/sample/README.md
+	install -pm 644 $(DEPTH)/as/etc/aerospike_ssd$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_ssd.conf
+	install -pm 644 $(DEPTH)/as/etc/aerospike_mesh$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_mesh.conf
+
+ifeq ($(USE_SYSTEMD),1)
+	cat $(DEPTH)/as/etc/aerospike.service.head >> $(PKG)/rpm/aerospike.service
+  ifeq ($(EDITION),community)
+	cat $(DEPTH)/as/etc/aerospike.service.telemetry >> $(PKG)/rpm/aerospike.service
+  endif
+	cat $(DEPTH)/as/etc/aerospike.service.tail >> $(PKG)/rpm/aerospike.service
+	install -p -D -m 644 $(PKG)/rpm/aerospike.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike.service
+	install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.tmpfiles $(BUILD_ROOT)/etc/tmpfiles.d/aerospike.conf
+	install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike
+	install -p -D -m 755 $(DEPTH)/as/etc/asd-systemd-helper $(BUILD_ROOT)/usr/bin/asd-systemd-helper
+	install -p -D -m 644 $(DEPTH)/as/etc/aerospike.service.d/* $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d
+else
+	install -pm 755 $(DEPTH)/as/etc/init-script $(BUILD_ROOT)/etc/init.d/aerospike
+	sed -i 's/@EDITION@/$(EDITION)/g' $(BUILD_ROOT)/etc/init.d/aerospike
+	install -pm 644 $(DEPTH)/as/etc/logrotate_asd $(BUILD_ROOT)/etc/logrotate.d/aerospike
+endif
+
+	install -d $(OPT_AS)/doc
+ifeq ($(EDITION),community)
+	install -pm 644 $(DEPTH)/LICENSE.CE $(OPT_AS)/doc/LICENSE
+	install -pm 644 $(DEPTH)/LICENSE-AGPL $(OPT_AS)/doc
+	install -pm 644 $(DEPTH)/LICENSE-APACHE $(OPT_AS)/doc
+else
+	install -pm 644 $(EEREPO)/LICENSE.EE $(OPT_AS)/doc/LICENSE
+endif
+	cat $(DEPTH)/LICENSE.3rdParty >> $(OPT_AS)/doc/LICENSE
+
+ifeq ($(EDITION),community)
+  ifeq ($(USE_SYSTEMD),1)
+	install -pm 755 $(DEPTH)/as/etc/aerospike_telemetry.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike_telemetry.service
+	install -pm 644 $(DEPTH)/as/etc/aerospike_telemetry.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike_telemetry
+  else
+	install -pm 755 $(DEPTH)/as/etc/init-telemetry-script $(BUILD_ROOT)/etc/init.d/aerospike_telemetry
+	install -pm 644 $(DEPTH)/as/etc/logrotate_telemetry $(BUILD_ROOT)/etc/logrotate.d/aerospike_telemetry
+  endif
+	install -d $(OPT_AS)/telemetry
+	install -d $(OPT_AS)/telemetry/phonehome
+	install -d $(OPT_AS)/telemetry/daemon
+	install -pm 644 $(DEPTH)/as/etc/telemetry.conf $(BUILD_ROOT)/etc/aerospike
+	install -pm 644 $(DEPTH)/modules/telemetry/README.md $(OPT_AS)/doc/TELEMETRY.md
+	install -pm 755 $(DEPTH)/modules/telemetry/telemetry.py $(OPT_AS)/telemetry
+	install -pm 755 $(DEPTH)/modules/telemetry/phonehome/*.py $(OPT_AS)/telemetry/phonehome
+	install -pm 755 $(DEPTH)/modules/telemetry/daemon/*.py $(OPT_AS)/telemetry/daemon
+endif
+
+	install -d $(OPT_AS)/data
+	install -d $(OPT_AS)/smd
+	install -d $(OPT_AS)/sys/udf/lua/external
+	install -d $(OPT_AS)/usr/udf/lua
+	install -pm 644 $(DEPTH)/modules/lua-core/src/*.lua $(OPT_AS)/sys/udf/lua
+	for FILE in `find $(DEPTH)/modules/lua-core/src/external -type f` ; do \
+	   install -pm 644 $$FILE $(OPT_AS)/sys/udf/lua/external ; \
+	done
+
+	install -d $(OPT_AS)/bin
+	install -pm 755 $(DEPTH)/tools/memacct/asparsemem $(OPT_AS)/bin
+
+package:
+	install -pm 644 $(OPT_AS)/doc/LICENSE $(PKG)/packages
+	install -d $(SOURCE_ROOT)/RPMS/x86_64
+
+	sed 's/@VERSION@/'$(REV)'/g' < $(PKG)/rpm/server-spec-base > $(PKG)/rpm/aerospike.spec
+ifneq ($(USE_SYSTEMD),1)
+	cat $(PKG)/rpm/server-spec-logrotate >> $(PKG)/rpm/aerospike.spec
+endif
+	cat $(PKG)/rpm/server-spec-files >> $(PKG)/rpm/aerospike.spec
+ifeq ($(USE_SYSTEMD),1)
+	cat $(PKG)/rpm/server-spec-systemd >> $(PKG)/rpm/aerospike.spec
+else
+	cat $(PKG)/rpm/server-spec-sysv >> $(PKG)/rpm/aerospike.spec
+endif
+
+ifeq ($(USE_EE),1)
+	cat $(EEREPO)/pkg/rpm/xdr-files >> $(PKG)/rpm/aerospike.spec
+endif
+	cat $(PKG)/rpm/server-spec-config >> $(PKG)/rpm/aerospike.spec
+ifeq ($(EDITION),community)
+	cat $(PKG)/rpm/server-spec-telemetry >> $(PKG)/rpm/aerospike.spec
+  ifeq ($(USE_SYSTEMD),1)
+	cat $(PKG)/rpm/server-spec-telemetry-systemd >> $(PKG)/rpm/aerospike.spec
+  else
+	cat $(PKG)/rpm/server-spec-telemetry-sysv >> $(PKG)/rpm/aerospike.spec
+  endif
+endif
+	cat $(PKG)/rpm/server-spec-scripts >> $(PKG)/rpm/aerospike.spec
+ifeq ($(USE_SYSTEMD),1)
+	cat $(PKG)/rpm/server-spec-scripts-systemd >> $(PKG)/rpm/aerospike.spec
+endif
+
+	sed -i 's/@RELEASE@/'$(OS)'/g' $(PKG)/rpm/aerospike.spec
+	sed -i 's/@EDITION@/'$(EDITION)'/g' $(PKG)/rpm/aerospike.spec
+
+	cd $(DEPTH); rpmbuild -bb -vv --define "dist .$(OS)" --buildroot $(BUILD_ROOT) $(PKG)/rpm/aerospike.spec
+
+	find $(SOURCE_ROOT)/RPMS -type f -exec mv {} $(PKG)/packages \;
+
+clean:
+	rm -rf $(PKG)/rpm/aerospike.spec
+ifeq ($(USE_SYSTEMD),1)
+	rm -rf $(PKG)/rpm/aerospike.service
+endif
+	rm -rf $(SOURCE_ROOT)/*
+
+ifeq ($(USE_EE),1)
+  include $(EEREPO)/xdr/make_in/Makefile.rpm.in
+endif
diff --git a/pkg/rpm/asinstall b/pkg/rpm/asinstall
new file mode 100755
index 00000000..ab592763
--- /dev/null
+++ b/pkg/rpm/asinstall
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Install Aerospike server and tools on RHEL/Fedora/Centos 6.  
+# This script must be run as root or sudo.
+
+#------------------
+# Verify User
+#------------------
+
+if [ $EUID -ne 0 ]
+then
+	echo "This script requires root or sudo privileges."
+	exit 1
+fi
+
+#---------------
+# Check argparse
+#---------------
+fn=/tmp/pkgexists
+
+cat <<EOF >$fn
+try:
+	import argparse
+	print(1)
+except:
+	print(0)
+EOF
+
+has_argparse=`python $fn`
+rm $fn
+
+if [ "$has_argparse" = "0" ]
+then
+	echo Installing python-argparse
+	rpm -Uvh python-argparse-1.2.1-2.el6.noarch.rpm
+fi
+
+#---------------
+# Install tools
+#---------------
+echo Installing tools
+
+# Use default arguments if none passed in.
+if [ $# -eq 0 ]
+then
+	echo rpm -Uvh aerospike-tools-*.rpm
+	rpm -Uvh aerospike-tools-*.rpm
+else
+	echo rpm "$@" aerospike-tools-*.rpm
+	rpm "$@" aerospike-tools-*.rpm
+fi
+
+#---------------
+# Install server
+#---------------
+echo Installing server
+
+# Use default arguments if none passed in.
+if [ $# -eq 0 ]
+then
+	echo rpm -Uvh aerospike-server-*.rpm
+	rpm -Uvh aerospike-server-*.rpm
+else
+	echo rpm "$@" aerospike-server-*.rpm
+	rpm "$@" aerospike-server-*.rpm
+fi
+
diff --git a/pkg/rpm/server-spec-base b/pkg/rpm/server-spec-base
new file mode 100644
index 00000000..ed89d8eb
--- /dev/null
+++ b/pkg/rpm/server-spec-base
@@ -0,0 +1,20 @@
+Name: aerospike
+Version: @VERSION@
+Release: 1%{?dist}
+Summary: The Aerospike Database
+License: Proprietary
+Group: Application
+BuildArch: x86_64
+Vendor: Aerospike, Inc.
+
+%description
+The Aerospike distributed datastore allows fully scalable
+and reliable data storage with elastic server properties.
+
+%define _topdir pkg/dist
+%define __spec_install_post /usr/lib/rpm/brp-compress
+%package server-@EDITION@
+Summary: Aerospike server
+Group: Applications
+%description server-@EDITION@
+This package contains all of the code for running the Aerospike server.
diff --git a/pkg/rpm/server-spec-config b/pkg/rpm/server-spec-config
new file mode 100644
index 00000000..8aa3e1b6
--- /dev/null
+++ b/pkg/rpm/server-spec-config
@@ -0,0 +1,10 @@
+%defattr(-,aerospike,aerospike)
+/opt/aerospike
+%defattr(-,root,root)
+%config(noreplace) /etc/aerospike/aerospike.conf
+%dir /etc/aerospike
+/etc/aerospike/irqbalance-ban.sh
+%dir /etc/aerospike/sample
+/etc/aerospike/sample/README.md
+/etc/aerospike/sample/aerospike_ssd.conf
+/etc/aerospike/sample/aerospike_mesh.conf
diff --git a/pkg/rpm/server-spec-files b/pkg/rpm/server-spec-files
new file mode 100644
index 00000000..3ed2b03a
--- /dev/null
+++ b/pkg/rpm/server-spec-files
@@ -0,0 +1,5 @@
+%files server-@EDITION@
+%defattr(-,root,root)
+/usr/bin/asd
+/usr/bin/asmigrate2to3
+/usr/bin/asfixownership
diff --git a/pkg/rpm/server-spec-logrotate b/pkg/rpm/server-spec-logrotate
new file mode 100644
index 00000000..5e0079e4
--- /dev/null
+++ b/pkg/rpm/server-spec-logrotate
@@ -0,0 +1 @@
+Requires: logrotate
diff --git a/pkg/rpm/server-spec-scripts b/pkg/rpm/server-spec-scripts
new file mode 100644
index 00000000..31475f10
--- /dev/null
+++ b/pkg/rpm/server-spec-scripts
@@ -0,0 +1,9 @@
+%pre server-@EDITION@
+if ! id -g aerospike >/dev/null 2>&1; then
+	echo "Adding group aerospike"
+	/usr/sbin/groupadd -r aerospike
+fi
+if ! id -u aerospike >/dev/null 2>&1; then
+	echo "Adding user aerospike"
+	/usr/sbin/useradd -r -d /opt/aerospike -c 'Aerospike server' -g aerospike -s /sbin/nologin aerospike
+fi
diff --git a/pkg/rpm/server-spec-scripts-systemd b/pkg/rpm/server-spec-scripts-systemd
new file mode 100644
index 00000000..78f1cd53
--- /dev/null
+++ b/pkg/rpm/server-spec-scripts-systemd
@@ -0,0 +1,4 @@
+%post server-@EDITION@
+/bin/systemctl --system daemon-reload &> /dev/null || :
+%postun server-@EDITION@
+/bin/systemctl --system daemon-reload &> /dev/null || :
diff --git a/pkg/rpm/server-spec-systemd b/pkg/rpm/server-spec-systemd
new file mode 100644
index 00000000..f5921d39
--- /dev/null
+++ b/pkg/rpm/server-spec-systemd
@@ -0,0 +1,8 @@
+/etc/systemd/system/aerospike.service.d/aerospike.conf
+/etc/systemd/system/aerospike.service.d/aerospike.conf.coldstart
+/etc/systemd/system/aerospike.service.d/aerospike.conf.default
+/usr/bin/asd-coldstart
+/usr/bin/asd-systemd-helper
+/usr/lib/systemd/system/aerospike.service
+%config /etc/tmpfiles.d/aerospike.conf
+%config(noreplace) /etc/sysconfig/aerospike
diff --git a/pkg/rpm/server-spec-sysv b/pkg/rpm/server-spec-sysv
new file mode 100644
index 00000000..107f2443
--- /dev/null
+++ b/pkg/rpm/server-spec-sysv
@@ -0,0 +1,4 @@
+%config(noreplace) /etc/logrotate.d/aerospike
+/etc/init.d/aerospike
+%dir /var/log/aerospike
+%dir /var/run/aerospike
diff --git a/pkg/rpm/server-spec-telemetry b/pkg/rpm/server-spec-telemetry
new file mode 100644
index 00000000..d5ab2650
--- /dev/null
+++ b/pkg/rpm/server-spec-telemetry
@@ -0,0 +1 @@
+%config(noreplace) /etc/aerospike/telemetry.conf
diff --git a/pkg/rpm/server-spec-telemetry-systemd b/pkg/rpm/server-spec-telemetry-systemd
new file mode 100644
index 00000000..061fa986
--- /dev/null
+++ b/pkg/rpm/server-spec-telemetry-systemd
@@ -0,0 +1,2 @@
+/usr/lib/systemd/system/aerospike_telemetry.service
+%config(noreplace) /etc/sysconfig/aerospike_telemetry
diff --git a/pkg/rpm/server-spec-telemetry-sysv b/pkg/rpm/server-spec-telemetry-sysv
new file mode 100644
index 00000000..c08cb9ad
--- /dev/null
+++ b/pkg/rpm/server-spec-telemetry-sysv
@@ -0,0 +1,2 @@
+/etc/init.d/aerospike_telemetry
+%config(noreplace) /etc/logrotate.d/aerospike_telemetry
diff --git a/pkg/src/Makefile b/pkg/src/Makefile
new file mode 100644
index 00000000..99262638
--- /dev/null
+++ b/pkg/src/Makefile
@@ -0,0 +1,29 @@
+# Build Aerospike source distribution.
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.vars
+
+REPO=$(realpath $(DEPTH))
+PKG = $(REPO)/pkg
+DIST = $(PKG)/dist
+SOURCE = $(DIST)/SOURCE
+SOURCE_ASD = $(SOURCE)/aerospike-server
+
+REV = $(shell $(DEPTH)/build/version)
+
+# Name of the source package:
+ARCHIVE = $(DEPTH)/pkg/packages/aerospike-server-$(EDITION)-$(REV).src.tar.bz2
+
+all: dist package clean
+
+
+.PHONY:dist
+dist:
+	bash git-cp-files.sh $(REPO) $(SOURCE_ASD)
+
+.PHONY: package
+package: dist
+	tar cvfj $(ARCHIVE) -C $(SOURCE) aerospike-server
+
+clean:
+	rm -rf $(SOURCE)
\ No newline at end of file
diff --git a/pkg/src/git-cp-files.sh b/pkg/src/git-cp-files.sh
new file mode 100755
index 00000000..f5516d66
--- /dev/null
+++ b/pkg/src/git-cp-files.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+CWD=$(pwd)
+SCRIPT=${BASH_SOURCE[0]}
+SCRIPT_PATH=$( cd "$( dirname "${SCRIPT}" )" && pwd )
+SCRIPT_BASE=$( basename "${SCRIPT}" )
+
+SOURCE=${1}
+TARGET=${2}
+DEPTH=${3}
+
+if [ -z "${SOURCE}" ]; then
+  echo "ERROR: Missing SOURCE argument." >&2
+  exit 1
+fi
+if [ -z "${TARGET}" ]; then
+  echo "ERROR: Missing TARGET argument." >&2
+  exit 1
+fi
+if [ ! -d ${SOURCE} ]; then
+  echo "ERROR: SOURCE not found: ${SOURCE}" >&2
+  exit 1
+fi
+
+if [ -n "${DEPTH}" ]; then
+	if [ ${DEPTH} -eq 0 ]; then
+		exit 0
+	fi
+	DEPTH=$((DEPTH - 1))
+fi
+
+mkdir -p ${TARGET}
+
+IFS=$'\n'
+for file in $(cd ${SOURCE} && git ls-files --abbrev); do
+  if [ -f ${SOURCE}/${file} ]; then
+    dir=$(dirname ${file})
+    if [ ! -z "${dir}" ] && [ ! -d ${TARGET}/${dir} ]; then
+      mkdir -p "${TARGET}/${dir}"
+    fi
+    cp -a "${SOURCE}/${file}" "${TARGET}/${file}"
+  fi
+done
+
+for module in $(cd ${SOURCE} && git submodule status | awk '{print $2}'); do
+  bash ${SCRIPT_PATH}/${SCRIPT_BASE} ${SOURCE}/${module} ${TARGET}/${module} ${DEPTH}
+done
diff --git a/pkg/tar/Makefile b/pkg/tar/Makefile
new file mode 100644
index 00000000..260838d4
--- /dev/null
+++ b/pkg/tar/Makefile
@@ -0,0 +1,71 @@
+# Build Aerospike TAR distribution.
+
+DEPTH = ../..
+include $(DEPTH)/make_in/Makefile.vars
+
+PKG = $(realpath $(DEPTH)/pkg)
+SOURCE = $(PKG)/tar
+SOURCE_ROOT = $(PKG)/dist
+BUILD_ROOT = $(SOURCE_ROOT)/BUILD
+TARGET = $(BUILD_ROOT)/aerospike-server
+
+REV = $(shell $(DEPTH)/build/version)
+
+ARCHIVE = $(PKG)/packages/aerospike-server-$(EDITION)-$(REV).tar
+
+all:	dist package clean
+
+.PHONY: dist
+dist:
+	@# create directory
+	install -pm 755 -Dd $(TARGET)
+
+	@# docs
+	install -pm 644 -D $(SOURCE)/README $(TARGET)/README
+	install -pm 644 -D $(DEPTH)/LICENSE $(TARGET)/LICENSE
+
+	@# binaries
+	install -pm 755 -Dd $(TARGET)/bin
+	install -pm 755 -D $(BIN_DIR)/asd $(TARGET)/bin/asd
+	install -pm 755 -D $(SOURCE)/bin/aerospike $(TARGET)/bin/aerospike
+
+	@# share binaries
+	install -pm 755 -Dd $(TARGET)/share/bin
+	install -pm 755 -D $(SOURCE)/share/bin/aerospike $(TARGET)/share/bin/aerospike
+
+	@# share libraries
+	install -pm 755 -Dd $(TARGET)/share/lib
+	install -pm 644 -D $(SOURCE)/share/lib/* $(TARGET)/share/lib/.
+
+	@# share libexecs
+	install -pm 755 -Dd $(TARGET)/share/libexec
+	install -pm 644 -D $(SOURCE)/share/libexec/* $(TARGET)/share/libexec/.
+
+	@# share manpages
+	install -pm 755 -Dd $(TARGET)/share/man
+	install -pm 644 -D $(SOURCE)/share/man/* $(TARGET)/share/man/.
+
+	@# share configs
+	install -pm 755 -Dd $(TARGET)/share/etc
+	install -pm 644 -D $(SOURCE)/share/etc/aerospike.conf $(TARGET)/share/etc/aerospike.conf
+	install -pm 755 -D $(DEPTH)/as/etc/irqbalance-ban.sh $(TARGET)/share/etc/irqbalance-ban.sh
+
+	@# lua files
+	install -pm 755 -Dd $(TARGET)/share/udf/lua
+	install -pm 644 -D $(DEPTH)/modules/lua-core/src/*.lua $(TARGET)/share/udf/lua/.
+
+	install -pm 755 -Dd $(TARGET)/share/udf/lua/external
+	for FILE in `find $(DEPTH)/modules/lua-core/src/external -type f` ; do \
+	   install -pm 644 $$FILE $(OPT_AS)/sys/udf/lua/external ; \
+	done
+
+tar:
+	tar cvf $(ARCHIVE) -C $(BUILD_ROOT) aerospike-server
+
+gzip:	tar
+	gzip -f $(ARCHIVE)
+
+package:	gzip
+
+clean:
+	rm -rf $(SOURCE_ROOT)/*
diff --git a/pkg/tar/README b/pkg/tar/README
new file mode 100644
index 00000000..3110049a
--- /dev/null
+++ b/pkg/tar/README
@@ -0,0 +1,101 @@
+AEROSPIKE README
+================
+
+SYNOPSIS
+
+  ./bin/aerospike init
+  sudo ./bin/aerospike start
+  ./bin/aerospike status
+  ./bin/aerospike stop
+
+DESCRIPTION
+
+  This package contains the Aerospike Server Daemon (asd), scripts,
+  configuration files, and other resources.
+
+QUICK START
+
+  The `aerospike` script, located in the `bin` directory, provides the ability
+  to initialize a directory for running `asd` and managing an instance of `asd`
+  from that directory.
+
+  The following is a quick walk-through to help you get started:
+
+    1. Initialize a directory for hosting your aerospike instance.
+
+        ./bin/aerospike init
+
+    2. Start the aerospike server
+
+        sudo ./bin/aerospike start
+
+      Superuser privileges are required because it attempts to set upper
+      limits on system resources.
+
+    3. Check the status of the aerospike server
+
+        ./bin/aerospike status
+
+    4. Stop the aerospike server
+
+        sudo ./bin/aerospike stop
+
+
+AEROSPIKE SCRIPT
+
+  The `aerospike` script, located in the `bin` directory, provides the ability
+  to managing an instance of `asd`.
+
+  For help with the script, use the `--help` option:
+
+      ./bin/aerospike --help 
+
+AEROSPIKE INSTANCE DIRECTORY
+  
+  The directory created by the `aerospike init` command will contain:
+
+    bin/aerospike         - The management script to manage this instance.
+    bin/asd               - The aerospike server daemon.
+    etc/aerospike.conf    - The configuration file used by this instance.
+    share/                - Contains read-only files, used by this instance.
+    var/                  - Contains runtime files generated by `asd`, including
+                            logs and data files.
+
+NOTES
+
+  1. The `aerospike init` script can be used to initialize any directory to host
+    an aerospike instance by specifying the `--home <PATH>` option
+
+        ./bin/aerospike init --home ~/myaerospike
+
+    For running multiple instances, see below.
+
+  2. Running Multiple Instances
+
+    NOTE: For production environments, it is not recommended to run multiple
+    instances on a single host. The reason is that you would ideally allocate
+    as many resource as possible to a production instance.
+
+    If you want to run multiple instances of aerospike on a single machine, then
+    each instance should be initialized with different instance id and port
+    number. The default instance id is "1" and the default port is "3000".
+
+    To initialize two instances, you can use:
+
+        ./bin/aerospike init --home ~/a --instance 1 -p 3000
+        ./bin/aerospike init --home ~/b --instance 2 -p 3010
+
+    The aerospike.conf for each instance cannot share resources. If you
+    define storage engines other than in-memory, then each must have dedicated
+    resources (file, device, etc) dedicated to those instance. There is
+    a limit of 15 possible instances which can created.
+
+  3. Running as non-root Users
+
+    If you want to run instances of aerospike as non-root users, you can provide
+    the user and group ids during initialization time:
+
+        ./bin/aerospike init --home ~/aerobob --user bob --group bobs
+
+    This will initialize setup the home directory to be owned by "bob". Also,
+    the server will run as the user "bob".
diff --git a/pkg/tar/bin/aerospike b/pkg/tar/bin/aerospike
new file mode 100644
index 00000000..78432537
--- /dev/null
+++ b/pkg/tar/bin/aerospike
@@ -0,0 +1,4 @@
+#!/bin/bash
+SCRIPT_PATH=$0
+SCRIPT_HOME=$(cd $(dirname ${SCRIPT_PATH})/..; pwd)
+AEROSPIKE_DAEMON=${SCRIPT_HOME}/bin/asd ${SCRIPT_HOME}/share/bin/aerospike $@
diff --git a/pkg/tar/share/bin/aerospike b/pkg/tar/share/bin/aerospike
new file mode 100755
index 00000000..8cd08880
--- /dev/null
+++ b/pkg/tar/share/bin/aerospike
@@ -0,0 +1,224 @@
+#!/bin/bash
+################################################################################
+#
+# Run Script for Aerospike
+#
+################################################################################
+
+SCRIPT_PATH=$0
+SCRIPT_NAME=$(basename $SCRIPT_PATH)
+SCRIPT_HOME=$(cd $(dirname $SCRIPT_PATH)/..; pwd)
+SCRIPT_BIN=${SCRIPT_HOME}/bin
+SCRIPT_LIB=${SCRIPT_HOME}/lib
+SCRIPT_LIBEXEC=${SCRIPT_HOME}/libexec
+SCRIPT_MAN=${SCRIPT_HOME}/man
+
+if [ ! AEROSPIKE_DAEMON ]; then
+  AEROSPIKE_DAEMON=${SCRIPT_BIN}/asd
+fi
+AEROSPIKE_HOME=$(pwd)
+
+DEBUG=0
+
+################################################################################
+#
+# FUNCTIONS
+#
+################################################################################
+
+print() {
+  printf "$1\n"
+}
+
+debug() {
+  [ $DEBUG == 1 ] && print "$(tput setaf 0)$(tput bold)debug:$(tput sgr0) $1"
+}
+
+info() {
+  print "$(tput setaf 4)$(tput bold)info:$(tput sgr0) $1"
+}
+
+warning() {
+  print "$(tput setaf 3)$(tput bold)warning:$(tput sgr0) $1" >&2
+}
+
+error() {
+  print "$(tput setaf 1)$(tput bold)error:$(tput sgr0) $1"
+}
+
+# try an operation and log the result
+try() {
+  local cmd="$1"
+  local msg="$2"
+  
+  if [[ -z $msg ]]; then
+    msg="$cmd"
+  fi
+  
+  rc=0
+  debug "$msg"
+  if [ $DEBUG ] && [ $DEBUG -gt 0 ]; then
+    eval "$cmd"
+    rc=$?
+  else
+    eval "$cmd" &>/dev/null
+    rc=$?
+  fi
+
+  if [[ $rc -eq 0 ]]; then
+    debug "$msg"
+  else
+    error "$msg"
+  fi
+
+  return $rc
+}
+
+default_parseopt() {
+  case $1 in
+    "--debug" )
+      DEBUG=1
+      debug "DEBUG MODE"
+      return 1
+      ;;
+    "--help" )
+      help
+      exit 0
+      ;;
+    "--home" )
+      if [ -z $2 ]; then
+        error "--home requires a PATH."
+        exit 1
+      fi
+      if [ ! -d $2 ]; then
+        error "--home requires a valid PATH: $2"
+        exit 1
+      fi
+      AEROSPIKE_HOME=$(cd $2; pwd)
+      return 2
+      ;;
+    * )
+      error "Unknown option: $1"
+      exit 1
+      ;;
+  esac
+}
+
+parseopt() {
+  default_parseopt $*
+  return $?
+}
+
+parseopts() {
+  while (( "$#" )); do
+    case $1 in
+      "--"* | "-"* )
+        parseopt $*
+        shift $?
+        ;;
+
+      * )
+        COMMAND=$1
+        debug "running ${COMMAND}"
+        if [ ! ${COMMAND} ]; then
+          error "Command is not specified."
+          usage >&2
+          exit 1
+        elif [ ! -f ${SCRIPT_LIBEXEC}/aerospike-${COMMAND} ]; then
+          error "'$COMMAND' is not a valid command."
+          usage >&2
+          exit 1
+        else
+          source ${SCRIPT_LIBEXEC}/aerospike-${COMMAND}
+        fi
+        shift
+        ;;
+
+    esac
+  done
+}
+
+process_running() {
+  debug "process running"
+  return 0
+}
+
+process_stopped() {
+  debug "process stopped"
+  return 0
+}
+
+process_died() {
+  debug "process died"
+  return 0
+}
+
+process_check() {
+  local pid_file=${AEROSPIKE_HOME}/var/run/aerospike.pid
+  if [ -f ${pid_file} ]; then
+    debug "${pid_file} exists."
+    pid=$(cat ${pid_file})
+    debug "${pid_file} => ${pid}"
+    pline=$(ps -p $pid -o "command=")
+    if [ $? -eq 0 ]; then
+      debug "pid ${pid} found: ${pline}"
+      if [[ "${pline}" == *${AEROSPIKE_HOME}/etc/aerospike.conf* ]]; then
+        debug "aerospike is running as ${pid}"
+        process_running ${pid} ${pid_file}
+        return $?
+      else
+        debug "pid ${pid} does not match expected command."
+        process_died ${pid} ${pid_file}
+        return $?
+      fi
+    else
+      debug "pid ${pid} not found"
+      process_died ${pid} ${pid_file}
+      return $?
+    fi
+  else
+    process_stopped
+    return $?
+  fi
+}
+
+usage() {
+  #      |--------------------------------------------------------------------------------|
+  print ""
+  print "usage: aerospike COMMAND [OPTIONS]"
+  print
+  print "$(tput bold)COMMANDS$(tput sgr0)"
+  for script in ${SCRIPT_LIBEXEC}/aerospike-*; do
+    s=$(basename ${script})
+    s=${s#aerospike-}
+    s=${s%.sh}
+    print "   $s"
+  done
+  print 
+  print "Use 'aerospike COMMAND --help' for command specific help." 
+  print
+  #     |--------------------------------------------------------------------------------|
+}
+
+help() {
+  if [ -f ${SCRIPT_MAN}/aerospike-${COMMAND}.man ]; then
+    man ${SCRIPT_MAN}/aerospike-${COMMAND}.man
+  elif [ -z ${COMMAND} ]; then
+    usage
+  else
+    print "No help available for '${COMMAND}'"
+  fi
+}
+
+################################################################################
+#
+# MAIN
+#
+################################################################################
+
+parseopts $*
+if [ ! ${COMMAND} ]; then
+  usage
+else
+  main
+fi
diff --git a/pkg/tar/share/etc/aerospike.conf b/pkg/tar/share/etc/aerospike.conf
new file mode 100644
index 00000000..e79555c0
--- /dev/null
+++ b/pkg/tar/share/etc/aerospike.conf
@@ -0,0 +1,54 @@
+# Aerospike database configuration file.
+
+# This stanza must come first.
+service {
+  user ${user}
+  group ${group}
+  paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1.
+  pidfile ${home}/var/run/aerospike.pid
+  proto-fd-max 15000
+  work-directory ${home}/var
+}
+
+logging {
+  # Log file must be an absolute path.
+  file ${home}/var/log/aerospike.log {
+    context any info
+  }
+}
+
+mod-lua {
+  system-path ${home}/share/udf/lua
+  user-path ${home}/var/udf/lua
+}
+
+network {
+  service {
+    address ${service_addr}
+    port ${service_port}
+  }
+
+  heartbeat {
+    mode multicast
+    multicast-group ${multicast_addr}
+    port ${multicast_port}
+
+    interval 150
+    timeout 10
+  }
+
+  fabric {
+    port ${fabric_port}
+  }
+
+  info {
+    port ${info_port}
+  }
+}
+
+namespace test {
+  replication-factor 2
+  memory-size 4G
+  default-ttl 30d # 30 days, use 0 to never expire/evict.
+  storage-engine memory
+}
diff --git a/pkg/tar/share/lib/aerospike-render.py b/pkg/tar/share/lib/aerospike-render.py
new file mode 100644
index 00000000..2c4a2044
--- /dev/null
+++ b/pkg/tar/share/lib/aerospike-render.py
@@ -0,0 +1,38 @@
+#/usr/bin/python
+'''
+SYNOPSIS
+
+  python aerospike-render.py <TEMPLATE> [<NAME>=<VALUE> [...]]
+
+DESCRIPTION
+
+  Render the template file specified by <TEMPLATE> using the variables 
+  defined by the -D option. 
+
+  An example of defining the 'home' variable:
+
+      -D home=/opt/aerospike
+
+  The 'home' variable may be used in the <TEMPLATE> by specifying it as:
+
+      ${home}
+
+'''
+
+import sys
+from string import Template
+
+if len(sys.argv) < 1 :
+  sys.stderr.write("error: Missing required template file.\n")
+  sys.stderr.write("\n")
+  sys.stderr.write("%s\n" % __doc__.lstrip())
+  sys.exit(1)
+
+# template file name
+templateFile = sys.argv[1]
+
+# create dict of variables
+variables = dict([ tuple(pair.split("=")) for pair in sys.argv[2:]])
+
+# read the template and process it
+sys.stdout.write(Template(open(templateFile).read()).substitute(variables))
\ No newline at end of file
diff --git a/pkg/tar/share/libexec/aerospike-destroy b/pkg/tar/share/libexec/aerospike-destroy
new file mode 100644
index 00000000..1514ed8e
--- /dev/null
+++ b/pkg/tar/share/libexec/aerospike-destroy
@@ -0,0 +1,26 @@
+#!/bin/bash
+################################################################################
+#
+# Destroy Script for Aerospike
+#
+# Inherits definitions from aerospike run script
+#
+################################################################################
+
+################################################################################
+# MAIN
+################################################################################
+
+main() {
+
+  try "$0 stop --home ${AEROSPIKE_HOME}"
+  if [ $? -ne 0 ]; then
+    exit $?
+  fi
+
+  try "rm -rf ${AEROSPIKE_HOME}/bin"
+  try "rm -rf ${AEROSPIKE_HOME}/etc"
+  try "rm -rf ${AEROSPIKE_HOME}/share"
+  try "rm -rf ${AEROSPIKE_HOME}/var"
+
+}
diff --git a/pkg/tar/share/libexec/aerospike-init b/pkg/tar/share/libexec/aerospike-init
new file mode 100644
index 00000000..6856a7cc
--- /dev/null
+++ b/pkg/tar/share/libexec/aerospike-init
@@ -0,0 +1,211 @@
+#!/bin/bash
+################################################################################
+#
+# Initialization Script for Aerospike
+#
+# Inherits definitions from aerospike run script
+#
+################################################################################
+
+# AEROSPIKE_USER=root
+# AEROSPIKE_GROUP=root
+# AEROSPIKE_SERVICE_ADDR=any
+# AEROSPIKE_SERVICE_PORT=3000
+# AEROSPIKE_MULTICAST_ADDR=239.1.99.222
+# AEROSPIKE_MULTICAST_PORT=9918
+# AEROSPIKE_FABRIC_PORT=3001
+# AEROSPIKE_INFO_PORT=3003
+# AEROSPIKE_INST=1
+
+AEROSPIKE_DEFAULT_USER=$(whoami)
+AEROSPIKE_DEFAULT_GROUP=$(whoami)
+AEROSPIKE_DEFAULT_SERVICE_ADDR=any
+AEROSPIKE_DEFAULT_SERVICE_PORT=3000
+AEROSPIKE_DEFAULT_MULTICAST_ADDR=239.1.99.222
+AEROSPIKE_DEFAULT_MULTICAST_PORT=9918
+AEROSPIKE_DEFAULT_FABRIC_PORT=3001
+AEROSPIKE_DEFAULT_INFO_PORT=3003
+AEROSPIKE_DEFAULT_INST=1
+
+aerospike_user() {
+  echo ${AEROSPIKE_USER:-$AEROSPIKE_DEFAULT_USER}
+}
+
+aerospike_group() {
+  echo ${AEROSPIKE_GROUP:-$AEROSPIKE_DEFAULT_GROUP}
+}
+
+aerospike_service_addr() {
+  echo ${AEROSPIKE_SERVICE_ADDR:-$AEROSPIKE_DEFAULT_SERVICE_ADDR}
+}
+
+aerospike_service_port() {
+  echo ${AEROSPIKE_SERVICE_PORT:-$AEROSPIKE_DEFAULT_SERVICE_PORT}
+}
+
+aerospike_multicast_addr() {
+  echo ${AEROSPIKE_MULTICAST_ADDR:-$AEROSPIKE_DEFAULT_MULTICAST_ADDR}
+}
+
+aerospike_multicast_port() {
+  echo ${AEROSPIKE_MULTICAST_PORT:-$AEROSPIKE_DEFAULT_MULTICAST_PORT}
+}
+
+aerospike_fabric_port() {
+  echo ${AEROSPIKE_FABRIC_PORT:-$(expr $(aerospike_service_port) + 1)}
+}
+
+aerospike_info_port() {
+  echo ${AEROSPIKE_INFO_PORT:-$(expr $(aerospike_service_port) + 3)}
+}
+
+aerospike_inst() {
+  echo ${AEROSPIKE_INST:-$AEROSPIKE_DEFAULT_INST}
+}
+
+
+################################################################################
+#
+# FUNCTIONS
+#
+################################################################################
+
+parseopt() {
+  case $1 in
+    "--local" )
+      AEROSPIKE_LOCAL=1
+      return 1
+      ;;
+    "--user" | "-u" )
+      AEROSPIKE_USER=$2
+      return 2
+      ;;
+    "--group" | "-g" )
+      AEROSPIKE_GROUP=$2
+      return 2
+      ;;
+    "--instance" | "-i" )
+      AEROSPIKE_INST=$2
+      return 2
+      ;;
+    "--service-addr" | "-h" )
+      AEROSPIKE_SERVICE_ADDR=$2
+      return 2
+      ;;
+    "--service-port" | "-p" )
+      AEROSPIKE_SERVICE_PORT=$2
+      return 2
+      ;;
+    "--multicast-addr" )
+      AEROSPIKE_MULTICAST_ADDR=$2
+      return 2
+      ;;
+    "--multicast-port" )
+      AEROSPIKE_MULTICAST_PORT=$2
+      return 2
+      ;;
+    "--fabric-port" )
+      AEROSPIKE_FABRIC_PORT=$2
+      return 2
+      ;;
+    "--info-port" )
+      AEROSPIKE_INFO_PORT=$2
+      return 2
+      ;;
+    "--home" )
+      AEROSPIKE_HOME=$(readlink -m $2)
+      return 2
+      ;;
+    * )
+      default_parseopt $*
+      return $?
+      ;;
+  esac
+}
+
+render() {
+  python ${SCRIPT_LIB}/aerospike-render.py $1 \
+    home=${AEROSPIKE_HOME} \
+    user=$(aerospike_user) \
+    group=$(aerospike_group) \
+    service_addr=$(aerospike_service_addr) \
+    service_port=$(aerospike_service_port) \
+    multicast_addr=$(aerospike_multicast_addr) \
+    multicast_port=$(aerospike_multicast_port) \
+    fabric_port=$(aerospike_fabric_port) \
+    info_port=$(aerospike_info_port)
+}
+
+main() {
+
+  local owner=$(aerospike_user)
+  local group=$(aerospike_group)
+
+  # base directories
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}"
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/etc"
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share"
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/var"
+  
+  # bin directory
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/bin"
+  try "install -m 755 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/bin/aerospike ${AEROSPIKE_HOME}/share/bin/aerospike"
+
+  # lib files
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/lib"
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/lib/*  ${AEROSPIKE_HOME}/share/lib/."
+
+  # libexec files
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/libexec"
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/libexec/*  ${AEROSPIKE_HOME}/share/libexec/."
+  
+  # command man pages
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/man"
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/man/aerospike-init.man  ${AEROSPIKE_HOME}/share/man/."
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/man/aerospike-destroy.man  ${AEROSPIKE_HOME}/share/man/."
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/man/aerospike-restart.man  ${AEROSPIKE_HOME}/share/man/."
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/man/aerospike-start.man    ${AEROSPIKE_HOME}/share/man/."
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/man/aerospike-status.man   ${AEROSPIKE_HOME}/share/man/."
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/man/aerospike-stop.man     ${AEROSPIKE_HOME}/share/man/."
+
+  # configuration template
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/etc"
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/etc/aerospike.conf ${AEROSPIKE_HOME}/share/etc/aerospike.conf"
+  
+  # install lua files
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/udf/lua"
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/udf/lua/*.lua ${AEROSPIKE_HOME}/share/udf/lua/."
+  
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/share/udf/lua/external"
+  try "install -m 644 -o ${owner} -g ${group} -C ${SCRIPT_HOME}/udf/lua/external/*.lua ${AEROSPIKE_HOME}/share/udf/lua/external/."
+
+  # instance local aerospike script
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/bin"
+  try "echo '#!/bin/bash' > ${AEROSPIKE_HOME}/bin/aerospike"
+  try "echo '. ${AEROSPIKE_HOME}/etc/aerospike.rc' >> ${AEROSPIKE_HOME}/bin/aerospike"
+  try "echo '${AEROSPIKE_HOME}/share/bin/aerospike \$*' >> ${AEROSPIKE_HOME}/bin/aerospike"
+  try "chmod 755 ${AEROSPIKE_HOME}/bin/aerospike"
+  try "install -m 755 -o ${owner} -g ${group} -C ${AEROSPIKE_DAEMON} ${AEROSPIKE_HOME}/bin/asd"
+
+  # runtime/work directories
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/var/log"
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/var/run"
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/var/smd"
+  try "install -m 755 -o ${owner} -g ${group} -Dd ${AEROSPIKE_HOME}/var/udf/lua"
+
+  # render the configuration file
+  if [ ! -f ${AEROSPIKE_HOME}/etc/aerospike.conf ]; then
+    try "render ${SCRIPT_HOME}/etc/aerospike.conf > ${AEROSPIKE_HOME}/etc/aerospike.conf"
+    try "chmod 644 ${AEROSPIKE_HOME}/etc/aerospike.conf"
+    try "chown ${owner}:${group} ${AEROSPIKE_HOME}/etc/aerospike.conf"
+  fi
+
+  # add rc file
+  if [ ! -f ${AEROSPIKE_HOME}/etc/aerospike.rc ]; then
+    try "echo 'export AEROSPIKE_DAEMON=${AEROSPIKE_HOME}/bin/asd' >> ${AEROSPIKE_HOME}/etc/aerospike.rc"
+    try "echo 'export AEROSPIKE_INST=$(aerospike_inst)' >> ${AEROSPIKE_HOME}/etc/aerospike.rc"
+    try "echo 'export AEROSPIKE_CONF=${AEROSPIKE_HOME}/etc/aerospike.conf' >> ${AEROSPIKE_HOME}/etc/aerospike.rc"
+    try "chmod 644 ${AEROSPIKE_HOME}/etc/aerospike.rc"
+    try "chown ${owner}:${group} ${AEROSPIKE_HOME}/etc/aerospike.rc"
+  fi
+}
diff --git a/pkg/tar/share/libexec/aerospike-restart b/pkg/tar/share/libexec/aerospike-restart
new file mode 100644
index 00000000..4cecc171
--- /dev/null
+++ b/pkg/tar/share/libexec/aerospike-restart
@@ -0,0 +1,25 @@
+#!/bin/bash
+################################################################################
+#
+# Restart Script for Aerospike
+#
+# Inherits definitions from aerospike run script
+#
+################################################################################
+
+main() {
+
+  try "$0 stop --home ${AEROSPIKE_HOME}"
+  if [ $? -ne 0 ]; then
+    exit $?
+  else
+    info "stopped"
+  fi
+
+  try "$0 start --home ${AEROSPIKE_HOME}"
+  if [ $? -ne 0 ]; then
+    exit $?
+  else
+    info "started"
+  fi
+}
diff --git a/pkg/tar/share/libexec/aerospike-start b/pkg/tar/share/libexec/aerospike-start
new file mode 100644
index 00000000..c0b6b477
--- /dev/null
+++ b/pkg/tar/share/libexec/aerospike-start
@@ -0,0 +1,105 @@
+#!/bin/bash
+################################################################################
+#
+# Start Script for Aerospike
+#
+# Inherits definitions from aerospike run script
+#
+################################################################################
+
+process_stopped() {
+  debug "process stopped"
+  return 0
+}
+
+process_running() {
+  error "process running."
+  exit 1
+}
+
+process_died() {
+  debug "aerospike (pid=$1) died abruptly. Cleaning up"
+  sudo rm -f $2
+  return 0
+}
+
+set_shmall() {
+  mem=`/sbin/sysctl -n kernel.shmall`
+  min=4294967296
+  if [ ${#mem} -le ${#min} ]
+  then
+    if [ $mem -lt $min ]
+    then
+      info "kernel.shmall too low, setting to 4G pages"
+      /sbin/sysctl -w kernel.shmall=$min
+    fi
+  fi
+}
+
+set_shmmax() {
+  mem=`/sbin/sysctl -n kernel.shmmax`
+  min=1073741824
+  if [ ${#mem} -le ${#min} ]
+  then
+    if [ $mem -lt $min ]
+    then
+      info "kernel.shmmax too low, setting to 1GB"
+      /sbin/sysctl -w kernel.shmmax=$min
+    fi
+  fi
+}
+
+set_socket_buffer_limit() {
+  name=${1}; path=${2}; size=${3}
+  curr=$(cat ${path})
+
+  if [ ${curr} -lt ${size} ]; then
+    info "increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}"
+    echo ${size} >${path}
+  fi
+}
+
+set_socket_buffer_limits() {
+  set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640
+  set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880
+}
+
+main() {
+
+  if [ ! -f ${AEROSPIKE_HOME}/etc/aerospike.conf ]; then
+    error "directory is not initialized. Run 'aerospike init' before continuing."
+    exit 1
+  fi
+
+  local me=$(whoami)
+  debug "i am $me"
+  if [[ ${me} != "root" ]]; then
+    error "super-user privileges required for this operation. Try the command using sudo."
+    exit 1
+  fi
+
+  debug "starting"
+
+  # check the process
+  process_check
+  debug "process checked"
+
+  # setup environment
+  debug "setting up environment"
+  set_shmall
+  set_shmmax
+  set_socket_buffer_limits
+  ulimit -n 100000
+
+  debug "starting..."
+  ${AEROSPIKE_DAEMON} --instance ${AEROSPIKE_INST} --config ${AEROSPIKE_HOME}/etc/aerospike.conf &>var/log/console.log
+  console=$(cat var/log/console.log)
+  if [ $? -eq 0 ] && [ -z "${console}" ]; then
+    info "started"
+  else
+    error "start failed due to an error."
+    print "${console}" 1>&2
+    print 1>&2
+    exit 1
+  fi
+}
diff --git a/pkg/tar/share/libexec/aerospike-status b/pkg/tar/share/libexec/aerospike-status
new file mode 100644
index 00000000..05f90a5f
--- /dev/null
+++ b/pkg/tar/share/libexec/aerospike-status
@@ -0,0 +1,24 @@
+#!/bin/bash
+################################################################################
+#
+# Start Script for Aerospike
+#
+# Inherits definitions from aerospike run script
+#
+################################################################################
+
+process_stopped() {
+  info "process stopped"
+}
+
+process_running() {
+  info "process running"
+}
+
+process_died() {
+  info "process died abruptly"
+}
+
+main() {
+  process_check
+}
diff --git a/pkg/tar/share/libexec/aerospike-stop b/pkg/tar/share/libexec/aerospike-stop
new file mode 100644
index 00000000..9b4c74d8
--- /dev/null
+++ b/pkg/tar/share/libexec/aerospike-stop
@@ -0,0 +1,36 @@
+#!/bin/bash
+################################################################################
+#
+# Start Script for Aerospike
+#
+# Inherits definitions from aerospike run script
+#
+################################################################################
+
+process_stopped() {
+  debug "process stopped"
+  return 0
+}
+
+process_running() {
+  debug "process running"
+  sudo kill $1
+  sudo rm -f $2
+  return 0
+}
+
+process_died() {
+  info "process died abruptly"
+  sudo rm -f $2
+  return 0
+}
+
+main() {
+
+  process_check
+  if [ $? -eq 0 ]; then
+    info "stopped"
+  else
+    error "an error occurred"
+  fi
+}
diff --git a/pkg/tar/share/man/aerospike-destroy.man b/pkg/tar/share/man/aerospike-destroy.man
new file mode 100644
index 00000000..dd0b45f4
--- /dev/null
+++ b/pkg/tar/share/man/aerospike-destroy.man
@@ -0,0 +1,34 @@
+.TH "aerospike destroy" 1 "1 APRIL 2014" "aerospike destroy" "aerospike manual" 
+
+.SH NAME
+
+aerospike destroy \- destroy aerospike home directory daemon
+
+.SH SYNOPSIS
+
+aerospike destroy [--home PATH]
+
+.SH DESCRIPTION
+
+.PP 
+Destroy the home directory for aerospike. This will remove all directories which were created by the `init` command.
+
+To override the home directory, specify the `--home PATH` option.
+
+.SH OPTIONS
+
+.IP "--home PATH"
+
+The directory to use as the aerospike home directory. 
+
+.SH EXAMPLES
+
+.HP
+To destroy aerospike running in the current directory:
+
+aerospike destroy
+
+.HP 
+To destroy aerospike running a different home directory, such as '/usr/share/aerospike':
+
+aerospike destroy --home /usr/share/aerospike
\ No newline at end of file
diff --git a/pkg/tar/share/man/aerospike-init.man b/pkg/tar/share/man/aerospike-init.man
new file mode 100644
index 00000000..c6c9701b
--- /dev/null
+++ b/pkg/tar/share/man/aerospike-init.man
@@ -0,0 +1,77 @@
+.TH "aerospike init" 1 "1 APRIL 2014" "aerospike init" "aerospike manual" 
+
+.SH NAME
+
+aerospike init \- initialize aerospike home directory
+
+.SH SYNOPSIS
+
+aerospike init [OPTIONS]
+
+.SH DESCRIPTION
+
+.PP 
+Initialize the home directory for aerospike. This will populate the directory with default configurations, files and directories.
+
+.SH OPTIONS
+
+.IP "--home PATH"
+
+The directory to initialize as the aerospike home directory. 
+
+.IP "--user UID"
+
+The user that will own the home directory and be used for running asd.
+
+.IP "--group GID"
+
+The group that will own the home directory and be used for running asd.
+
+.IP "--instance INT"
+
+Each instance of asd on a machine must have a unique instance id. Values can be between 0 and 15.
+
+.IP "--service-addr,-h IPADDR"
+
+The address asd will bind to. If not specified, then it will default to 'any'.
+
+.IP "--service-port,-p INT"
+
+The port asd will bind to.  If not specified, then it will default to the 3000.
+
+.IP "--multicast-addr IPADDR"
+
+The address asd will use for communicating heartbeat (status) with other nodes in a cluster.
+
+.IP "--multicast-port INT"
+
+The port asd will use for communicating heartbeat (status) with other nodes in a cluster.
+
+.IP "--fabric-port INT"
+
+The port asd will use for communicating with other nodes in a cluster. If not specified, then it will default to the `service-port + 1`.
+
+.IP "--info-port INT"
+
+The port asd will use for servicing info requests. If not specified, then it will default to the `service-port + 3`.
+
+.SH EXAMPLES
+
+
+.HP
+To initialize the current directory:
+
+aerospike init
+
+.HP 
+To initialize a different directory, such as '/usr/share/aerospike':
+
+aerospike init --home /usr/share/aerospike
+
+.HP 
+To initialize two instances, you need to assign each a unique instance identifier and port number:
+
+aerospike init --home ~/aerospike1 --instance 1 -p 3000
+
+aerospike init --home ~/aerospike2 --instance 2 -p 3010
+
diff --git a/pkg/tar/share/man/aerospike-restart.man b/pkg/tar/share/man/aerospike-restart.man
new file mode 100644
index 00000000..eea71e52
--- /dev/null
+++ b/pkg/tar/share/man/aerospike-restart.man
@@ -0,0 +1,34 @@
+.TH "aerospike restart" 1 "1 APRIL 2014" "aerospike restart" "aerospike manual" 
+
+.SH NAME
+
+aerospike restart \- restart aerospike server daemon
+
+.SH SYNOPSIS
+
+aerospike restart [--home PATH]
+
+.SH DESCRIPTION
+
+.PP 
+Restart the aerospike server daemon. The program will assume the current directory is the aerospike home directory. 
+
+To override the home directory, specify the `--home PATH` option.
+
+.SH OPTIONS
+
+.IP "--home PATH"
+
+The directory to use as the aerospike home directory. 
+
+.SH EXAMPLES
+
+.HP
+To restart aerospike running in the current directory:
+
+aerospike restart
+
+.HP 
+To restart aerospike running a different home directory, such as '/usr/share/aerospike':
+
+aerospike restart --home /usr/share/aerospike
\ No newline at end of file
diff --git a/pkg/tar/share/man/aerospike-start.man b/pkg/tar/share/man/aerospike-start.man
new file mode 100644
index 00000000..7616829c
--- /dev/null
+++ b/pkg/tar/share/man/aerospike-start.man
@@ -0,0 +1,34 @@
+.TH "aerospike start" 1 "1 APRIL 2014" "aerospike start" "aerospike manual" 
+
+.SH NAME
+
+aerospike start \- start aerospike server daemon
+
+.SH SYNOPSIS
+
+aerospike start [--home PATH]
+
+.SH DESCRIPTION
+
+.PP 
+Start the aerospike server daemon. The program will assume the current directory is the aerospike home directory. 
+
+To override the home directory, specify the `--home PATH` option.
+
+.SH OPTIONS
+
+.IP "--home PATH"
+
+The directory to use as the aerospike home directory. 
+
+.SH EXAMPLES
+
+.HP
+To start aerospike running in the current directory:
+
+aerospike start
+
+.HP 
+To start aerospike running a different home directory, such as '/usr/share/aerospike':
+
+aerospike start --home /usr/share/aerospike
\ No newline at end of file
diff --git a/pkg/tar/share/man/aerospike-status.man b/pkg/tar/share/man/aerospike-status.man
new file mode 100644
index 00000000..1be8d2fb
--- /dev/null
+++ b/pkg/tar/share/man/aerospike-status.man
@@ -0,0 +1,34 @@
+.TH "aerospike status" 1 "1 APRIL 2014" "aerospike status" "aerospike manual" 
+
+.SH NAME
+
+aerospike status \- get the status of aerospike server daemon
+
+.SH SYNOPSIS
+
+aerospike status [--home PATH]
+
+.SH DESCRIPTION
+
+.PP 
+Get the status of the aerospike server daemon. The program will assume the current directory is the aerospike home directory. 
+
+To override the home directory, specify the `--home PATH` option.
+
+.SH OPTIONS
+
+.IP "--home PATH"
+
+The directory to use as the aerospike home directory. 
+
+.SH EXAMPLES
+
+.HP
+To status aerospike running in the current directory:
+
+aerospike status
+
+.HP 
+To status aerospike running a different home directory, such as '/usr/share/aerospike':
+
+aerospike status --home /usr/share/aerospike
\ No newline at end of file
diff --git a/pkg/tar/share/man/aerospike-stop.man b/pkg/tar/share/man/aerospike-stop.man
new file mode 100644
index 00000000..e0806896
--- /dev/null
+++ b/pkg/tar/share/man/aerospike-stop.man
@@ -0,0 +1,34 @@
+.TH "aerospike stop" 1 "1 APRIL 2014" "aerospike stop" "aerospike manual" 
+
+.SH NAME
+
+aerospike stop \- stop aerospike server daemon
+
+.SH SYNOPSIS
+
+aerospike stop [--home PATH]
+
+.SH DESCRIPTION
+
+.PP 
+Start the aerospike server daemon. The program will assume the current directory is the aerospike home directory. 
+
+To override the home directory, specify the `--home PATH` option.
+
+.SH OPTIONS
+
+.IP "--home PATH"
+
+The directory to use as the aerospike home directory. 
+
+.SH EXAMPLES
+
+.HP
+To stop aerospike running in the current directory:
+
+aerospike stop
+
+.HP 
+To stop aerospike running a different home directory, such as '/usr/share/aerospike':
+
+aerospike stop --home /usr/share/aerospike
\ No newline at end of file
diff --git a/tools/bin/asd-coldstart b/tools/bin/asd-coldstart
new file mode 100755
index 00000000..461cdf45
--- /dev/null
+++ b/tools/bin/asd-coldstart
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+ASD_DIR=/etc/systemd/system/aerospike.service.d
+AS_CONF=$ASD_DIR/aerospike.conf
+
+cp -pf $ASD_DIR/aerospike.conf.coldstart $AS_CONF
+systemctl daemon-reload
+
+echo "Cold Starting Aerospike"
+systemctl start aerospike
+
+cp -pf $ASD_DIR/aerospike.conf.default $AS_CONF
+systemctl daemon-reload
diff --git a/tools/bin/iddecode b/tools/bin/iddecode
new file mode 100755
index 00000000..00ffc73c
--- /dev/null
+++ b/tools/bin/iddecode
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+#
+#    File:   iddecode
+#
+#    Description:
+#       Decode an Aerospike node ID by default into the MAC address and fabric TCP port:
+#
+#             MAC address @ fabric TCP port
+#
+#       The "-ras" option decodes it as a static Rack Aware format node ID:
+#
+#             group ID + node ID @ fabric TCP port
+#
+#       The "-rad" option decodes it as a dynamic Rack Aware format node ID:
+#
+#             group ID + IP address @ fabric TCP port
+#
+
+import sys
+
+rack_aware = False
+
+def usage():
+   print 'Usage: ' + sys.argv[0] + ' [-ra{s,d}] <NodeId>'
+   print '  Decode Aerospike server <NodeId>.'
+   print '  Use {static,dynamic} Rack Aware interpretation if the "-ra{s,d}" option is supplied.'
+   exit(-1)
+
+def decode_ip_addr(ipaddr):
+   d = ipaddr & 0xff
+   ipaddr >>= 8
+   c = ipaddr & 0xff
+   ipaddr >>= 8
+   b = ipaddr & 0xff
+   ipaddr >>= 8
+   a = ipaddr & 0xff
+   return str(a) + "." + str(b) + "." + str(c) + "." + str(d)
+
+if len(sys.argv) < 2 or len(sys.argv) > 3:
+   usage()
+
+id_pos = 1
+if len(sys.argv) == 3:
+   if sys.argv[1][0:3] == '-ra':
+      if sys.argv[1][3:4] == 'd':
+        rack_aware = 'dynamic'
+      else:
+        rack_aware = 'static'
+      id_pos += 1
+   else:
+      usage()
+
+rid = sys.argv[id_pos]
+l = 16 - len(rid)
+nid = '0' * l + rid
+port = int(nid[0:4], 16)
+
+print("Aerospike Node ID: " + rid + " ="),
+
+if rack_aware:
+   group_str = nid[4:8]
+   group_id = int(group_str, 16)
+   node_str = nid[8:17]
+   node_id = int(node_str, 16)
+   if rack_aware == 'dynamic':
+     node_id_str = decode_ip_addr(node_id)
+   else:
+     node_id_str = str(node_id)
+   print("Group ID: " + str(group_id) + " (0x" + group_str + ") + Node ID: " + node_id_str + " (0x" + node_str + ")"),
+else:
+   MAC = nid[14:16]
+   for i in xrange(12,3,-2):
+      MAC = MAC + ':' + nid[i:i+2]
+   print(MAC),
+
+print("@ " + str(port))
diff --git a/tools/citrus2aero/upgrade2to3 b/tools/citrus2aero/upgrade2to3
new file mode 100755
index 00000000..6974c33c
--- /dev/null
+++ b/tools/citrus2aero/upgrade2to3
@@ -0,0 +1,93 @@
+#!/bin/bash
+echo "This will help to migrate your existing citrusleaf conf to the new aerospike conf"
+
+OLDCONF='/etc/citrusleaf/citrusleaf.conf'
+NEWCONF='/etc/aerospike/aerospike.conf'
+
+autorun="${1:-no}"
+
+echolines(){
+echo -e "\n"
+}
+
+exitonerror()
+{
+echolines
+echo "Error running last command $msg. Non zero exit value returned. Exitting"
+exit
+}
+
+
+[ $autorun=="y" ]||read -p "Enter old config file [$OLDCONF]" oldconf
+#Check if older config exist
+if [ ! -f $oldconf ] 
+then echo "Old config $oldconf not found."
+$0
+exit
+else
+echo "Older citrusleaf.conf found. Proceeding with migration."
+
+cp -v $OLDCONF $NEWCONF.from_citrus
+if [ $? != '0' ]
+then echo "Error copying $OLDCONF to $NEWCONF.from_citrus. Please check permissions. (run $0 as sudo?)"
+exitonerror
+fi
+cp -v $OLDCONF $NEWCONF.2to3
+if [ $? != '0' ]
+then exitonerror
+fi
+
+declare -a config_updates
+#Escape any slashes and spaces while adding to the array
+config_updates=(["user\ citrusleaf"]="user\ aerospike" ["group\ citrusleaf"]="group\ aerospike" ["cld"]="asd" ["clxdr.log"]="aerospike\/asxdr.log" ["\/var\/log\/citrusleaf.log"]="\/var\/log\/aerospike\/aerospike.log" ["\/etc\/citrusleaf\/"]="\/etc\/aerospike\/")
+
+for key in "${!config_updates[@]}"
+do val=${config_updates["$key"]}
+#Check if the key exists in the file
+if [ `grep "$key" $NEWCONF.2to3|wc -l` -gt 0 ]
+then
+#For each instance of key in file, prompt for sed replace
+for line in `grep -inr "$key" $NEWCONF.2to3|cut -f 1 -d ':'`
+do
+
+#Format the srcline and target line for prompting only
+srcline=`sed -n "$line,${line}p" $NEWCONF.2to3|sed -e 's/^ *//g' -e 's/ *$//g'`
+targetline=`echo $srcline|sed "s/$key/$val/"`
+[ $autorun=="y" ]||read -p "Move $srcline to $targetline (y/n)[y] :" sedyes
+sedyes="${sedyes:-y}"
+if [ $sedyes != 'y' ]
+then echo "Not moving $srcline to $targetline"
+echolines
+else
+echo "Moving $srcline to $targetline"
+sed -i "${line}s/${key}/${val}/" $NEWCONF.2to3
+if [ $? != '0' ]
+then exitonerror
+fi
+echolines
+#if [ $key == "\/etc\/citrusleaf\/" ]
+for f in $srcline
+do srcfile=`echo $f|grep "$key"`
+destfile=`echo $srcfile|sed "s/$key/$val/"`
+if [ `echo $srcfile|wc -w` -gt 0 ] && [ -f $srcfile ]
+then
+ mv -v $srcfile $destfile
+echolines
+fi #end if [ `echo $srcfile|wc -w` -gt 0 ] && [ $srcfile -nt $destfile ]
+
+done
+
+
+fi #end if [ $sedyes !='y' ]
+done #end for line in `grep -inr "$key" $NEWCONF.2to3|cut -f 1 -d ':'`
+fi #end if [ `grep "$key" $NEWCONF.2to3|wc -l` -gt 0 ]
+done #end for key in "${!config_updates[@]}"
+
+
+fi #end if [ ! -f $oldconf ]
+
+
+cp -v $NEWCONF.2to3 $NEWCONF
+
+echolines
+echo -e "INFO:Please remove older citrusleaf packages if installed."
diff --git a/tools/fixownership/fixownership.py b/tools/fixownership/fixownership.py
new file mode 100755
index 00000000..b0fedd87
--- /dev/null
+++ b/tools/fixownership/fixownership.py
@@ -0,0 +1,282 @@
+#!/usr/bin/python
+
+# 1) Select the config file , default config given
+# 2) get the required params
+# 3) fix perms
+# 4) Default: Interactive mode to prompt at each step
+
+import sys
+from optparse import OptionParser
+import pwd
+import grp
+import os
+
+usage = "usage: %prog [options]"
+parser = OptionParser(usage=usage)
+parser.add_option("-c", "--config-file", dest="configfile",
+                  help="config file absolute location",
+                  default="/etc/aerospike/aerospike.conf")
+parser.add_option("-y", "--yes",
+                  action="store_false", dest="interactive",
+                  default=True,
+                  help="Fix all permissions assuming" +
+                  " default yes for all questions")
+
+(options, args) = parser.parse_args()
+# These are the params we are looking in config file
+# to change ownership, add any new param to this dict
+params = {'user': '',
+          'group': '',
+          'work-directory': '',
+          'pidfile': '',
+          'file': '',
+          'user-path': '',
+          'device': '',
+          'namedpipe-path': '', # legacy
+          'digestlog-path': '', # legacy
+          'errorlog-path': '',  # legacy
+          'xdr-pidfile': '',    # legacy
+          'system-path': '',
+          'xdr-digestlog-path' : '',
+          'ca-file' : '',
+          'ca-path' : '',
+          'cert-file' : '',
+          'key-file' : '',
+          'encryption-key-file' : '',
+          'dc-security-config-file' : ''
+          }
+
+# this is the dictionary for not running default chown
+# devices dont persist ownership change through reboots.
+# Hence adding it to exception list
+# instead adding user to the group owning the device
+# Also using this exclusion list for not adding parent dir
+params_exc = {'device': '',
+              'user': '',
+              'group': ''
+              }
+param_names = params.keys()
+u_configfile = ''
+
+# check that the running user is root / sudo
+if not os.getegid() == 0:
+    print "Please run this script as root or with sudo"
+    exit(10)
+
+# Run in interactive mode by default , get config file absolute path
+if options.interactive is True:
+    u_configfile = raw_input("Enter config file (" +
+                             options.configfile + "): ")
+if not u_configfile:
+    u_configfile = options.configfile
+try:
+    with open(u_configfile, 'r') as u_cf:
+        for line in u_cf:
+            # store the split
+            l_line = line.split()
+            # If line is not empty
+            if(len(l_line) > 0):
+                # Get the first word which is the param name
+                line_param = l_line[0]
+                # and check if the param is in the dictionary above
+                if line_param in param_names:
+                    # if present, then add the value to dict
+                    p_value = l_line[1]
+                    if (len(params[line_param]) == 0):
+                        # This is the first value for the param
+                        params[line_param] = p_value
+                    else:
+                        # The param already has a value, append
+                        params[line_param] = params[line_param] + \
+                            "," + p_value
+except IOError as e:
+    print 'Error while trying to read config file, please check ' + \
+        u_configfile + ': ' + e.strerror
+    exit(1)
+
+u_user = params['user']
+u_group = params['group']
+# If either user or group is root, exit
+if u_user == 'root' or u_group == 'root':
+    print "root user or root group found in config, not making any changes"
+    exit(2)
+
+# get the uid/gid needed for chown
+# This also verifies that the user exists
+try:
+    uid = pwd.getpwnam(u_user).pw_uid
+    gid = grp.getgrnam(u_group).gr_gid
+except KeyError as e:
+    print ("Error getting uid/gid of the user/group specified in the"
+           "config file : " + str(e))
+    exit(3)
+
+d_group = ''
+# If device is in config, get the group for the device
+if params['device'] is not '':
+    u_device = params['device']
+    # We are assuming all devices are owned by the same group
+    # hence finding group only for the first device
+    if(',' in u_device):
+            u_device = params['device'].split(',')[0]
+    try:
+        d_info = os.stat(u_device)
+        d_gid = d_info.st_gid
+        d_group = grp.getgrgid(d_gid)[0]
+    except OSError as e:
+        print e.strerror + " " + u_device
+        exit(5)
+
+# This is the default behavior for the script
+chown_default = 'y'
+do_chown = chown_default
+
+usermod_default = chown_default
+do_usermod = usermod_default
+# add user to group of device , default interactive
+if options.interactive is True and d_group is not '':
+    do_usermod = raw_input("Add user " + u_group +
+                           " to group " + d_group + "? (" + do_usermod + ")")
+    if not do_usermod:
+        do_usermod = usermod_default
+cmd = "usermod -a -G " + d_group + " " + u_user
+if do_usermod == 'y' and d_group is not '':
+    try:
+        if os.system(cmd) == 0:
+            print "User " + u_user + " added to group disk"
+        else:
+            # The exit() call here will raise an exception (SystemExit)
+            # So, let us not print any error here.
+            # Let the exception handler do it
+            exit(7)
+    except:
+        print "Error while adding user to group disk. "
+        exit(6)
+
+
+# removing the keys from dict which we dont need anymore
+# This also helps to iterate the entire dict later for changing owner
+params.pop('device')
+params.pop('user')
+params.pop('group')
+
+# Default values of some of the params which need to be upgraded if no
+# config values are present
+params_def = {'work-directory': '/opt/aerospike',
+              'user-path': '/opt/aerospike/usr',
+              'system-path': '/opt/aerospike/sys'
+              }
+for k in params_def:
+    if not len(params[k]) > 0:
+        params[k] = params_def[k]
+
+params_derived = {
+                'smd-path' : params['work-directory'] + '/smd'
+                }
+
+for k in params_derived:
+    if k not in params or not len(params[k]) > 0:
+        params[k] = params_derived[k]
+
+# Add parent directory and all subdirectory and file
+# recursively to dict. This helps in changing ownership.
+params_recursive_def = ['user-path', 'system-path', 'ca-path' , 'smd-path']
+
+def update_recursive(param, path):
+    try:
+        for root, dirs, files in os.walk(path):
+            for d in dirs:
+                param = param + "," + os.path.join(root, d)
+            for f in files:
+                param = param + "," + os.path.join(root, f)
+
+        return param
+
+    except OSError as e:
+        print "Error: " + e.strerror
+        return param
+
+for k in params_recursive_def:
+    if k in params and len(params[k]) > 0:
+        dirname = os.path.dirname(params[k])
+        params[k] = update_recursive(params[k], params[k])
+        params[k] = params[k] + "," + dirname
+
+
+# chown for all params
+for k in params:
+    # except for the params mentioned in exclusion list
+    if k not in params_exc:
+        param = params[k]
+        if len(param) > 0:
+            if(',' in param):
+                # if param has multiple values, get them in a list
+                n_params = param.split(',')
+            else:
+                n_params = [param]
+            for n_param in n_params:
+                # if the user chose interactive mode ask the question,
+                # else assume its a 'yes'
+                if options.interactive is True:
+                    do_chown = raw_input("change ownership of " + n_param +
+                                         " to user " + u_user +
+                                         " group " + u_group +
+                                         " (" + chown_default + "):")
+                    if not do_chown:
+                        do_chown = chown_default
+                else:
+                    do_chown = 'y'
+
+                if do_chown == 'y':
+                    try:
+                        # ok if file does not exist
+                        if os.path.exists(n_param):
+                            os.chown(n_param, uid, gid)
+                            print ("Ownership changed of " + n_param +
+                                   " to user " + u_user +
+                                   " group " + u_group)
+                    except OSError as e:
+                        print "Error: " + n_param, e.strerror
+                        #prompt for all except pid file
+                        if 'pid' not in n_param and options.interactive is True:
+                            do_continue = raw_input(n_param + " doesn't exist;"
+                                                "Do you want to continue?" +
+                                                "(" + chown_default + "):")
+                            if not do_continue:
+                                do_continue = chown_default
+                            if not do_continue == 'y':
+                                exit(4)
+
+# The reason for deleting shared memory -
+# We create the shared memory with 666 permission
+# The non-root run can continue running with warm restart but
+# cold restart will have problems deleting the shared memory
+delshm_default = chown_default
+do_delshm = delshm_default
+if options.interactive is True:
+    do_delshm = raw_input("Delete all shared memory instances " +
+                          "used by aerospike server? You can check" +
+                          " https://docs.aerospike.com" +
+                          "/display/V3/Warm+Start for more details" +
+                          "(" + delshm_default + "):")
+    if not do_delshm:
+        do_delshm = delshm_default
+if do_delshm == 'y':
+    cmd = "for i in `ipcs -m| sed \"s/ .*$//\" |grep 0xae`;do ipcrm -M $i;done"
+    try:
+        if os.system(cmd) == 0:
+            print "Shared memory used by aerospike deleted"
+        else:
+            # The exit() call here will raise an exception (SystemExit)
+            # So, let us not print any error here.
+            # Let the exception handler do it
+            exit(8)
+    except:
+        print "Error while deleting shared memory "
+        exit(9)
+
+#If we have come this far, there have been no exits, we should be good to print
+#an INFO message saying all good. This message should be used as a debug param
+#for successful execution
+print "INFO: Successful execution of fixownership script finished"
+
diff --git a/tools/jem/.gitignore b/tools/jem/.gitignore
new file mode 100644
index 00000000..60f6cb9f
--- /dev/null
+++ b/tools/jem/.gitignore
@@ -0,0 +1 @@
+jemdefs.pyc
diff --git a/tools/jem/README.md b/tools/jem/README.md
new file mode 100644
index 00000000..5d10a611
--- /dev/null
+++ b/tools/jem/README.md
@@ -0,0 +1,118 @@
+# JEMalloc Developer Tools
+
+This is a set of command-line tools for analyzing memory usage over time
+of programs incorporating the
+[JEMalloc](http://www.canonware.com/jemalloc/ "JEMalloc Website")
+memory allocator. The tools work by analyzing a sequence of snapshots of
+the state of the allocator as reported by the `malloc_stats_print()` API.
+
+These tools are written in the Python 2.X language.
+
+The main JEMalloc GitHub repo. is:
+[https://github.com/jemalloc/jemalloc](https://github.com/jemalloc/jemalloc)
+
+While these tools can be used with any program that uses JEMalloc and
+logs the appropriate statistics to a file, the specific examples in this
+documentation will be in the context of the
+[Aerospike server](https://github.com/aerospike/aerospike-server).
+
+JEMalloc is incorporated as a submodule in the Aerospike server (version
+3.0 or greater) as `modules/jemalloc`.
+
+Aerospike's public fork of the main JEMalloc repo. is:
+[https://github.com/aerospike/jemalloc](https://github.com/aerospike/jemalloc).
+
+# Usage
+
+The typical way to use these tools is first to collect from a program
+some sequence of samples of JEMalloc statistics over time, and then to
+interactively examine the collected data using this set of tools. (Each
+time sample of the state of JEMalloc memory usage is termed an "iteration"
+by these tools.)
+
+The Aerospike server's current JEMalloc statistics can be dumped to the
+console log file (`/tmp/aerospike-console.<PID>`) using the Info. command
+`jem-stats:`, e.g., by using the command `asinfo -v jem-stats:`.
+
+Running the `get-jem-stats` script will periodically trigger collection
+of these statistics to a file by the Aerospike server using the
+`jem-stats:` Info. command.
+
+Next, running the script `extract-jem-stats` will pre-process the log
+file containing the JEMalloc statistics into a Python file suitable for
+inclusion by the this set of tools.
+
+Finally, the time series of JEMalloc statistics samples may be analyzed
+using the tools provided by this tool set.
+
+The three tools are: `jemabs` (<em>abs</em>olute memory usage), `jemdel`
+(<em>del</em>ta memory usage), and `jemeff` (memory usage <em>eff</em>iciency.)
+
+## Tool Command-Line Arguments
+
+The common command-line argument pattern for each of these tools is:
+
+`{{<JEMDataFile:Default:'jemdata'>} <ThreholdMB:Default=200> <StartIteration:Default=0> <EndIteration:Default=<Last>>}`
+
+* All arguments are optional.
+
+* If the first argument is non-numeric, it will be interpreted as the
+base source filename (without ".py") in the current directory containing
+the pre-processed JEMalloc statistics.
+
+* The first numeric argument is interpreted as a minimum threshold (in
+megabytes) for a value to be significant enough to output. (The
+threshold is compared against the absolute value of a data point, so
+negative values in the data can also be observed.)
+
+* The second numeric argument is the starting iteration, which defaults to 0.
+
+* The third numeric argument is the ending iteration, which defaults to
+the last iteration of the data set.
+
+The output of the tools in a "human readable" format, i.e., in terms of
+"MB", "GB", etc.
+
+These tools currently look at the `total`, `active`, and `mapped` memory
+as reported by JEMalloc, on an arena-by-arena basis. These quantities
+give a good overall picture of what is happening in an arena over time.
+
+The same techniques could in principle be extended to cover other
+statistics reported by JEMalloc.
+
+# Examples
+
+	prompt% extract-jem-stats      --  Pre-process the log file for use by these tools.
+
+	prompt% jemdel                 --  Print all changes in arena sizes greater than 200MB.
+
+	prompt% jemabs 1024 3 6        --  Print the arena sizes greater than 1GB for iterations 3 through 5, inclusive.
+
+# Files
+
+The files in this directory are as follows:
+
+* `get-jem-stats` -- Poll the Aerospike server (version 3.0 or greater)
+to dump its JEMalloc statistics to its console log file on a time
+interval (defaulting to 600 sec. = 10 min.)  (TCSH version.)
+
+* `get-jem-stats.sh` -- (BASH version of the above.)
+
+* `extract-jem-stats` -- Pre-process the JEMalloc statistics into the
+file `jemdata.py` suitable for processing by the `jem(abs|del|eff)`
+tools. (TCSH version.)
+
+* `extract-jem-stats.sh` -- (BASH version of the above.)
+
+* `jemabs` -- Calculate the absolute greatest sizes of memory allocated
+for each JEMalloc arena.
+
+* `jemdel` -- Calculate the greatest changes (`delta`) in memory
+allocated for each JEMalloc arena.
+
+* `jemeff` -- Calculate the efficiency (memory used / memory allocated)
+for each JEMalloc arena.
+
+* `jemdefs.py` -- Definitions needed by the `jem(abs|del|eff)` tools.
+
+* `README.md` -- This documentation file.
diff --git a/tools/jem/extract-jem-stats b/tools/jem/extract-jem-stats
new file mode 100755
index 00000000..035a5d97
--- /dev/null
+++ b/tools/jem/extract-jem-stats
@@ -0,0 +1,41 @@
+#!/bin/tcsh -f
+#
+#    File:   extract-jem-stats
+#
+#    Description:
+#       Extract JEMalloc statistics from the file specified as the single
+#       command-line argument, which defaults to the latest Aerospike server
+#       console log file.
+#
+#       Output is to the file 'jemdata.py' in the current working directory,
+#       suitable for processing by the 'jem(abs|del|eff)' tools."
+#
+#    Usage:
+#       prompt% extract-jem-stats {<JEMallocStatsFile:Default=/tmp/aerospike-console.<PID>>}
+#
+
+alias ascons '\ls -t /tmp/aerospike-console.* | head -1'
+
+if ($#argv == 0) then
+  set FILE = `ascons`
+else if ($#argv == 1) then
+  set FILE = $1
+  if ( ! -f $FILE ) then
+     echo "Error!  File $FILE not found!"
+     exit -1
+  else
+  endif
+else
+  echo "Usage:  $0 {<JEMallocStatsFile>}"
+  echo "  Extract JEMalloc statistics from the given file, which defaults to"
+  echo "  the latest Aerospike server console log file.  Output is to the file"
+  echo "  'jemdata.py' in the current working directory, suitable for processing"
+  echo "  by the 'jem(abs|del|eff)' tools."
+  exit -1
+endif
+
+echo -n "Extracting data from $FILE...."
+
+egrep '(^(arenas|(total|active|mapped):)|Begin)' $FILE | sed "s/.*Begin.*/arenas 'sum'/" | tr -d ':' | sed 's/\[\([0-9]*\)\]/ \1/' | awk 'BEGIN{print "def load(arenas, total, active, mapped):"}{print "    "$1"("$2") "}' > jemdata.py
+
+echo "Done."
diff --git a/tools/jem/extract-jem-stats.sh b/tools/jem/extract-jem-stats.sh
new file mode 100755
index 00000000..3480b541
--- /dev/null
+++ b/tools/jem/extract-jem-stats.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+#
+#    File:   extract-jem-stats.sh
+#
+#    Description:
+#       Extract JEMalloc statistics from the file specified as the single
+#       command-line argument, which defaults to the latest Aerospike server
+#       console log file.
+#
+#       Output is to the file 'jemdata.py' in the current working directory,
+#       suitable for processing by the 'jem(abs|del|eff)' tools."
+#
+#    Usage:
+#       prompt$ extract-jem-stats.sh {<JEMallocStatsFile:Default=/tmp/aerospike-console.<PID>>}
+#
+
+shopt -s expand_aliases
+alias ascons='\ls -t /tmp/aerospike-console.* | head -1'
+
+if [ $# -eq 0 ]; then
+   FILE=`ascons`
+elif [ $# -eq 1 ]; then
+  FILE=$1
+  if [ ! -f $FILE ]; then
+     echo "Error!  File $FILE not found!"
+     exit -1
+  fi
+else
+  echo "Usage:  $0 {<JEMallocStatsFile>}"
+  echo "  Extract JEMalloc statistics from the given file, which defaults to"
+  echo "  the latest Aerospike server console log file.  Output is to the file"
+  echo "  'jemdata.py' in the current working directory, suitable for processing"
+  echo "  by the 'jem(abs|del|eff)' tools."
+  exit -1
+fi
+
+echo -n "Extracting data from $FILE...."
+
+egrep '(^(arenas|(total|active|mapped):)|Begin)' $FILE | sed "s/.*Begin.*/arenas 'sum'/" | tr -d ':' | sed 's/\[\([0-9]*\)\]/ \1/' | awk 'BEGIN{print "def load(arenas, total, active, mapped):"}{print "    "$1"("$2") "}' > jemdata.py
+
+echo "Done."
diff --git a/tools/jem/get-jem-stats b/tools/jem/get-jem-stats
new file mode 100755
index 00000000..f6cacf8b
--- /dev/null
+++ b/tools/jem/get-jem-stats
@@ -0,0 +1,20 @@
+#!/bin/tcsh -f
+#
+#    File:   get-jem-stats
+#
+#    Description:
+#       Endlessly poll the Aerospike server for its JEMalloc statistics.
+#       Polling interval is specified by the single, optional command-line
+#       argument, which defaults to 600 sec. = 10 min.
+#
+#    Usage:
+#       prompt% get-jem-stats {<PollIntervalSec:Default=600>}
+#
+
+set INTERVAL = 600
+if ($#argv == 1) set INTERVAL = $argv[1]
+
+while (1)
+  asinfo -v jem-stats: >& /dev/null
+  sleep $INTERVAL
+end
diff --git a/tools/jem/get-jem-stats.sh b/tools/jem/get-jem-stats.sh
new file mode 100755
index 00000000..3ece9e09
--- /dev/null
+++ b/tools/jem/get-jem-stats.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#    File:   get-jem-stats.sh
+#
+#    Description:
+#       Endlessly poll the Aerospike server for its JEMalloc statistics.
+#       Polling interval is specified by the single, optional command-line
+#       argument, which defaults to 600 sec. = 10 min.
+#
+#    Usage:
+#       prompt$ get-jem-stats.sh {<PollIntervalSec:Default=600>}
+#
+
+INTERVAL=600
+if [ $# -eq 1 ]; then INTERVAL=$1; fi
+
+while true; do
+  asinfo -v jem-stats: &> /dev/null
+  sleep $INTERVAL
+done
diff --git a/tools/jem/jemabs b/tools/jem/jemabs
new file mode 100755
index 00000000..a4c1615b
--- /dev/null
+++ b/tools/jem/jemabs
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#
+#    File:   jemabs
+#
+#    Description:
+#       Compute the JEMalloc arenas with the greatest absolute size over time.
+#
+#    Usage:
+#       prompt% jemabs {{<JEMDataFile:Default:'jemdata'>} <ThreholdMB:Default=200> <StartIteration:Default=0> <EndIteration:Default=<Last>>}
+#
+
+from jemdefs import *
+
+pkg = sys.argv.pop(1) if len(sys.argv) > 1 and not sys.argv[1].isdigit() else 'jemdata'
+__import__(pkg, fromlist=['load']).load(arenas, total, active, mapped)
+
+def printabs(thresh_mb = 200, start = 0, end = len(table)):
+    thresh = thresh_mb * one_meg
+    for i in xrange(start, end):
+        print 'Absolutes from iteration # ' + str(i) + ' above ' + readable(thresh) + ':'
+        for arena in sorted(table[i].keys()):
+            if arena is 'sum':
+                continue
+            try:
+                entry  = table[i][arena]
+                total  = float(entry['total'])
+                active = float(entry['active'])
+                mapped = float(entry['mapped'])
+            except Exception as e:
+                print 'caught exception', e, type(e), 'at iteration =', i
+                return
+
+            if (abs(total) >= thresh):
+                print 'arena: ', arena, ' total: ', readable(total)
+            if (abs(active) >= thresh):
+                print 'arena: ', arena, ' active: ', readable(active)
+            if (abs(mapped) >= thresh):
+                print 'arena: ', arena, ' mapped: ', readable(mapped)
+
+printabs(*[int(x) for x in sys.argv[1:]])
diff --git a/tools/jem/jemdefs.py b/tools/jem/jemdefs.py
new file mode 100644
index 00000000..b9f6758f
--- /dev/null
+++ b/tools/jem/jemdefs.py
@@ -0,0 +1,65 @@
+#
+#    File:   jemdefs.py
+#
+#    Description:
+#       Library of common definitions used by the "jem(abs|del|eff)" tools.
+#
+#    Usage:
+#       To use in an externally-callable utility Python script, include the line:
+#
+#       from jemdefs import *
+#
+
+import sys
+
+sys.path.insert(0, '.')
+
+one_kay = 1024
+one_meg = one_kay * one_kay
+one_gig = one_kay * one_meg
+
+def readable(sz):
+    asz = abs(sz)
+    if (asz >= one_gig):
+        scale = ' GB'
+        quant = sz / one_gig
+    elif (asz >= one_meg):
+        scale = ' MB'
+        quant = sz / one_meg
+    elif (asz >= one_kay):
+        scale = ' KB'
+        quant = sz / one_kay
+    else:
+        scale = ' B'
+        quant = sz
+    return str(quant) + scale
+
+def pct(n, d):
+   return str(int(10000 * (n / d)) / 100.0) + '%'
+
+table = {}
+
+i = -1
+j = None
+
+def arenas(name):
+    global i, j
+    if (name == 'sum'):
+        i += 1
+        table[i] = {}
+    else:
+        name = int(name)
+    j = name
+    table[i][j] = {}
+
+def total(val):
+    global i, j
+    table[i][j]['total'] = int(val)
+
+def active(val):
+    global i, j
+    table[i][j]['active'] = int(val)
+
+def mapped(val):
+    global i, j
+    table[i][j]['mapped'] = int(val)
diff --git a/tools/jem/jemdel b/tools/jem/jemdel
new file mode 100755
index 00000000..581e349c
--- /dev/null
+++ b/tools/jem/jemdel
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+#
+#    File:   jemdel
+#
+#    Description:
+#       Compute the JEMalloc arenas with the greatest change ("delta") over time.
+#
+#    Usage:
+#       prompt% jemdel {{<JEMDataFile:Default:'jemdata'>} <ThreholdMB:Default=200> <StartIteration:Default=0> <EndIteration:Default=<Last>>}
+#
+
+from jemdefs import *
+
+pkg = sys.argv.pop(1) if len(sys.argv) > 1 and not sys.argv[1].isdigit() else 'jemdata'
+__import__(pkg, fromlist=['load']).load(arenas, total, active, mapped)
+
+def printdelta(thresh_mb = 200, first = 0, start = 1, end = len(table)):
+    thresh = thresh_mb * one_meg
+    for last in xrange(start, end):
+        print 'Deltas from iteration # ' + str(first) + ' to ' + str(last) + ' above ' + readable(thresh) + ':'
+        for arena in sorted(table[first].keys()):
+            if arena is 'sum':
+                continue
+            try:
+                entry   = table[first][arena]
+                total0  = entry['total']
+                active0 = entry['active']
+                mapped0 = entry['mapped']
+            except Exception as e:
+                print 'caught exception', e, type(e), 'at iteration =', i
+
+            entry   = table[last][arena]
+            total1  = entry['total']
+            active1 = entry['active']
+            mapped1 = entry['mapped']
+
+            dt = float(total1 - total0)
+            if (abs(dt) >= thresh):
+                print 'arena: ', arena, ' delta total: ', readable(dt)
+
+            dm = float(mapped1 - mapped0)
+            if (abs(dm) >= thresh):
+                print 'arena: ', arena, ' delta mapped: ', readable(dm)
+
+printdelta(*[int(x) for x in sys.argv[1:]])
diff --git a/tools/jem/jemeff b/tools/jem/jemeff
new file mode 100755
index 00000000..c20a67e8
--- /dev/null
+++ b/tools/jem/jemeff
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+#
+#    File:   jemeff
+#
+#    Description:
+#       Compute the JEMalloc arenas with the greatest efficiency
+#       (memory used / memory allocated) over time.
+#
+#    Usage:
+#       prompt% jemeff {{<JEMDataFile:Default:'jemdata'>} <ThreholdMB:Default=200> <StartIteration:Default=0> <EndIteration:Default=<Last>>}
+#
+
+from jemdefs import *
+
+pkg = sys.argv.pop(1) if len(sys.argv) > 1 and not sys.argv[1].isdigit() else 'jemdata'
+__import__(pkg, fromlist=['load']).load(arenas, total, active, mapped)
+
+def printeff(thresh_mb = 200, start = 0, end = len(table)):
+    thresh = thresh_mb * one_meg
+    for i in xrange(start, end):
+        print 'Efficiency from iteration # ' + str(i) + ' above ' + readable(thresh) + ':'
+        for arena in sorted(table[i].keys()):
+            if arena is 'sum':
+                continue
+            try:
+                entry  = table[i][arena]
+                total  = float(entry['total'])
+                active = float(entry['active'])
+                mapped = float(entry['mapped'])
+            except Exception as e:
+                print 'caught exception', e, type(e), 'at iteration =', i
+                return
+
+            if ((abs(total) >= thresh) or (abs(active) >= thresh) or (abs(mapped) >= thresh)):
+                print 'arena: ', arena, ' total: ', readable(total), '(' + pct(total, mapped) + ') active: ', readable(active), ' (' + pct(active, mapped) + ') mapped: ', readable(mapped)
+
+printeff(*[int(x) for x in sys.argv[1:]])
diff --git a/tools/memacct/asparsemem b/tools/memacct/asparsemem
new file mode 100755
index 00000000..055e6b7f
--- /dev/null
+++ b/tools/memacct/asparsemem
@@ -0,0 +1,121 @@
+#!/usr/bin/env python2
+
+from __future__ import print_function
+from sys import stdin, stdout, stderr
+from optparse import OptionParser
+from subprocess import Popen, PIPE, CalledProcessError
+from re import match
+
+addr_map = {}
+
+def check_output(args):
+	proc = Popen(args, stdout = PIPE)
+	out, _ = proc.communicate()
+	res = proc.poll()
+
+	if res != 0:
+		raise CalledProcessError(res, args[0])
+
+	return out
+
+def map_address(addr, asd_path):
+	if not addr in addr_map:
+		try:
+			resolv = check_output(["addr2line", "-e", asd_path, addr])
+
+		except OSError:
+			print("The addr2line command is not available. You might have to install the binutils package.", file = stderr)
+			raise SystemExit(1)
+
+		except CalledProcessError:
+			print("The addr2line command failed.", file = stderr)
+			raise SystemExit(1)
+
+		addr_map[addr] = resolv.rstrip()
+
+	return addr_map[addr]
+
+def parse_mem(in_path):
+	time = None
+	aggr = {}
+
+	try:
+		f = stdin if in_path == '-' else open(in_path, "r")
+		for line in f:
+			line = line.rstrip()
+			m = match("^-+ ([^-]+) -+$", line)
+
+			if m:
+				new_time = m.group(1)
+				print("Processing memory dump of {0}".format(new_time), file = stderr)
+
+				if time:
+					print("More than one memory dump in {0}".format(in_path), file = stderr)
+					raise SystemExit(1)
+
+				time = new_time
+				continue
+
+			m = match("^0x([0-9a-f]+) +([0-9]+) +0x([0-9a-f]+) +0x([0-9a-f]+)$", line)
+
+			if m:
+				(addr_s, tid_s, size_hi_s, size_lo_s) = m.group(1, 2, 3, 4)
+
+				addr = int(addr_s, 16)
+				tid = int(tid_s, 16)
+				size_hi = int(size_hi_s, 16)
+				size_lo = int(size_lo_s, 16)
+
+				size = size_hi * 2 ** 64 + size_lo
+
+				if (size >= 2 ** 127):
+					size -= 2 ** 128
+
+				if not addr in aggr:
+					aggr[addr] = 0
+
+				aggr[addr] += size
+				continue
+
+			print("Invalid line in {0}: {1}".format(in_path, line), file = stderr)
+			raise SystemExit(1)
+
+	except IOError:
+		print("Cannot open file {0}".format(in_path), file = stderr)
+		raise SystemExit(1)
+
+	return aggr
+
+parser = OptionParser()
+parser.add_option("-i", "--in", dest = "in_path", help = "read memory accounting info from FILE; defaults to \"-\", which means stdin", metavar = "FILE")
+parser.add_option("-o", "--out", dest = "out_path", help = "write aggregated memory accounting info to FILE; defaults to \"-\", which means stdout", metavar = "FILE")
+parser.add_option("-a", "--asd", dest = "asd_path", help = "read line number information from Aerospike server executable ASD; defaults to /usr/bin/asd", metavar = "ASD")
+
+(opts, args) = parser.parse_args()
+
+if not opts.asd_path:
+	opts.asd_path = "/usr/bin/asd"
+
+if not opts.in_path:
+	opts.in_path = "-"
+
+if not opts.out_path:
+	opts.out_path = "-"
+
+aggr = parse_mem(opts.in_path)
+
+try:
+	f = stdout if opts.out_path == '-' else open(opts.out_path, "w")
+
+	for addr in sorted(aggr):
+		addr_s = "0x{0:x}".format(addr)
+		site = map_address(addr_s, opts.asd_path)
+
+		if aggr[addr] > 0:
+			print("{0}|{1}|{2}".format(site, addr_s, aggr[addr]), file = f)
+
+except IOError:
+	print("Cannot open file {0}".format(opts.out_path), file = stderr)
+	raise SystemExit(1)
+
+raise SystemExit(0)
diff --git a/tools/systemtap/README.md b/tools/systemtap/README.md
new file mode 100644
index 00000000..912fb437
--- /dev/null
+++ b/tools/systemtap/README.md
@@ -0,0 +1,40 @@
+
+#### Enable ntpd on clients and servers.
+
+Do this first so it can settle while you do the rest ...
+
+
+#### Dependencies
+
+Install systemtap software:
+
+    # On RedHat/CentOS/Fedora:
+    sudo yum install systemtap systemtap-runtime systemtap-sdt-devel
+
+    # On Debian/Ubuntu:
+    sudo apt-get install -y systemtap systemtap-runtime systemtap-sdt-dev
+
+
+Add user to systemtap groups:
+
+    sudo usermod -a -G stapusr,stapsys,stapdev <username>
+    # relogin after this to obtain group privileges
+
+
+#### Building server
+
+    cd aerospike-server
+    make USE_SYSTEMTAP=1 clean all
+
+
+#### Collecting server events
+
+    cd aerospike-server
+    stap tools/systemtap/queries.stp -o /tmp/asd-`hostname`-stap.log
+
+
+#### Annotate multiple concurrent trace files
+
+    cd aerospike-server
+    sort -n /tmp/*-stap.log | tools/systemtap/query_annotate 
+
diff --git a/tools/systemtap/queries.stp b/tools/systemtap/queries.stp
new file mode 100755
index 00000000..43bb0e1e
--- /dev/null
+++ b/tools/systemtap/queries.stp
@@ -0,0 +1,145 @@
+/*
+ 
+*/
+
+
+probe process("./target/Linux-x86_64/bin/asd").mark("trans__demarshal")
+{
+    printf("%d %d ASD_trans_demarshal nodeid=%0x msgp=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__starting")
+{
+    printf("%d %d ASD_query_starting nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__qtrsetup_starting")
+{
+    printf("%d %d ASD_query_qtrsetup_starting nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__qtrsetup_finished")
+{
+    printf("%d %d ASD_query_qtrsetup_finished nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__init")
+{
+    printf("%d %d ASD_query_init nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__done")
+{
+    printf("%d %d ASD_query_done nodeid=%0x taskid=%0x nrecs=%d\n",
+           gettimeofday_us(), tid(), $arg1, $arg2, $arg3);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__trans_done")
+{
+    printf("%d %d ASD_query_trans_done nodeid=%0x taskid=%0x qtr=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2, $arg3);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__qtr_alloc")
+{
+    printf("%d %d ASD_query_qtr_alloc nodeid=%0x taskid=%0x qtr=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2, $arg3);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__qtr_free")
+{
+    printf("%d %d ASD_query_qtr_free nodeid=%0x taskid=%0x qtr=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2, $arg3);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__ioreq_starting")
+{
+    printf("%d %d ASD_query_ioreq_starting nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__ioreq_finished")
+{
+    printf("%d %d ASD_query_ioreq_finished nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+/*
+// These are a little verbose ...
+probe process("./target/Linux-x86_64/bin/asd").mark("query__io_starting")
+{
+    printf("%d %d ASD_query_io_starting nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__io_notmatch")
+{
+    printf("%d %d ASD_query_io_notmatch nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__io_error")
+{
+    printf("%d %d ASD_query_io_error nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__io_finished")
+{
+    printf("%d %d ASD_query_io_finished nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+*/
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__netio_starting")
+{
+    printf("%d %d ASD_query_netio_starting nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__netio_finished")
+{
+    printf("%d %d ASD_query_netio_finished nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__addfin")
+{
+    printf("%d %d ASD_query_addfin nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__sendpacket_starting")
+{
+    printf("%d %d ASD_sendpacket_starting nodeid=%0x pos=%d len=%d\n",
+           gettimeofday_us(), tid(), $arg1, $arg2, $arg3);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__sendpacket_continue")
+{
+    printf("%d %d ASD_sendpacket_continue nodeid=%0x pos=%d\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("query__sendpacket_finished")
+{
+    printf("%d %d ASD_sendpacket_finished nodeid=%0x\n",
+           gettimeofday_us(), tid(), $arg1);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("sindex__msgrange_starting")
+{
+    printf("%d %d ASD_sindex_msgrange_starting nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
+
+probe process("./target/Linux-x86_64/bin/asd").mark("sindex__msgrange_finished")
+{
+    printf("%d %d ASD_sindex_msgrange_finished nodeid=%0x taskid=%0x\n",
+           gettimeofday_us(), tid(), $arg1, $arg2);
+}
diff --git a/tools/systemtap/query_annotate b/tools/systemtap/query_annotate
new file mode 100755
index 00000000..68b88db5
--- /dev/null
+++ b/tools/systemtap/query_annotate
@@ -0,0 +1,277 @@
+#!/usr/bin/env python
+
+import sys
+import fileinput
+import datetime
+import traceback
+
+ntasks = 0
+tasks = {}
+
+nnodes = 0
+nodes = {}
+
+substates = {}
+
+counts = {}
+
+def fieldorder(field):
+    """Sorts nodeid and taskid to the begining of the field lists"""
+    if   field[0:6] == 'nodeid': return '0' + field
+    elif field[0:6] == 'taskid': return '1' + field
+    else: return field
+
+def substate_starting(tstamp, parent, child):
+    global substates
+    substates[child] = tstamp
+
+def substate_finished(tstamp, child, statedelta):
+    global substates
+    if child not in substates:
+        return
+    dt = tstamp - substates[child] 
+    del substates[child]
+    statedelta.append((child[0], dt))
+
+def count_increment(child, countchange):
+    global counts
+    if child not in counts:
+        counts[child] = 1
+    else:
+        counts[child] += 1
+    countchange.append((child[0], counts[child]))
+
+def count_decrement(child, countchange):
+    global counts
+    if child in counts:
+        counts[child] -= 1
+    else:
+        counts[child] = 0
+    countchange.append((child[0], counts[child]))
+    if counts[child] == 0:
+        del counts[child]
+
+def process_line(line):
+    global ntasks
+    global tasks
+    global nnodes
+    global nodes
+
+    result = ''
+
+    fields = line.split()
+    tstamp = long(fields[0])
+    tid = long(fields[1])
+    evtstr = fields[2]
+
+    taskid = None
+    delta = None
+
+    # ----------------------------------------------------------------
+    # Map some fields to nicer names/sumbols
+    # ----------------------------------------------------------------
+
+    for field in fields[3:]:
+        name, value = field.split('=')
+
+        # Taskid gets mapped to an ordinal.
+        if name == 'taskid':
+            taskid = value
+            if not taskid in tasks:
+                ntasks += 1
+                tasks[taskid] = (ntasks, tstamp)
+            delta = tstamp - tasks[taskid][1]
+
+        # Nodid gets mapped to an ordinal.
+        elif name == 'nodeid':
+            nodeid = value.upper()
+            if not nodeid in nodes:
+                nnodes += 1
+                nodes[nodeid] = nnodes
+
+    # ----------------------------------------------------------------
+    # Manage substates
+    # ----------------------------------------------------------------
+
+    statedelta = []
+    countchange = []
+
+    # ---- cliquery
+
+    if evtstr == 'CLI_query_foreach_starting':
+        substate_starting(tstamp,
+                          (None,),
+                          ('cliquery', taskid))
+
+    if evtstr == 'CLI_query_foreach_finished':
+        substate_finished(tstamp,
+                          ('cliquery', taskid),
+                          statedelta)
+
+    # ---- cliquerycmd
+
+    if evtstr == 'CLI_query_command_execute':
+        substate_starting(tstamp,
+                          ('CLI_query_foreach', taskid),
+                          ('cliquerycmd', taskid, nodeid))
+
+    if evtstr == 'CLI_query_command_complete':
+        substate_finished(tstamp,
+                          ('cliquerycmd', taskid, nodeid),
+                          statedelta)
+
+    # ---- preinit
+
+    if evtstr == 'ASD_trans_prepare':
+        substate_starting(tstamp,
+                          ('CLI_query_command', taskid, nodeid),
+                          ('preinit', taskid, nodeid))
+
+    if evtstr == 'ASD_query_init':
+        substate_finished(tstamp,
+                          ('preinit', taskid, nodeid),
+                          statedelta)
+
+    # ---- asdquery
+
+    if evtstr == 'ASD_query_init':
+        substate_starting(tstamp,
+                          ('CLI_query_command', taskid, nodeid),
+                          ('asdquery', taskid, nodeid))
+        count_increment(('concurqueries', nodeid), countchange)
+
+    if evtstr == 'ASD_query_done':
+        substate_finished(tstamp,
+                          ('asdquery', taskid, nodeid),
+                          statedelta)
+        count_decrement(('concurqueries', nodeid), countchange)
+
+    # ---- querygen
+
+    if evtstr == 'ASD_query_init':
+        substate_starting(tstamp,
+                          ('CLI_query_command', taskid, nodeid),
+                          ('querygen', taskid, nodeid))
+
+    if evtstr == 'ASD_query_ioreq_starting':
+        substate_finished(tstamp,
+                          ('querygen', taskid, nodeid),
+                          statedelta)
+
+    # ---- queryxmt
+
+    if evtstr == 'CLI_query_command_execute':
+        substate_starting(tstamp,
+                          ('CLI_query_command', taskid, nodeid),
+                          ('queryxmt', taskid, nodeid))
+
+    if evtstr == 'ASD_trans_prepare':
+        substate_finished(tstamp,
+                          ('queryxmt', taskid, nodeid),
+                          statedelta)
+
+    # ---- qtrsetup
+
+    if evtstr == 'ASD_query_qtrsetup_starting':
+        substate_starting(tstamp,
+                          ('CLI_query_command', taskid, nodeid),
+                          ('qtrsetup', taskid, nodeid))
+
+    if evtstr == 'ASD_query_qtrsetup_finished':
+        substate_finished(tstamp,
+                          ('qtrsetup', taskid, nodeid),
+                          statedelta)
+
+    # ---- recparse
+
+    if evtstr == 'CLI_query_recparse_starting':
+        substate_starting(tstamp,
+                          ('CLI_query_command', taskid, nodeid),
+                          ('recparse', taskid, nodeid))
+
+    if evtstr == 'CLI_query_recparse_finished':
+        substate_finished(tstamp,
+                          ('recparse', taskid, nodeid),
+                          statedelta)
+
+    # Augment the record with state deltas.
+    #
+    for child, dt in statedelta:
+        fields.append("%s=%.3f" % (child, float(dt) / 1000.0))
+    for child, count in countchange:
+        fields.append("%s=%d" % (child, count))
+
+
+    # ----------------------------------------------------------------
+    # Print an output record
+    # ----------------------------------------------------------------
+
+    # Print the timestamp formatted in localtime.
+    result += datetime.datetime.fromtimestamp(float(tstamp) / 1000000.0).strftime('%Y-%m-%d %H:%M:%S.%f')
+
+    # Threadid
+    result += " %7s" % ("[%d]" % (tid,),)
+
+    # If we've got a taskid print the delta time
+    if taskid:
+        result += " %7.3f" % (float(delta) / 1000.0)
+    else:
+        result += "  ------"
+
+    # Print the event name
+    result += " %-32s" % (evtstr,)
+
+    # Print the rest of the event
+    sawnodeid = False
+    sawtaskid = False
+    padded = False
+    for field in sorted(fields[3:], key=fieldorder):
+        name, value = field.split('=')
+        if name == 'nodeid':
+            result += " N%-2d" % (nodes[value.upper()],)
+            sawnodeid = True
+        elif name == 'taskid':
+            if not sawnodeid:
+                result += " -- "
+            result += " Q%06d" % (tasks[value][0],)
+            sawtaskid = True
+        else:
+            if not padded:
+                padded = True
+                if not sawtaskid:
+                    result += " -------"
+            result += ' ' + field
+
+    # All done
+    if not padded:
+        padded = True
+        if not sawtaskid:
+            result += " -------"
+
+    return result
+
+def dump_nodes():
+    global nodes
+    pairs = []
+    for nodeid, nid in nodes.iteritems():
+        pairs.append((nodeid, nid))
+    for nodeid, nid in sorted(pairs, key=lambda pair: pair[1]):
+        print "N%-2d\t%s" % (nid, nodeid)
+
+def dump_tasks():
+    global tasks
+    pairs = []
+    for (taskid, (tid, tstamp)) in tasks.iteritems():
+        print tid, taskid
+
+for line in fileinput.input():
+    try:
+        print process_line(line)
+    except Exception as ex:
+        print >> sys.stderr, "Exception processing: "
+        print >> sys.stderr, line
+        traceback.print_exc(file=sys.stderr)
+
+dump_nodes()
+
+# dump_tasks()