Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Integrate SOMAColumn: Arrow adapter methods, part 1 #3405

Merged
merged 26 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a8ed930
Remove fmt::format
XanthosXanthopoulos Nov 20, 2024
abd41cc
Remove unneeded methods and member variables
XanthosXanthopoulos Nov 21, 2024
b25b983
Add minimal testing for dimensions
XanthosXanthopoulos Nov 21, 2024
a0e0409
Replace string_view with string when returning column name, add curre…
XanthosXanthopoulos Dec 6, 2024
56466a4
Remove current_domain flag
XanthosXanthopoulos Nov 24, 2024
e3c3e59
Replace string_view with string when returning column name, add curre…
XanthosXanthopoulos Dec 6, 2024
f181c11
Update CMake files
XanthosXanthopoulos Dec 12, 2024
d376fba
Add minimal testing for dimensions
XanthosXanthopoulos Nov 21, 2024
3e777ab
Misc fixes
XanthosXanthopoulos Nov 21, 2024
00f31e7
Add read test case
XanthosXanthopoulos Nov 22, 2024
6928bc5
Remove current_domain flag
XanthosXanthopoulos Nov 24, 2024
7a97246
Do not export soma column [skip ci]
XanthosXanthopoulos Nov 25, 2024
56ad949
Replace string_view with string when returning column name, add curre…
XanthosXanthopoulos Dec 6, 2024
6ab035f
Add function to extract data from ArrowTable into std::array
XanthosXanthopoulos Dec 6, 2024
04c8ac0
Migrate array creation to SOMAColumn
XanthosXanthopoulos Dec 6, 2024
37e21ef
Fix string current domain in unit dataframe tests
XanthosXanthopoulos Dec 6, 2024
12d2fde
Fix current domain unit test on string dimension
XanthosXanthopoulos Dec 6, 2024
d376d20
Remove unused methods
XanthosXanthopoulos Dec 6, 2024
5128195
Misc fixes
XanthosXanthopoulos Jan 14, 2025
275f5c8
Address review comments
XanthosXanthopoulos Jan 17, 2025
c580bcc
Replace Skip template parameter with a function parameter
XanthosXanthopoulos Jan 21, 2025
312a5ef
Address review comment about unit test
XanthosXanthopoulos Jan 21, 2025
cb52fa9
[c++] SOMAColumn serialization/deserialization (#3599)
XanthosXanthopoulos Jan 22, 2025
b631e87
[c++] Make `SOMAColumn` metadata required only for `GeometryDataframe…
XanthosXanthopoulos Jan 23, 2025
1cbd46d
Apply suggestions from code review
XanthosXanthopoulos Jan 23, 2025
533247c
Rename constants
XanthosXanthopoulos Jan 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apis/r/tests/testthat/test-11-shape.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_that("SOMADataFrame shape", {
list(soma_joinid = c(0, 999), int_column = c(-10000, 10000)),
list(soma_joinid = c(0, 999), string_column = NULL),
list(string_column = NULL, int_column = c(-10000, 10000)),
list(string_column = c("apple", "zebra"), int_column = c(-10000, 10000))
list(string_column = c("", ""), int_column = c(-10000, 10000))
)

# Check the test configs themselves to make sure someone (ahem, me)
Expand Down
57 changes: 57 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ void SOMAArray::create(
std::string_view uri,
ArraySchema schema,
std::string_view soma_type,
std::optional<std::string_view> soma_schema,
std::optional<TimestampRange> timestamp) {
Array::create(std::string(uri), schema);

Expand Down Expand Up @@ -54,6 +55,14 @@ void SOMAArray::create(
TILEDB_STRING_UTF8,
static_cast<uint32_t>(ENCODING_VERSION_VAL.length()),
ENCODING_VERSION_VAL.c_str());

if (soma_schema.has_value()) {
array->put_metadata(
TILEDB_SOMA_SCHEMA_KEY,
TILEDB_STRING_UTF8,
static_cast<uint32_t>(soma_schema->length()),
soma_schema->data());
}
}

std::unique_ptr<SOMAArray> SOMAArray::open(
Expand Down Expand Up @@ -120,6 +129,7 @@ SOMAArray::SOMAArray(
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache(timestamp);
fill_columns();
}

SOMAArray::SOMAArray(
Expand All @@ -138,6 +148,7 @@ SOMAArray::SOMAArray(
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache(timestamp);
fill_columns();
}

SOMAArray::SOMAArray(
Expand All @@ -154,6 +165,7 @@ SOMAArray::SOMAArray(
, schema_(std::make_shared<ArraySchema>(arr->schema())) {
reset({}, batch_size_, result_order_);
fill_metadata_cache(timestamp);
fill_columns();
}

void SOMAArray::fill_metadata_cache(std::optional<TimestampRange> timestamp) {
Expand Down Expand Up @@ -188,6 +200,51 @@ void SOMAArray::fill_metadata_cache(std::optional<TimestampRange> timestamp) {
}
}

void SOMAArray::fill_columns() {
// Clear columns in case of reopen
columns_.clear();

if (type().value_or("") == "SOMAGeometryDataFrame") {
if (!has_metadata(TILEDB_SOMA_SCHEMA_KEY)) {
throw TileDBSOMAError(std::format(
"[SOMAArray][fill_columns] Missing required metadata key '{}' "
"from SOMAGeometryDataFrame '{}'",
TILEDB_SOMA_SCHEMA_KEY,
uri()));
}

auto soma_schema_extension_raw = get_metadata(TILEDB_SOMA_SCHEMA_KEY)
.value();
auto data = static_cast<const char*>(
std::get<2>(soma_schema_extension_raw));
auto soma_schema_extension = data != nullptr ?
nlohmann::json::parse(std::string(
data,
std::get<1>(
soma_schema_extension_raw))) :
nlohmann::json::object();

if (!soma_schema_extension.contains(TILEDB_SOMA_SCHEMA_COL_KEY)) {
throw TileDBSOMAError(std::format(
"[SOMAArray][fill_columns] Missing '{}' key from '{}'",
TILEDB_SOMA_SCHEMA_COL_KEY,
TILEDB_SOMA_SCHEMA_KEY));
}

columns_ = SOMAColumn::deserialize(
soma_schema_extension.value(
TILEDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()),
*ctx_->tiledb_ctx(),
*arr_);

} else {
// Non-geometry dataframes have trivially constructible columns and do
// not require a schema
columns_ = SOMAColumn::deserialize(
nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_);
}
}

const std::string SOMAArray::uri() const {
return uri_;
};
Expand Down
8 changes: 8 additions & 0 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "enums.h"
#include "logger_public.h"
#include "managed_query.h"
#include "soma_column.h"
#include "soma_object.h"

// ================================================================
Expand Down Expand Up @@ -93,6 +94,7 @@ class SOMAArray : public SOMAObject {
std::string_view uri,
ArraySchema schema,
std::string_view soma_type,
std::optional<std::string_view> soma_schema = std::nullopt,
std::optional<TimestampRange> timestamp = std::nullopt);

/**
Expand Down Expand Up @@ -208,6 +210,7 @@ class SOMAArray : public SOMAObject {
, first_read_next_(other.first_read_next_)
, submitted_(other.submitted_) {
fill_metadata_cache(timestamp_);
fill_columns();
}

SOMAArray(
Expand Down Expand Up @@ -1519,6 +1522,8 @@ class SOMAArray : public SOMAObject {

void fill_metadata_cache(std::optional<TimestampRange> timestamp);

void fill_columns();

// SOMAArray URI
std::string uri_;

Expand All @@ -1541,6 +1546,9 @@ class SOMAArray : public SOMAObject {
// Metadata cache
std::map<std::string, MetadataValue> metadata_;

// SOMAColumn list
std::vector<std::shared_ptr<SOMAColumn>> columns_;

// Read timestamp range (start, end)
std::optional<TimestampRange> timestamp_;

Expand Down
49 changes: 48 additions & 1 deletion libtiledbsoma/src/soma/soma_attribute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,43 @@
#include "soma_attribute.h"

namespace tiledbsoma {
std::shared_ptr<SOMAColumn> SOMAAttribute::deserialize(
const nlohmann::json& soma_schema, const Context& ctx, const Array& array) {
if (!soma_schema.contains(TILEDB_SOMA_SCHEMA_COL_ATTR_KEY)) {
throw TileDBSOMAError(
"[SOMAAttribute][deserialize] Missing required field "
"'tiledb_attributes'");
}

std::vector<std::string>
attribute_names = soma_schema[TILEDB_SOMA_SCHEMA_COL_ATTR_KEY]
.template get<std::vector<std::string>>();

if (attribute_names.size() != 1) {
throw TileDBSOMAError(std::format(
"[SOMAAttribute][deserialize] Invalid number of attributes. "
"Epected 1, got {}",
attribute_names.size()));
}

if (!array.schema().has_attribute(attribute_names[0])) {
// Attribute probably dropped so skip column reconstruction.
return nullptr;
}

auto attribute = array.schema().attribute(attribute_names[0]);
auto enumeration_name = AttributeExperimental::get_enumeration_name(
ctx, attribute);

std::optional<Enumeration>
enumeration = enumeration_name ?
std::make_optional(ArrayExperimental::get_enumeration(
ctx, array, attribute.name())) :
std::nullopt;

return std::make_shared<SOMAAttribute>(attribute, enumeration);
}

std::shared_ptr<SOMAAttribute> SOMAAttribute::create(
std::shared_ptr<Context> ctx,
ArrowSchema* schema,
Expand Down Expand Up @@ -105,4 +142,14 @@ ArrowSchema* SOMAAttribute::arrow_schema_slot(
attribute, *ctx.tiledb_ctx(), array)
.release();
}
} // namespace tiledbsoma

void SOMAAttribute::serialize(nlohmann::json& columns_schema) const {
nlohmann::json column;

column[TILEDB_SOMA_SCHEMA_COL_TYPE_KEY] = static_cast<uint32_t>(
soma_column_datatype_t::SOMA_COLUMN_ATTRIBUTE);
column[TILEDB_SOMA_SCHEMA_COL_ATTR_KEY] = {attribute.name()};

columns_schema.push_back(column);
}
} // namespace tiledbsoma
11 changes: 11 additions & 0 deletions libtiledbsoma/src/soma/soma_attribute.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ using namespace tiledb;

class SOMAAttribute : public SOMAColumn {
public:
//===================================================================
//= public static
//===================================================================

static std::shared_ptr<SOMAColumn> deserialize(
const nlohmann::json& soma_schema,
const Context& ctx,
const Array& array);

/**
* Create a ``SOMAAttribute`` shared pointer from an Arrow schema
*/
Expand Down Expand Up @@ -95,6 +104,8 @@ class SOMAAttribute : public SOMAColumn {
ArrowSchema* arrow_schema_slot(
const SOMAContext& ctx, Array& array) override;

void serialize(nlohmann::json&) const override;

private:
void _set_dim_points(
const std::unique_ptr<ManagedQuery>& query,
Expand Down
92 changes: 91 additions & 1 deletion libtiledbsoma/src/soma/soma_column.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,98 @@

#include "soma_column.h"

#include "soma_attribute.h"
#include "soma_dimension.h"
#include "soma_geometry_column.h"

namespace tiledbsoma {

std::map<uint32_t, SOMAColumn::Factory> SOMAColumn::deserialiser_map = {
{soma_column_datatype_t::SOMA_COLUMN_ATTRIBUTE, SOMAAttribute::deserialize},
{soma_column_datatype_t::SOMA_COLUMN_DIMENSION, SOMADimension::deserialize},
{soma_column_datatype_t::SOMA_COLUMN_GEOMETRY,
SOMAGeometryColumn::deserialize}};

std::vector<std::shared_ptr<SOMAColumn>> SOMAColumn::deserialize(
const nlohmann::json& soma_schema_columns,
const Context& ctx,
const Array& array) {
std::vector<std::shared_ptr<SOMAColumn>> columns;

if (!soma_schema_columns.empty()) {
for (auto& column : soma_schema_columns) {
auto type = column[TILEDB_SOMA_SCHEMA_COL_TYPE_KEY]
.template get<uint32_t>();

auto col = deserialiser_map[type](column, ctx, array);

if (col) {
// Deserialized column can be null in case the array is modified
// and the column no longer exists.
columns.push_back(deserialiser_map[type](column, ctx, array));
}
}

// Check for any newly added attributes
std::unordered_set<std::string> used_attribute_names;

std::for_each(
columns.cbegin(),
columns.cend(),
[&used_attribute_names](const std::shared_ptr<SOMAColumn>& col) {
if (col->tiledb_attributes().has_value()) {
auto attributes = col->tiledb_attributes().value();
for (const auto& attribute : attributes) {
used_attribute_names.insert(attribute.name());
}
}
});

for (size_t i = 0; i < array.schema().attribute_num(); ++i) {
auto attribute = array.schema().attribute(i);

// Attribute is already used by another attribute so we skip
if (used_attribute_names.contains(attribute.name())) {
continue;
}

auto enumeration_name = AttributeExperimental::get_enumeration_name(
ctx, attribute);
auto enumeration = enumeration_name.has_value() ?
std::make_optional(
ArrayExperimental::get_enumeration(
ctx, array, attribute.name())) :
std::nullopt;

columns.push_back(
std::make_shared<SOMAAttribute>(attribute, enumeration));
}
} else {
// All arrays before the introduction of SOMAColumn do not have
// composite columns, thus the metadata are trivially constructible
for (auto& dimension : array.schema().domain().dimensions()) {
columns.push_back(std::make_shared<SOMADimension>(dimension));
}

for (auto& attribute : array.schema().attributes()) {
auto enumeration_name = AttributeExperimental::get_enumeration_name(
ctx, attribute.second);
auto enumeration = enumeration_name.has_value() ?
std::make_optional(
ArrayExperimental::get_enumeration(
ctx,
array,
attribute.second.name())) :
std::nullopt;

columns.push_back(
std::make_shared<SOMAAttribute>(attribute.second, enumeration));
}
}

return columns;
}

template <>
std::pair<std::string, std::string> SOMAColumn::core_domain_slot<std::string>()
const {
Expand Down Expand Up @@ -59,4 +149,4 @@ SOMAColumn::core_current_domain_slot<std::string>(
throw TileDBSOMAError(e.what());
}
}
} // namespace tiledbsoma
} // namespace tiledbsoma
18 changes: 18 additions & 0 deletions libtiledbsoma/src/soma/soma_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <any>
#include <format>
#include <map>
#include <optional>
#include <span>
#include <string>
Expand All @@ -34,6 +35,15 @@ using namespace tiledb;

class SOMAColumn {
public:
//===================================================================
//= public static
//===================================================================

static std::vector<std::shared_ptr<SOMAColumn>> deserialize(
const nlohmann::json& soma_schema,
const Context& ctx,
const Array& array);

//===================================================================
//= public non-static
//===================================================================
Expand Down Expand Up @@ -455,6 +465,8 @@ class SOMAColumn {
}
}

virtual void serialize(nlohmann::json&) const = 0;

protected:
virtual void _set_dim_points(
const std::unique_ptr<ManagedQuery>& query,
Expand All @@ -481,6 +493,12 @@ class SOMAColumn {
const SOMAContext& ctx, Array& array) const = 0;

virtual std::any _core_current_domain_slot(NDRectangle& ndrect) const = 0;

private:
typedef std::shared_ptr<SOMAColumn> (*Factory)(
const nlohmann::json&, const Context&, const Array&);

static std::map<uint32_t, Factory> deserialiser_map;
};

template <>
Expand Down
Loading