-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Enhancement] Filter row group using runtime filter before prepare next row group in FileReader #54868
[Enhancement] Filter row group using runtime filter before prepare next row group in FileReader #54868
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,23 +16,18 @@ | |
|
||
#include <glog/logging.h> | ||
|
||
#include <algorithm> | ||
#include <atomic> | ||
#include <cstring> | ||
#include <iterator> | ||
#include <map> | ||
#include <sstream> | ||
#include <unordered_map> | ||
#include <unordered_set> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include "block_cache/kv_cache.h" | ||
#include "column/chunk.h" | ||
#include "column/column.h" | ||
#include "column/column_helper.h" | ||
#include "column/const_column.h" | ||
#include "column/datum.h" | ||
#include "column/vectorized_fwd.h" | ||
#include "common/compiler_util.h" | ||
#include "common/config.h" | ||
|
@@ -43,23 +38,19 @@ | |
#include "exprs/expr_context.h" | ||
#include "exprs/runtime_filter.h" | ||
#include "exprs/runtime_filter_bank.h" | ||
#include "formats/parquet/column_converter.h" | ||
#include "formats/parquet/metadata.h" | ||
#include "formats/parquet/scalar_column_reader.h" | ||
#include "formats/parquet/schema.h" | ||
#include "formats/parquet/statistics_helper.h" | ||
#include "formats/parquet/utils.h" | ||
#include "formats/parquet/zone_map_filter_evaluator.h" | ||
#include "fs/fs.h" | ||
#include "gen_cpp/PlanNodes_types.h" | ||
#include "gen_cpp/parquet_types.h" | ||
#include "gutil/casts.h" | ||
#include "gutil/strings/substitute.h" | ||
#include "io/shared_buffered_input_stream.h" | ||
#include "runtime/descriptors.h" | ||
#include "runtime/types.h" | ||
#include "storage/chunk_helper.h" | ||
#include "util/thrift_util.h" | ||
|
||
namespace starrocks::parquet { | ||
|
||
|
@@ -358,6 +349,9 @@ bool FileReader::_filter_group_with_more_filter(const GroupReaderPtr& group_read | |
// status and lead to the query failed. | ||
bool FileReader::_filter_group(const GroupReaderPtr& group_reader) { | ||
if (config::parquet_advance_zonemap_filter) { | ||
if (_scanner_ctx->rf_scan_range_pruner != nullptr) { | ||
_rf_scan_range_pruner = std::make_shared<OlapRuntimeScanRangePruner>(*_scanner_ctx->rf_scan_range_pruner); | ||
} | ||
auto res = _scanner_ctx->predicate_tree.visit( | ||
ZoneMapEvaluator<FilterLevel::ROW_GROUP>{_scanner_ctx->predicate_tree, group_reader.get()}); | ||
if (!res.ok()) { | ||
|
@@ -386,6 +380,29 @@ bool FileReader::_filter_group(const GroupReaderPtr& group_reader) { | |
} | ||
} | ||
|
||
StatusOr<bool> FileReader::_update_rf_and_filter_group(const GroupReaderPtr& group_reader) { | ||
bool filter = false; | ||
if (config::parquet_advance_zonemap_filter && _rf_scan_range_pruner != nullptr) { | ||
RETURN_IF_ERROR(_rf_scan_range_pruner->update_range_if_arrived( | ||
&EMPTY_GLOBAL_DICTMAPS, | ||
[&filter, &group_reader](auto cid, const PredicateList& predicates) { | ||
PredicateCompoundNode<CompoundNodeType::AND> pred_tree; | ||
for (const auto& pred : predicates) { | ||
pred_tree.add_child(PredicateColumnNode{pred}); | ||
} | ||
auto real_tree = PredicateTree::create(std::move(pred_tree)); | ||
|
||
auto res = real_tree.visit(ZoneMapEvaluator<FilterLevel::ROW_GROUP>{real_tree, group_reader.get()}); | ||
if (res.ok() && res->has_value() && res->value().span_size() == 0) { | ||
filter = true; | ||
} | ||
return Status::OK(); | ||
}, | ||
true, 0)); | ||
} | ||
return filter; | ||
} | ||
|
||
Status FileReader::_read_has_nulls(const GroupReaderPtr& group_reader, const std::vector<SlotDescriptor*>& slots, | ||
std::vector<bool>* has_nulls) { | ||
const HdfsScannerContext& ctx = *_scanner_ctx; | ||
|
@@ -609,12 +626,31 @@ Status FileReader::get_next(ChunkPtr* chunk) { | |
} | ||
if (status.is_end_of_file()) { | ||
// release previous RowGroupReader | ||
_row_group_readers[_cur_row_group_idx] = nullptr; | ||
_cur_row_group_idx++; | ||
if (_cur_row_group_idx < _row_group_size) { | ||
// prepare new group | ||
RETURN_IF_ERROR(_row_group_readers[_cur_row_group_idx]->prepare()); | ||
} | ||
do { | ||
_row_group_readers[_cur_row_group_idx] = nullptr; | ||
_cur_row_group_idx++; | ||
if (_cur_row_group_idx < _row_group_size) { | ||
const auto& cur_row_group = _row_group_readers[_cur_row_group_idx]; | ||
auto ret = _update_rf_and_filter_group(cur_row_group); | ||
if (ret.ok() && ret.value()) { | ||
// row group is filtered by runtime filter | ||
_group_reader_param.stats->parquet_filtered_row_groups += 1; | ||
continue; | ||
} else if (ret.status().is_end_of_file()) { | ||
// If rf is always false, will return eof | ||
_group_reader_param.stats->parquet_filtered_row_groups += | ||
(_row_group_size - _cur_row_group_idx); | ||
_row_group_readers.assign(_row_group_readers.size(), nullptr); | ||
_cur_row_group_idx = _row_group_size; | ||
break; | ||
} else { | ||
// do nothing, ignore the error code | ||
} | ||
|
||
RETURN_IF_ERROR(cur_row_group->prepare()); | ||
} | ||
break; | ||
} while (true); | ||
|
||
return Status::OK(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The most risky bug in this code is: You can modify the code like this: Status FileReader::get_next(ChunkPtr* chunk) {
while (_cur_row_group_idx < _row_group_size) { // Add condition check here
const auto& cur_row_group = _row_group_readers[_cur_row_group_idx];
auto ret = _update_rf_and_filter_group(cur_row_group);
if (ret.ok() && ret.value()) {
_group_reader_param.stats->parquet_filtered_row_groups += 1;
_row_group_readers[_cur_row_group_idx] = nullptr; // Ensure the current reader is nullified after filtering
_cur_row_group_idx++;
continue;
} else if (ret.status().is_end_of_file()) {
_group_reader_param.stats->parquet_filtered_row_groups +=
(_row_group_size - _cur_row_group_idx);
_row_group_readers.assign(_row_group_readers.size(), nullptr);
_cur_row_group_idx = _row_group_size;
break;
} else {
// do nothing, ignore the error code
}
RETURN_IF_ERROR(cur_row_group->prepare());
break;
}
return Status::OK();
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not one good suggestion |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we definitely have to remove "Olap"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good suggestion, i will remove it next pr.