Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue 470 porting short but excellent pr from clickhouse #472

Draft
wants to merge 9 commits into
base: develop
Choose a base branch
from
11 changes: 10 additions & 1 deletion base/poco/Foundation/include/Poco/UTF32Encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,16 @@ class Foundation_API UTF32Encoding: public TextEncoding
int convert(int ch, unsigned char* bytes, int length) const;
int queryConvert(const unsigned char* bytes, int length) const;
int sequenceLength(const unsigned char* bytes, int length) const;


protected:
static int safeToInt(Poco::UInt32 value)
{
if (value <= 0x10FFFF)
return static_cast<int>(value);
else
return -1;
}

private:
bool _flipBytes;
static const char* _names[];
Expand Down
42 changes: 21 additions & 21 deletions base/poco/Foundation/src/UTF32Encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,22 @@ const char* UTF32Encoding::_names[] =

const TextEncoding::CharacterMap UTF32Encoding::_charMap =
{
/* 00 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 10 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 20 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 30 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 40 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 50 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 60 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 70 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 80 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 90 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* a0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* b0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* c0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* d0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* e0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* f0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* 00 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 10 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 20 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 30 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 40 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 50 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 60 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 70 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 80 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* 90 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* a0 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* b0 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* c0 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* d0 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* e0 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
/* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
};


Expand Down Expand Up @@ -118,7 +118,7 @@ const TextEncoding::CharacterMap& UTF32Encoding::characterMap() const
int UTF32Encoding::convert(const unsigned char* bytes) const
{
UInt32 uc;
unsigned char* p = (unsigned char*) &uc;
unsigned char* p = reinterpret_cast<unsigned char*>(&uc);
*p++ = *bytes++;
*p++ = *bytes++;
*p++ = *bytes++;
Expand All @@ -129,7 +129,7 @@ int UTF32Encoding::convert(const unsigned char* bytes) const
ByteOrder::flipBytes(uc);
}

return uc;
return safeToInt(uc);
}


Expand All @@ -138,7 +138,7 @@ int UTF32Encoding::convert(int ch, unsigned char* bytes, int length) const
if (bytes && length >= 4)
{
UInt32 ch1 = _flipBytes ? ByteOrder::flipBytes((UInt32) ch) : (UInt32) ch;
unsigned char* p = (unsigned char*) &ch1;
unsigned char* p = reinterpret_cast<unsigned char*>(&ch1);
*bytes++ = *p++;
*bytes++ = *p++;
*bytes++ = *p++;
Expand All @@ -155,14 +155,14 @@ int UTF32Encoding::queryConvert(const unsigned char* bytes, int length) const
if (length >= 4)
{
UInt32 uc;
unsigned char* p = (unsigned char*) &uc;
unsigned char* p = reinterpret_cast<unsigned char*>(&uc);
*p++ = *bytes++;
*p++ = *bytes++;
*p++ = *bytes++;
*p++ = *bytes++;
if (_flipBytes)
ByteOrder::flipBytes(uc);
return uc;
ret = safeToInt(uc);
}

return ret;
Expand Down
11 changes: 11 additions & 0 deletions src/Columns/ColumnVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ class ColumnVector final : public COWHelper<ColumnVectorHelper, ColumnVector<T>>
data.push_back(assert_cast<const Self &>(src).getData()[n]);
}

void insertManyFrom(const IColumn & src, size_t position, size_t length) override
{
ValueType v = assert_cast<const Self &>(src).getData()[position];
data.resize_fill(data.size() + length, v);
}

void insertMany(const Field & field, size_t length) override
{
data.resize_fill(data.size() + length, static_cast<T>(field.get<T>()));
}

void insertData(const char * pos, size_t) override
{
data.emplace_back(unalignedLoad<T>(pos));
Expand Down
7 changes: 6 additions & 1 deletion src/Columns/FilterDescription.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,12 @@ FilterDescription::FilterDescription(const IColumn & column_)

size_t size = res.size();
for (size_t i = 0; i < size; ++i)
res[i] = res[i] && !null_map[i];
{
auto has_val = static_cast<UInt8>(!!res[i]);
auto not_null = static_cast<UInt8>(!null_map[i]);
/// Instead of the logical AND operator(&&), the bitwise one(&) is utilized for the auto vectorization.
res[i] = has_val & not_null;
}

data = &res;
data_holder = std::move(mutable_holder);
Expand Down
19 changes: 16 additions & 3 deletions src/Common/HashTable/HashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int NO_AVAILABLE_DATA;
extern const int CANNOT_ALLOCATE_MEMORY;
}
}

Expand Down Expand Up @@ -501,9 +502,21 @@ class HashTable : private boost::noncopyable,
return place_value;
}

static size_t allocCheckOverflow(size_t buffer_size)
{
size_t size = 0;
if (common::mulOverflow(buffer_size, sizeof(Cell), size))
throw DB::Exception(
DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY,
"Integer overflow trying to allocate memory for HashTable. Trying to allocate {} cells of {} bytes each",
buffer_size, sizeof(Cell));

return size;
}

void alloc(const Grower & new_grower)
{
buf = reinterpret_cast<Cell *>(Allocator::alloc(new_grower.bufSize() * sizeof(Cell)));
buf = reinterpret_cast<Cell *>(Allocator::alloc(allocCheckOverflow(new_grower.bufSize())));
grower = new_grower;
}

Expand Down Expand Up @@ -560,11 +573,11 @@ class HashTable : private boost::noncopyable,

if constexpr (Cell::need_to_notify_cell_during_move)
{
buf = reinterpret_cast<Cell *>(Allocator::alloc(new_grower.bufSize() * sizeof(Cell)));
buf = reinterpret_cast<Cell *>(Allocator::alloc(allocCheckOverflow(new_grower.bufSize())));
memcpy(reinterpret_cast<void *>(buf), reinterpret_cast<const void *>(old_buffer.get()), old_buffer_size);
}
else
buf = reinterpret_cast<Cell *>(Allocator::realloc(buf, old_buffer_size, new_grower.bufSize() * sizeof(Cell)));
buf = reinterpret_cast<Cell *>(Allocator::realloc(buf, old_buffer_size, allocCheckOverflow(new_grower.bufSize())));

grower = new_grower;

Expand Down
6 changes: 3 additions & 3 deletions src/Common/SipHash.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class SipHash
current_word = 0;
}

void update(const char * data, UInt64 size)
ALWAYS_INLINE void update(const char * data, UInt64 size)
{
const char * end = data + size;

Expand Down Expand Up @@ -137,12 +137,12 @@ class SipHash
}

template <typename T>
void update(const T & x)
ALWAYS_INLINE void update(const T & x)
{
update(reinterpret_cast<const char *>(&x), sizeof(x));
}

void update(const std::string & x)
ALWAYS_INLINE void update(const std::string & x)
{
update(x.data(), x.length());
}
Expand Down
8 changes: 7 additions & 1 deletion src/DataTypes/Serializations/SerializationString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ template <int UNROLL_TIMES>
static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit)
{
size_t offset = data.size();
/// Avoiding calling resize in a loop improves the performance.
data.resize(std::max(data.capacity(), static_cast<size_t>(4096)));

for (size_t i = 0; i < limit; ++i)
{
if (istr.eof())
Expand All @@ -161,7 +164,8 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnSt
offset += size + 1;
offsets.push_back(offset);

data.resize(offset);
if (unlikely(offset > data.size()))
data.resize_exact(roundUpToPowerOfTwoOrZero(std::max(offset, data.size() * 2)));

if (size)
{
Expand Down Expand Up @@ -193,6 +197,8 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnSt

data[offset - 1] = 0;
}

data.resize(offset);
}


Expand Down
6 changes: 3 additions & 3 deletions src/IO/BitHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class BitReader
{}

// reads bits_to_read high-bits from bits_buffer
inline UInt64 readBits(UInt8 bits_to_read)
ALWAYS_INLINE inline UInt64 readBits(UInt8 bits_to_read)
{
if (bits_to_read > bits_count)
fillBitBuffer();
Expand All @@ -72,7 +72,7 @@ class BitReader
return getBitsFromBitBuffer<PEEK>(8);
}

inline UInt8 readBit()
ALWAYS_INLINE inline UInt8 readBit()
{
return static_cast<UInt8>(readBits(1));
}
Expand Down Expand Up @@ -123,7 +123,7 @@ class BitReader


// Fills internal bits_buffer with data from source, reads at most 64 bits
size_t fillBitBuffer()
ALWAYS_INLINE size_t fillBitBuffer()
{
const size_t available = source_end - source_current;
const auto bytes_to_read = std::min<size_t>(64 / 8, available);
Expand Down
6 changes: 5 additions & 1 deletion src/Interpreters/ThreadStatusExt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,15 @@ void ThreadStatus::detachQuery(bool exit_if_already_detached, bool thread_exits)
query_context.reset();
thread_trace_context.trace_id = 0;
thread_trace_context.span_id = 0;

/// The memory of thread_group->finished_threads_counters_memory is temporarily moved to this vector, which is deallocated out of critical section.
std::vector<ThreadGroupStatus::ProfileEventsCountersAndMemory> move_to_temp;

/// Avoid leaking of ThreadGroupStatus::finished_threads_counters_memory
/// (this is in case someone uses system thread but did not call getProfileEventsCountersAndMemoryForThreads())
{
std::lock_guard guard(thread_group->mutex);
auto stats = std::move(thread_group->finished_threads_counters_memory);
move_to_temp = std::move(thread_group->finished_threads_counters_memory);
}

thread_group.reset();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
DROP TABLE IF EXISTS tab;
create table tab (d Int64, s AggregateFunction(groupUniqArrayArray, Array(UInt64)), c SimpleAggregateFunction(groupUniqArrayArray, Array(UInt64))) engine = SummingMergeTree() order by d;
INSERT INTO tab VALUES (1, 'このコー'); -- { clientError CANNOT_ALLOCATE_MEMORY }
DROP TABLE tab;