Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

eliminated unneeded offset #118

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::AVX2>(const uint8_t *pdatai
{
size_t i = 0;
size_t datasize64 = (datasize >> 6) << 6;
if (datasize64 > 64)
if (datasize64 >= 64)
{
const __m256i v1 = _mm256_set1_epi8(1);
__m256i vdata = _mm256_loadu_si256((const __m256i*)pdatain);
__m256i vBfr = _mm256_set1_epi16(((m_BitBfr << 8) & 0xFF00) | ((m_BitBfr >> 8) & 0xFF));
__m256i vdata_alignr16b_init = _mm256_permute2f128_si256(vBfr, vdata, 1 | (2<<4));
__m256i vdata_prev1 = _mm256_alignr_epi8(vdata, vdata_alignr16b_init, 15);
__m256i vdata_prev2 = _mm256_alignr_epi8(vdata, vdata_alignr16b_init, 14);
for ( ; i < datasize64 - 64; i += 64)
for ( ; i < datasize64; i += 64)
{
for (int c = 0; c < 64; c += 32) // this might force compiler to unroll the loop so we might have 2 loads in parallel
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::AVX512>(const uint8_t *pdat
{
size_t i = 0;
size_t datasize128 = (datasize >> 7) << 7;
if (datasize128 > 128)
if (datasize128 >= 128)
{
const __m512i v1 = _mm512_set1_epi8(1);
const __m512i v254 = _mm512_set1_epi8(-2);
Expand All @@ -24,7 +24,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::AVX512>(const uint8_t *pdat
__m512i vdata_alignr48b_init = _mm512_alignr_epi32(vdata, vBfr, 12);
__m512i vdata_prev1 = _mm512_alignr_epi8(vdata, vdata_alignr48b_init, 15);
__m512i vdata_prev2 = _mm512_alignr_epi8(vdata, vdata_alignr48b_init, 14);
for ( ; i < datasize128 - 128; i += 128)
for ( ; i < datasize128; i += 128)
{
for (int c = 0; c < 128; c += 64) // this might force compiler to unroll the loop so we might have 2 loads in parallel
{
Expand Down
4 changes: 2 additions & 2 deletions vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::NEON>(const uint8_t *pdatai
{
size_t i = 0;
size_t datasize32 = (datasize >> 5) << 5;
if (datasize32 > 32)
if (datasize32 >= 32)
{
const uint8x16_t v0 = vdupq_n_u8(0);
const uint8x16_t v1 = vdupq_n_u8(1);
Expand All @@ -25,7 +25,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::NEON>(const uint8_t *pdatai
uint8x16_t vdata_prev2 = vextq_u8(vBfr, vdata, 14);
uint8_t idx0n[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint8x16_t v015 = vld1q_u8(idx0n);
for ( ; i < datasize32 - 32; i += 32)
for ( ; i < datasize32; i += 32)
{
for (int c = 0; c < 32; c += 16)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::SSSE3>(const uint8_t *pdata
{
size_t i = 0;
size_t datasize32 = (datasize >> 5) << 5;
if (datasize32 > 32)
if (datasize32 >= 32)
{
const __m128i v1 = _mm_set1_epi8(1);
__m128i vdata = _mm_loadu_si128((const __m128i*)pdatain);
__m128i vBfr = _mm_set1_epi16(((m_BitBfr << 8) & 0xFF00) | ((m_BitBfr >> 8) & 0xFF));
__m128i vdata_prev1 = _mm_alignr_epi8(vdata, vBfr, 15);
__m128i vdata_prev2 = _mm_alignr_epi8(vdata, vBfr, 14);
for ( ; i < datasize32 - 32; i += 32)
for ( ; i < datasize32; i += 32)
{
for (int c = 0; c < 32; c += 16)
{
Expand Down