-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathvector_convert.h
397 lines (303 loc) · 9.44 KB
/
vector_convert.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/************************** vector_convert.h *******************************
* Author: Agner Fog
* Date created: 2014-07-23
* Last modified: 2019-08-02
* Version: 1.40.00
* Project: vector class library
* Description:
* Header file for conversion between different vector classes with different
* sizes.
*
* (c) Copyright 2012-2019 Agner Fog.
* Apache License version 2.0 or later.
*****************************************************************************/
#ifndef VECTOR_CONVERT_H
#define VECTOR_CONVERT_H
#ifndef VECTORCLASS_H
#include "vectorclass.h"
#endif
#if VECTORCLASS_H > 19999
#error Incompatible versions of vector class library mixed
#endif
#ifdef VCL_NAMESPACE
namespace VCL_NAMESPACE {
#endif
#if MAX_VECTOR_SIZE >= 256
/*****************************************************************************
*
* Extend from 128 to 256 bit vectors
*
*****************************************************************************/
#if INSTRSET >= 8 // AVX2. 256 bit integer vectors
// sign extend
static inline Vec16s extend (Vec16c const a) {
return _mm256_cvtepi8_epi16(a);
}
// zero extend
static inline Vec16us extend (Vec16uc const a) {
return _mm256_cvtepu8_epi16(a);
}
// sign extend
static inline Vec8i extend (Vec8s const a) {
return _mm256_cvtepi16_epi32(a);
}
// zero extend
static inline Vec8ui extend (Vec8us const a) {
return _mm256_cvtepu16_epi32(a);
}
// sign extend
static inline Vec4q extend (Vec4i const a) {
return _mm256_cvtepi32_epi64(a);
}
// zero extend
static inline Vec4uq extend (Vec4ui const a) {
return _mm256_cvtepu32_epi64(a);
}
#else // no AVX2. 256 bit vectors are emulated
// sign extend and zero extend functions:
static inline Vec16s extend (Vec16c const a) {
return Vec16s(extend_low(a), extend_high(a));
}
static inline Vec16us extend (Vec16uc const a) {
return Vec16us(extend_low(a), extend_high(a));
}
static inline Vec8i extend (Vec8s const a) {
return Vec8i(extend_low(a), extend_high(a));
}
static inline Vec8ui extend (Vec8us const a) {
return Vec8ui(extend_low(a), extend_high(a));
}
static inline Vec4q extend (Vec4i const a) {
return Vec4q(extend_low(a), extend_high(a));
}
static inline Vec4uq extend (Vec4ui const a) {
return Vec4uq(extend_low(a), extend_high(a));
}
#endif // AVX2
/*****************************************************************************
*
* Conversions between float and double
*
*****************************************************************************/
#if INSTRSET >= 7 // AVX. 256 bit float vectors
// float to double
static inline Vec4d to_double (Vec4f const a) {
return _mm256_cvtps_pd(a);
}
// double to float
static inline Vec4f to_float (Vec4d const a) {
return _mm256_cvtpd_ps(a);
}
#else // no AVX2. 256 bit float vectors are emulated
// float to double
static inline Vec4d to_double (Vec4f const a) {
Vec2d lo = _mm_cvtps_pd(a);
Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a));
return Vec4d(lo,hi);
}
// double to float
static inline Vec4f to_float (Vec4d const a) {
Vec4f lo = _mm_cvtpd_ps(a.get_low());
Vec4f hi = _mm_cvtpd_ps(a.get_high());
return _mm_movelh_ps(lo, hi);
}
#endif
/*****************************************************************************
*
* Reduce from 256 to 128 bit vectors
*
*****************************************************************************/
#if INSTRSET >= 10 // AVX512VL
// compress functions. overflow wraps around
static inline Vec16c compress (Vec16s const a) {
return _mm256_cvtepi16_epi8(a);
}
static inline Vec16uc compress (Vec16us const a) {
return _mm256_cvtepi16_epi8(a);
}
static inline Vec8s compress (Vec8i const a) {
return _mm256_cvtepi32_epi16(a);
}
static inline Vec8us compress (Vec8ui const a) {
return _mm256_cvtepi32_epi16(a);
}
static inline Vec4i compress (Vec4q const a) {
return _mm256_cvtepi64_epi32(a);
}
static inline Vec4ui compress (Vec4uq const a) {
return _mm256_cvtepi64_epi32(a);
}
#else // no AVX512
// compress functions. overflow wraps around
static inline Vec16c compress (Vec16s const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec16uc compress (Vec16us const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec8s compress (Vec8i const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec8us compress (Vec8ui const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec4i compress (Vec4q const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec4ui compress (Vec4uq const a) {
return compress(a.get_low(), a.get_high());
}
#endif // AVX512
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
/*****************************************************************************
*
* Extend from 256 to 512 bit vectors
*
*****************************************************************************/
#if INSTRSET >= 9 // AVX512. 512 bit integer vectors
// sign extend
static inline Vec32s extend (Vec32c const a) {
#if INSTRSET >= 10
return _mm512_cvtepi8_epi16(a);
#else
return Vec32s(extend_low(a), extend_high(a));
#endif
}
// zero extend
static inline Vec32us extend (Vec32uc const a) {
#if INSTRSET >= 10
return _mm512_cvtepu8_epi16(a);
#else
return Vec32us(extend_low(a), extend_high(a));
#endif
}
// sign extend
static inline Vec16i extend (Vec16s const a) {
return _mm512_cvtepi16_epi32(a);
}
// zero extend
static inline Vec16ui extend (Vec16us const a) {
return _mm512_cvtepu16_epi32(a);
}
// sign extend
static inline Vec8q extend (Vec8i const a) {
return _mm512_cvtepi32_epi64(a);
}
// zero extend
static inline Vec8uq extend (Vec8ui const a) {
return _mm512_cvtepu32_epi64(a);
}
#else // no AVX512. 512 bit vectors are emulated
// sign extend
static inline Vec32s extend (Vec32c const a) {
return Vec32s(extend_low(a), extend_high(a));
}
// zero extend
static inline Vec32us extend (Vec32uc const a) {
return Vec32us(extend_low(a), extend_high(a));
}
// sign extend
static inline Vec16i extend (Vec16s const a) {
return Vec16i(extend_low(a), extend_high(a));
}
// zero extend
static inline Vec16ui extend (Vec16us const a) {
return Vec16ui(extend_low(a), extend_high(a));
}
// sign extend
static inline Vec8q extend (Vec8i const a) {
return Vec8q(extend_low(a), extend_high(a));
}
// zero extend
static inline Vec8uq extend (Vec8ui const a) {
return Vec8uq(extend_low(a), extend_high(a));
}
#endif // AVX512
/*****************************************************************************
*
* Reduce from 512 to 256 bit vectors
*
*****************************************************************************/
#if INSTRSET >= 9 // AVX512F
// compress functions. overflow wraps around
static inline Vec32c compress (Vec32s const a) {
#if INSTRSET >= 10 // AVVX512BW
return _mm512_cvtepi16_epi8(a);
#else
return compress(a.get_low(), a.get_high());
#endif
}
static inline Vec32uc compress (Vec32us const a) {
return Vec32uc(compress(Vec32s(a)));
}
static inline Vec16s compress (Vec16i const a) {
return _mm512_cvtepi32_epi16(a);
}
static inline Vec16us compress (Vec16ui const a) {
return _mm512_cvtepi32_epi16(a);
}
static inline Vec8i compress (Vec8q const a) {
return _mm512_cvtepi64_epi32(a);
}
static inline Vec8ui compress (Vec8uq const a) {
return _mm512_cvtepi64_epi32(a);
}
#else // no AVX512
// compress functions. overflow wraps around
static inline Vec32c compress (Vec32s const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec32uc compress (Vec32us const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec16s compress (Vec16i const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec16us compress (Vec16ui const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec8i compress (Vec8q const a) {
return compress(a.get_low(), a.get_high());
}
static inline Vec8ui compress (Vec8uq const a) {
return compress(a.get_low(), a.get_high());
}
#endif // AVX512
/*****************************************************************************
*
* Conversions between float and double
*
*****************************************************************************/
#if INSTRSET >= 9 // AVX512. 512 bit float vectors
// float to double
static inline Vec8d to_double (Vec8f const a) {
return _mm512_cvtps_pd(a);
}
// double to float
static inline Vec8f to_float (Vec8d const a) {
return _mm512_cvtpd_ps(a);
}
#else // no AVX512. 512 bit float vectors are emulated
// float to double
static inline Vec8d to_double (Vec8f const a) {
Vec4d lo = to_double(a.get_low());
Vec4d hi = to_double(a.get_high());
return Vec8d(lo,hi);
}
// double to float
static inline Vec8f to_float (Vec8d const a) {
Vec4f lo = to_float(a.get_low());
Vec4f hi = to_float(a.get_high());
return Vec8f(lo, hi);
}
#endif
#endif // MAX_VECTOR_SIZE >= 512
// double to float
static inline Vec4f to_float (Vec2d const a) {
return _mm_cvtpd_ps(a);
}
#ifdef VCL_NAMESPACE
}
#endif
#endif // VECTOR_CONVERT_H