diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index b5c6da832..8bec4558d 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -14376,7 +14376,7 @@ common->compiler = compiler; /* Main pcre2_jit_exec entry. */ SLJIT_ASSERT((private_data_size & (sizeof(sljit_sw) - 1)) == 0); -sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, W), 5, 5, SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS, 0, private_data_size); +sljit_emit_enter(compiler, 0, SLJIT_ARGS1(W, W), 5 | SLJIT_ENTER_FLOAT(SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS), 5, private_data_size); /* Register init. */ reset_ovector(common, (re->top_bracket + 1) * 2); diff --git a/src/pcre2_jit_simd_inc.h b/src/pcre2_jit_simd_inc.h index 502977fc3..50d5394e3 100644 --- a/src/pcre2_jit_simd_inc.h +++ b/src/pcre2_jit_simd_inc.h @@ -668,7 +668,7 @@ for (i = 0; i < 4; i++) fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind); } -sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1); +sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1, 0); sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0); /* Ignore matches before the first STR_PTR. */ @@ -696,7 +696,7 @@ for (i = 0; i < 4; i++) fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind); } -sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1); +sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1, 0); sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0); CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start); diff --git a/src/sljit/sljitConfigInternal.h b/src/sljit/sljitConfigInternal.h index de06dd8e0..12efc350a 100644 --- a/src/sljit/sljitConfigInternal.h +++ b/src/sljit/sljitConfigInternal.h @@ -70,10 +70,16 @@ extern "C" { SLJIT_NUMBER_OF_SCRATCH_REGISTERS : number of available scratch registers SLJIT_NUMBER_OF_SAVED_REGISTERS : number of available saved registers SLJIT_NUMBER_OF_FLOAT_REGISTERS : number of available floating point registers - SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS : number of available floating point scratch registers - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS : number of available floating point saved registers + SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS : number of available scratch floating point registers + SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS : number of available saved floating point registers + SLJIT_NUMBER_OF_VECTOR_REGISTERS : number of available vector registers + SLJIT_NUMBER_OF_SCRATCH_VECTOR_REGISTERS : number of available scratch vector registers + SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS : number of available saved vector registers SLJIT_NUMBER_OF_TEMPORARY_REGISTERS : number of available temporary registers SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS : number of available temporary floating point registers + SLJIT_NUMBER_OF_TEMPORARY_VECTOR_REGISTERS : number of available temporary vector registers + SLJIT_SEPARATE_VECTOR_REGISTERS : if this macro is defined, the vector registers do not + overlap with floating point registers SLJIT_WORD_SHIFT : the shift required to apply when accessing a sljit_sw/sljit_uw array by index SLJIT_F32_SHIFT : the shift required to apply when accessing a single precision floating point array by index @@ -98,10 +104,13 @@ extern "C" { SLJIT_TMP_R(i) : accessing temporary registers SLJIT_TMP_FR0 .. FR9 : accessing temporary floating point registers SLJIT_TMP_FR(i) : accessing temporary floating point registers + SLJIT_TMP_VR0 .. VR9 : accessing temporary vector registers + SLJIT_TMP_VR(i) : accessing temporary vector registers SLJIT_TMP_DEST_REG : a temporary register for results SLJIT_TMP_MEM_REG : a temporary base register for accessing memory (can be the same as SLJIT_TMP_DEST_REG) SLJIT_TMP_DEST_FREG : a temporary register for float results + SLJIT_TMP_DEST_VREG : a temporary register for vector results SLJIT_FUNC : calling convention attribute for both calling JIT from C and C calling back from JIT SLJIT_W(number) : defining 64 bit constants on 64 bit architectures (platform independent helper) SLJIT_F64_SECOND(reg) : provides the register index of the second 32 bit part of a 64 bit @@ -553,7 +562,7 @@ determine the next executed instruction after return. */ #if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR) SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size); SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr); -SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void); +/* Note: sljitLir.h also defines sljit_free_unused_memory_exec() function. */ #define SLJIT_BUILTIN_MALLOC_EXEC(size, exec_allocator_data) sljit_malloc_exec(size) #define SLJIT_BUILTIN_FREE_EXEC(ptr, exec_allocator_data) sljit_free_exec(ptr) @@ -591,7 +600,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void *code); #define SLJIT_TMP_DEST_REG SLJIT_TMP_R0 #define SLJIT_TMP_MEM_REG SLJIT_TMP_R0 #define SLJIT_TMP_DEST_FREG SLJIT_TMP_FR0 -#define SLJIT_LOCALS_OFFSET_BASE (8 * SSIZE_OF(sw)) +#define SLJIT_LOCALS_OFFSET_BASE (8 * (sljit_s32)sizeof(sljit_sw)) #define SLJIT_PREF_SHIFT_REG SLJIT_R2 #define SLJIT_MASKED_SHIFT 1 #define SLJIT_MASKED_SHIFT32 1 @@ -609,7 +618,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void *code); #else /* _WIN64 */ #define SLJIT_NUMBER_OF_SAVED_REGISTERS 8 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 10 -#define SLJIT_LOCALS_OFFSET_BASE (4 * SSIZE_OF(sw)) +#define SLJIT_LOCALS_OFFSET_BASE (4 * (sljit_s32)sizeof(sljit_sw)) #endif /* !_WIN64 */ #define SLJIT_TMP_DEST_REG SLJIT_TMP_R0 #define SLJIT_TMP_MEM_REG SLJIT_TMP_R0 @@ -695,9 +704,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void *code); #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12 #define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2 +#define SLJIT_SEPARATE_VECTOR_REGISTERS 1 +#define SLJIT_NUMBER_OF_VECTOR_REGISTERS 30 +#define SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS 0 +#define SLJIT_NUMBER_OF_TEMPORARY_VECTOR_REGISTERS 2 #define SLJIT_TMP_DEST_REG SLJIT_TMP_R1 #define SLJIT_TMP_MEM_REG SLJIT_TMP_R1 #define SLJIT_TMP_DEST_FREG SLJIT_TMP_FR0 +#define SLJIT_TMP_DEST_VREG SLJIT_TMP_VR0 #define SLJIT_LOCALS_OFFSET_BASE 0 #define SLJIT_MASKED_SHIFT 1 #define SLJIT_MASKED_SHIFT32 1 @@ -768,6 +782,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void *code); #endif +#if !(defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) +#define SLJIT_NUMBER_OF_VECTOR_REGISTERS (SLJIT_NUMBER_OF_FLOAT_REGISTERS) +#define SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS (SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS) +#define SLJIT_NUMBER_OF_TEMPORARY_VECTOR_REGISTERS (SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS) +#define SLJIT_TMP_DEST_VREG (SLJIT_TMP_DEST_FREG) +#endif /* !SLJIT_SEPARATE_VECTOR_REGISTERS */ + #define SLJIT_LOCALS_OFFSET (SLJIT_LOCALS_OFFSET_BASE) #define SLJIT_NUMBER_OF_SCRATCH_REGISTERS \ @@ -776,12 +797,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void *code); #define SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS \ (SLJIT_NUMBER_OF_FLOAT_REGISTERS - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS) +#define SLJIT_NUMBER_OF_SCRATCH_VECTOR_REGISTERS \ + (SLJIT_NUMBER_OF_VECTOR_REGISTERS - SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS) + /**********************************/ /* Temporary register management. */ /**********************************/ #define SLJIT_TMP_REGISTER_BASE (SLJIT_NUMBER_OF_REGISTERS + 2) #define SLJIT_TMP_FREGISTER_BASE (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) +#define SLJIT_TMP_VREGISTER_BASE (SLJIT_NUMBER_OF_VECTOR_REGISTERS + 1) /* WARNING: Accessing temporary registers is not recommended, because they are also used by the JIT compiler for various computations. Using them @@ -815,6 +840,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void *code); #define SLJIT_TMP_FR9 (SLJIT_TMP_FREGISTER_BASE + 9) #define SLJIT_TMP_FR(i) (SLJIT_TMP_FREGISTER_BASE + (i)) +#define SLJIT_TMP_VR0 (SLJIT_TMP_VREGISTER_BASE + 0) +#define SLJIT_TMP_VR1 (SLJIT_TMP_VREGISTER_BASE + 1) +#define SLJIT_TMP_VR2 (SLJIT_TMP_VREGISTER_BASE + 2) +#define SLJIT_TMP_VR3 (SLJIT_TMP_VREGISTER_BASE + 3) +#define SLJIT_TMP_VR4 (SLJIT_TMP_VREGISTER_BASE + 4) +#define SLJIT_TMP_VR5 (SLJIT_TMP_VREGISTER_BASE + 5) +#define SLJIT_TMP_VR6 (SLJIT_TMP_VREGISTER_BASE + 6) +#define SLJIT_TMP_VR7 (SLJIT_TMP_VREGISTER_BASE + 7) +#define SLJIT_TMP_VR8 (SLJIT_TMP_VREGISTER_BASE + 8) +#define SLJIT_TMP_VR9 (SLJIT_TMP_VREGISTER_BASE + 9) +#define SLJIT_TMP_VR(i) (SLJIT_TMP_VREGISTER_BASE + (i)) + /********************************/ /* CPU status flags management. */ /********************************/ diff --git a/src/sljit/sljitLir.c b/src/sljit/sljitLir.c index 2dca17cd6..ac726ccbe 100644 --- a/src/sljit/sljitLir.c +++ b/src/sljit/sljitLir.c @@ -96,9 +96,10 @@ /* All variable flags are even. */ #define VARIABLE_FLAG_MASK (0x3e << VARIABLE_FLAG_SHIFT) #define GET_FLAG_TYPE(op) ((op) >> VARIABLE_FLAG_SHIFT) +#define GET_FLAG_TYPE_MASK(op) (((op) >> VARIABLE_FLAG_SHIFT) & 0x3e) #define GET_OPCODE(op) \ - ((op) & ~(SLJIT_32 | SLJIT_SET_Z | VARIABLE_FLAG_MASK)) + ((op) & 0xff) #define HAS_FLAGS(op) \ ((op) & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) @@ -139,7 +140,9 @@ #define REG_PAIR_SECOND(reg) ((reg) >> 8) /* Mask for sljit_emit_enter. */ -#define SLJIT_KEPT_SAVEDS_COUNT(options) ((options) & 0x3) +#define ENTER_GET_REGS(regs) ((regs) & 0xff) +#define ENTER_GET_FLOAT_REGS(regs) (((regs) >> 8) & 0xff) +#define SLJIT_KEPT_SAVEDS_COUNT(options) ((options) & 0x3) /* Getters for simd operations, which returns with log2(size). */ #define SLJIT_SIMD_GET_OPCODE(type) ((type) & 0xff) @@ -753,17 +756,17 @@ static SLJIT_INLINE sljit_uw sljit_get_next_min(sljit_uw next_label_size, #endif /* !SLJIT_CONFIG_X86 */ static SLJIT_INLINE void set_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 args, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { SLJIT_UNUSED_ARG(args); SLJIT_UNUSED_ARG(local_size); compiler->options = options; - compiler->scratches = scratches; - compiler->saveds = saveds; - compiler->fscratches = fscratches; - compiler->fsaveds = fsaveds; + compiler->scratches = ENTER_GET_REGS(scratches); + compiler->saveds = ENTER_GET_REGS(saveds); + compiler->fscratches = ENTER_GET_FLOAT_REGS(scratches); + compiler->fsaveds = ENTER_GET_FLOAT_REGS(saveds); #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) compiler->last_return = args & SLJIT_ARG_MASK; compiler->logical_local_size = local_size; @@ -771,17 +774,17 @@ static SLJIT_INLINE void set_emit_enter(struct sljit_compiler *compiler, } static SLJIT_INLINE void set_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 args, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { SLJIT_UNUSED_ARG(args); SLJIT_UNUSED_ARG(local_size); compiler->options = options; - compiler->scratches = scratches; - compiler->saveds = saveds; - compiler->fscratches = fscratches; - compiler->fsaveds = fsaveds; + compiler->scratches = ENTER_GET_REGS(scratches); + compiler->saveds = ENTER_GET_REGS(saveds); + compiler->fscratches = ENTER_GET_FLOAT_REGS(scratches); + compiler->fsaveds = ENTER_GET_FLOAT_REGS(saveds); #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) compiler->last_return = args & SLJIT_ARG_MASK; compiler->logical_local_size = local_size; @@ -1079,57 +1082,73 @@ static void sljit_verbose_freg(struct sljit_compiler *compiler, sljit_s32 r) fprintf(compiler->verbose, "ft%d", r - SLJIT_TMP_FREGISTER_BASE); } +static void sljit_verbose_vreg(struct sljit_compiler *compiler, sljit_s32 r) +{ +#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \ + || (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) + if (r >= SLJIT_F64_SECOND(SLJIT_VR0)) { + fprintf(compiler->verbose, "^"); + r -= SLJIT_F64_SECOND(0); + } +#endif /* SLJIT_CONFIG_ARM_32 || SLJIT_CONFIG_MIPS_32 */ + + if (r < (SLJIT_VR0 + compiler->fscratches)) + fprintf(compiler->verbose, "vr%d", r - SLJIT_VR0); + else if (r < SLJIT_TMP_VREGISTER_BASE) + fprintf(compiler->verbose, "vs%d", SLJIT_NUMBER_OF_VECTOR_REGISTERS - r); + else + fprintf(compiler->verbose, "vt%d", r - SLJIT_TMP_VREGISTER_BASE); +} + +static void sljit_verbose_mem(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i) +{ + if (!(p & REG_MASK)) { + fprintf(compiler->verbose, "[%" SLJIT_PRINT_D "d]", i); + return; + } + + fputc('[', compiler->verbose); + sljit_verbose_reg(compiler, (p) & REG_MASK); + if (p & OFFS_REG_MASK) { + fprintf(compiler->verbose, " + "); + sljit_verbose_reg(compiler, OFFS_REG(p)); + if (i) + fprintf(compiler->verbose, " * %d", 1 << (i)); + } else if (i) + fprintf(compiler->verbose, " + %" SLJIT_PRINT_D "d", (i)); + fputc(']', compiler->verbose); +} + static void sljit_verbose_param(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i) { - if ((p) == SLJIT_IMM) - fprintf(compiler->verbose, "#%" SLJIT_PRINT_D "d", (i)); - else if ((p) & SLJIT_MEM) { - if ((p) & REG_MASK) { - fputc('[', compiler->verbose); - sljit_verbose_reg(compiler, (p) & REG_MASK); - if ((p) & OFFS_REG_MASK) { - fprintf(compiler->verbose, " + "); - sljit_verbose_reg(compiler, OFFS_REG(p)); - if (i) - fprintf(compiler->verbose, " * %d", 1 << (i)); - } - else if (i) - fprintf(compiler->verbose, " + %" SLJIT_PRINT_D "d", (i)); - fputc(']', compiler->verbose); - } - else - fprintf(compiler->verbose, "[#%" SLJIT_PRINT_D "d]", (i)); - } else + if (p == SLJIT_IMM) + fprintf(compiler->verbose, "#%" SLJIT_PRINT_D "d", i); + else if (p & SLJIT_MEM) + sljit_verbose_mem(compiler, p, i); + else sljit_verbose_reg(compiler, p); } static void sljit_verbose_fparam(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i) { - if ((p) & SLJIT_MEM) { - if ((p) & REG_MASK) { - fputc('[', compiler->verbose); - sljit_verbose_reg(compiler, (p) & REG_MASK); - if ((p) & OFFS_REG_MASK) { - fprintf(compiler->verbose, " + "); - sljit_verbose_reg(compiler, OFFS_REG(p)); - if (i) - fprintf(compiler->verbose, "%d", 1 << (i)); - } - else if (i) - fprintf(compiler->verbose, " + %" SLJIT_PRINT_D "d", (i)); - fputc(']', compiler->verbose); - } - else - fprintf(compiler->verbose, "[#%" SLJIT_PRINT_D "d]", (i)); - } + if (p & SLJIT_MEM) + sljit_verbose_mem(compiler, p, i); else sljit_verbose_freg(compiler, p); } +static void sljit_verbose_vparam(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i) +{ + if (p & SLJIT_MEM) + sljit_verbose_mem(compiler, p, i); + else + sljit_verbose_vreg(compiler, p); +} + static const char* op0_names[] = { "breakpoint", "nop", "lmul.uw", "lmul.sw", "divmod.u", "divmod.s", "div.u", "div.s", - "endbr", "skip_frames_before_return" + "memory_barrier", "endbr", "skip_frames_before_return" }; static const char* op1_names[] = { @@ -1184,7 +1203,7 @@ static const char* fop2r_names[] = { }; static const char* simd_op2_names[] = { - "and", "or", "xor" + "and", "or", "xor", "shuffle" }; static const char* jump_names[] = { @@ -1224,6 +1243,7 @@ static const char* call_arg_names[] = { || (defined SLJIT_VERBOSE && SLJIT_VERBOSE) #define SLJIT_SKIP_CHECKS(compiler) (compiler)->skip_checks = 1 +#define SLJIT_CHECK_OPCODE(op, flags) ((op) & ~(SLJIT_32 | SLJIT_SET_Z | VARIABLE_FLAG_MASK | (flags))) static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_generate_code(struct sljit_compiler *compiler) { @@ -1252,9 +1272,15 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_generate_code(struct sljit_com #endif /* !SLJIT_CONFIG_X86 */ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { +#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) + sljit_s32 real_scratches = ENTER_GET_REGS(scratches); + sljit_s32 real_saveds = ENTER_GET_REGS(saveds); + sljit_s32 real_fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 real_fsaveds = ENTER_GET_FLOAT_REGS(saveds); +#endif SLJIT_UNUSED_ARG(compiler); #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -1264,15 +1290,17 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_enter(struct sljit_compil CHECK_ARGUMENT((options & ~SLJIT_ENTER_CPU_SPECIFIC_OPTIONS) == 0); } CHECK_ARGUMENT(SLJIT_KEPT_SAVEDS_COUNT(options) <= 3 && SLJIT_KEPT_SAVEDS_COUNT(options) <= saveds); - CHECK_ARGUMENT(scratches >= 0 && scratches <= SLJIT_NUMBER_OF_REGISTERS); - CHECK_ARGUMENT(saveds >= 0 && saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS); - CHECK_ARGUMENT(scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS); - CHECK_ARGUMENT(fscratches >= 0 && fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); - CHECK_ARGUMENT(fsaveds >= 0 && fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS); - CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); + CHECK_ARGUMENT((scratches & ~0xffff) == 0 && (saveds & ~0xffff) == 0); + CHECK_ARGUMENT(real_scratches >= 0 && real_scratches <= SLJIT_NUMBER_OF_REGISTERS); + CHECK_ARGUMENT(real_saveds >= 0 && real_saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS); + CHECK_ARGUMENT(real_scratches + real_saveds <= SLJIT_NUMBER_OF_REGISTERS); + CHECK_ARGUMENT(real_fscratches >= 0 && real_fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); + CHECK_ARGUMENT(real_fsaveds >= 0 && real_fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS); + CHECK_ARGUMENT(real_fscratches + real_fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE); CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) <= SLJIT_ARG_TYPE_F32); - CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, (options & SLJIT_ENTER_REG_ARG) ? 0 : saveds, fscratches)); + CHECK_ARGUMENT(function_check_arguments(arg_types, real_scratches, + (options & SLJIT_ENTER_REG_ARG) ? 0 : real_saveds, real_fscratches)); compiler->last_flags = 0; #endif @@ -1308,16 +1336,22 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_enter(struct sljit_compil #endif /* !SLJIT_CONFIG_X86 */ fprintf(compiler->verbose, " scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n", - scratches, saveds, fscratches, fsaveds, local_size); + ENTER_GET_REGS(scratches), ENTER_GET_REGS(saveds), ENTER_GET_FLOAT_REGS(scratches), ENTER_GET_FLOAT_REGS(saveds), local_size); } #endif CHECK_RETURN_OK; } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { +#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) + sljit_s32 real_scratches = ENTER_GET_REGS(scratches); + sljit_s32 real_saveds = ENTER_GET_REGS(saveds); + sljit_s32 real_fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 real_fsaveds = ENTER_GET_FLOAT_REGS(saveds); +#endif SLJIT_UNUSED_ARG(compiler); #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -1327,15 +1361,17 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_set_context(struct sljit_compi CHECK_ARGUMENT((options & ~SLJIT_ENTER_CPU_SPECIFIC_OPTIONS) == 0); } CHECK_ARGUMENT(SLJIT_KEPT_SAVEDS_COUNT(options) <= 3 && SLJIT_KEPT_SAVEDS_COUNT(options) <= saveds); - CHECK_ARGUMENT(scratches >= 0 && scratches <= SLJIT_NUMBER_OF_REGISTERS); - CHECK_ARGUMENT(saveds >= 0 && saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS); - CHECK_ARGUMENT(scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS); - CHECK_ARGUMENT(fscratches >= 0 && fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); - CHECK_ARGUMENT(fsaveds >= 0 && fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS); - CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); + CHECK_ARGUMENT((scratches & ~0xffff) == 0 && (saveds & ~0xffff) == 0); + CHECK_ARGUMENT(real_scratches >= 0 && real_scratches <= SLJIT_NUMBER_OF_REGISTERS); + CHECK_ARGUMENT(real_saveds >= 0 && real_saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS); + CHECK_ARGUMENT(real_scratches + real_saveds <= SLJIT_NUMBER_OF_REGISTERS); + CHECK_ARGUMENT(real_fscratches >= 0 && real_fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); + CHECK_ARGUMENT(real_fsaveds >= 0 && real_fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS); + CHECK_ARGUMENT(real_fscratches + real_fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE); CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) < SLJIT_ARG_TYPE_F64); - CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, (options & SLJIT_ENTER_REG_ARG) ? 0 : saveds, fscratches)); + CHECK_ARGUMENT(function_check_arguments(arg_types, real_scratches, + (options & SLJIT_ENTER_REG_ARG) ? 0 : real_saveds, real_fscratches)); compiler->last_flags = 0; #endif @@ -1371,7 +1407,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_set_context(struct sljit_compi #endif /* !SLJIT_CONFIG_X86 */ fprintf(compiler->verbose, " scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n", - scratches, saveds, fscratches, fsaveds, local_size); + ENTER_GET_REGS(scratches), ENTER_GET_REGS(saveds), ENTER_GET_FLOAT_REGS(scratches), ENTER_GET_FLOAT_REGS(saveds), local_size); } #endif CHECK_RETURN_OK; @@ -1427,7 +1463,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_return(struct sljit_compi break; } - if (GET_OPCODE(op) < SLJIT_MOV_F64) { + if (SLJIT_CHECK_OPCODE(op, 0) < SLJIT_MOV_F64) { FUNCTION_CHECK_SRC(src, srcw); } else { FUNCTION_FCHECK(src, srcw, op & SLJIT_32); @@ -1471,9 +1507,9 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op0(struct sljit_compiler #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT((op >= SLJIT_BREAKPOINT && op <= SLJIT_LMUL_SW) || ((op & ~SLJIT_32) >= SLJIT_DIVMOD_UW && (op & ~SLJIT_32) <= SLJIT_DIV_SW) - || (op >= SLJIT_ENDBR && op <= SLJIT_SKIP_FRAMES_BEFORE_RETURN)); - CHECK_ARGUMENT(GET_OPCODE(op) < SLJIT_LMUL_UW || GET_OPCODE(op) >= SLJIT_ENDBR || compiler->scratches >= 2); - if ((GET_OPCODE(op) >= SLJIT_LMUL_UW && GET_OPCODE(op) <= SLJIT_DIV_SW) || op == SLJIT_SKIP_FRAMES_BEFORE_RETURN) + || (op >= SLJIT_MEMORY_BARRIER && op <= SLJIT_SKIP_FRAMES_BEFORE_RETURN)); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) < SLJIT_LMUL_UW || SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_MEMORY_BARRIER || compiler->scratches >= 2); + if ((SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_LMUL_UW && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_DIV_SW) || op == SLJIT_SKIP_FRAMES_BEFORE_RETURN) compiler->last_flags = 0; #endif #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) @@ -1499,7 +1535,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op1(struct sljit_compiler } #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_REV_S32); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_MOV && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_REV_S32); switch (GET_OPCODE(op)) { case SLJIT_MOV: @@ -1546,26 +1582,37 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_atomic_load(struct sljit_ #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_ATOMIC)); - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_MOV_P); - CHECK_ARGUMENT(GET_OPCODE(op) != SLJIT_MOV_S8 && GET_OPCODE(op) != SLJIT_MOV_S16 && GET_OPCODE(op) != SLJIT_MOV_S32); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, SLJIT_ATOMIC_TEST | SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS | SLJIT_SET_Z | VARIABLE_FLAG_MASK) >= SLJIT_MOV + && SLJIT_CHECK_OPCODE(op, SLJIT_ATOMIC_TEST | SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS | SLJIT_SET_Z | VARIABLE_FLAG_MASK) <= SLJIT_MOV_P); + CHECK_ARGUMENT((op & (SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS)) != (SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS)); /* All arguments must be valid registers. */ CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(mem_reg) && !CHECK_IF_VIRTUAL_REGISTER(mem_reg)); - if (op == SLJIT_MOV32_U8 || op == SLJIT_MOV32_U16) { - /* Only SLJIT_32 is allowed. */ - CHECK_ARGUMENT(!(op & (VARIABLE_FLAG_MASK | SLJIT_SET_Z))); - } else { + if (GET_OPCODE(op) < SLJIT_MOV_U8 || GET_OPCODE(op) > SLJIT_MOV_S16) { /* Nothing allowed. */ - CHECK_ARGUMENT(!(op & (SLJIT_32 | SLJIT_SET_Z | VARIABLE_FLAG_MASK))); + CHECK_ARGUMENT(!(op & SLJIT_32)); } compiler->last_flags = 0; #endif /* SLJIT_ARGUMENT_CHECKS */ #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { - fprintf(compiler->verbose, " atomic_load%s%s ", !(op & SLJIT_32) ? "" : "32", + if (op & SLJIT_ATOMIC_TEST) + CHECK_RETURN_OK; + if (sljit_emit_atomic_load(compiler, op | SLJIT_ATOMIC_TEST, dst_reg, mem_reg)) { + fprintf(compiler->verbose, " # atomic_load: unsupported form, no instructions are emitted\n"); + CHECK_RETURN_OK; + } + + fprintf(compiler->verbose, " atomic_load"); + if (op & SLJIT_ATOMIC_USE_CAS) + fprintf(compiler->verbose, "_cas"); + if (op & SLJIT_ATOMIC_USE_LS) + fprintf(compiler->verbose, "_ls"); + + fprintf(compiler->verbose, "%s%s ", !(op & SLJIT_32) ? "" : "32", op1_types[GET_OPCODE(op) - SLJIT_OP1_BASE]); sljit_verbose_reg(compiler, dst_reg); fprintf(compiler->verbose, ", ["); @@ -1588,29 +1635,40 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_atomic_store(struct sljit #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_ATOMIC)); - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_MOV_P); - CHECK_ARGUMENT(GET_OPCODE(op) != SLJIT_MOV_S8 && GET_OPCODE(op) != SLJIT_MOV_S16 && GET_OPCODE(op) != SLJIT_MOV_S32); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, SLJIT_ATOMIC_TEST | SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS | SLJIT_SET_Z) >= SLJIT_MOV + && SLJIT_CHECK_OPCODE(op, SLJIT_ATOMIC_TEST | SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS | SLJIT_SET_Z) <= SLJIT_MOV_P); + CHECK_ARGUMENT((op & (SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS)) != (SLJIT_ATOMIC_USE_CAS | SLJIT_ATOMIC_USE_LS)); /* All arguments must be valid registers. */ CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src_reg)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(mem_reg) && !CHECK_IF_VIRTUAL_REGISTER(mem_reg)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(temp_reg) && src_reg != temp_reg); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(temp_reg) && (src_reg != temp_reg || (op & SLJIT_ATOMIC_USE_LS))); - CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK) || GET_FLAG_TYPE(op) == SLJIT_ATOMIC_STORED); + CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK) || GET_FLAG_TYPE_MASK(op) == SLJIT_ATOMIC_STORED); - if (GET_OPCODE(op) == SLJIT_MOV_U8 || GET_OPCODE(op) == SLJIT_MOV_U16) { - /* Only SLJIT_32, SLJIT_ATOMIC_STORED are allowed. */ - CHECK_ARGUMENT(!(op & SLJIT_SET_Z)); - } else { - /* Only SLJIT_ATOMIC_STORED is allowed. */ - CHECK_ARGUMENT(!(op & (SLJIT_32 | SLJIT_SET_Z))); + if (GET_OPCODE(op) < SLJIT_MOV_U8 || GET_OPCODE(op) > SLJIT_MOV_S16) { + /* Nothing allowed. */ + CHECK_ARGUMENT(!(op & SLJIT_32)); } - compiler->last_flags = GET_FLAG_TYPE(op) | (op & SLJIT_32); + compiler->last_flags = GET_FLAG_TYPE_MASK(op) | (op & SLJIT_32); #endif /* SLJIT_ARGUMENT_CHECKS */ #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { - fprintf(compiler->verbose, " atomic_store%s%s%s ", !(op & SLJIT_32) ? "" : "32", + if (op & SLJIT_ATOMIC_TEST) + CHECK_RETURN_OK; + if (sljit_emit_atomic_store(compiler, op | SLJIT_ATOMIC_TEST, src_reg, mem_reg, temp_reg)) { + fprintf(compiler->verbose, " # atomic_store: unsupported form, no instructions are emitted\n"); + CHECK_RETURN_OK; + } + + fprintf(compiler->verbose, " atomic_store"); + if (op & SLJIT_ATOMIC_USE_CAS) + fprintf(compiler->verbose, "_cas"); + if (op & SLJIT_ATOMIC_USE_LS) + fprintf(compiler->verbose, "_ls"); + + fprintf(compiler->verbose, "%s%s%s ", !(op & SLJIT_32) ? "" : "32", op1_types[GET_OPCODE(op) - SLJIT_OP1_BASE], !(op & VARIABLE_FLAG_MASK) ? "" : ".stored"); sljit_verbose_reg(compiler, src_reg); fprintf(compiler->verbose, ", ["); @@ -1634,7 +1692,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op2(struct sljit_compiler } #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD && GET_OPCODE(op) <= SLJIT_ROTR); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_ADD && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_ROTR); switch (GET_OPCODE(op)) { case SLJIT_AND: @@ -1741,8 +1799,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_shift_into(struct sljit_c sljit_s32 src3, sljit_sw src3w) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) - CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_LSHR - || GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) == SLJIT_SHL || SLJIT_CHECK_OPCODE(op, 0) == SLJIT_LSHR + || SLJIT_CHECK_OPCODE(op, 0) == SLJIT_MSHL || SLJIT_CHECK_OPCODE(op, 0) == SLJIT_MLSHR); CHECK_ARGUMENT((op & ~(0xff | SLJIT_32 | SLJIT_SHIFT_INTO_NON_ZERO)) == 0); CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src1_reg)); @@ -1876,7 +1934,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fop1(struct sljit_compile #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU)); - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV_F64 && GET_OPCODE(op) <= SLJIT_ABS_F64); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_MOV_F64 && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_ABS_F64); CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK))); FUNCTION_FCHECK(src, srcw, op & SLJIT_32); FUNCTION_FCHECK(dst, dstw, op & SLJIT_32); @@ -1914,7 +1972,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fop1_cmp(struct sljit_com #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU)); - CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_CMP_F64); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) == SLJIT_CMP_F64); CHECK_ARGUMENT(!(op & SLJIT_SET_Z)); CHECK_ARGUMENT((op & VARIABLE_FLAG_MASK) || (GET_FLAG_TYPE(op) >= SLJIT_F_EQUAL && GET_FLAG_TYPE(op) <= SLJIT_ORDERED_LESS_EQUAL)); @@ -2007,7 +2065,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fop2(struct sljit_compile #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU)); - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD_F64 && GET_OPCODE(op) <= SLJIT_DIV_F64); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_ADD_F64 && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_DIV_F64); CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK))); FUNCTION_FCHECK(src1, src1w, op & SLJIT_32); FUNCTION_FCHECK(src2, src2w, op & SLJIT_32); @@ -2034,7 +2092,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fop2r(struct sljit_compil { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU)); - CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_COPYSIGN_F64); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) == SLJIT_COPYSIGN_F64); FUNCTION_FCHECK(src1, src1w, op & SLJIT_32); FUNCTION_FCHECK(src2, src2w, op & SLJIT_32); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, op & SLJIT_32)); @@ -2106,7 +2164,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fcopy(struct sljit_compil { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU)); - CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_COPY_TO_F64 && GET_OPCODE(op) <= SLJIT_COPY_FROM_F64); + CHECK_ARGUMENT(SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_COPY_TO_F64 && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_COPY_FROM_F64); CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK))); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, op & SLJIT_32)); @@ -2383,7 +2441,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_com #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL); CHECK_ARGUMENT(op == SLJIT_MOV || op == SLJIT_MOV32 - || (GET_OPCODE(op) >= SLJIT_AND && GET_OPCODE(op) <= SLJIT_XOR)); + || (SLJIT_CHECK_OPCODE(op, 0) >= SLJIT_AND && SLJIT_CHECK_OPCODE(op, 0) <= SLJIT_XOR)); CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK)); if (type <= SLJIT_NOT_ZERO) @@ -2700,7 +2758,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fmem_update(struct sljit_ } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -2709,14 +2767,14 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) <= (srcdst & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(vreg, 0)); FUNCTION_FCHECK(srcdst, srcdstw, 0); #endif #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_mov(compiler, type | SLJIT_SIMD_TEST, freg, srcdst, srcdstw) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_mov(compiler, type | SLJIT_SIMD_TEST, vreg, srcdst, srcdstw) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_mem: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } @@ -2732,9 +2790,9 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com else fprintf(compiler->verbose, ".al%d ", (8 << SLJIT_SIMD_GET_ELEM2_SIZE(type))); - sljit_verbose_freg(compiler, freg); + sljit_verbose_vreg(compiler, vreg); fprintf(compiler->verbose, ", "); - sljit_verbose_fparam(compiler, srcdst, srcdstw); + sljit_verbose_vparam(compiler, srcdst, srcdstw); fprintf(compiler->verbose, "\n"); } #endif @@ -2742,7 +2800,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -2750,7 +2808,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct slj CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) == 0); CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(vreg, 0)); if (type & SLJIT_SIMD_FLOAT) { if (src == SLJIT_IMM) { @@ -2766,7 +2824,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct slj if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_replicate(compiler, type | SLJIT_SIMD_TEST, freg, src, srcw) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_replicate(compiler, type | SLJIT_SIMD_TEST, vreg, src, srcw) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_dup: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } @@ -2776,7 +2834,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct slj (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); - sljit_verbose_freg(compiler, freg); + sljit_verbose_vreg(compiler, vreg); fprintf(compiler->verbose, ", "); if (type & SLJIT_SIMD_FLOAT) sljit_verbose_fparam(compiler, src, srcw); @@ -2789,7 +2847,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct slj } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -2801,7 +2859,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct slji CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(!(type & SLJIT_32) || SLJIT_SIMD_GET_ELEM_SIZE(type) <= 2); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(vreg, 0)); CHECK_ARGUMENT(lane_index >= 0 && lane_index < (1 << (SLJIT_SIMD_GET_REG_SIZE(type) - SLJIT_SIMD_GET_ELEM_SIZE(type)))); if (type & SLJIT_SIMD_FLOAT) { @@ -2814,7 +2872,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct slji if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_lane_mov(compiler, type | SLJIT_SIMD_TEST, freg, lane_index, srcdst, srcdstw) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_lane_mov(compiler, type | SLJIT_SIMD_TEST, vreg, lane_index, srcdst, srcdstw) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_move_lane: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } @@ -2828,7 +2886,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct slji (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); - sljit_verbose_freg(compiler, freg); + sljit_verbose_vreg(compiler, vreg); fprintf(compiler->verbose, "[%d], ", lane_index); if (type & SLJIT_SIMD_FLOAT) sljit_verbose_fparam(compiler, srcdst, srcdstw); @@ -2841,7 +2899,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct slji } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -2849,7 +2907,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struc CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) == 0); CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(vreg, 0)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src, 0)); CHECK_ARGUMENT(src_lane_index >= 0 && src_lane_index < (1 << (SLJIT_SIMD_GET_REG_SIZE(type) - SLJIT_SIMD_GET_ELEM_SIZE(type)))); #endif @@ -2857,7 +2915,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struc if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_lane_replicate(compiler, type | SLJIT_SIMD_TEST, freg, src, src_lane_index) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_lane_replicate(compiler, type | SLJIT_SIMD_TEST, vreg, src, src_lane_index) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_lane_replicate: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } @@ -2867,9 +2925,9 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struc (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); - sljit_verbose_freg(compiler, freg); + sljit_verbose_vreg(compiler, vreg); fprintf(compiler->verbose, ", "); - sljit_verbose_freg(compiler, src); + sljit_verbose_vreg(compiler, src); fprintf(compiler->verbose, "[%d]\n", src_lane_index); } #endif @@ -2877,7 +2935,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struc } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -2887,14 +2945,14 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_ELEM2_SIZE(type)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(vreg, 0)); FUNCTION_FCHECK(src, srcw, SLJIT_SIMD_GET_ELEM_SIZE(type) == 2); #endif #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_extend(compiler, type | SLJIT_SIMD_TEST, freg, src, srcw) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_extend(compiler, type | SLJIT_SIMD_TEST, vreg, src, srcw) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_extend: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } @@ -2907,9 +2965,9 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_ (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); - sljit_verbose_freg(compiler, freg); + sljit_verbose_vreg(compiler, vreg); fprintf(compiler->verbose, ", "); - sljit_verbose_fparam(compiler, src, srcw); + sljit_verbose_vparam(compiler, src, srcw); fprintf(compiler->verbose, "\n"); } #endif @@ -2917,7 +2975,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_ } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -2925,14 +2983,14 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(SLJIT_32)) == SLJIT_SIMD_STORE); CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(vreg, 0)); FUNCTION_CHECK_DST(dst, dstw); #endif #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_sign(compiler, type | SLJIT_SIMD_TEST, freg, dst, dstw) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_sign(compiler, type | SLJIT_SIMD_TEST, vreg, dst, dstw) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_sign: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } @@ -2943,7 +3001,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); - sljit_verbose_freg(compiler, freg); + sljit_verbose_vreg(compiler, vreg); fprintf(compiler->verbose, ", "); sljit_verbose_param(compiler, dst, dstw); fprintf(compiler->verbose, "\n"); @@ -2953,37 +3011,43 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK(0)) <= SLJIT_SIMD_OP2_XOR); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK2(0)) <= SLJIT_SIMD_OP2_SHUFFLE); CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, 0)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src1_freg, 0)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg, 0)); + CHECK_ARGUMENT(SLJIT_SIMD_GET_OPCODE(type) != SLJIT_SIMD_OP2_SHUFFLE || (SLJIT_SIMD_GET_ELEM_SIZE(type) == 0 && !(type & SLJIT_SIMD_FLOAT))); + CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) <= (src2 & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_vreg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src1_vreg, 0)); + FUNCTION_FCHECK(src2, src2w, 0); #endif #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2_freg) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_vreg, src1_vreg, src2, src2w) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_op2: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } - fprintf(compiler->verbose, " simd_%s.%d.%s%d ", + fprintf(compiler->verbose, " simd_%s.%d.%s%d", simd_op2_names[SLJIT_SIMD_GET_OPCODE(type) - 1], (8 << SLJIT_SIMD_GET_REG_SIZE(type)), (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); - sljit_verbose_freg(compiler, dst_freg); + if ((type & 0x3f000000) != SLJIT_SIMD_MEM_UNALIGNED) + fprintf(compiler->verbose, ".al%d", (8 << SLJIT_SIMD_GET_ELEM2_SIZE(type))); + + fprintf(compiler->verbose, " "); + sljit_verbose_vreg(compiler, dst_vreg); fprintf(compiler->verbose, ", "); - sljit_verbose_freg(compiler, src1_freg); + sljit_verbose_vreg(compiler, src1_vreg); fprintf(compiler->verbose, ", "); - sljit_verbose_freg(compiler, src2_freg); + sljit_verbose_vparam(compiler, src2, src2w); fprintf(compiler->verbose, "\n"); } #endif @@ -3389,17 +3453,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \ && !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \ && !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \ + && !(defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \ && !(defined SLJIT_CONFIG_LOONGARCH && SLJIT_CONFIG_LOONGARCH) SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(freg); + SLJIT_UNUSED_ARG(vreg); SLJIT_UNUSED_ARG(srcdst); SLJIT_UNUSED_ARG(srcdstw); @@ -3407,14 +3472,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(freg); + SLJIT_UNUSED_ARG(vreg); SLJIT_UNUSED_ARG(src); SLJIT_UNUSED_ARG(srcw); @@ -3422,14 +3487,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(freg); + SLJIT_UNUSED_ARG(vreg); SLJIT_UNUSED_ARG(lane_index); SLJIT_UNUSED_ARG(srcdst); SLJIT_UNUSED_ARG(srcdstw); @@ -3438,14 +3503,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(freg); + SLJIT_UNUSED_ARG(vreg); SLJIT_UNUSED_ARG(src); SLJIT_UNUSED_ARG(src_lane_index); @@ -3453,14 +3518,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(freg); + SLJIT_UNUSED_ARG(vreg); SLJIT_UNUSED_ARG(src); SLJIT_UNUSED_ARG(srcw); @@ -3468,14 +3533,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(freg); + SLJIT_UNUSED_ARG(vreg); SLJIT_UNUSED_ARG(dst); SLJIT_UNUSED_ARG(dstw); @@ -3483,56 +3548,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); - SLJIT_UNUSED_ARG(dst_freg); - SLJIT_UNUSED_ARG(src1_freg); - SLJIT_UNUSED_ARG(src2_freg); - - return SLJIT_ERR_UNSUPPORTED; -} - -#endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_ARM */ - -#if !(defined(SLJIT_CONFIG_X86) && SLJIT_CONFIG_X86) \ - && !(defined(SLJIT_CONFIG_ARM) && SLJIT_CONFIG_ARM) \ - && !(defined(SLJIT_CONFIG_S390X) && SLJIT_CONFIG_S390X) \ - && !(defined(SLJIT_CONFIG_LOONGARCH) && SLJIT_CONFIG_LOONGARCH) - -SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, - sljit_s32 op, - sljit_s32 dst_reg, - sljit_s32 mem_reg) -{ - SLJIT_UNUSED_ARG(compiler); - SLJIT_UNUSED_ARG(op); - SLJIT_UNUSED_ARG(dst_reg); - SLJIT_UNUSED_ARG(mem_reg); - - CHECK_ERROR(); - CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); - - return SLJIT_ERR_UNSUPPORTED; -} - -SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, - sljit_s32 op, - sljit_s32 src_reg, - sljit_s32 mem_reg, - sljit_s32 temp_reg) -{ - SLJIT_UNUSED_ARG(compiler); - SLJIT_UNUSED_ARG(op); - SLJIT_UNUSED_ARG(src_reg); - SLJIT_UNUSED_ARG(mem_reg); - SLJIT_UNUSED_ARG(temp_reg); - - CHECK_ERROR(); - CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + SLJIT_UNUSED_ARG(dst_vreg); + SLJIT_UNUSED_ARG(src1_vreg); + SLJIT_UNUSED_ARG(src2); + SLJIT_UNUSED_ARG(src2w); return SLJIT_ERR_UNSUPPORTED; } diff --git a/src/sljit/sljitLir.h b/src/sljit/sljitLir.h index 8b6fa69a0..6f390c281 100644 --- a/src/sljit/sljitLir.h +++ b/src/sljit/sljitLir.h @@ -251,7 +251,7 @@ extern "C" { #define SLJIT_FS7 (SLJIT_NUMBER_OF_FLOAT_REGISTERS - 7) #define SLJIT_FS8 (SLJIT_NUMBER_OF_FLOAT_REGISTERS - 8) #define SLJIT_FS9 (SLJIT_NUMBER_OF_FLOAT_REGISTERS - 9) -/* All S registers provided by the architecture can be accessed by SLJIT_FS(i) +/* All FS registers provided by the architecture can be accessed by SLJIT_FS(i) The i parameter must be >= 0 and < SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS. */ #define SLJIT_FS(i) (SLJIT_NUMBER_OF_FLOAT_REGISTERS - (i)) @@ -262,6 +262,52 @@ extern "C" { #define SLJIT_RETURN_FREG SLJIT_FR0 +/* --------------------------------------------------------------------- */ +/* Vector registers */ +/* --------------------------------------------------------------------- */ + +/* Vector registers are storage areas, which are used for Single Instruction + Multiple Data (SIMD) computations. The VR and VS register sets overlap + in the same way as R and S register sets. See above. + + The storage space of vector registers often overlap with floating point + registers. In this case setting the value of SLJIT_VR(i) destroys the + value of SLJIT_FR(i) and vice versa. See SLJIT_SEPARATE_VECTOR_REGISTERS + macro. */ + +/* Vector scratch registers. */ +#define SLJIT_VR0 1 +#define SLJIT_VR1 2 +#define SLJIT_VR2 3 +#define SLJIT_VR3 4 +#define SLJIT_VR4 5 +#define SLJIT_VR5 6 +#define SLJIT_VR6 7 +#define SLJIT_VR7 8 +#define SLJIT_VR8 9 +#define SLJIT_VR9 10 +/* All VR registers provided by the architecture can be accessed by SLJIT_VR(i) + The i parameter must be >= 0 and < SLJIT_NUMBER_OF_VECTOR_REGISTERS. */ +#define SLJIT_VR(i) (1 + (i)) + +/* Vector saved registers. */ +#define SLJIT_VS0 (SLJIT_NUMBER_OF_VECTOR_REGISTERS) +#define SLJIT_VS1 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 1) +#define SLJIT_VS2 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 2) +#define SLJIT_VS3 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 3) +#define SLJIT_VS4 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 4) +#define SLJIT_VS5 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 5) +#define SLJIT_VS6 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 6) +#define SLJIT_VS7 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 7) +#define SLJIT_VS8 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 8) +#define SLJIT_VS9 (SLJIT_NUMBER_OF_VECTOR_REGISTERS - 9) +/* All VS registers provided by the architecture can be accessed by SLJIT_VS(i) + The i parameter must be >= 0 and < SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS. */ +#define SLJIT_VS(i) (SLJIT_NUMBER_OF_VECTOR_REGISTERS - (i)) + +/* Vector registers >= SLJIT_FIRST_SAVED_VECTOR_REG are saved registers. */ +#define SLJIT_FIRST_SAVED_VECTOR_REG (SLJIT_VS0 - SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS + 1) + /* --------------------------------------------------------------------- */ /* Argument type definitions */ /* --------------------------------------------------------------------- */ @@ -715,8 +761,10 @@ static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler a simd operation represents the same 128 bit register, and both SLJIT_FR0 and SLJIT_FR1 are overwritten. */ #define SLJIT_SIMD_REGS_ARE_PAIRS 13 -/* [Not emulated] Atomic support is available (fine-grained). */ -#define SLJIT_HAS_ATOMIC 14 +/* [Not emulated] Atomic support is available. */ +#define SLJIT_HAS_ATOMIC 14 +/* [Not emulated] Memory barrier support is available. */ +#define SLJIT_HAS_MEMORY_BARRIER 15 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) /* [Not emulated] AVX support is available on x86. */ @@ -749,42 +797,65 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type); with an error code. */ /* - The executable code is a function from the viewpoint of the C - language. The function calls must conform to the ABI (Application - Binary Interface) of the platform, which specify the purpose of - machine registers and stack handling among other things. The - sljit_emit_enter function emits the necessary instructions for - setting up a new context for the executable code. This is often - called as function prologue. Furthermore the options argument - can be used to pass configuration options to the compiler. The + The executable code is a callable function from the viewpoint + of the C language. Function calls must conform with the ABI + (Application Binary Interface) of the target platform, which + specify the purpose of machine registers and stack handling + among other things. The sljit_emit_enter function emits the + necessary instructions for setting up an entry point for the + executable code. This is often called as function prologue. + + The "options" argument can be used to pass configuration options + to the sljit compiler which affects the generated code, until + another sljit_emit_enter or sljit_set_context is called. The available options are listed before sljit_emit_enter. The function argument list is specified by the SLJIT_ARGSx (SLJIT_ARGS0 .. SLJIT_ARGS4) macros. Currently maximum four arguments are supported. See the description of SLJIT_ARGSx - macros about argument passing. Furthermore the register set - used by the function must be declared as well. The number of - scratch and saved registers available to the function must - be passed to sljit_emit_enter. Only R registers between R0 - and "scratches" argument can be used later. E.g. if "scratches" - is set to two, the scratch register set will be limited to - SLJIT_R0 and SLJIT_R1. The S registers and the floating point - registers ("fscratches" and "fsaveds") are specified in a - similar manner. The sljit_emit_enter is also capable of - allocating a stack space for local data. The "local_size" - argument contains the size in bytes of this local area, and - it can be accessed using SLJIT_MEM1(SLJIT_SP). The memory - area between SLJIT_SP (inclusive) and SLJIT_SP + local_size - (exclusive) can be modified freely until the function returns. - The stack space is not initialized to zero. + macros about argument passing. + + The register set used by the function must be declared as well. + The number of scratch and saved registers available to the + function must be passed to sljit_emit_enter. Only R registers + between R0 and "scratches" argument can be used later. E.g. + if "scratches" is set to two, the scratch register set will + be limited to SLJIT_R0 and SLJIT_R1. The S registers are + declared in a similar manner, but their count is specified + by "saveds" argument. The floating point scratch and saved + registers can be set by using "scratches" and "saveds" argument + as well, but their value must be passed to the SLJIT_ENTER_FLOAT + macro, see below. + + The sljit_emit_enter is also capable of allocating a stack + space for local data. The "local_size" argument contains the + size in bytes of this local area, and it can be accessed using + SLJIT_MEM1(SLJIT_SP). The memory area between SLJIT_SP (inclusive) + and SLJIT_SP + local_size (exclusive) can be modified freely + until the function returns. The alocated stack space is an + uninitialized memory area. + + Floating point scratch and saved registers must be specified + by the SLJIT_ENTER_FLOAT macro, which result value should be + combined with scratches / saveds argument. + + Examples: + To use three scratch and four floating point scratch + registers, the "scratches" argument must be set to: + 3 | SLJIT_ENTER_FLOAT(4) + + To use six saved and five floating point saved + registers, the "saveds" argument must be set to: + 6 | SLJIT_ENTER_FLOAT(5) Note: the following conditions must met: 0 <= scratches <= SLJIT_NUMBER_OF_REGISTERS 0 <= saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS - 0 <= fscratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS - 0 <= fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS - fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS + + 0 <= float scratches <= SLJIT_NUMBER_OF_FLOAT_REGISTERS + 0 <= float saveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS + float scratches + float saveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS Note: the compiler can use saved registers as scratch registers, but the opposite is not supported @@ -793,6 +864,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type); overwrites the previous context. */ +/* The following options are available for sljit_emit_enter. */ + /* Saved registers between SLJIT_S0 and SLJIT_S(n - 1) (inclusive) are not saved / restored on function enter / return. Instead, these registers can be used to pass / return data (such as @@ -808,17 +881,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type); and all arguments must be stored in scratch registers. */ #define SLJIT_ENTER_REG_ARG 0x00000004 -/* The local_size must be >= 0 and <= SLJIT_MAX_LOCAL_SIZE. */ -#define SLJIT_MAX_LOCAL_SIZE 1048576 - #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) /* Use VEX prefix for all SIMD operations on x86. */ #define SLJIT_ENTER_USE_VEX 0x00010000 #endif /* !SLJIT_CONFIG_X86 */ +/* Macros for other sljit_emit_enter arguments. */ + +/* Floating point scratch and saved registers can be + specified by SLJIT_ENTER_FLOAT. */ +#define SLJIT_ENTER_FLOAT(regs) ((regs) << 8) + +/* The local_size must be >= 0 and <= SLJIT_MAX_LOCAL_SIZE. */ +#define SLJIT_MAX_LOCAL_SIZE 1048576 + SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size); + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size); /* The SLJIT compiler has a current context (which contains the local stack space size, number of used registers, etc.) which is initialized @@ -834,8 +913,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi the previous context. */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size); + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size); /* Return to the caller function. The sljit_emit_return_void function does not return with any value. The sljit_emit_return function returns @@ -1092,16 +1171,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *c the behaviour is undefined. */ #define SLJIT_DIV_SW (SLJIT_OP0_BASE + 7) #define SLJIT_DIV_S32 (SLJIT_DIV_SW | SLJIT_32) +/* Flags: - (does not modify flags) + May return with SLJIT_ERR_UNSUPPORTED if SLJIT_HAS_MEMORY_BARRIER + feature is not supported (calling sljit_has_cpu_feature() with + this feature option returns with 0). */ +#define SLJIT_MEMORY_BARRIER (SLJIT_OP0_BASE + 8) /* Flags: - (does not modify flags) ENDBR32 instruction for x86-32 and ENDBR64 instruction for x86-64 when Intel Control-flow Enforcement Technology (CET) is enabled. No instructions are emitted for other architectures. */ -#define SLJIT_ENDBR (SLJIT_OP0_BASE + 8) +#define SLJIT_ENDBR (SLJIT_OP0_BASE + 9) /* Flags: - (may destroy flags) Skip stack frames before return when Intel Control-flow Enforcement Technology (CET) is enabled. No instructions are emitted for other architectures. */ -#define SLJIT_SKIP_FRAMES_BEFORE_RETURN (SLJIT_OP0_BASE + 9) +#define SLJIT_SKIP_FRAMES_BEFORE_RETURN (SLJIT_OP0_BASE + 10) SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op); @@ -1890,21 +1974,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler /* The following options are used by several simd operations. */ -/* Load data into a simd register, this is the default */ +/* Load data into a vector register, this is the default */ #define SLJIT_SIMD_LOAD 0x000000 -/* Store data from a simd register */ +/* Store data from a vector register */ #define SLJIT_SIMD_STORE 0x000001 -/* The simd register contains floating point values */ +/* The vector register contains floating point values */ #define SLJIT_SIMD_FLOAT 0x000400 /* Tests whether the operation is available */ #define SLJIT_SIMD_TEST 0x000800 -/* Move data to/from a 64 bit (8 byte) long SIMD register */ +/* Move data to/from a 64 bit (8 byte) long vector register */ #define SLJIT_SIMD_REG_64 (3 << 12) -/* Move data to/from a 128 bit (16 byte) long SIMD register */ +/* Move data to/from a 128 bit (16 byte) long vector register */ #define SLJIT_SIMD_REG_128 (4 << 12) -/* Move data to/from a 256 bit (32 byte) long SIMD register */ +/* Move data to/from a 256 bit (32 byte) long vector register */ #define SLJIT_SIMD_REG_256 (5 << 12) -/* Move data to/from a 512 bit (64 byte) long SIMD register */ +/* Move data to/from a 512 bit (64 byte) long vector register */ #define SLJIT_SIMD_REG_512 (6 << 12) /* Element size is 8 bit long (this is the default), usually cannot be combined with SLJIT_SIMD_FLOAT */ #define SLJIT_SIMD_ELEM_8 (0 << 18) @@ -1919,7 +2003,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler /* Element size is 256 bit long */ #define SLJIT_SIMD_ELEM_256 (5 << 18) -/* The following options are used by sljit_emit_simd_mov(). */ +/* The following options are used by sljit_emit_simd_mov() + and sljit_emit_simd_op2(). */ /* Memory address is unaligned (this is the default) */ #define SLJIT_SIMD_MEM_UNALIGNED (0 << 24) @@ -1936,7 +2021,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler /* Memory address is 512 bit aligned */ #define SLJIT_SIMD_MEM_ALIGNED_512 (6 << 24) -/* Moves data between a simd register and memory. +/* Moves data between a vector register and memory. If the operation is not supported, it returns with SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed, @@ -1944,21 +2029,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler type must be a combination of SLJIT_SIMD_* and SLJIT_SIMD_MEM_* options - freg is the source or destination simd register + vreg is the source or destination vector register of the operation - srcdst must be a memory operand or a simd register + srcdst must be a memory operand or a vector register Note: The alignment and element size must be - less or equal than simd register size. + less or equal than vector register size. Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw); -/* Replicates a scalar value to all lanes of a simd +/* Replicates a scalar value to all lanes of a vector register. If the operation is not supported, it returns with @@ -1967,7 +2052,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co type must be a combination of SLJIT_SIMD_* options except SLJIT_SIMD_STORE. - freg is the destination simd register of the operation + vreg is the destination vector register of the operation src is the value which is replicated Note: @@ -1977,7 +2062,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw); /* The following options are used by sljit_emit_simd_lane_mov(). */ @@ -1987,7 +2072,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil /* Sign extend the integer value stored from the lane. */ #define SLJIT_SIMD_LANE_SIGNED 0x000004 -/* Moves data between a simd register lane and a register or +/* Moves data between a vector register lane and a register or memory. If the srcdst argument is a register, it must be a floating point register when SLJIT_SIMD_FLOAT is specified, or a general purpose register otherwise. @@ -2003,7 +2088,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil is set and SLJIT_SIMD_FLOAT is not set SLJIT_SIMD_LANE_ZERO - when SLJIT_SIMD_LOAD is specified - freg is the source or destination simd register + vreg is the source or destination vector register of the operation lane_index is the index of the lane srcdst is the destination operand for loads, and @@ -2015,11 +2100,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw); /* Replicates a scalar value from a lane to all lanes - of a simd register. + of a vector register. If the operation is not supported, it returns with SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed, @@ -2027,14 +2112,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile type must be a combination of SLJIT_SIMD_* options except SLJIT_SIMD_STORE. - freg is the destination simd register of the operation - src is the simd register which lane is replicated + vreg is the destination vector register of the operation + src is the vector register which lane is replicated src_lane_index is the lane index of the src register Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index); /* The following options are used by sljit_emit_simd_load_extend(). */ @@ -2048,7 +2133,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c /* Extend data to 64 bit */ #define SLJIT_SIMD_EXTEND_64 (3 << 24) -/* Extend elements and stores them in a simd register. +/* Extend elements and stores them in a vector register. The extension operation increases the size of the elements (e.g. from 16 bit to 64 bit). For integer values, the extension can be signed or unsigned. @@ -2059,15 +2144,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c type must be a combination of SLJIT_SIMD_*, and SLJIT_SIMD_EXTEND_* options except SLJIT_SIMD_STORE - freg is the destination simd register of the operation - src must be a memory operand or a simd register. + vreg is the destination vector register of the operation + src must be a memory operand or a vector register. In the latter case, the source elements are stored in the lower half of the register. Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw); /* Extract the highest bit (usually the sign bit) from @@ -2079,16 +2164,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler type must be a combination of SLJIT_SIMD_* and SLJIT_32 options except SLJIT_SIMD_LOAD - freg is the source simd register of the operation + vreg is the source vector register of the operation dst is the destination operand Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw); -/* The following options are used by sljit_emit_simd_op2(). */ +/* The following operations are used by sljit_emit_simd_op2(). */ /* Binary 'and' operation */ #define SLJIT_SIMD_OP2_AND 0x000001 @@ -2096,23 +2181,40 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c #define SLJIT_SIMD_OP2_OR 0x000002 /* Binary 'xor' operation */ #define SLJIT_SIMD_OP2_XOR 0x000003 +/* Shuffle bytes of src1 using the indicies in src2 */ +#define SLJIT_SIMD_OP2_SHUFFLE 0x000004 -/* Perform simd operations using simd registers. +/* Perform simd operations using vector registers. If the operation is not supported, it returns with SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed, it does not emit any instructions. - type must be a combination of SLJIT_SIMD_* and SLJIT_SIMD_OP2_ - options except SLJIT_SIMD_LOAD and SLJIT_SIMD_STORE - dst_freg is the destination register of the operation - src1_freg is the first source register of the operation - src1_freg is the second source register of the operation + type must be a combination of SLJIT_SIMD_*, SLJIT_SIMD_MEM_* + and SLJIT_SIMD_OP2_* options except SLJIT_SIMD_LOAD + and SLJIT_SIMD_STORE + dst_vreg is the destination register of the operation + src1_vreg is the first source register of the operation + src2 is the second source operand of the operation Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg); + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w); + +/* The following operations are used by sljit_emit_atomic_load() and + sljit_emit_atomic_store() operations. */ + +/* Tests whether the atomic operation is available (does not generate + any instructions). When a load from is allowed, its corresponding + store form is allowed and vice versa. */ +#define SLJIT_ATOMIC_TEST 0x10000 +/* The compiler must generate compare and swap instruction. + When this bit is set, calling sljit_emit_atomic_load() is optional. */ +#define SLJIT_ATOMIC_USE_CAS 0x20000 +/* The compiler must generate load-acquire and store-release instructions. + When this bit is set, the temp_reg for sljit_emit_atomic_store is not used. */ +#define SLJIT_ATOMIC_USE_LS 0x40000 /* The sljit_emit_atomic_load and sljit_emit_atomic_store operation pair can perform an atomic read-modify-write operation. First, an unsigned @@ -2121,23 +2223,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co sljit_emit_atomic_store. A thread can only perform a single atomic operation at a time. - Note: atomic operations are experimental, and not implemented - for all cpus. - The following conditions must be satisfied, or the operation is undefined: - the address provided in mem_reg must be divisible by the size of the value (only naturally aligned updates are supported) - - no memory writes are allowed between the load and store operations - regardless of its target address (currently read operations are - allowed, but this might change in the future) + - no memory operations are allowed between the load and store operations - the memory operation (op) and the base address (stored in mem_reg) passed to the load/store operations must be the same (the mem_reg can be a different register, only its value must be the same) - - an store must always follow a load for the same transaction. + - a store must always follow a load for the same transaction. - op must be between SLJIT_MOV and SLJIT_MOV_P, excluding all - signed loads such as SLJIT_MOV32_S16 + op must be between SLJIT_MOV and SLJIT_MOV_P dst_reg is the register where the data will be loaded into mem_reg is the base address of the memory load (it cannot be SLJIT_SP or a virtual register on x86-32) @@ -2151,18 +2247,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler allows performing an atomic read-modify-write operation. See the description of sljit_emit_atomic_load. - op must be between SLJIT_MOV and SLJIT_MOV_P, excluding all signed - loads such as SLJIT_MOV32_S16 + op must be between SLJIT_MOV and SLJIT_MOV_P src_reg is the register which value is stored into the memory mem_reg is the base address of the memory store (it cannot be SLJIT_SP or a virtual register on x86-32) - temp_reg is a not preserved scratch register, which must be - initialized with the value loaded into the dst_reg during the - corresponding sljit_emit_atomic_load operation, or the operation - is undefined - - Flags: ATOMIC_STORED is set if the operation is successful, - otherwise the memory remains unchanged. */ + temp_reg is a scratch register, which must be initialized with + the value loaded into the dst_reg during the corresponding + sljit_emit_atomic_load operation, or the operation is undefined. + The temp_reg register preserves its value, if the memory store + is successful. Otherwise, its value is undefined. + + Flags: ATOMIC_STORED + if ATOMIC_STORED flag is set, it represents that the memory + is updated with a new value. Otherwise the memory is unchanged. */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src_reg, sljit_s32 mem_reg, diff --git a/src/sljit/sljitNativeARM_32.c b/src/sljit/sljitNativeARM_32.c index a253c06f0..b22894b70 100644 --- a/src/sljit/sljitNativeARM_32.c +++ b/src/sljit/sljitNativeARM_32.c @@ -114,6 +114,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) #define CLZ 0xe16f0f10 #define CMN 0xe1600000 #define CMP 0xe1400000 +#define DMB_SY 0xf57ff05f #define EOR 0xe0200000 #define LDR 0xe5100000 #define LDR_POST 0xe4100000 @@ -180,6 +181,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) #define VST1_s 0xf4800000 #define VSTR_F32 0xed000a00 #define VSUB_F32 0xee300a40 +#define VTBL 0xf3b00800 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) /* Arm v7 specific instructions. */ @@ -364,7 +366,7 @@ static sljit_uw patch_pc_relative_loads(sljit_uw *last_pc_patch, sljit_uw *code_ while (last_pc_patch < code_ptr) { /* Data transfer instruction with Rn == r15. */ - if ((*last_pc_patch & 0x0e0f0000) == 0x040f0000) { + if ((*last_pc_patch & 0x0e4f0000) == 0x040f0000) { diff = (sljit_uw)(const_pool - last_pc_patch); ind = (*last_pc_patch) & 0xfff; @@ -1131,6 +1133,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_COPY_F32: case SLJIT_HAS_COPY_F64: case SLJIT_HAS_ATOMIC: +#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) + case SLJIT_HAS_MEMORY_BARRIER: +#endif /* SLJIT_CONFIG_ARM_V7 */ return 1; case SLJIT_HAS_CTZ: @@ -1225,9 +1230,11 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3 sljit_s32 src2, sljit_sw src2w); SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_uw imm, offset; sljit_s32 i, tmp, size, word_arg_count; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); @@ -1240,11 +1247,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi #endif CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); - imm = 0; + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); + imm = 0; tmp = SLJIT_S0 - saveds; for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) imm |= (sljit_uw)1 << reg_map[i]; @@ -1391,15 +1400,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 size; CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1); /* Doubles are saved, so alignment is unaffected. */ @@ -2364,6 +2377,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile | (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */); } return SLJIT_SUCCESS; + case SLJIT_MEMORY_BARRIER: +#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) + return push_inst(compiler, DMB_SY); +#else /* !SLJIT_CONFIG_ARM_V7 */ + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_CONFIG_ARM_V7 */ case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -3105,9 +3124,9 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile if (type >= SLJIT_FAST_CALL) PTR_FAIL_IF(prepare_blx(compiler)); - jump->addr = compiler->size; PTR_FAIL_IF(push_inst_with_unique_literal(compiler, ((EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1, type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0)) & ~COND_MASK) | get_cc(compiler, type), 0)); + jump->addr = compiler->size - 1; if (jump->flags & SLJIT_REWRITABLE_JUMP) compiler->patches++; @@ -3907,7 +3926,7 @@ static SLJIT_INLINE sljit_s32 simd_get_quad_reg_index(sljit_s32 freg) #define SLJIT_QUAD_OTHER_HALF(freg) ((((freg) & 0x1) << 1) - 1) SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3916,7 +3935,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3930,16 +3949,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (!(srcdst & SLJIT_MEM)) { if (reg_size == 4) srcdst = simd_get_quad_reg_index(srcdst); if (type & SLJIT_SIMD_STORE) - ins = VD(srcdst) | VN(freg) | VM(freg); + ins = VD(srcdst) | VN(vreg) | VM(vreg); else - ins = VD(freg) | VN(srcdst) | VM(srcdst); + ins = VD(vreg) | VN(srcdst) | VM(srcdst); if (reg_size == 4) ins |= (sljit_ins)1 << 6; @@ -3952,7 +3971,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (elem_size > 3) elem_size = 3; - ins = ((type & SLJIT_SIMD_STORE) ? VST1 : VLD1) | VD(freg) + ins = ((type & SLJIT_SIMD_STORE) ? VST1 : VLD1) | VD(vreg) | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8)); SLJIT_ASSERT(reg_size >= alignment); @@ -4060,7 +4079,7 @@ static sljit_ins simd_get_imm(sljit_s32 elem_size, sljit_uw value) } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4068,7 +4087,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil sljit_ins ins, imm; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -4082,24 +4101,24 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (src == SLJIT_IMM && srcw == 0) - return push_inst(compiler, VMOV_i | ((reg_size == 4) ? (1 << 6) : 0) | VD(freg)); + return push_inst(compiler, VMOV_i | ((reg_size == 4) ? (1 << 6) : 0) | VD(vreg)); if (SLJIT_UNLIKELY(elem_size == 3)) { SLJIT_ASSERT(type & SLJIT_SIMD_FLOAT); if (src & SLJIT_MEM) { - FAIL_IF(emit_fop_mem(compiler, FPU_LOAD | SLJIT_32, freg, src, srcw)); - src = freg; - } else if (freg != src) - FAIL_IF(push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src))); + FAIL_IF(emit_fop_mem(compiler, FPU_LOAD | SLJIT_32, vreg, src, srcw)); + src = vreg; + } else if (vreg != src) + FAIL_IF(push_inst(compiler, VORR | VD(vreg) | VN(src) | VM(src))); - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); - if (freg != src) - return push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src)); + if (vreg != src) + return push_inst(compiler, VORR | VD(vreg) | VN(src) | VM(src)); return SLJIT_SUCCESS; } @@ -4111,7 +4130,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= (sljit_ins)1 << 5; - return push_inst(compiler, VLD1_r | ins | VD(freg) | RN(src) | 0xf); + return push_inst(compiler, VLD1_r | ins | VD(vreg) | RN(src) | 0xf); } if (type & SLJIT_SIMD_FLOAT) { @@ -4121,7 +4140,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= (sljit_ins)1 << 6; - return push_inst(compiler, VDUP_s | ins | VD(freg) | (sljit_ins)freg_map[src]); + return push_inst(compiler, VDUP_s | ins | VD(vreg) | (sljit_ins)freg_map[src]); } if (src == SLJIT_IMM) { @@ -4134,7 +4153,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) imm |= (sljit_ins)1 << 6; - return push_inst(compiler, VMOV_i | imm | VD(freg)); + return push_inst(compiler, VMOV_i | imm | VD(vreg)); } FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)srcw)); @@ -4156,11 +4175,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= (sljit_ins)1 << 21; - return push_inst(compiler, VDUP | ins | VN(freg) | RD(src)); + return push_inst(compiler, VDUP | ins | VN(vreg) | RD(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4168,7 +4187,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -4182,7 +4201,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (type & SLJIT_SIMD_LANE_ZERO) { ins = (reg_size == 3) ? 0 : ((sljit_ins)1 << 6); @@ -4190,62 +4209,62 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (type & SLJIT_SIMD_FLOAT) { if (elem_size == 3 && !(srcdst & SLJIT_MEM)) { if (lane_index == 1) - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); - if (srcdst != freg) - FAIL_IF(push_inst(compiler, VORR | VD(freg) | VN(srcdst) | VM(srcdst))); + if (srcdst != vreg) + FAIL_IF(push_inst(compiler, VORR | VD(vreg) | VN(srcdst) | VM(srcdst))); - freg += SLJIT_QUAD_OTHER_HALF(freg); - return push_inst(compiler, VMOV_i | VD(freg)); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + return push_inst(compiler, VMOV_i | VD(vreg)); } - if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) { - FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG2) | VN(freg) | VM(freg))); + if (srcdst == vreg || (elem_size == 3 && srcdst == (vreg + SLJIT_QUAD_OTHER_HALF(vreg)))) { + FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG2) | VN(vreg) | VM(vreg))); srcdst = TMP_FREG2; srcdstw = 0; } } - FAIL_IF(push_inst(compiler, VMOV_i | ins | VD(freg))); + FAIL_IF(push_inst(compiler, VMOV_i | ins | VD(vreg))); } if (reg_size == 4 && lane_index >= (0x8 >> elem_size)) { lane_index -= (0x8 >> elem_size); - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); } if (srcdst & SLJIT_MEM) { if (elem_size == 3) - return emit_fop_mem(compiler, ((type & SLJIT_SIMD_STORE) ? 0 : FPU_LOAD) | SLJIT_32, freg, srcdst, srcdstw); + return emit_fop_mem(compiler, ((type & SLJIT_SIMD_STORE) ? 0 : FPU_LOAD) | SLJIT_32, vreg, srcdst, srcdstw); FAIL_IF(sljit_emit_simd_mem_offset(compiler, &srcdst, srcdstw)); lane_index = lane_index << elem_size; ins = (sljit_ins)((elem_size << 10) | (lane_index << 5)); - return push_inst(compiler, ((type & SLJIT_SIMD_STORE) ? VST1_s : VLD1_s) | ins | VD(freg) | RN(srcdst) | 0xf); + return push_inst(compiler, ((type & SLJIT_SIMD_STORE) ? VST1_s : VLD1_s) | ins | VD(vreg) | RN(srcdst) | 0xf); } if (type & SLJIT_SIMD_FLOAT) { if (elem_size == 3) { if (type & SLJIT_SIMD_STORE) - return push_inst(compiler, VORR | VD(srcdst) | VN(freg) | VM(freg)); - return push_inst(compiler, VMOV_F32 | SLJIT_32 | VD(freg) | VM(srcdst)); + return push_inst(compiler, VORR | VD(srcdst) | VN(vreg) | VM(vreg)); + return push_inst(compiler, VMOV_F32 | SLJIT_32 | VD(vreg) | VM(srcdst)); } if (type & SLJIT_SIMD_STORE) { - if (freg_ebit_map[freg] == 0) { + if (freg_ebit_map[vreg] == 0) { if (lane_index == 1) - freg = SLJIT_F64_SECOND(freg); + vreg = SLJIT_F64_SECOND(vreg); - return push_inst(compiler, VMOV_F32 | VD(srcdst) | VM(freg)); + return push_inst(compiler, VMOV_F32 | VD(srcdst) | VM(vreg)); } - FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | ((sljit_ins)lane_index << 21) | VN(freg) | RD(TMP_REG1))); + FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | ((sljit_ins)lane_index << 21) | VN(vreg) | RD(TMP_REG1))); return push_inst(compiler, VMOV | VN(srcdst) | RD(TMP_REG1)); } FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(srcdst) | RD(TMP_REG1))); - return push_inst(compiler, VMOV_s | ((sljit_ins)lane_index << 21) | VN(freg) | RD(TMP_REG1)); + return push_inst(compiler, VMOV_s | ((sljit_ins)lane_index << 21) | VN(vreg) | RD(TMP_REG1)); } if (srcdst == SLJIT_IMM) { @@ -4273,11 +4292,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile ins |= (1 << 23); } - return push_inst(compiler, VMOV_s | ins | VN(freg) | RD(srcdst)); + return push_inst(compiler, VMOV_s | ins | VN(vreg) | RD(srcdst)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4285,7 +4304,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4297,7 +4316,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c return SLJIT_SUCCESS; if (reg_size == 4) { - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); src = simd_get_quad_reg_index(src); if (src_lane_index >= (0x8 >> elem_size)) { @@ -4307,13 +4326,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } if (elem_size == 3) { - if (freg != src) - FAIL_IF(push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src))); + if (vreg != src) + FAIL_IF(push_inst(compiler, VORR | VD(vreg) | VN(src) | VM(src))); - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); - if (freg != src) - return push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src)); + if (vreg != src) + return push_inst(compiler, VORR | VD(vreg) | VN(src) | VM(src)); return SLJIT_SUCCESS; } @@ -4322,11 +4341,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (reg_size == 4) ins |= (sljit_ins)1 << 6; - return push_inst(compiler, VDUP_s | ins | VD(freg) | VM(src)); + return push_inst(compiler, VDUP_s | ins | VD(vreg) | VM(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4335,7 +4354,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler sljit_s32 dst_reg; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -4349,20 +4368,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (src & SLJIT_MEM) { FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw)); if (reg_size == 4 && elem2_size - elem_size == 1) - FAIL_IF(push_inst(compiler, VLD1 | (0x7 << 8) | VD(freg) | RN(src) | 0xf)); + FAIL_IF(push_inst(compiler, VLD1 | (0x7 << 8) | VD(vreg) | RN(src) | 0xf)); else - FAIL_IF(push_inst(compiler, VLD1_s | (sljit_ins)((reg_size - elem2_size + elem_size) << 10) | VD(freg) | RN(src) | 0xf)); - src = freg; + FAIL_IF(push_inst(compiler, VLD1_s | (sljit_ins)((reg_size - elem2_size + elem_size) << 10) | VD(vreg) | RN(src) | 0xf)); + src = vreg; } else if (reg_size == 4) src = simd_get_quad_reg_index(src); if (!(type & SLJIT_SIMD_FLOAT)) { - dst_reg = (reg_size == 4) ? freg : TMP_FREG2; + dst_reg = (reg_size == 4) ? vreg : TMP_FREG2; do { FAIL_IF(push_inst(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 24)) @@ -4371,27 +4390,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler } while (++elem_size < elem2_size); if (dst_reg == TMP_FREG2) - return push_inst(compiler, VORR | VD(freg) | VN(TMP_FREG2) | VM(TMP_FREG2)); + return push_inst(compiler, VORR | VD(vreg) | VN(TMP_FREG2) | VM(TMP_FREG2)); return SLJIT_SUCCESS; } /* No SIMD variant, must use VFP instead. */ SLJIT_ASSERT(reg_size == 4); - if (freg == src) { - freg += SLJIT_QUAD_OTHER_HALF(freg); - FAIL_IF(push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src) | 0x20)); - freg += SLJIT_QUAD_OTHER_HALF(freg); - return push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src)); + if (vreg == src) { + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + FAIL_IF(push_inst(compiler, VCVT_F64_F32 | VD(vreg) | VM(src) | 0x20)); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + return push_inst(compiler, VCVT_F64_F32 | VD(vreg) | VM(src)); } - FAIL_IF(push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src))); - freg += SLJIT_QUAD_OTHER_HALF(freg); - return push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src) | 0x20); + FAIL_IF(push_inst(compiler, VCVT_F64_F32 | VD(vreg) | VM(src))); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + return push_inst(compiler, VCVT_F64_F32 | VD(vreg) | VM(src) | 0x20); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4400,7 +4419,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_s32 dst_r; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); ADJUST_LOCAL_OFFSET(dst, dstw); @@ -4433,12 +4452,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } if (reg_size == 4) { - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); ins |= (sljit_ins)1 << 6; } SLJIT_ASSERT((freg_map[TMP_FREG2] & 0x1) == 0); - FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG2) | VM(freg))); + FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG2) | VM(vreg))); if (reg_size == 4 && elem_size > 0) FAIL_IF(push_inst(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD(TMP_FREG2) | VM(TMP_FREG2))); @@ -4468,14 +4487,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_ins ins = 0; + sljit_s32 alignment; + sljit_ins ins = 0, load_ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4483,6 +4504,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) return SLJIT_ERR_UNSUPPORTED; + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + switch (SLJIT_SIMD_GET_OPCODE(type)) { case SLJIT_SIMD_OP2_AND: ins = VAND; @@ -4493,19 +4517,51 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = VEOR; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = VTBL; + break; } - if (type & SLJIT_SIMD_TEST) - return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + if (elem_size > 3) + elem_size = 3; + + load_ins = VLD1 | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8)); + alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + + SLJIT_ASSERT(reg_size >= alignment); + + if (alignment == 3) + load_ins |= 0x10; + else if (alignment >= 4) + load_ins |= 0x20; + + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + FAIL_IF(push_inst(compiler, load_ins | VD(TMP_FREG2) | RN(src2) | ((sljit_ins)elem_size) << 6 | 0xf)); + src2 = TMP_FREG2; + } if (reg_size == 4) { - dst_freg = simd_get_quad_reg_index(dst_freg); - src1_freg = simd_get_quad_reg_index(src1_freg); - src2_freg = simd_get_quad_reg_index(src2_freg); + dst_vreg = simd_get_quad_reg_index(dst_vreg); + src1_vreg = simd_get_quad_reg_index(src1_vreg); + src2 = simd_get_quad_reg_index(src2); + + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) { + ins |= (sljit_ins)1 << 8; + + FAIL_IF(push_inst(compiler, ins | VD(dst_vreg != src1_vreg ? dst_vreg : TMP_FREG2) | VN(src1_vreg) | VM(src2))); + src2 += SLJIT_QUAD_OTHER_HALF(src2); + FAIL_IF(push_inst(compiler, ins | VD(dst_vreg + SLJIT_QUAD_OTHER_HALF(dst_vreg)) | VN(src1_vreg) | VM(src2))); + + if (dst_vreg == src1_vreg) + return push_inst(compiler, VORR | VD(dst_vreg) | VN(TMP_FREG2) | VM(TMP_FREG2)); + return SLJIT_SUCCESS; + } + ins |= (sljit_ins)1 << 6; } - return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg)); + return push_inst(compiler, ins | VD(dst_vreg) | VN(src1_vreg) | VM(src2)); } #undef FPU_LOAD @@ -4519,7 +4575,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + case SLJIT_MOV_U8: ins = LDREXB; break; @@ -4531,6 +4595,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler break; } + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + return push_inst(compiler, ins | RN(mem_reg) | RD(dst_reg)); } @@ -4547,7 +4614,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + case SLJIT_MOV_U8: ins = STREXB; break; @@ -4559,6 +4634,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler break; } + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + FAIL_IF(push_inst(compiler, ins | RN(mem_reg) | RD(TMP_REG1) | RM(src_reg))); if (op & SLJIT_SET_ATOMIC_STORED) return push_inst(compiler, CMP | SET_FLAGS | SRC2_IMM | RN(TMP_REG1)); diff --git a/src/sljit/sljitNativeARM_64.c b/src/sljit/sljitNativeARM_64.c index 5331ebdf4..fbd0d2084 100644 --- a/src/sljit/sljitNativeARM_64.c +++ b/src/sljit/sljitNativeARM_64.c @@ -91,6 +91,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define CLZ 0xdac01000 #define CSEL 0x9a800000 #define CSINC 0x9a800400 +#define DMB_SY 0xd5033fbf #define DUP_e 0x0e000400 #define DUP_g 0x0e000c00 #define EOR 0xca000000 @@ -171,6 +172,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define SUBI 0xd1000000 #define SUBS 0xeb000000 #define TBZ 0x36000000 +#define TBL_v 0x0e000000 #define UBFM 0xd3400000 #define UCVTF 0x9e630000 #define UDIV 0x9ac00800 @@ -593,6 +595,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_COPY_F32: case SLJIT_HAS_COPY_F64: case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; default: @@ -1208,17 +1211,21 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, s /* --------------------------------------------------------------------- */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 prev, fprev, saved_regs_size, i, tmp; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); sljit_ins offs; CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 2); saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64); @@ -1383,15 +1390,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 saved_regs_size; CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 2); saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64); @@ -1537,7 +1548,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile op = GET_OPCODE(op); switch (op) { case SLJIT_BREAKPOINT: - return push_inst(compiler, BRK); + return push_inst(compiler, BRK | (0xf000 << 5)); case SLJIT_NOP: return push_inst(compiler, NOP); case SLJIT_LMUL_UW: @@ -1554,6 +1565,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile case SLJIT_DIV_UW: case SLJIT_DIV_SW: return push_inst(compiler, ((op == SLJIT_DIV_UW ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1)); + case SLJIT_MEMORY_BARRIER: + return push_inst(compiler, DMB_SY); case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -2775,7 +2788,7 @@ static sljit_s32 sljit_emit_simd_mem_offset(struct sljit_compiler *compiler, slj } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -2783,7 +2796,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -2798,9 +2811,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (!(srcdst & SLJIT_MEM)) { if (type & SLJIT_SIMD_STORE) - ins = VD(srcdst) | VN(freg) | VM(freg); + ins = VD(srcdst) | VN(vreg) | VM(vreg); else - ins = VD(freg) | VN(srcdst) | VM(srcdst); + ins = VD(vreg) | VN(srcdst) | VM(srcdst); if (reg_size == 4) ins |= (1 << 30); @@ -2818,7 +2831,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (reg_size == 4) ins |= (1 << 30); - return push_inst(compiler, ins | ((sljit_ins)elem_size << 10) | RN(srcdst) | VT(freg)); + return push_inst(compiler, ins | ((sljit_ins)elem_size << 10) | RN(srcdst) | VT(vreg)); } static sljit_ins simd_get_imm(sljit_s32 elem_size, sljit_uw value) @@ -2923,7 +2936,7 @@ static sljit_ins simd_get_imm(sljit_s32 elem_size, sljit_uw value) } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -2931,7 +2944,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil sljit_ins ins, imm; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -2952,7 +2965,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= (sljit_ins)1 << 30; - return push_inst(compiler, LD1R | ins | RN(src) | VT(freg)); + return push_inst(compiler, LD1R | ins | RN(src) | VT(vreg)); } ins = (sljit_ins)1 << (16 + elem_size); @@ -2962,9 +2975,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (type & SLJIT_SIMD_FLOAT) { if (src == SLJIT_IMM) - return push_inst(compiler, MOVI | (ins & ((sljit_ins)1 << 30)) | VD(freg)); + return push_inst(compiler, MOVI | (ins & ((sljit_ins)1 << 30)) | VD(vreg)); - return push_inst(compiler, DUP_e | ins | VD(freg) | VN(src)); + return push_inst(compiler, DUP_e | ins | VD(vreg) | VN(src)); } if (src == SLJIT_IMM) { @@ -2976,18 +2989,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (imm != ~(sljit_ins)0) { imm |= ins & ((sljit_ins)1 << 30); - return push_inst(compiler, MOVI | imm | VD(freg)); + return push_inst(compiler, MOVI | imm | VD(vreg)); } FAIL_IF(load_immediate(compiler, TMP_REG2, srcw)); src = TMP_REG2; } - return push_inst(compiler, DUP_g | ins | VD(freg) | RN(src)); + return push_inst(compiler, DUP_g | ins | VD(vreg) | RN(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -2995,7 +3008,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3011,13 +3024,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (type & SLJIT_SIMD_LANE_ZERO) { ins = (reg_size == 3) ? 0 : ((sljit_ins)1 << 30); - if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) { - FAIL_IF(push_inst(compiler, ORR_v | ins | VD(TMP_FREG1) | VN(freg) | VM(freg))); + if ((type & SLJIT_SIMD_FLOAT) && vreg == srcdst) { + FAIL_IF(push_inst(compiler, ORR_v | ins | VD(TMP_FREG1) | VN(vreg) | VM(vreg))); srcdst = TMP_FREG1; srcdstw = 0; } - FAIL_IF(push_inst(compiler, MOVI | ins | VD(freg))); + FAIL_IF(push_inst(compiler, MOVI | ins | VD(vreg))); } if (srcdst & SLJIT_MEM) { @@ -3033,14 +3046,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile lane_index = lane_index << elem_size; ins |= (sljit_ins)(((lane_index & 0x8) << 27) | ((lane_index & 0x7) << 10)); - return push_inst(compiler, ((type & SLJIT_SIMD_STORE) ? ST1_s : LD1_s) | ins | RN(srcdst) | VT(freg)); + return push_inst(compiler, ((type & SLJIT_SIMD_STORE) ? ST1_s : LD1_s) | ins | RN(srcdst) | VT(vreg)); } if (type & SLJIT_SIMD_FLOAT) { if (type & SLJIT_SIMD_STORE) - ins = INS_e | ((sljit_ins)1 << (16 + elem_size)) | ((sljit_ins)lane_index << (11 + elem_size)) | VD(srcdst) | VN(freg); + ins = INS_e | ((sljit_ins)1 << (16 + elem_size)) | ((sljit_ins)lane_index << (11 + elem_size)) | VD(srcdst) | VN(vreg); else - ins = INS_e | ((((sljit_ins)lane_index << 1) | 1) << (16 + elem_size)) | VD(freg) | VN(srcdst); + ins = INS_e | ((((sljit_ins)lane_index << 1) | 1) << (16 + elem_size)) | VD(vreg) | VN(srcdst); return push_inst(compiler, ins); } @@ -3054,7 +3067,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } if (type & SLJIT_SIMD_STORE) { - ins = RD(srcdst) | VN(freg); + ins = RD(srcdst) | VN(vreg); if ((type & SLJIT_SIMD_LANE_SIGNED) && (elem_size < 2 || (elem_size == 2 && !(type & SLJIT_32)))) { ins |= SMOV; @@ -3064,7 +3077,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } else ins |= UMOV; } else - ins = INS | VD(freg) | RN(srcdst); + ins = INS | VD(vreg) | RN(srcdst); if (elem_size == 3) ins |= (sljit_ins)1 << 30; @@ -3073,7 +3086,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3081,7 +3094,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -3097,11 +3110,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (reg_size == 4) ins |= (sljit_ins)1 << 30; - return push_inst(compiler, DUP_e | ins | VD(freg) | VN(src)); + return push_inst(compiler, DUP_e | ins | VD(vreg) | VN(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3109,7 +3122,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type); CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -3126,28 +3139,28 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw)); if (reg_size == 4 && elem2_size - elem_size == 1) - FAIL_IF(push_inst(compiler, LD1 | ((sljit_ins)elem_size << 10) | RN(src) | VT(freg))); + FAIL_IF(push_inst(compiler, LD1 | ((sljit_ins)elem_size << 10) | RN(src) | VT(vreg))); else - FAIL_IF(push_inst(compiler, LD1_s | ((sljit_ins)0x2000 << (reg_size - elem2_size + elem_size)) | RN(src) | VT(freg))); - src = freg; + FAIL_IF(push_inst(compiler, LD1_s | ((sljit_ins)0x2000 << (reg_size - elem2_size + elem_size)) | RN(src) | VT(vreg))); + src = vreg; } if (type & SLJIT_SIMD_FLOAT) { SLJIT_ASSERT(reg_size == 4); - return push_inst(compiler, FCVTL | (1 << 22) | VD(freg) | VN(src)); + return push_inst(compiler, FCVTL | (1 << 22) | VD(vreg) | VN(src)); } do { FAIL_IF(push_inst(compiler, ((type & SLJIT_SIMD_EXTEND_SIGNED) ? SSHLL : USHLL) - | ((sljit_ins)1 << (19 + elem_size)) | VD(freg) | VN(src))); - src = freg; + | ((sljit_ins)1 << (19 + elem_size)) | VD(vreg) | VN(src))); + src = vreg; } while (++elem_size < elem2_size); return SLJIT_SUCCESS; } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3156,7 +3169,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_s32 dst_r; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); ADJUST_LOCAL_OFFSET(dst, dstw); @@ -3191,7 +3204,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c if (reg_size == 4) ins |= (1 << 30); - FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG1) | VN(freg))); + FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG1) | VN(vreg))); if (reg_size == 4 && elem_size > 0) FAIL_IF(push_inst(compiler, XTN | ((sljit_ins)(elem_size - 1) << 22) | VD(TMP_FREG1) | VN(TMP_FREG1))); @@ -3224,14 +3237,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -3239,6 +3253,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) return SLJIT_ERR_UNSUPPORTED; + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + switch (SLJIT_SIMD_GET_OPCODE(type)) { case SLJIT_SIMD_OP2_AND: ins = AND_v; @@ -3249,15 +3266,24 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = EOR_v; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = TBL_v; + break; } - if (type & SLJIT_SIMD_TEST) - return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + if (elem_size > 3) + elem_size = 3; + + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + push_inst(compiler, LD1 | (reg_size == 4 ? (1 << 30) : 0) | ((sljit_ins)elem_size << 10) | RN(src2) | VT(TMP_FREG1)); + src2 = TMP_FREG1; + } if (reg_size == 4) ins |= (sljit_ins)1 << 30; - return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg)); + return push_inst(compiler, ins | VD(dst_vreg) | VN(src1_vreg) | VM(src2)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, @@ -3269,39 +3295,55 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); -#ifdef __ARM_FEATURE_ATOMICS - switch (GET_OPCODE(op)) { - case SLJIT_MOV32: - case SLJIT_MOV_U32: - ins = LDR ^ (1 << 30); - break; - case SLJIT_MOV_U16: - ins = LDRH; - break; - case SLJIT_MOV_U8: - ins = LDRB; - break; - default: - ins = LDR; - break; - } -#else /* !__ARM_FEATURE_ATOMICS */ +#ifndef __ARM_FEATURE_ATOMICS + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; +#endif /* ARM_FEATURE_ATOMICS */ + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + case SLJIT_MOV32: case SLJIT_MOV_U32: - ins = LDXR ^ (1 << 30); +#ifdef __ARM_FEATURE_ATOMICS + if (!(op & SLJIT_ATOMIC_USE_LS)) + ins = LDR ^ (1 << 30); + else +#endif /* ARM_FEATURE_ATOMICS */ + ins = LDXR ^ (1 << 30); break; case SLJIT_MOV_U8: - ins = LDXRB; +#ifdef __ARM_FEATURE_ATOMICS + if (!(op & SLJIT_ATOMIC_USE_LS)) + ins = LDRB; + else +#endif /* ARM_FEATURE_ATOMICS */ + ins = LDXRB; break; case SLJIT_MOV_U16: - ins = LDXRH; +#ifdef __ARM_FEATURE_ATOMICS + if (!(op & SLJIT_ATOMIC_USE_LS)) + ins = LDRH; + else +#endif /* ARM_FEATURE_ATOMICS */ + ins = LDXRH; break; default: - ins = LDXR; +#ifdef __ARM_FEATURE_ATOMICS + if (!(op & SLJIT_ATOMIC_USE_LS)) + ins = LDR; + else +#endif /* ARM_FEATURE_ATOMICS */ + ins = LDXR; break; } -#endif /* ARM_FEATURE_ATOMICS */ + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + return push_inst(compiler, ins | RN(mem_reg) | RT(dst_reg)); } @@ -3311,55 +3353,65 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler sljit_s32 temp_reg) { sljit_ins ins; - sljit_s32 tmp = temp_reg; sljit_ins cmp = 0; - sljit_ins inv_bits = W_OP; CHECK_ERROR(); CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); #ifdef __ARM_FEATURE_ATOMICS - if (op & SLJIT_SET_ATOMIC_STORED) - cmp = (SUBS ^ W_OP) | RD(TMP_ZERO); + if (!(op & SLJIT_ATOMIC_USE_LS)) { + if (op & SLJIT_SET_ATOMIC_STORED) + cmp = (SUBS ^ W_OP) | RD(TMP_ZERO); + + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + + case SLJIT_MOV32: + case SLJIT_MOV_U32: + ins = CAS ^ (1 << 30); + break; + case SLJIT_MOV_U16: + ins = CASH; + break; + case SLJIT_MOV_U8: + ins = CASB; + break; + default: + ins = CAS; + if (cmp) + cmp ^= W_OP; + break; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; - switch (GET_OPCODE(op)) { - case SLJIT_MOV32: - case SLJIT_MOV_U32: - ins = CAS ^ (1 << 30); - break; - case SLJIT_MOV_U16: - ins = CASH; - break; - case SLJIT_MOV_U8: - ins = CASB; - break; - default: - ins = CAS; - inv_bits = 0; if (cmp) - cmp ^= W_OP; - break; - } + FAIL_IF(push_inst(compiler, ((MOV ^ W_OP) ^ (cmp & W_OP)) | RM(temp_reg) | RD(TMP_REG2))); - if (cmp) { - FAIL_IF(push_inst(compiler, (MOV ^ inv_bits) | RM(temp_reg) | RD(TMP_REG1))); - tmp = TMP_REG1; - } - FAIL_IF(push_inst(compiler, ins | RM(tmp) | RN(mem_reg) | RD(src_reg))); - if (!cmp) - return SLJIT_SUCCESS; + FAIL_IF(push_inst(compiler, ins | RM(temp_reg) | RN(mem_reg) | RD(src_reg))); + if (!cmp) + return SLJIT_SUCCESS; - FAIL_IF(push_inst(compiler, cmp | RM(tmp) | RN(temp_reg))); - FAIL_IF(push_inst(compiler, (CSET ^ inv_bits) | RD(tmp))); - return push_inst(compiler, cmp | RM(tmp) | RN(TMP_ZERO)); + return push_inst(compiler, cmp | RM(TMP_REG2) | RN(temp_reg)); + } #else /* !__ARM_FEATURE_ATOMICS */ - SLJIT_UNUSED_ARG(tmp); - SLJIT_UNUSED_ARG(inv_bits); + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; +#endif /* __ARM_FEATURE_ATOMICS */ if (op & SLJIT_SET_ATOMIC_STORED) cmp = (SUBI ^ W_OP) | (1 << 29); switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + case SLJIT_MOV32: case SLJIT_MOV_U32: ins = STXR ^ (1 << 30); @@ -3375,9 +3427,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler break; } - FAIL_IF(push_inst(compiler, ins | RM(TMP_REG1) | RN(mem_reg) | RT(src_reg))); - return cmp ? push_inst(compiler, cmp | RD(TMP_ZERO) | RN(TMP_REG1)) : SLJIT_SUCCESS; -#endif /* __ARM_FEATURE_ATOMICS */ + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + FAIL_IF(push_inst(compiler, ins | RM(TMP_REG2) | RN(mem_reg) | RT(src_reg))); + if (!cmp) + return SLJIT_SUCCESS; + return push_inst(compiler, cmp | RD(TMP_ZERO) | RN(TMP_REG2)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset) diff --git a/src/sljit/sljitNativeARM_T2_32.c b/src/sljit/sljitNativeARM_T2_32.c index 799954a85..233e1327b 100644 --- a/src/sljit/sljitNativeARM_T2_32.c +++ b/src/sljit/sljitNativeARM_T2_32.c @@ -138,6 +138,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) #define CMPI_W 0xf1b00f00 #define CMP_X 0x4500 #define CMP_W 0xebb00f00 +#define DMB_SY 0xf3bf8f5f #define EORI 0xf0800000 #define EORS 0x4040 #define EOR_W 0xea800000 @@ -253,6 +254,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) #define VST1_s 0xf9800000 #define VSTR_F32 0xed000a00 #define VSUB_F32 0xee300a40 +#define VTBL 0xffb00800 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -694,6 +696,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_COPY_F32: case SLJIT_HAS_COPY_F64: case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; default: @@ -1367,9 +1370,11 @@ static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit /* --------------------------------------------------------------------- */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 size, i, tmp, word_arg_count; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); sljit_uw offset; @@ -1383,8 +1388,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi #endif CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); tmp = SLJIT_S0 - saveds; for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) @@ -1577,15 +1585,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 size; CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1); /* Doubles are saved, so alignment is unaffected. */ @@ -1904,6 +1916,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile } return SLJIT_SUCCESS; #endif /* __ARM_FEATURE_IDIV || __ARM_ARCH_EXT_IDIV__ */ + case SLJIT_MEMORY_BARRIER: + return push_inst32(compiler, DMB_SY); case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -3582,7 +3596,7 @@ static SLJIT_INLINE sljit_s32 simd_get_quad_reg_index(sljit_s32 freg) #define SLJIT_QUAD_OTHER_HALF(freg) ((((freg) & 0x1) << 1) - 1) SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3591,7 +3605,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3605,16 +3619,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (!(srcdst & SLJIT_MEM)) { if (reg_size == 4) srcdst = simd_get_quad_reg_index(srcdst); if (type & SLJIT_SIMD_STORE) - ins = VD4(srcdst) | VN4(freg) | VM4(freg); + ins = VD4(srcdst) | VN4(vreg) | VM4(vreg); else - ins = VD4(freg) | VN4(srcdst) | VM4(srcdst); + ins = VD4(vreg) | VN4(srcdst) | VM4(srcdst); if (reg_size == 4) ins |= (sljit_ins)1 << 6; @@ -3627,7 +3641,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (elem_size > 3) elem_size = 3; - ins = ((type & SLJIT_SIMD_STORE) ? VST1 : VLD1) | VD4(freg) + ins = ((type & SLJIT_SIMD_STORE) ? VST1 : VLD1) | VD4(vreg) | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8)); SLJIT_ASSERT(reg_size >= alignment); @@ -3735,7 +3749,7 @@ static sljit_ins simd_get_imm(sljit_s32 elem_size, sljit_uw value) } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3743,7 +3757,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil sljit_ins ins, imm; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -3757,24 +3771,24 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (src == SLJIT_IMM && srcw == 0) - return push_inst32(compiler, VMOV_i | ((reg_size == 4) ? (1 << 6) : 0) | VD4(freg)); + return push_inst32(compiler, VMOV_i | ((reg_size == 4) ? (1 << 6) : 0) | VD4(vreg)); if (SLJIT_UNLIKELY(elem_size == 3)) { SLJIT_ASSERT(type & SLJIT_SIMD_FLOAT); if (src & SLJIT_MEM) { - FAIL_IF(emit_fop_mem(compiler, FPU_LOAD | SLJIT_32, freg, src, srcw)); - src = freg; - } else if (freg != src) - FAIL_IF(push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src))); + FAIL_IF(emit_fop_mem(compiler, FPU_LOAD | SLJIT_32, vreg, src, srcw)); + src = vreg; + } else if (vreg != src) + FAIL_IF(push_inst32(compiler, VORR | VD4(vreg) | VN4(src) | VM4(src))); - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); - if (freg != src) - return push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src)); + if (vreg != src) + return push_inst32(compiler, VORR | VD4(vreg) | VN4(src) | VM4(src)); return SLJIT_SUCCESS; } @@ -3786,7 +3800,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= 1 << 5; - return push_inst32(compiler, VLD1_r | ins | VD4(freg) | RN4(src) | 0xf); + return push_inst32(compiler, VLD1_r | ins | VD4(vreg) | RN4(src) | 0xf); } if (type & SLJIT_SIMD_FLOAT) { @@ -3796,7 +3810,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= (sljit_ins)1 << 6; - return push_inst32(compiler, VDUP_s | ins | VD4(freg) | (sljit_ins)freg_map[src]); + return push_inst32(compiler, VDUP_s | ins | VD4(vreg) | (sljit_ins)freg_map[src]); } if (src == SLJIT_IMM) { @@ -3809,7 +3823,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) imm |= (sljit_ins)1 << 6; - return push_inst32(compiler, VMOV_i | imm | VD4(freg)); + return push_inst32(compiler, VMOV_i | imm | VD4(vreg)); } FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)srcw)); @@ -3831,11 +3845,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 4) ins |= (sljit_ins)1 << 21; - return push_inst32(compiler, VDUP | ins | VN4(freg) | RT4(src)); + return push_inst32(compiler, VDUP | ins | VN4(vreg) | RT4(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3843,7 +3857,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3857,7 +3871,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (type & SLJIT_SIMD_LANE_ZERO) { ins = (reg_size == 3) ? 0 : ((sljit_ins)1 << 6); @@ -3865,62 +3879,62 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (type & SLJIT_SIMD_FLOAT) { if (elem_size == 3 && !(srcdst & SLJIT_MEM)) { if (lane_index == 1) - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); - if (srcdst != freg) - FAIL_IF(push_inst32(compiler, VORR | VD4(freg) | VN4(srcdst) | VM4(srcdst))); + if (srcdst != vreg) + FAIL_IF(push_inst32(compiler, VORR | VD4(vreg) | VN4(srcdst) | VM4(srcdst))); - freg += SLJIT_QUAD_OTHER_HALF(freg); - return push_inst32(compiler, VMOV_i | VD4(freg)); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + return push_inst32(compiler, VMOV_i | VD4(vreg)); } - if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) { - FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG2) | VN4(freg) | VM4(freg))); + if (srcdst == vreg || (elem_size == 3 && srcdst == (vreg + SLJIT_QUAD_OTHER_HALF(vreg)))) { + FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG2) | VN4(vreg) | VM4(vreg))); srcdst = TMP_FREG2; srcdstw = 0; } } - FAIL_IF(push_inst32(compiler, VMOV_i | ins | VD4(freg))); + FAIL_IF(push_inst32(compiler, VMOV_i | ins | VD4(vreg))); } if (reg_size == 4 && lane_index >= (0x8 >> elem_size)) { lane_index -= (0x8 >> elem_size); - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); } if (srcdst & SLJIT_MEM) { if (elem_size == 3) - return emit_fop_mem(compiler, ((type & SLJIT_SIMD_STORE) ? 0 : FPU_LOAD) | SLJIT_32, freg, srcdst, srcdstw); + return emit_fop_mem(compiler, ((type & SLJIT_SIMD_STORE) ? 0 : FPU_LOAD) | SLJIT_32, vreg, srcdst, srcdstw); FAIL_IF(sljit_emit_simd_mem_offset(compiler, &srcdst, srcdstw)); lane_index = lane_index << elem_size; ins = (sljit_ins)((elem_size << 10) | (lane_index << 5)); - return push_inst32(compiler, ((type & SLJIT_SIMD_STORE) ? VST1_s : VLD1_s) | ins | VD4(freg) | RN4(srcdst) | 0xf); + return push_inst32(compiler, ((type & SLJIT_SIMD_STORE) ? VST1_s : VLD1_s) | ins | VD4(vreg) | RN4(srcdst) | 0xf); } if (type & SLJIT_SIMD_FLOAT) { if (elem_size == 3) { if (type & SLJIT_SIMD_STORE) - return push_inst32(compiler, VORR | VD4(srcdst) | VN4(freg) | VM4(freg)); - return push_inst32(compiler, VMOV_F32 | SLJIT_32 | VD4(freg) | VM4(srcdst)); + return push_inst32(compiler, VORR | VD4(srcdst) | VN4(vreg) | VM4(vreg)); + return push_inst32(compiler, VMOV_F32 | SLJIT_32 | VD4(vreg) | VM4(srcdst)); } if (type & SLJIT_SIMD_STORE) { - if (freg_ebit_map[freg] == 0) { + if (freg_ebit_map[vreg] == 0) { if (lane_index == 1) - freg = SLJIT_F64_SECOND(freg); + vreg = SLJIT_F64_SECOND(vreg); - return push_inst32(compiler, VMOV_F32 | VD4(srcdst) | VM4(freg)); + return push_inst32(compiler, VMOV_F32 | VD4(srcdst) | VM4(vreg)); } - FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | ((sljit_ins)lane_index << 21) | VN4(freg) | RT4(TMP_REG1))); + FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | ((sljit_ins)lane_index << 21) | VN4(vreg) | RT4(TMP_REG1))); return push_inst32(compiler, VMOV | VN4(srcdst) | RT4(TMP_REG1)); } FAIL_IF(push_inst32(compiler, VMOV | (1 << 20) | VN4(srcdst) | RT4(TMP_REG1))); - return push_inst32(compiler, VMOV_s | ((sljit_ins)lane_index << 21) | VN4(freg) | RT4(TMP_REG1)); + return push_inst32(compiler, VMOV_s | ((sljit_ins)lane_index << 21) | VN4(vreg) | RT4(TMP_REG1)); } if (srcdst == SLJIT_IMM) { @@ -3948,11 +3962,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile ins |= (1 << 23); } - return push_inst32(compiler, VMOV_s | ins | VN4(freg) | RT4(srcdst)); + return push_inst32(compiler, VMOV_s | ins | VN4(vreg) | RT4(srcdst)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3960,7 +3974,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -3972,7 +3986,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c return SLJIT_SUCCESS; if (reg_size == 4) { - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); src = simd_get_quad_reg_index(src); if (src_lane_index >= (0x8 >> elem_size)) { @@ -3982,13 +3996,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } if (elem_size == 3) { - if (freg != src) - FAIL_IF(push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src))); + if (vreg != src) + FAIL_IF(push_inst32(compiler, VORR | VD4(vreg) | VN4(src) | VM4(src))); - freg += SLJIT_QUAD_OTHER_HALF(freg); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); - if (freg != src) - return push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src)); + if (vreg != src) + return push_inst32(compiler, VORR | VD4(vreg) | VN4(src) | VM4(src)); return SLJIT_SUCCESS; } @@ -3997,11 +4011,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (reg_size == 4) ins |= (sljit_ins)1 << 6; - return push_inst32(compiler, VDUP_s | ins | VD4(freg) | VM4(src)); + return push_inst32(compiler, VDUP_s | ins | VD4(vreg) | VM4(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4010,7 +4024,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler sljit_s32 dst_reg; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -4024,20 +4038,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler return SLJIT_SUCCESS; if (reg_size == 4) - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); if (src & SLJIT_MEM) { FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw)); if (reg_size == 4 && elem2_size - elem_size == 1) - FAIL_IF(push_inst32(compiler, VLD1 | (0x7 << 8) | VD4(freg) | RN4(src) | 0xf)); + FAIL_IF(push_inst32(compiler, VLD1 | (0x7 << 8) | VD4(vreg) | RN4(src) | 0xf)); else - FAIL_IF(push_inst32(compiler, VLD1_s | (sljit_ins)((reg_size - elem2_size + elem_size) << 10) | VD4(freg) | RN4(src) | 0xf)); - src = freg; + FAIL_IF(push_inst32(compiler, VLD1_s | (sljit_ins)((reg_size - elem2_size + elem_size) << 10) | VD4(vreg) | RN4(src) | 0xf)); + src = vreg; } else if (reg_size == 4) src = simd_get_quad_reg_index(src); if (!(type & SLJIT_SIMD_FLOAT)) { - dst_reg = (reg_size == 4) ? freg : TMP_FREG2; + dst_reg = (reg_size == 4) ? vreg : TMP_FREG2; do { FAIL_IF(push_inst32(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 28)) @@ -4046,27 +4060,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler } while (++elem_size < elem2_size); if (dst_reg == TMP_FREG2) - return push_inst32(compiler, VORR | VD4(freg) | VN4(TMP_FREG2) | VM4(TMP_FREG2)); + return push_inst32(compiler, VORR | VD4(vreg) | VN4(TMP_FREG2) | VM4(TMP_FREG2)); return SLJIT_SUCCESS; } /* No SIMD variant, must use VFP instead. */ SLJIT_ASSERT(reg_size == 4); - if (freg == src) { - freg += SLJIT_QUAD_OTHER_HALF(freg); - FAIL_IF(push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src) | 0x20)); - freg += SLJIT_QUAD_OTHER_HALF(freg); - return push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src)); + if (vreg == src) { + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + FAIL_IF(push_inst32(compiler, VCVT_F64_F32 | VD4(vreg) | VM4(src) | 0x20)); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + return push_inst32(compiler, VCVT_F64_F32 | VD4(vreg) | VM4(src)); } - FAIL_IF(push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src))); - freg += SLJIT_QUAD_OTHER_HALF(freg); - return push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src) | 0x20); + FAIL_IF(push_inst32(compiler, VCVT_F64_F32 | VD4(vreg) | VM4(src))); + vreg += SLJIT_QUAD_OTHER_HALF(vreg); + return push_inst32(compiler, VCVT_F64_F32 | VD4(vreg) | VM4(src) | 0x20); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4075,7 +4089,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_s32 dst_r; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); ADJUST_LOCAL_OFFSET(dst, dstw); @@ -4108,12 +4122,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } if (reg_size == 4) { - freg = simd_get_quad_reg_index(freg); + vreg = simd_get_quad_reg_index(vreg); ins |= (sljit_ins)1 << 6; } SLJIT_ASSERT((freg_map[TMP_FREG2] & 0x1) == 0); - FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG2) | VM4(freg))); + FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG2) | VM4(vreg))); if (reg_size == 4 && elem_size > 0) FAIL_IF(push_inst32(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD4(TMP_FREG2) | VM4(TMP_FREG2))); @@ -4143,14 +4157,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_ins ins = 0; + sljit_s32 alignment; + sljit_ins ins = 0, load_ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4158,6 +4174,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) return SLJIT_ERR_UNSUPPORTED; + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + switch (SLJIT_SIMD_GET_OPCODE(type)) { case SLJIT_SIMD_OP2_AND: ins = VAND; @@ -4168,19 +4187,51 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = VEOR; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = VTBL; + break; } - if (type & SLJIT_SIMD_TEST) - return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + if (elem_size > 3) + elem_size = 3; + + load_ins = VLD1 | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8)); + alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + + SLJIT_ASSERT(reg_size >= alignment); + + if (alignment == 3) + load_ins |= 0x10; + else if (alignment >= 4) + load_ins |= 0x20; + + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + FAIL_IF(push_inst32(compiler, load_ins | VD4(TMP_FREG2) | RN4(src2) | ((sljit_ins)elem_size) << 6 | 0xf)); + src2 = TMP_FREG2; + } if (reg_size == 4) { - dst_freg = simd_get_quad_reg_index(dst_freg); - src1_freg = simd_get_quad_reg_index(src1_freg); - src2_freg = simd_get_quad_reg_index(src2_freg); + dst_vreg = simd_get_quad_reg_index(dst_vreg); + src1_vreg = simd_get_quad_reg_index(src1_vreg); + src2 = simd_get_quad_reg_index(src2); + + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) { + ins |= (sljit_ins)1 << 8; + + FAIL_IF(push_inst32(compiler, ins | VD4(dst_vreg != src1_vreg ? dst_vreg : TMP_FREG2) | VN4(src1_vreg) | VM4(src2))); + src2 += SLJIT_QUAD_OTHER_HALF(src2); + FAIL_IF(push_inst32(compiler, ins | VD4(dst_vreg + SLJIT_QUAD_OTHER_HALF(dst_vreg)) | VN4(src1_vreg) | VM4(src2))); + + if (dst_vreg == src1_vreg) + return push_inst32(compiler, VORR | VD4(dst_vreg) | VN4(TMP_FREG2) | VM4(TMP_FREG2)); + return SLJIT_SUCCESS; + } + ins |= (sljit_ins)1 << 6; } - return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2_freg)); + return push_inst32(compiler, ins | VD4(dst_vreg) | VN4(src1_vreg) | VM4(src2)); } #undef FPU_LOAD @@ -4194,7 +4245,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + case SLJIT_MOV_U8: ins = LDREXB; break; @@ -4206,6 +4265,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler break; } + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + return push_inst32(compiler, ins | RN4(mem_reg) | RT4(dst_reg)); } @@ -4222,7 +4284,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + case SLJIT_MOV_S16: + case SLJIT_MOV_S32: + return SLJIT_ERR_UNSUPPORTED; + case SLJIT_MOV_U8: ins = STREXB | RM4(TMP_REG1); break; @@ -4234,6 +4304,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler break; } + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + FAIL_IF(push_inst32(compiler, ins | RN4(mem_reg) | RT4(src_reg))); if (op & SLJIT_SET_ATOMIC_STORED) return push_inst32(compiler, CMPI_W | RN4(TMP_REG1)); diff --git a/src/sljit/sljitNativeLOONGARCH_64.c b/src/sljit/sljitNativeLOONGARCH_64.c index 2e1d742ae..1d3f66cd1 100644 --- a/src/sljit/sljitNativeLOONGARCH_64.c +++ b/src/sljit/sljitNativeLOONGARCH_64.c @@ -250,6 +250,9 @@ lower parts in the instruction word, denoted by the “L” and “H” suffixes #define AMCAS_W OPC_3R(0x70B2) #define AMCAS_D OPC_3R(0x70B3) +/* Memory barrier instructions */ +#define DBAR OPC_3R(0x70e4) + /* Other instructions */ #define BREAK OPC_3R(0x54) #define DBGCALL OPC_3R(0x55) @@ -348,6 +351,7 @@ lower parts in the instruction word, denoted by the “L” and “H” suffixes #define VREPLGR2VR OPC_2R(0x1ca7c0) #define VREPLVE OPC_3R(0xe244) #define VREPLVEI OPC_2R(0x1cbde0) +#define VSHUF_B OPC_4R(0xd5) #define XVPERMI OPC_2RI8(0x1dfa) #define I12_MAX (0x7ff) @@ -386,6 +390,8 @@ static sljit_u32 hwcap_feature_list = 0; #define GET_CFG2 0 #define GET_HWCAP 1 +#define LOONGARCH_SUPPORT_AMCAS (LOONGARCH_CFG2_LAMCAS & get_cpu_features(GET_CFG2)) + static SLJIT_INLINE sljit_u32 get_cpu_features(sljit_u32 feature_type) { if (cfg2_feature_list == 0) @@ -804,9 +810,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_SIMD: return (LOONGARCH_HWCAP_LSX & get_cpu_features(GET_HWCAP)); - case SLJIT_HAS_ATOMIC: - return (LOONGARCH_CFG2_LAMCAS & get_cpu_features(GET_CFG2)); - case SLJIT_HAS_CLZ: case SLJIT_HAS_CTZ: case SLJIT_HAS_REV: @@ -814,6 +817,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_PREFETCH: case SLJIT_HAS_COPY_F32: case SLJIT_HAS_COPY_F64: + case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; default: @@ -889,16 +894,20 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw); SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 i, tmp, offset; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1); local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64); @@ -973,13 +982,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi #undef STACK_MAX_DISTANCE SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); + CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1); local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64); @@ -1884,6 +1898,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile return push_inst(compiler, ((op & SLJIT_32)? DIV_WU: DIV_DU) | RD(SLJIT_R0) | RJ(SLJIT_R0) | RK(SLJIT_R1)); case SLJIT_DIV_SW: return push_inst(compiler, INST(DIV, op) | RD(SLJIT_R0) | RJ(SLJIT_R0) | RK(SLJIT_R1)); + case SLJIT_MEMORY_BARRIER: + return push_inst(compiler, DBAR); case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -2644,10 +2660,8 @@ static sljit_ins get_jump_instruction(sljit_s32 type) { switch (type) { case SLJIT_EQUAL: - case SLJIT_ATOMIC_NOT_STORED: return BNE | RJ(EQUAL_FLAG) | RD(TMP_ZERO); case SLJIT_NOT_EQUAL: - case SLJIT_ATOMIC_STORED: return BEQ | RJ(EQUAL_FLAG) | RD(TMP_ZERO); case SLJIT_LESS: case SLJIT_GREATER: @@ -2655,6 +2669,7 @@ static sljit_ins get_jump_instruction(sljit_s32 type) case SLJIT_SIG_GREATER: case SLJIT_OVERFLOW: case SLJIT_CARRY: + case SLJIT_ATOMIC_STORED: return BEQ | RJ(OTHER_FLAG) | RD(TMP_ZERO); case SLJIT_GREATER_EQUAL: case SLJIT_LESS_EQUAL: @@ -2662,6 +2677,7 @@ static sljit_ins get_jump_instruction(sljit_s32 type) case SLJIT_SIG_LESS_EQUAL: case SLJIT_NOT_OVERFLOW: case SLJIT_NOT_CARRY: + case SLJIT_ATOMIC_NOT_STORED: return BNE | RJ(OTHER_FLAG) | RD(TMP_ZERO); case SLJIT_F_EQUAL: case SLJIT_ORDERED_EQUAL: @@ -2933,7 +2949,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co break; case SLJIT_ATOMIC_STORED: case SLJIT_ATOMIC_NOT_STORED: - FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RJ(EQUAL_FLAG) | IMM_I12(1))); + FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RJ(OTHER_FLAG) | IMM_I12(1))); src_r = dst_r; invert ^= 0x1; break; @@ -3162,14 +3178,14 @@ static sljit_s32 sljit_emit_simd_mem_offset(struct sljit_compiler *compiler, slj } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3184,9 +3200,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (!(srcdst & SLJIT_MEM)) { if (type & SLJIT_SIMD_STORE) - ins = FRD(srcdst) | FRJ(freg) | FRK(freg); + ins = FRD(srcdst) | FRJ(vreg) | FRK(vreg); else - ins = FRD(freg) | FRJ(srcdst) | FRK(srcdst); + ins = FRD(vreg) | FRJ(srcdst) | FRK(srcdst); if (reg_size == 5) ins |= VOR_V | (sljit_ins)1 << 26; @@ -3202,15 +3218,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co ins = (type & SLJIT_SIMD_STORE) ? XVST : XVLD; if (FAST_IS_REG(srcdst) && srcdst >= 0 && (srcdstw >= I12_MIN && srcdstw <= I12_MAX)) - return push_inst(compiler, ins | FRD(freg) | RJ((sljit_u8)srcdst) | IMM_I12(srcdstw)); + return push_inst(compiler, ins | FRD(vreg) | RJ((sljit_u8)srcdst) | IMM_I12(srcdstw)); else { FAIL_IF(sljit_emit_simd_mem_offset(compiler, &srcdst, srcdstw)); - return push_inst(compiler, ins | FRD(freg) | RJ(srcdst) | IMM_I12(0)); + return push_inst(compiler, ins | FRD(vreg) | RJ(srcdst) | IMM_I12(0)); } } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3218,7 +3234,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -3237,7 +3253,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 5) ins = (sljit_ins)1 << 25; - return push_inst(compiler, VLDREPL | ins | FRD(freg) | RJ(src) | (sljit_ins)1 << (23 - elem_size)); + return push_inst(compiler, VLDREPL | ins | FRD(vreg) | RJ(src) | (sljit_ins)1 << (23 - elem_size)); } if (reg_size == 5) @@ -3245,13 +3261,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (type & SLJIT_SIMD_FLOAT) { if (src == SLJIT_IMM) - return push_inst(compiler, VREPLGR2VR | ins | FRD(freg) | RJ(TMP_ZERO) | (sljit_ins)elem_size << 10); + return push_inst(compiler, VREPLGR2VR | ins | FRD(vreg) | RJ(TMP_ZERO) | (sljit_ins)elem_size << 10); - FAIL_IF(push_inst(compiler, VREPLVE | ins | FRD(freg) | FRJ(src) | RK(TMP_ZERO) | (sljit_ins)elem_size << 15)); + FAIL_IF(push_inst(compiler, VREPLVE | ins | FRD(vreg) | FRJ(src) | RK(TMP_ZERO) | (sljit_ins)elem_size << 15)); if (reg_size == 5) { ins = (sljit_ins)(0x44 << 10); - return push_inst(compiler, XVPERMI | ins | FRD(freg) | FRJ(freg)); + return push_inst(compiler, XVPERMI | ins | FRD(vreg) | FRJ(vreg)); } return SLJIT_SUCCESS; @@ -3264,11 +3280,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil src = TMP_REG2; } - return push_inst(compiler, ins | FRD(freg) | RJ(src)); + return push_inst(compiler, ins | FRD(vreg) | RJ(src)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3276,7 +3292,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3298,13 +3314,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (type & SLJIT_SIMD_LANE_ZERO) { ins = (reg_size == 5) ? ((sljit_ins)1 << 26) : 0; - if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) { - FAIL_IF(push_inst(compiler, VOR_V | ins | FRD(TMP_FREG1) | FRJ(freg) | FRK(freg))); + if ((type & SLJIT_SIMD_FLOAT) && vreg == srcdst) { + FAIL_IF(push_inst(compiler, VOR_V | ins | FRD(TMP_FREG1) | FRJ(vreg) | FRK(vreg))); srcdst = TMP_FREG1; srcdstw = 0; } - FAIL_IF(push_inst(compiler, VXOR_V | ins | FRD(freg) | FRJ(freg) | FRK(freg))); + FAIL_IF(push_inst(compiler, VXOR_V | ins | FRD(vreg) | FRJ(vreg) | FRK(vreg))); } if (srcdst & SLJIT_MEM) { @@ -3315,7 +3331,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (type & SLJIT_SIMD_STORE) { ins |= (sljit_ins)lane_index << 18 | (sljit_ins)(1 << (23 - elem_size)); - return push_inst(compiler, VSTELM | ins | FRD(freg) | RJ(srcdst)); + return push_inst(compiler, VSTELM | ins | FRD(vreg) | RJ(srcdst)); } else { emit_op_mem(compiler, (elem_size == 3 ? WORD_DATA : (elem_size == 2 ? INT_DATA : (elem_size == 1 ? HALF_DATA : BYTE_DATA))) | LOAD_DATA, TMP_REG1, srcdst | SLJIT_MEM, 0); srcdst = TMP_REG1; @@ -3323,20 +3339,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (reg_size == 5) { if (elem_size < 2) { - FAIL_IF(push_inst(compiler, VOR_V | (sljit_ins)1 << 26 | FRD(TMP_FREG1) | FRJ(freg) | FRK(freg))); + FAIL_IF(push_inst(compiler, VOR_V | (sljit_ins)1 << 26 | FRD(TMP_FREG1) | FRJ(vreg) | FRK(vreg))); if (lane_index >= (2 << (3 - elem_size))) { - FAIL_IF(push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(TMP_FREG1) | FRJ(freg) | IMM_I8(1))); + FAIL_IF(push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(TMP_FREG1) | FRJ(vreg) | IMM_I8(1))); FAIL_IF(push_inst(compiler, VINSGR2VR | ins | FRD(TMP_FREG1) | RJ(srcdst) | IMM_V(lane_index % (2 << (3 - elem_size))))); - return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(freg) | FRJ(TMP_FREG1) | IMM_I8(2)); + return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(vreg) | FRJ(TMP_FREG1) | IMM_I8(2)); } else { - FAIL_IF(push_inst(compiler, VINSGR2VR | ins | FRD(freg) | RJ(srcdst) | IMM_V(lane_index))); - return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(freg) | FRJ(TMP_FREG1) | IMM_I8(18)); + FAIL_IF(push_inst(compiler, VINSGR2VR | ins | FRD(vreg) | RJ(srcdst) | IMM_V(lane_index))); + return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(vreg) | FRJ(TMP_FREG1) | IMM_I8(18)); } } else ins = (sljit_ins)(0x3f ^ (0x3f >> elem_size)) << 10 | (sljit_ins)1 << 26; } - return push_inst(compiler, VINSGR2VR | ins | FRD(freg) | RJ(srcdst) | IMM_V(lane_index)); + return push_inst(compiler, VINSGR2VR | ins | FRD(vreg) | RJ(srcdst) | IMM_V(lane_index)); } } @@ -3344,11 +3360,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile ins = (reg_size == 5) ? (sljit_ins)(0x3f ^ (0x3f >> elem_size)) << 10 | (sljit_ins)1 << 26 : (sljit_ins)(0x3f ^ (0x1f >> elem_size)) << 10; if (type & SLJIT_SIMD_STORE) { - FAIL_IF(push_inst(compiler, VPICKVE2GR_U | ins | RD(TMP_REG1) | FRJ(freg) | IMM_V(lane_index))); + FAIL_IF(push_inst(compiler, VPICKVE2GR_U | ins | RD(TMP_REG1) | FRJ(vreg) | IMM_V(lane_index))); return push_inst(compiler, VINSGR2VR | ins | FRD(srcdst) | RJ(TMP_REG1) | IMM_V(0)); } else { FAIL_IF(push_inst(compiler, VPICKVE2GR_U | ins | RD(TMP_REG1) | FRJ(srcdst) | IMM_V(0))); - return push_inst(compiler, VINSGR2VR | ins | FRD(freg) | RJ(TMP_REG1) | IMM_V(lane_index)); + return push_inst(compiler, VINSGR2VR | ins | FRD(vreg) | RJ(TMP_REG1) | IMM_V(lane_index)); } } @@ -3373,8 +3389,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile else ins |= VPICKVE2GR_U; - FAIL_IF(push_inst(compiler, VOR_V | (sljit_ins)1 << 26 | FRD(TMP_FREG1) | FRJ(freg) | FRK(freg))); - FAIL_IF(push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(TMP_FREG1) | FRJ(freg) | IMM_I8(1))); + FAIL_IF(push_inst(compiler, VOR_V | (sljit_ins)1 << 26 | FRD(TMP_FREG1) | FRJ(vreg) | FRK(vreg))); + FAIL_IF(push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(TMP_FREG1) | FRJ(vreg) | IMM_I8(1))); return push_inst(compiler, ins | RD(srcdst) | FRJ(TMP_FREG1) | IMM_V(lane_index % (2 << (3 - elem_size)))); } } else { @@ -3383,33 +3399,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } } - return push_inst(compiler, ins | RD(srcdst) | FRJ(freg) | IMM_V(lane_index)); + return push_inst(compiler, ins | RD(srcdst) | FRJ(vreg) | IMM_V(lane_index)); } else { ins = (sljit_ins)(0x3f ^ (0x1f >> elem_size)) << 10; if (reg_size == 5) { if (elem_size < 2) { - FAIL_IF(push_inst(compiler, VOR_V | (sljit_ins)1 << 26 | FRD(TMP_FREG1) | FRJ(freg) | FRK(freg))); + FAIL_IF(push_inst(compiler, VOR_V | (sljit_ins)1 << 26 | FRD(TMP_FREG1) | FRJ(vreg) | FRK(vreg))); if (lane_index >= (2 << (3 - elem_size))) { - FAIL_IF(push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(TMP_FREG1) | FRJ(freg) | IMM_I8(1))); + FAIL_IF(push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(TMP_FREG1) | FRJ(vreg) | IMM_I8(1))); FAIL_IF(push_inst(compiler, VINSGR2VR | ins | FRD(TMP_FREG1) | RJ(srcdst) | IMM_V(lane_index % (2 << (3 - elem_size))))); - return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(freg) | FRJ(TMP_FREG1) | IMM_I8(2)); + return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(vreg) | FRJ(TMP_FREG1) | IMM_I8(2)); } else { - FAIL_IF(push_inst(compiler, VINSGR2VR | ins | FRD(freg) | RJ(srcdst) | IMM_V(lane_index))); - return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(freg) | FRJ(TMP_FREG1) | IMM_I8(18)); + FAIL_IF(push_inst(compiler, VINSGR2VR | ins | FRD(vreg) | RJ(srcdst) | IMM_V(lane_index))); + return push_inst(compiler, XVPERMI | (sljit_ins)1 << 18 | FRD(vreg) | FRJ(TMP_FREG1) | IMM_I8(18)); } } else ins = (sljit_ins)(0x3f ^ (0x3f >> elem_size)) << 10 | (sljit_ins)1 << 26; } - return push_inst(compiler, VINSGR2VR | ins | FRD(freg) | RJ(srcdst) | IMM_V(lane_index)); + return push_inst(compiler, VINSGR2VR | ins | FRD(vreg) | RJ(srcdst) | IMM_V(lane_index)); } return SLJIT_ERR_UNSUPPORTED; } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3417,7 +3433,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); if (reg_size != 5 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -3431,18 +3447,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c ins = (sljit_ins)(0x3f ^ (0x1f >> elem_size)) << 10; if (reg_size == 5) { - FAIL_IF(push_inst(compiler, VREPLVEI | (sljit_ins)1 << 26 | ins | FRD(freg) | FRJ(src) | IMM_V(src_lane_index % (2 << (3 - elem_size))))); + FAIL_IF(push_inst(compiler, VREPLVEI | (sljit_ins)1 << 26 | ins | FRD(vreg) | FRJ(src) | IMM_V(src_lane_index % (2 << (3 - elem_size))))); ins = (src_lane_index < (2 << (3 - elem_size))) ? (sljit_ins)(0x44 << 10) : (sljit_ins)(0xee << 10); - return push_inst(compiler, XVPERMI | ins | FRD(freg) | FRJ(freg)); + return push_inst(compiler, XVPERMI | ins | FRD(vreg) | FRJ(vreg)); } - return push_inst(compiler, VREPLVEI | ins | FRD(freg) | FRJ(src) | IMM_V(src_lane_index)); + return push_inst(compiler, VREPLVEI | ins | FRD(vreg) | FRJ(src) | IMM_V(src_lane_index)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3451,7 +3467,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -3471,12 +3487,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler ins = (type & SLJIT_SIMD_STORE) ? XVST : XVLD; if (FAST_IS_REG(src) && src >= 0 && (srcw >= I12_MIN && srcw <= I12_MAX)) - FAIL_IF(push_inst(compiler, ins | FRD(freg) | RJ(src) | IMM_I12(srcw))); + FAIL_IF(push_inst(compiler, ins | FRD(vreg) | RJ(src) | IMM_I12(srcw))); else { FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw)); - FAIL_IF(push_inst(compiler, ins | FRD(freg) | RJ(src) | IMM_I12(0))); + FAIL_IF(push_inst(compiler, ins | FRD(vreg) | RJ(src) | IMM_I12(0))); } - src = freg; + src = vreg; } if (type & SLJIT_SIMD_FLOAT) { @@ -3489,7 +3505,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler FAIL_IF(push_inst(compiler, XVPERMI | FRD(src) | FRJ(src) | IMM_I8(16))); } - return push_inst(compiler, VFCVTL_D_S | ins | FRD(freg) | FRJ(src)); + return push_inst(compiler, VFCVTL_D_S | ins | FRD(vreg) | FRJ(src)); } ins = (type & SLJIT_SIMD_EXTEND_SIGNED) ? VSLLWIL : (VSLLWIL | (sljit_ins)1 << 18); @@ -3501,15 +3517,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler if (reg_size == 5) FAIL_IF(push_inst(compiler, XVPERMI | FRD(src) | FRJ(src) | IMM_I8(16))); - FAIL_IF(push_inst(compiler, ins | ((sljit_ins)1 << (13 + elem_size)) | FRD(freg) | FRJ(src))); - src = freg; + FAIL_IF(push_inst(compiler, ins | ((sljit_ins)1 << (13 + elem_size)) | FRD(vreg) | FRJ(src))); + src = vreg; } while (++elem_size < elem2_size); return SLJIT_SUCCESS; } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3518,7 +3534,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_s32 dst_r; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); ADJUST_LOCAL_OFFSET(dst, dstw); @@ -3539,7 +3555,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c if (reg_size == 5) ins = (sljit_ins)1 << 26; - FAIL_IF(push_inst(compiler, VMSKLTZ | ins | (sljit_ins)(elem_size << 10) | FRD(TMP_FREG1) | FRJ(freg))); + FAIL_IF(push_inst(compiler, VMSKLTZ | ins | (sljit_ins)(elem_size << 10) | FRD(TMP_FREG1) | FRJ(vreg))); FAIL_IF(push_inst(compiler, VPICKVE2GR_U | (sljit_ins)(0x3c << 10) | RD(dst_r) | FRJ(TMP_FREG1))); @@ -3556,14 +3572,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 5 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -3577,6 +3594,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + FAIL_IF(push_inst(compiler, (reg_size == 4 ? VLD : XVLD) | FRD(TMP_FREG1) | RJ(src2) | IMM_I12(0))); + src2 = TMP_FREG1; + } + switch (SLJIT_SIMD_GET_OPCODE(type)) { case SLJIT_SIMD_OP2_AND: ins = VAND_V; @@ -3587,12 +3610,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = VXOR_V; break; + case SLJIT_SIMD_OP2_SHUFFLE: + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + return push_inst(compiler, VSHUF_B | FRD(dst_vreg) | FRJ(src1_vreg) | FRK(src1_vreg) | FRA(src2)); } if (reg_size == 5) ins |= (sljit_ins)1 << 26; - return push_inst(compiler, ins | FRD(dst_freg) | FRJ(src1_freg) | FRK(src2_freg)); + return push_inst(compiler, ins | FRD(dst_vreg) | FRJ(src1_vreg) | FRK(src2)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, @@ -3605,14 +3633,45 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + if ((op & SLJIT_ATOMIC_USE_LS) || !LOONGARCH_SUPPORT_AMCAS) { + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: + ins = LL_D; + break; + case SLJIT_MOV_S32: + case SLJIT_MOV32: + ins = LL_W; + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | RD(dst_reg) | RJ(mem_reg)); + } + switch(GET_OPCODE(op)) { + case SLJIT_MOV_S8: + ins = LD_B; + break; case SLJIT_MOV_U8: ins = LD_BU; break; + case SLJIT_MOV_S16: + ins = LD_H; + break; case SLJIT_MOV_U16: ins = LD_HU; break; case SLJIT_MOV32: + case SLJIT_MOV_S32: ins = LD_W; break; case SLJIT_MOV_U32: @@ -3623,6 +3682,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler break; } + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + return push_inst(compiler, ins | RD(dst_reg) | RJ(mem_reg) | IMM_I12(0)); } @@ -3639,16 +3701,48 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + if ((op & SLJIT_ATOMIC_USE_LS) || !LOONGARCH_SUPPORT_AMCAS) { + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: + ins = SC_D; + break; + case SLJIT_MOV_S32: + case SLJIT_MOV32: + ins = SC_W; + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + FAIL_IF(push_inst(compiler, ADD_D | RD(OTHER_FLAG) | RJ(src_reg) | RK(TMP_ZERO))); + return push_inst(compiler, ins | RD(OTHER_FLAG) | RJ(mem_reg)); + } + switch (GET_OPCODE(op)) { + case SLJIT_MOV_S8: + ins = AMCAS_B; + break; case SLJIT_MOV_U8: ins = AMCAS_B; unsign = BSTRPICK_D | (7 << 16); break; + case SLJIT_MOV_S16: + ins = AMCAS_H; + break; case SLJIT_MOV_U16: ins = AMCAS_H; unsign = BSTRPICK_D | (15 << 16); break; case SLJIT_MOV32: + case SLJIT_MOV_S32: ins = AMCAS_W; break; case SLJIT_MOV_U32: @@ -3660,9 +3754,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler break; } + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + if (op & SLJIT_SET_ATOMIC_STORED) { - FAIL_IF(push_inst(compiler, XOR | RD(TMP_REG1) | RJ(temp_reg) | RK(TMP_ZERO))); - tmp = TMP_REG1; + FAIL_IF(push_inst(compiler, XOR | RD(TMP_REG3) | RJ(temp_reg) | RK(TMP_ZERO))); + tmp = TMP_REG3; } FAIL_IF(push_inst(compiler, ins | RD(tmp) | RJ(mem_reg) | RK(src_reg))); if (!(op & SLJIT_SET_ATOMIC_STORED)) @@ -3671,8 +3768,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler if (unsign) FAIL_IF(push_inst(compiler, unsign | RD(tmp) | RJ(tmp))); - FAIL_IF(push_inst(compiler, XOR | RD(EQUAL_FLAG) | RJ(tmp) | RK(temp_reg))); - return push_inst(compiler, SLTUI | RD(EQUAL_FLAG) | RJ(EQUAL_FLAG) | IMM_I12(1)); + FAIL_IF(push_inst(compiler, XOR | RD(OTHER_FLAG) | RJ(tmp) | RK(temp_reg))); + return push_inst(compiler, SLTUI | RD(OTHER_FLAG) | RJ(OTHER_FLAG) | IMM_I12(1)); } static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins) diff --git a/src/sljit/sljitNativeMIPS_common.c b/src/sljit/sljitNativeMIPS_common.c index 88eb30b7f..3f0a26585 100644 --- a/src/sljit/sljitNativeMIPS_common.c +++ b/src/sljit/sljitNativeMIPS_common.c @@ -249,6 +249,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = { #define LDL (HI(26)) #define LDR (HI(27)) #define LDC1 (HI(53)) +#define LL (HI(48)) +#define LLD (HI(52)) #define LUI (HI(15)) #define LW (HI(35)) #define LWL (HI(34)) @@ -288,6 +290,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = { #define ROTR (HI(0) | (1 << 21) | LO(2)) #define ROTRV (HI(0) | (1 << 6) | LO(6)) #endif /* SLJIT_MIPS_REV >= 2 */ +#define SC (HI(56)) +#define SCD (HI(60)) #define SD (HI(63)) #define SDL (HI(44)) #define SDR (HI(45)) @@ -308,6 +312,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = { #define SWL (HI(42)) #define SWR (HI(46)) #define SWC1 (HI(57)) +#define SYNC (HI(0) | LO(15)) #define TRUNC_W_S (HI(17) | FMT_S | LO(13)) #if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2 #define WSBH (HI(31) | (2 << 6) | LO(32)) @@ -857,6 +862,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_CLZ: case SLJIT_HAS_CMOV: case SLJIT_HAS_PREFETCH: + case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; case SLJIT_HAS_CTZ: @@ -928,17 +935,22 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit #endif SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_ins base; sljit_s32 i, tmp, offset; sljit_s32 arg_count, word_arg_count, float_arg_count; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1); #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) @@ -1138,12 +1150,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); + CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1); #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) @@ -2462,6 +2480,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0))); return (op >= SLJIT_DIV_UW) ? SLJIT_SUCCESS : push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1)); #endif /* SLJIT_MIPS_REV >= 6 */ + case SLJIT_MEMORY_BARRIER: +#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1) + return push_inst(compiler, SYNC, UNMOVABLE_INS); +#else /* SLJIT_MIPS_REV < 1 */ + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_MIPS_REV >= 1 */ case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -3312,6 +3336,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile case SLJIT_SIG_GREATER: case SLJIT_OVERFLOW: case SLJIT_CARRY: + case SLJIT_ATOMIC_STORED: BR_Z(OTHER_FLAG); break; case SLJIT_GREATER_EQUAL: @@ -3320,6 +3345,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile case SLJIT_SIG_LESS_EQUAL: case SLJIT_NOT_OVERFLOW: case SLJIT_NOT_CARRY: + case SLJIT_ATOMIC_NOT_STORED: BR_NZ(OTHER_FLAG); break; case SLJIT_F_NOT_EQUAL: @@ -4209,6 +4235,80 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil #undef TO_ARGW_HI +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 dst_reg, + sljit_s32 mem_reg) +{ + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: +#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) + ins = LLD; + break; +#endif /* SLJIT_CONFIG_MIPS_64 */ + case SLJIT_MOV_S32: + case SLJIT_MOV32: + ins = LL; + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | T(dst_reg) | S(mem_reg), DR(dst_reg)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 src_reg, + sljit_s32 mem_reg, + sljit_s32 temp_reg) +{ + sljit_ins ins; + + /* temp_reg == mem_reg is undefined so use another temp register */ + SLJIT_UNUSED_ARG(temp_reg); + + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: +#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) + ins = SCD; + break; +#endif /* SLJIT_CONFIG_RISCV_64 */ + case SLJIT_MOV_S32: + case SLJIT_MOV32: + op |= SLJIT_32; + ins = SC; + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src_reg) | TA(0) | DA(OTHER_FLAG), OTHER_FLAG)); + return push_inst(compiler, ins | TA(OTHER_FLAG) | S(mem_reg), OTHER_FLAG); +} + SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value) { struct sljit_const *const_; diff --git a/src/sljit/sljitNativePPC_common.c b/src/sljit/sljitNativePPC_common.c index 1f17d9042..8c3e4224c 100644 --- a/src/sljit/sljitNativePPC_common.c +++ b/src/sljit/sljitNativePPC_common.c @@ -187,10 +187,12 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define LD (HI(58) | 0) #define LFD (HI(50)) #define LFS (HI(48)) +#define LDARX (HI(31) | LO(84)) #if defined(_ARCH_PWR7) && _ARCH_PWR7 #define LDBRX (HI(31) | LO(532)) #endif /* POWER7 */ #define LHBRX (HI(31) | LO(790)) +#define LWARX (HI(31) | LO(20)) #define LWBRX (HI(31) | LO(534)) #define LWZ (HI(32)) #define MFCR (HI(31) | LO(19)) @@ -231,6 +233,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #if defined(_ARCH_PWR7) && _ARCH_PWR7 #define STDBRX (HI(31) | LO(660)) #endif /* POWER7 */ +#define STDCX (HI(31) | LO(214)) #define STDU (HI(62) | 1) #define STDUX (HI(31) | LO(181)) #define STFD (HI(54)) @@ -239,12 +242,14 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define STHBRX (HI(31) | LO(918)) #define STW (HI(36)) #define STWBRX (HI(31) | LO(662)) +#define STWCX (HI(31) | LO(150)) #define STWU (HI(37)) #define STWUX (HI(31) | LO(183)) #define SUBF (HI(31) | LO(40)) #define SUBFC (HI(31) | LO(8)) #define SUBFE (HI(31) | LO(136)) #define SUBFIC (HI(8)) +#define SYNC (HI(31) | LO(598)) #define XOR (HI(31) | LO(316)) #define XORI (HI(26)) #define XORIS (HI(27)) @@ -748,6 +753,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_CLZ: case SLJIT_HAS_ROT: case SLJIT_HAS_PREFETCH: + case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; case SLJIT_HAS_CTZ: @@ -845,9 +852,11 @@ static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flag #define STACK_MAX_DISTANCE (0x8000 - SSIZE_OF(sw) - LR_SAVE_OFFSET) SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 i, tmp, base, offset; sljit_s32 word_arg_count = 0; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); @@ -856,9 +865,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi #endif CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 0) + GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64); @@ -962,13 +973,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); + CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 0) + GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, f64); @@ -1399,6 +1415,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile #else return push_inst(compiler, (op == SLJIT_DIV_UW ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)); #endif + case SLJIT_MEMORY_BARRIER: + return push_inst(compiler, SYNC); case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -2422,6 +2440,7 @@ static sljit_ins get_bo_bi_flags(struct sljit_compiler *compiler, sljit_s32 type /* fallthrough */ case SLJIT_EQUAL: + case SLJIT_ATOMIC_STORED: return (12 << 21) | (2 << 16); case SLJIT_CARRY: @@ -2430,6 +2449,7 @@ static sljit_ins get_bo_bi_flags(struct sljit_compiler *compiler, sljit_s32 type /* fallthrough */ case SLJIT_NOT_EQUAL: + case SLJIT_ATOMIC_NOT_STORED: return (4 << 21) | (2 << 16); case SLJIT_LESS: @@ -2686,10 +2706,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co break; case SLJIT_EQUAL: + case SLJIT_ATOMIC_STORED: bit = 2; break; case SLJIT_NOT_EQUAL: + case SLJIT_ATOMIC_NOT_STORED: bit = 2; invert = 1; break; @@ -3106,6 +3128,78 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler return push_inst(compiler, INST_CODE_AND_DST(inst, DOUBLE_DATA, freg) | A(mem & REG_MASK) | IMM(memw)); } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 dst_reg, + sljit_s32 mem_reg) +{ + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: +#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) + ins = LDARX; + break; +#endif /* SLJIT_CONFIG_RISCV_64 */ + case SLJIT_MOV_U32: + case SLJIT_MOV32: + ins = LWARX; + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | D(dst_reg) | B(mem_reg)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 src_reg, + sljit_s32 mem_reg, + sljit_s32 temp_reg) +{ + sljit_ins ins; + + /* temp_reg == mem_reg is undefined so use another temp register */ + SLJIT_UNUSED_ARG(temp_reg); + + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: +#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) + ins = STDCX | 0x1; + break; +#endif /* SLJIT_CONFIG_RISCV_64 */ + case SLJIT_MOV_U32: + case SLJIT_MOV32: + ins = STWCX | 0x1; + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | D(src_reg) | B(mem_reg)); +} + SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value) { struct sljit_const *const_; diff --git a/src/sljit/sljitNativeRISCV_common.c b/src/sljit/sljitNativeRISCV_common.c index d86100a80..e487b79b8 100644 --- a/src/sljit/sljitNativeRISCV_common.c +++ b/src/sljit/sljitNativeRISCV_common.c @@ -50,6 +50,9 @@ typedef sljit_u32 sljit_ins; #define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) #define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) +#define TMP_VREG1 (SLJIT_NUMBER_OF_VECTOR_REGISTERS + 1) +#define TMP_VREG2 (SLJIT_NUMBER_OF_VECTOR_REGISTERS + 2) + static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = { 0, 10, 11, 12, 13, 14, 15, 16, 17, 29, 30, 31, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 9, 8, 2, 6, 1, 7, 5, 28 }; @@ -58,6 +61,10 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { 0, 10, 11, 12, 13, 14, 15, 16, 17, 2, 3, 4, 5, 6, 7, 28, 29, 30, 31, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 9, 8, 0, 1, }; +static const sljit_u8 vreg_map[SLJIT_NUMBER_OF_VECTOR_REGISTERS + 3] = { + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +}; + /* --------------------------------------------------------------------- */ /* Instrucion forms */ /* --------------------------------------------------------------------- */ @@ -68,6 +75,9 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define FRD(rd) ((sljit_ins)freg_map[rd] << 7) #define FRS1(rs1) ((sljit_ins)freg_map[rs1] << 15) #define FRS2(rs2) ((sljit_ins)freg_map[rs2] << 20) +#define VRD(rd) ((sljit_ins)vreg_map[rd] << 7) +#define VRS1(rs1) ((sljit_ins)vreg_map[rs1] << 15) +#define VRS2(rs2) ((sljit_ins)vreg_map[rs2] << 20) #define IMM_I(imm) ((sljit_ins)(imm) << 20) #define IMM_S(imm) ((((sljit_ins)(imm) & 0xfe0) << 20) | (((sljit_ins)(imm) & 0x1f) << 7)) @@ -77,6 +87,15 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define F12(f) ((sljit_ins)(f) << 20) #define F7(f) ((sljit_ins)(f) << 25) +/* Vector instruction types. */ +#define OPFVF (F3(0x5) | OPC(0x57)) +#define OPFVV (F3(0x1) | OPC(0x57)) +#define OPIVI (F3(0x3) | OPC(0x57)) +#define OPIVV (F3(0x0) | OPC(0x57)) +#define OPIVX (F3(0x4) | OPC(0x57)) +#define OPMVV (F3(0x2) | OPC(0x57)) +#define OPMVX (F3(0x6) | OPC(0x57)) + #define ADD (F7(0x0) | F3(0x0) | OPC(0x33)) #define ADDI (F3(0x0) | OPC(0x13)) #define AND (F7(0x0) | F3(0x7) | OPC(0x33)) @@ -88,11 +107,16 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define BGE (F3(0x5) | OPC(0x63)) #define BLTU (F3(0x6) | OPC(0x63)) #define BGEU (F3(0x7) | OPC(0x63)) +#if defined __riscv_zbb +#define CLZ (F7(0x30) | F3(0x1) | OPC(0x13)) +#define CTZ (F7(0x30) | F12(0x1) | F3(0x1) | OPC(0x13)) +#endif /* __riscv_zbb */ #define DIV (F7(0x1) | F3(0x4) | OPC(0x33)) #define DIVU (F7(0x1) | F3(0x5) | OPC(0x33)) #define EBREAK (F12(0x1) | F3(0x0) | OPC(0x73)) #define FADD_S (F7(0x0) | F3(0x7) | OPC(0x53)) #define FDIV_S (F7(0xc) | F3(0x7) | OPC(0x53)) +#define FENCE (F3(0x0) | OPC(0xf)) #define FEQ_S (F7(0x50) | F3(0x2) | OPC(0x53)) #define FLD (F3(0x3) | OPC(0x7)) #define FLE_S (F7(0x50) | F3(0x0) | OPC(0x53)) @@ -116,6 +140,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define LD (F3(0x3) | OPC(0x3)) #define LUI (OPC(0x37)) #define LW (F3(0x2) | OPC(0x3)) +#define LR (F7(0x8) | OPC(0x2f)) #define MUL (F7(0x1) | F3(0x0) | OPC(0x33)) #define MULH (F7(0x1) | F3(0x1) | OPC(0x33)) #define MULHU (F7(0x1) | F3(0x3) | OPC(0x33)) @@ -123,21 +148,73 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define ORI (F3(0x6) | OPC(0x13)) #define REM (F7(0x1) | F3(0x6) | OPC(0x33)) #define REMU (F7(0x1) | F3(0x7) | OPC(0x33)) +#if defined __riscv_zbb +#if defined SLJIT_CONFIG_RISCV_32 +#define REV8 (F12(0x698) | F3(0x5) | OPC(0x13)) +#elif defined SLJIT_CONFIG_RISCV_64 +#define REV8 (F12(0x6b8) | F3(0x5) | OPC(0x13)) +#endif +#define ROL (F7(0x30) | F3(0x1) | OPC(0x33)) +#define ROR (F7(0x30) | F3(0x5) | OPC(0x33)) +#define RORI (F7(0x30) | F3(0x5) | OPC(0x13)) +#endif /* __riscv_zbb */ +#define SC (F7(0xc) | OPC(0x2f)) #define SD (F3(0x3) | OPC(0x23)) +#if defined __riscv_zbb +#define SEXTB (F7(0x30) | F12(0x4) | F3(0x1) | OPC(0x13)) +#define SEXTH (F7(0x30) | F12(0x5) | F3(0x1) | OPC(0x13)) +#endif /* __riscv_zbb */ +#if defined __riscv_zba +#define SH1ADD (F7(0x10) | F3(0x2) | OPC(0x33)) +#define SH2ADD (F7(0x10) | F3(0x4) | OPC(0x33)) +#define SH3ADD (F7(0x10) | F3(0x6) | OPC(0x33)) +#endif /* __riscv_zba */ #define SLL (F7(0x0) | F3(0x1) | OPC(0x33)) -#define SLLI (IMM_I(0x0) | F3(0x1) | OPC(0x13)) +#define SLLI (F3(0x1) | OPC(0x13)) #define SLT (F7(0x0) | F3(0x2) | OPC(0x33)) #define SLTI (F3(0x2) | OPC(0x13)) #define SLTU (F7(0x0) | F3(0x3) | OPC(0x33)) #define SLTUI (F3(0x3) | OPC(0x13)) #define SRL (F7(0x0) | F3(0x5) | OPC(0x33)) -#define SRLI (IMM_I(0x0) | F3(0x5) | OPC(0x13)) +#define SRLI (F3(0x5) | OPC(0x13)) #define SRA (F7(0x20) | F3(0x5) | OPC(0x33)) -#define SRAI (IMM_I(0x400) | F3(0x5) | OPC(0x13)) +#define SRAI (F7(0x20) | F3(0x5) | OPC(0x13)) #define SUB (F7(0x20) | F3(0x0) | OPC(0x33)) #define SW (F3(0x2) | OPC(0x23)) +#define VAND_VV (F7(0x13) | OPIVV) +#define VFMV_FS (F7(0x21) | OPFVV) +#define VFMV_SF (F7(0x21) | OPFVF) +#define VFMV_VF (F7(0x2f) | OPFVF) +#define VFWCVT_FFV (F7(0x25) | (0xc << 15) | OPFVV) +#define VL (F7(0x1) | OPC(0x7)) +#define VMSLE_VI (F7(0x3b) | OPIVI) +#define VMV_SX (F7(0x21) | OPMVX) +#define VMV_VI (F7(0x2f) | OPIVI) +#define VMV_VV (F7(0x2f) | OPIVV) +#define VMV_VX (F7(0x2f) | OPIVX) +#define VMV_XS (F7(0x21) | OPMVV) +#define VOR_VV (F7(0x15) | OPIVV) +#define VSETIVLI (F7(0x60) | F3(0x7) | OPC(0x57)) +#define VS (F7(0x1) | OPC(0x27)) +#define VSLIDEDOWN_VX (F7(0x1f) | OPIVX) +#define VSLIDEDOWN_VI (F7(0x1f) | OPIVI) +#define VSLIDEUP_VX (F7(0x1d) | OPIVX) +#define VSLIDEUP_VI (F7(0x1d) | OPIVI) +#define VRGATHER_VI (F7(0x19) | OPIVI) +#define VRGATHER_VV (F7(0x19) | OPIVV) +#define VXOR_VV (F7(0x17) | OPIVV) +#define VZEXT_VF2 (F7(0x25) | (0x6 << 15) | OPMVV) +#define VZEXT_VF4 (F7(0x25) | (0x4 << 15) | OPMVV) +#define VZEXT_VF8 (F7(0x25) | (0x2 << 15) | OPMVV) #define XOR (F7(0x0) | F3(0x4) | OPC(0x33)) #define XORI (F3(0x4) | OPC(0x13)) +#if defined __riscv_zbb +#if defined SLJIT_CONFIG_RISCV_32 +#define ZEXTH (F7(0x4) | F3(0x4) | OPC(0x33)) +#elif defined SLJIT_CONFIG_RISCV_64 +#define ZEXTH (F7(0x4) | F3(0x4) | OPC(0x3B)) +#endif +#endif /* __riscv_zbb */ #define SIMM_MAX (0x7ff) #define SIMM_MIN (-0x800) @@ -650,7 +727,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) case SLJIT_HAS_COPY_F64: #endif /* !SLJIT_CONFIG_RISCV_64 */ + case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: +#ifdef __riscv_vector + case SLJIT_HAS_SIMD: +#endif + return 1; +#ifdef __riscv_zbb + case SLJIT_HAS_CLZ: + case SLJIT_HAS_CTZ: + case SLJIT_HAS_REV: + case SLJIT_HAS_ROT: return 1; +#endif default: return 0; } @@ -724,16 +813,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type) static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw); SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 i, tmp, offset; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1); #if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) { @@ -821,13 +914,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi #undef STACK_MAX_DISTANCE SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); + CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1); #if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) { @@ -1034,6 +1132,9 @@ static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sl sljit_s32 base = arg & REG_MASK; sljit_s32 tmp_r = (flags & MEM_USE_TMP2) ? TMP_REG2 : TMP_REG1; sljit_sw offset, argw_hi; +#if defined __riscv_zba + sljit_ins ins = ADD; +#endif /* __riscv_zba */ SLJIT_ASSERT(arg & SLJIT_MEM); if (!(next_arg & SLJIT_MEM)) { @@ -1044,6 +1145,20 @@ static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sl if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) { argw &= 0x3; +#if defined __riscv_zba + switch (argw) { + case 1: + ins = SH1ADD; + break; + case 2: + ins = SH2ADD; + break; + case 3: + ins = SH3ADD; + break; + } + FAIL_IF(push_inst(compiler, ins | RD(tmp_r) | RS1(OFFS_REG(arg)) | RS2(base))); +#else /* !__riscv_zba */ /* Using the cache. */ if (argw == compiler->cache_argw) { if (arg == compiler->cache_arg) @@ -1075,6 +1190,8 @@ static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sl } else FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(!argw ? OFFS_REG(arg) : TMP_REG3))); +#endif /* __riscv_zba */ + return push_mem_inst(compiler, flags, reg, tmp_r, 0); } @@ -1161,7 +1278,7 @@ static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, slji #define WORD_32 0x08 #define IMM_EXTEND(v) (IMM_I((op & SLJIT_32) ? (v) : (32 + (v)))) #endif /* SLJIT_CONFIG_RISCV_32 */ - +#ifndef __riscv_zbb static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src) { sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ); @@ -1264,6 +1381,7 @@ static sljit_s32 emit_rev16(struct sljit_compiler *compiler, sljit_s32 op, sljit FAIL_IF(push_inst(compiler, (GET_OPCODE(op) == SLJIT_REV_U16 ? SRLI : SRAI) | WORD | RD(dst) | RS1(dst) | IMM_I(word_size - 16))); return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(TMP_REG1)); } +#endif /* !__riscv_zbb */ #define EMIT_LOGICAL(op_imm, op_reg) \ if (flags & SRC2_IMM) { \ @@ -1309,6 +1427,9 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl return SLJIT_SUCCESS; case SLJIT_MOV_S8: +#if defined __riscv_zbb + return push_inst(compiler, SEXTB | RD(dst) | RS1(src2)); +#else /* !__riscv_zbb */ SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) { FAIL_IF(push_inst(compiler, SLLI | WORD | RD(dst) | RS1(src2) | IMM_EXTEND(24))); @@ -1316,8 +1437,12 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl } SLJIT_ASSERT(dst == src2); return SLJIT_SUCCESS; +#endif /* __riscv_zbb */ case SLJIT_MOV_U16: +#if defined __riscv_zbb + return push_inst(compiler, ZEXTH | RD(dst) | RS1(src2)); +#else /* !__riscv_zbb */ SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) { FAIL_IF(push_inst(compiler, SLLI | WORD | RD(dst) | RS1(src2) | IMM_EXTEND(16))); @@ -1325,8 +1450,12 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl } SLJIT_ASSERT(dst == src2); return SLJIT_SUCCESS; +#endif /* __riscv_zbb */ case SLJIT_MOV_S16: +#if defined __riscv_zbb + return push_inst(compiler, SEXTH | RD(dst) | RS1(src2)); +#else /* !__riscv_zbb */ SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) { FAIL_IF(push_inst(compiler, SLLI | WORD | RD(dst) | RS1(src2) | IMM_EXTEND(16))); @@ -1334,6 +1463,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl } SLJIT_ASSERT(dst == src2); return SLJIT_SUCCESS; +#endif /* !__riscv_zbb */ #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) case SLJIT_MOV_U32: @@ -1354,24 +1484,59 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl #endif /* SLJIT_CONFIG_RISCV_64 */ case SLJIT_CLZ: +#if defined __riscv_zbb + return push_inst(compiler, CLZ | WORD | RD(dst) | RS1(src2)); +#endif /* __riscv_zbb */ case SLJIT_CTZ: +#if defined __riscv_zbb + return push_inst(compiler, CTZ | WORD | RD(dst) | RS1(src2)); +#else /* !__riscv_zbb */ SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); return emit_clz_ctz(compiler, op, dst, src2); +#endif /* __riscv_zbb */ case SLJIT_REV: +#if defined __riscv_zbb + SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); + FAIL_IF(push_inst(compiler, REV8 | RD(dst) | RS1(src2))); +#if defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64 + if (op & SLJIT_32) + return push_inst(compiler, SRAI | RD(dst) | RS1(dst) | IMM_I(32)); + return SLJIT_SUCCESS; +#else /* !SLJIT_CONFIG_RISCV_64 */ + return SLJIT_SUCCESS; +#endif /* SLJIT_CONFIG_RISCV_64 */ +#endif /* __riscv_zbb */ case SLJIT_REV_S32: -#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) +#if ((defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) || defined __riscv_zbb) case SLJIT_REV_U32: -#endif /* SLJIT_CONFIG_RISCV_32 */ +#endif /* SLJIT_CONFIG_RISCV_32 || __riscv_zbb */ SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); +#if defined __riscv_zbb + FAIL_IF(push_inst(compiler, REV8 | RD(dst) | RS1(src2))); +#if defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64 + return push_inst(compiler, (GET_OPCODE(op) == SLJIT_REV_U32 ? SRLI : SRAI )| RD(dst) | RS1(dst) | IMM_I(32)); +#else /* !SLJIT_CONFIG_RISCV_64 */ + return SLJIT_SUCCESS; +#endif /* SLJIT_CONFIG_RISCV_64 */ +#else /* !__riscv_zbb */ return emit_rev(compiler, op, dst, src2); - +#endif /* __riscv_zbb */ case SLJIT_REV_U16: case SLJIT_REV_S16: SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM)); +#if defined __riscv_zbb + FAIL_IF(push_inst(compiler, REV8 | RD(dst) | RS1(src2))); +#if defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64 + return push_inst(compiler, (GET_OPCODE(op) == SLJIT_REV_U16 ? SRLI : SRAI )| RD(dst) | RS1(dst) | IMM_I(48)); +#else /* !SLJIT_CONFIG_RISCV_64 */ + return push_inst(compiler, (GET_OPCODE(op) == SLJIT_REV_U16 ? SRLI : SRAI) | RD(dst) | RS1(dst) | IMM_I(16)); +#endif /* SLJIT_CONFIG_RISCV_64 */ +#else /* !__riscv_zbb */ return emit_rev16(compiler, op, dst, src2); +#endif /* __riscv_zbb */ -#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) +#if ((defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) && !defined __riscv_zbb) case SLJIT_REV_U32: SLJIT_ASSERT(src1 == TMP_ZERO && !(flags & SRC2_IMM) && dst != TMP_REG1); FAIL_IF(emit_rev(compiler, op, dst, src2)); @@ -1379,8 +1544,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl return SLJIT_SUCCESS; FAIL_IF(push_inst(compiler, SLLI | RD(dst) | RS1(dst) | IMM_I(32))); return push_inst(compiler, SRLI | RD(dst) | RS1(dst) | IMM_I(32)); -#endif /* SLJIT_CONFIG_RISCV_32 */ - +#endif /* SLJIT_CONFIG_RISCV_64 && !__riscv_zbb */ case SLJIT_ADD: /* Overflow computation (both add and sub): overflow = src1_sign ^ src2_sign ^ result_sign ^ carry_flag */ is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW; @@ -1668,7 +1832,16 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl case SLJIT_ROTR: if (flags & SRC2_IMM) { SLJIT_ASSERT(src2 != 0); - +#if defined __riscv_zbb + if (GET_OPCODE(op) == SLJIT_ROTL) { +#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) + src2 = ((op & SLJIT_32) ? 32 : 64) - src2; +#else /* !SLJIT_CONFIG_RISCV_64 */ + src2 = 32 - src2; +#endif /* SLJIT_CONFIG_RISCV_64 */ + } + return push_inst(compiler, RORI | WORD | RD(dst) | RS1(src1) | IMM_I(src2)); +#else /* !__riscv_zbb */ op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SLLI : SRLI; FAIL_IF(push_inst(compiler, op_imm | WORD | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2))); @@ -1680,8 +1853,12 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SRLI : SLLI; FAIL_IF(push_inst(compiler, op_imm | WORD | RD(dst) | RS1(src1) | IMM_I(src2))); return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)); +#endif /* !__riscv_zbb */ } +#if defined __riscv_zbb + return push_inst(compiler, (GET_OPCODE(op) == SLJIT_ROTL ? ROL : ROR) | WORD | RD(dst) | RS1(src1) | RS2(src2)); +#else /* !__riscv_zbb */ if (src2 == TMP_ZERO) { if (dst != src1) return push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(0)); @@ -1694,7 +1871,7 @@ static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sl op_reg = (GET_OPCODE(op) == SLJIT_ROTL) ? SRL : SLL; FAIL_IF(push_inst(compiler, op_reg | WORD | RD(dst) | RS1(src1) | RS2(EQUAL_FLAG))); return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)); - +#endif /* !riscv_zbb */ default: SLJIT_UNREACHABLE(); return SLJIT_SUCCESS; @@ -1881,6 +2058,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile return push_inst(compiler, DIVU | WORD | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(SLJIT_R1)); case SLJIT_DIV_SW: return push_inst(compiler, DIV | WORD | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(SLJIT_R1)); + case SLJIT_MEMORY_BARRIER: + return push_inst(compiler, FENCE | 0x0ff00000); case SLJIT_ENDBR: case SLJIT_SKIP_FRAMES_BEFORE_RETURN: return SLJIT_SUCCESS; @@ -2573,6 +2752,7 @@ static sljit_ins get_jump_instruction(sljit_s32 type) case SLJIT_SIG_GREATER: case SLJIT_OVERFLOW: case SLJIT_CARRY: + case SLJIT_ATOMIC_NOT_STORED: case SLJIT_F_EQUAL: case SLJIT_ORDERED_EQUAL: case SLJIT_ORDERED_NOT_EQUAL: @@ -2591,6 +2771,7 @@ static sljit_ins get_jump_instruction(sljit_s32 type) case SLJIT_SIG_LESS_EQUAL: case SLJIT_NOT_OVERFLOW: case SLJIT_NOT_CARRY: + case SLJIT_ATOMIC_STORED: case SLJIT_F_NOT_EQUAL: case SLJIT_UNORDERED_OR_NOT_EQUAL: case SLJIT_UNORDERED_OR_EQUAL: @@ -2687,7 +2868,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler } if (src2 & SLJIT_MEM) { - PTR_FAIL_IF(emit_op_mem2(compiler, flags, src2_tmp_reg, src2, src2w, 0, 0)); + PTR_FAIL_IF(emit_op_mem2(compiler, flags | (src1 == TMP_REG1 ? MEM_USE_TMP2 : 0), src2_tmp_reg, src2, src2w, 0, 0)); src2 = src2_tmp_reg; } @@ -2862,6 +3043,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co src_r = dst_r; invert ^= 0x1; break; + case SLJIT_ATOMIC_STORED: + case SLJIT_ATOMIC_NOT_STORED: + invert ^= 0x1; + break; } } else { invert = 0; @@ -3066,6 +3251,561 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile #undef TO_ARGW_HI +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 dst_reg, + sljit_s32 mem_reg) +{ + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: +#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) + ins = LR | (3 << 12); + break; +#endif /* SLJIT_CONFIG_RISCV_64 */ + case SLJIT_MOV_S32: + case SLJIT_MOV32: + ins = LR | (2 << 12); + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | RD(dst_reg) | RS1(mem_reg)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 src_reg, + sljit_s32 mem_reg, + sljit_s32 temp_reg) +{ + sljit_ins ins; + + /* temp_reg == mem_reg is undefined so use another temp register */ + SLJIT_UNUSED_ARG(temp_reg); + + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + + if (op & SLJIT_ATOMIC_USE_CAS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV: + case SLJIT_MOV_P: +#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) + ins = SC | (3 << 12); + break; +#endif /* SLJIT_CONFIG_RISCV_64 */ + case SLJIT_MOV_S32: + case SLJIT_MOV32: + ins = SC | (2 << 12); + break; + + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | RD(OTHER_FLAG) | RS1(mem_reg) | RS2(src_reg)); +} + +/* + SEW = Selected element width + LMUL = Vector register group multiplier + + VLMUL values (in binary): + 100 : reserved + 101 : 1/8 + 110 : 1/4 + 111 : 1/2 + 000 : 1 + 001 : 2 + 010 : 4 + 011 : 8 +*/ + +static SLJIT_INLINE sljit_s32 sljit_emit_vsetivli(struct sljit_compiler *compiler, sljit_s32 type, sljit_ins vlmul) +{ + sljit_ins elem_size = (sljit_ins)SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins avl = (sljit_ins)1 << (SLJIT_SIMD_GET_REG_SIZE(type) - elem_size); + + return push_inst(compiler, VSETIVLI | RD(TMP_REG1) | (elem_size << 23) | (vlmul << 20) | (avl << 15)); +} + +static SLJIT_INLINE sljit_s32 sljit_emit_vsetivli_size(struct sljit_compiler *compiler, sljit_s32 reg_size, sljit_s32 elem_size) +{ + sljit_ins avl = (sljit_ins)1 << (reg_size - elem_size); + return push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem_size << 23) | (avl << 15)); +} + +static sljit_s32 sljit_emit_vmem(struct sljit_compiler *compiler, sljit_ins ins, sljit_s32 elem_size, sljit_s32 mem, sljit_sw memw) +{ + sljit_s32 base = mem & REG_MASK; + + if (elem_size > 0) + ins |= (1 << 14) | ((sljit_ins)elem_size << 12); + + if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) { + memw &= 0x3; + + if (SLJIT_UNLIKELY(memw)) { + FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG1) | RS1(OFFS_REG(mem)) | IMM_I(memw))); + } + + FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(base) | RS2(!memw ? OFFS_REG(mem) : TMP_REG1))); + return push_inst(compiler, ins | RS1(TMP_REG1)); + } + + if (memw == 0) + return push_inst(compiler, ins | RS1(base)); + + if (memw <= SIMM_MAX && memw >= SIMM_MIN) { + FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(base) | IMM_I(memw))); + return push_inst(compiler, ins | RS1(TMP_REG1)); + } + + FAIL_IF(load_immediate(compiler, TMP_REG1, memw, TMP_REG3)); + + if (base != 0) + FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(base))); + + return push_inst(compiler, ins | RS1(TMP_REG1)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 vreg, + sljit_s32 srcdst, sljit_sw srcdstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); + + ADJUST_LOCAL_OFFSET(srcdst, srcdstw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (elem_size > 3) + elem_size = 3; + + FAIL_IF(sljit_emit_vsetivli_size(compiler, reg_size, elem_size)); + + if (srcdst & SLJIT_MEM) { + ins = (type & SLJIT_SIMD_STORE) ? VS : VL; + return sljit_emit_vmem(compiler, ins | VRD(vreg), elem_size, srcdst, srcdstw); + } + + if (type & SLJIT_SIMD_STORE) + ins = VRD(srcdst) | VRS1(vreg); + else + ins = VRD(vreg) | VRS1(srcdst); + + return push_inst(compiler, VMV_VV | ins); +} + +static sljit_s32 sljit_simd_get_mem_flags(sljit_s32 elem_size) +{ + switch (elem_size) { + case 0: + return BYTE_DATA; + case 1: + return HALF_DATA; +#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) + case 2: + return INT_DATA; +#endif /* SLJIT_CONFIG_RISCV_64 */ + default: + return WORD_DATA; + } +} + +static sljit_sw sljit_simd_get_imm(sljit_s32 elem_size, sljit_sw imm) +{ + switch (elem_size) { + case 0: + return (sljit_s8)imm; + case 1: + return (sljit_s16)imm; +#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) + case 2: + return (sljit_s32)imm; +#endif /* SLJIT_CONFIG_RISCV_64 */ + default: + return imm; + } +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 vreg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 flags; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); + + ADJUST_LOCAL_OFFSET(src, srcw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + +#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) + if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2) + return SLJIT_ERR_UNSUPPORTED; +#else /* !SLJIT_CONFIG_RISCV_32 */ + if (((type & SLJIT_SIMD_FLOAT) && elem_size < 2) || elem_size > 3) + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_CONFIG_RISCV_32 */ + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + FAIL_IF(sljit_emit_vsetivli(compiler, type, 0)); + + if (type & SLJIT_SIMD_FLOAT) { + if (src == SLJIT_IMM) + return push_inst(compiler, VMV_VI | VRD(vreg) | ((sljit_ins)(srcw & 0x1f) << 15)); + + if (src & SLJIT_MEM) { + flags = (elem_size == 2) ? SINGLE_DATA : DOUBLE_DATA; + FAIL_IF(emit_op_mem(compiler, flags | LOAD_DATA, TMP_FREG1, src, srcw)); + src = TMP_FREG1; + } + + return push_inst(compiler, VFMV_VF | VRD(vreg) | FRS1(src)); + } + + if (src == SLJIT_IMM) { + srcw = sljit_simd_get_imm(elem_size, srcw); + + if (srcw >= -0x10 && srcw <= 0xf) + return push_inst(compiler, VMV_VI | VRD(vreg) | ((sljit_ins)(srcw & 0x1f) << 15)); + + FAIL_IF(load_immediate(compiler, TMP_REG1, srcw, TMP_REG3)); + src = TMP_REG1; + } else if (src & SLJIT_MEM) { + FAIL_IF(emit_op_mem(compiler, sljit_simd_get_mem_flags(elem_size) | LOAD_DATA, TMP_REG1, src, srcw)); + src = TMP_REG1; + } + + return push_inst(compiler, VMV_VX | VRD(vreg) | RS1(src)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 vreg, sljit_s32 lane_index, + sljit_s32 srcdst, sljit_sw srcdstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 flags; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); + + ADJUST_LOCAL_OFFSET(srcdst, srcdstw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + +#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) + if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2) + return SLJIT_ERR_UNSUPPORTED; +#else /* !SLJIT_CONFIG_RISCV_32 */ + if (((type & SLJIT_SIMD_FLOAT) && elem_size < 2) || elem_size > 3) + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_CONFIG_RISCV_32 */ + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (type & SLJIT_SIMD_STORE) { + FAIL_IF(push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem_size << 23) | (1 << 15))); + + if (lane_index > 0) { + FAIL_IF(push_inst(compiler, VSLIDEDOWN_VI | VRD(TMP_VREG1) | ((sljit_ins)lane_index << 15) | VRS2(vreg))); + vreg = TMP_VREG1; + } + + if (srcdst & SLJIT_MEM) + return sljit_emit_vmem(compiler, VS | VRD(vreg), elem_size, srcdst, srcdstw); + + if (type & SLJIT_SIMD_FLOAT) + return push_inst(compiler, VFMV_FS | FRD(srcdst) | VRS2(vreg)); + + FAIL_IF(push_inst(compiler, VMV_XS | RD(srcdst) | VRS2(vreg))); + +#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) + if ((type & SLJIT_SIMD_LANE_SIGNED) || elem_size >= 2) + return SLJIT_SUCCESS; +#else /* !SLJIT_CONFIG_RISCV_32 */ + if ((type & SLJIT_SIMD_LANE_SIGNED) || elem_size >= 3 || (elem_size == 2 && (type & SLJIT_32))) + return SLJIT_SUCCESS; +#endif /* SLJIT_CONFIG_RISCV_32 */ + + if (elem_size == 0) + return push_inst(compiler, ANDI | RD(srcdst) | RS1(srcdst) | IMM_I(0xff)); + +#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) + flags = 16; +#else /* !SLJIT_CONFIG_RISCV_32 */ + flags = (elem_size == 1) ? 48 : 32; +#endif /* SLJIT_CONFIG_RISCV_32 */ + + FAIL_IF(push_inst(compiler, SLLI | RD(srcdst) | RS1(srcdst) | IMM_I(flags))); + return push_inst(compiler, SRLI | RD(srcdst) | RS1(srcdst) | IMM_I(flags)); + } + + if (type & SLJIT_SIMD_LANE_ZERO) { + FAIL_IF(sljit_emit_vsetivli(compiler, type, 0)); + FAIL_IF(push_inst(compiler, VMV_VI | VRD(vreg))); + } + + if (srcdst & SLJIT_MEM) { + FAIL_IF(push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem_size << 23) | (1 << 15))); + FAIL_IF(sljit_emit_vmem(compiler, VL | VRD(lane_index > 0 ? TMP_VREG1 : vreg), elem_size, srcdst, srcdstw)); + + if (lane_index == 0) + return SLJIT_SUCCESS; + + FAIL_IF(push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem_size << 23) | ((sljit_ins)(lane_index + 1) << 15))); + return push_inst(compiler, VSLIDEUP_VI | VRD(vreg) | ((sljit_ins)lane_index << 15) | VRS2(TMP_VREG1)); + } + + if (!(type & SLJIT_SIMD_LANE_ZERO) || lane_index > 0) + FAIL_IF(push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem_size << 23) | ((sljit_ins)(lane_index + 1) << 15))); + + if (type & SLJIT_SIMD_FLOAT) { + FAIL_IF(push_inst(compiler, VFMV_SF | VRD(lane_index > 0 ? TMP_VREG1 : vreg) | FRS1(srcdst))); + + if (lane_index == 0) + return SLJIT_SUCCESS; + + return push_inst(compiler, VSLIDEUP_VI | VRD(vreg) | ((sljit_ins)lane_index << 15) | VRS2(TMP_VREG1)); + } + + if (srcdst == SLJIT_IMM) { + srcdstw = sljit_simd_get_imm(elem_size, srcdstw); + FAIL_IF(load_immediate(compiler, TMP_REG1, srcdstw, TMP_REG3)); + srcdst = TMP_REG1; + } + + FAIL_IF(push_inst(compiler, VMV_SX | VRD(lane_index > 0 ? TMP_VREG1 : vreg) | RS1(srcdst))); + + if (lane_index == 0) + return SLJIT_SUCCESS; + + return push_inst(compiler, VSLIDEUP_VI | VRD(vreg) | ((sljit_ins)lane_index << 15) | VRS2(TMP_VREG1)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 vreg, + sljit_s32 src, sljit_s32 src_lane_index) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if (((type & SLJIT_SIMD_FLOAT) && elem_size < 2) || elem_size > 3) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + FAIL_IF(sljit_emit_vsetivli(compiler, type, 0)); + + FAIL_IF(push_inst(compiler, VRGATHER_VI | VRD(vreg != src ? vreg : TMP_VREG1) | ((sljit_ins)src_lane_index << 15) | VRS2(src))); + if (vreg == src) + return push_inst(compiler, VMV_VV | VRD(vreg) | VRS1(TMP_VREG1)); + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 vreg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type); + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); + + ADJUST_LOCAL_OFFSET(src, srcw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + +#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) + if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2) + return SLJIT_ERR_UNSUPPORTED; +#else /* !SLJIT_CONFIG_RISCV_32 */ + if (((type & SLJIT_SIMD_FLOAT) && elem_size < 2) || elem_size > 3) + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_CONFIG_RISCV_32 */ + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if ((src & SLJIT_MEM) || vreg == src) { + ins = (sljit_ins)1 << (reg_size - elem2_size); + FAIL_IF(push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem_size << 23) | (ins << 15))); + + if (src & SLJIT_MEM) + FAIL_IF(sljit_emit_vmem(compiler, VL | VRD(TMP_VREG1), elem_size, src, srcw)); + else + FAIL_IF(push_inst(compiler, VMV_VV | VRD(TMP_VREG1) | VRS1(src))); + + src = TMP_VREG1; + } + + if (type & SLJIT_SIMD_FLOAT) { + FAIL_IF(sljit_emit_vsetivli(compiler, type, 0x7)); + return push_inst(compiler, VFWCVT_FFV | VRD(vreg) | VRS2(src)); + } + + ins = (sljit_ins)1 << (reg_size - elem2_size); + FAIL_IF(push_inst(compiler, VSETIVLI | RD(TMP_REG1) | ((sljit_ins)elem2_size << 23) | (ins << 15))); + + switch (elem2_size - elem_size) { + case 1: + ins = VZEXT_VF2; + break; + case 2: + ins = VZEXT_VF4; + break; + default: + ins = VZEXT_VF8; + break; + } + + if (type & SLJIT_SIMD_EXTEND_SIGNED) + ins |= 1 << 15; + + return push_inst(compiler, ins | VRD(vreg) | VRS2(src)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 vreg, + sljit_s32 dst, sljit_sw dstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); + + ADJUST_LOCAL_OFFSET(dst, dstw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if (((type & SLJIT_SIMD_FLOAT) && elem_size < 2) || elem_size > 3) + return SLJIT_ERR_UNSUPPORTED; + + FAIL_IF(sljit_emit_vsetivli(compiler, type, 0)); + FAIL_IF(push_inst(compiler, VMV_VI | VRD(TMP_VREG1) | (0x0 << 15))); + FAIL_IF(push_inst(compiler, VMSLE_VI | VRD(TMP_VREG1) | (0x0 << 15) | VRS2(vreg))); + + FAIL_IF(sljit_emit_vsetivli_size(compiler, 2, 2)); + FAIL_IF(push_inst(compiler, VMV_XS | RD(dst_r) | VRS2(TMP_VREG1))); + + if (dst & SLJIT_MEM) + return emit_op_mem(compiler, (type & SLJIT_32) ? INT_DATA : WORD_DATA, dst_r, dst, dstw); + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins ins = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + + ADJUST_LOCAL_OFFSET(src2, src2w); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + ins = VAND_VV; + break; + case SLJIT_SIMD_OP2_OR: + ins = VOR_VV; + break; + case SLJIT_SIMD_OP2_XOR: + ins = VXOR_VV; + break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = VRGATHER_VV; + elem_size = 0; + break; + } + + if (elem_size > 3) + elem_size = 3; + + FAIL_IF(sljit_emit_vsetivli_size(compiler, reg_size, elem_size)); + + if (src2 & SLJIT_MEM) { + FAIL_IF(sljit_emit_vmem(compiler, VL | VRD(TMP_VREG1), elem_size, src2, src2w)); + src2 = TMP_VREG1; + } + + if (SLJIT_SIMD_GET_OPCODE(type) != SLJIT_SIMD_OP2_SHUFFLE) + return push_inst(compiler, ins | VRD(dst_vreg) | VRS1(src1_vreg) | VRS2(src2)); + + if (dst_vreg == src2) { + FAIL_IF(push_inst(compiler, VMV_VV | VRD(TMP_VREG1) | VRS1(src2))); + src2 = TMP_VREG1; + } + + if (dst_vreg == src1_vreg) { + FAIL_IF(push_inst(compiler, VMV_VV | VRD(TMP_VREG2) | VRS1(src1_vreg))); + src1_vreg = TMP_VREG2; + } + + return push_inst(compiler, ins | VRD(dst_vreg) | VRS1(src2) | VRS2(src1_vreg)); +} + SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value) { struct sljit_const *const_; diff --git a/src/sljit/sljitNativeS390X.c b/src/sljit/sljitNativeS390X.c index 99e846350..ce78689e8 100644 --- a/src/sljit/sljitNativeS390X.c +++ b/src/sljit/sljitNativeS390X.c @@ -1638,6 +1638,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_COPY_F64: case SLJIT_HAS_SIMD: case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; case SLJIT_HAS_CTZ: @@ -1660,19 +1661,24 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type) /* --------------------------------------------------------------------- */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); sljit_s32 offset, i, tmp; CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); /* Saved registers are stored in callee allocated save area. */ SLJIT_ASSERT(gpr(SLJIT_FIRST_SAVED_REG) == r6 && gpr(SLJIT_S0) == r13); + scratches = ENTER_GET_REGS(scratches); + saveds = ENTER_GET_REGS(saveds); + offset = 2 * SSIZE_OF(sw); if (saveds + scratches >= SLJIT_NUMBER_OF_REGISTERS) { if (saved_arg_count == 0) { @@ -1756,12 +1762,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); compiler->local_size = (local_size + SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE + 0xf) & ~0xf; return SLJIT_SUCCESS; @@ -1950,6 +1956,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile return push_inst(compiler, lgr(arg1, tmp0)); /* remainder */ return SLJIT_SUCCESS; + case SLJIT_MEMORY_BARRIER: + return push_inst(compiler, 0x0700 /* bcr */ | (0xe << 4) | 0); case SLJIT_ENDBR: return SLJIT_SUCCESS; case SLJIT_SKIP_FRAMES_BEFORE_RETURN: @@ -2475,14 +2483,9 @@ static sljit_s32 sljit_emit_sub(struct sljit_compiler *compiler, sljit_s32 op, ins = (op & SLJIT_32) ? 0xc20d00000000 /* cfi */ : 0xc20c00000000 /* cgfi */; return emit_ri(compiler, ins, src1, src1, src1w, src2w, RIL_A); } - } - else { - if ((op & SLJIT_32) || is_u32(src2w)) { - ins = (op & SLJIT_32) ? 0xc20f00000000 /* clfi */ : 0xc20e00000000 /* clgfi */; - return emit_ri(compiler, ins, src1, src1, src1w, src2w, RIL_A); - } - if (is_s16(src2w)) - return emit_rie_d(compiler, 0xec00000000db /* alghsik */, (sljit_s32)tmp0, src1, src1w, src2w); + } else if ((op & SLJIT_32) || is_u32(src2w)) { + ins = (op & SLJIT_32) ? 0xc20f00000000 /* clfi */ : 0xc20e00000000 /* clgfi */; + return emit_ri(compiler, ins, src1, src1, src1w, src2w, RIL_A); } } else if (src2 & SLJIT_MEM) { @@ -3934,7 +3937,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3944,7 +3947,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3959,15 +3962,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (!(srcdst & SLJIT_MEM)) { if (type & SLJIT_SIMD_STORE) - ins = F36(srcdst) | F32(freg); + ins = F36(srcdst) | F32(vreg); else - ins = F36(freg) | F32(srcdst); + ins = F36(vreg) | F32(srcdst); return push_inst(compiler, 0xe70000000056 /* vlr */ | ins); } FAIL_IF(make_addr_bx(compiler, &addr, srcdst, srcdstw, tmp1)); - ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + ins = F36(vreg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); if (alignment >= 4) ins |= 4 << 12; @@ -3978,7 +3981,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3988,7 +3991,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil sljit_sw sign_ext; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -4003,15 +4006,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (src & SLJIT_MEM) { FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1)); - return push_inst(compiler, 0xe70000000005 /* vlrep */ | F36(freg) + return push_inst(compiler, 0xe70000000005 /* vlrep */ | F36(vreg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset) | ((sljit_ins)elem_size << 12)); } if (type & SLJIT_SIMD_FLOAT) { if (src == SLJIT_IMM) - return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg)); + return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(vreg)); - return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(src) | ((sljit_ins)elem_size << 12)); + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(vreg) | F32(src) | ((sljit_ins)elem_size << 12)); } if (src == SLJIT_IMM) { @@ -4043,10 +4046,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (sign_ext != 0x10000) { if (sign_ext == 0 || sign_ext == -1) - return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg) + return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(vreg) | (sign_ext == 0 ? 0 : ((sljit_ins)0xffff << 16))); - return push_inst(compiler, 0xe70000000045 /* vrepi */ | F36(freg) + return push_inst(compiler, 0xe70000000045 /* vrepi */ | F36(vreg) | ((sljit_ins)srcw << 16) | ((sljit_ins)elem_size << 12)); } @@ -4055,12 +4058,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil } else reg = gpr(src); - FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(reg) | ((sljit_ins)elem_size << 12))); - return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(freg) | ((sljit_ins)elem_size << 12)); + FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(vreg) | R32A(reg) | ((sljit_ins)elem_size << 12))); + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(vreg) | F32(vreg) | ((sljit_ins)elem_size << 12)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4070,7 +4073,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -4085,20 +4088,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (srcdst & SLJIT_MEM) { FAIL_IF(make_addr_bx(compiler, &addr, srcdst, srcdstw, tmp1)); - ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + ins = F36(vreg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); } if (type & SLJIT_SIMD_LANE_ZERO) { if ((srcdst & SLJIT_MEM) && lane_index == ((1 << (3 - elem_size)) - 1)) return push_inst(compiler, 0xe70000000004 /* vllez */ | ins | ((sljit_ins)elem_size << 12)); - if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) { - FAIL_IF(push_inst(compiler, 0xe70000000056 /* vlr */ | F36(TMP_FREG1) | F32(freg))); + if ((type & SLJIT_SIMD_FLOAT) && vreg == srcdst) { + FAIL_IF(push_inst(compiler, 0xe70000000056 /* vlr */ | F36(TMP_FREG1) | F32(vreg))); srcdst = TMP_FREG1; srcdstw = 0; } - FAIL_IF(push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg))); + FAIL_IF(push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(vreg))); } if (srcdst & SLJIT_MEM) { @@ -4126,19 +4129,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (type & SLJIT_SIMD_FLOAT) { if (type & SLJIT_SIMD_STORE) - return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(srcdst) | F32(freg) | ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12)); + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(srcdst) | F32(vreg) | ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12)); if (elem_size == 3) { if (lane_index == 0) - ins = F32(srcdst) | F28(freg) | (1 << 12); + ins = F32(srcdst) | F28(vreg) | (1 << 12); else - ins = F32(freg) | F28(srcdst); + ins = F32(vreg) | F28(srcdst); - return push_inst(compiler, 0xe70000000084 /* vpdi */ | F36(freg) | ins); + return push_inst(compiler, 0xe70000000084 /* vpdi */ | F36(vreg) | ins); } FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(tmp0) | F32(srcdst) | ((sljit_ins)2 << 12))); - return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(tmp0) | ((sljit_ins)lane_index << 16) | ((sljit_ins)2 << 12)); + return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(vreg) | R32A(tmp0) | ((sljit_ins)lane_index << 16) | ((sljit_ins)2 << 12)); } if (srcdst == SLJIT_IMM) { @@ -4167,7 +4170,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } if (ins != 0) - return push_inst(compiler, ins | F36(freg) | ((sljit_ins)srcdstw << 16) | ((sljit_ins)lane_index << 12)); + return push_inst(compiler, ins | F36(vreg) | ((sljit_ins)srcdstw << 16) | ((sljit_ins)lane_index << 12)); push_load_imm_inst(compiler, tmp0, srcdstw); reg = tmp0; @@ -4177,9 +4180,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile ins = ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12); if (!(type & SLJIT_SIMD_STORE)) - return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(reg) | ins); + return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(vreg) | R32A(reg) | ins); - FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(reg) | F32(freg) | ins)); + FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(reg) | F32(vreg) | ins)); if (!(type & SLJIT_SIMD_LANE_SIGNED) || elem_size >= 3) return SLJIT_SUCCESS; @@ -4200,14 +4203,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); if (reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4218,12 +4221,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; - return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(src) + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(vreg) | F32(src) | ((sljit_ins)src_lane_index << 16) | ((sljit_ins)elem_size << 12)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4233,7 +4236,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler sljit_ins ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -4248,7 +4251,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler if (src & SLJIT_MEM) { FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1)); - ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + ins = F36(vreg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); switch (elem2_size - elem_size) { case 1: @@ -4263,27 +4266,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler } FAIL_IF(push_inst(compiler, ins)); - src = freg; + src = vreg; } if (type & SLJIT_SIMD_FLOAT) { - FAIL_IF(push_inst(compiler, 0xe700000000d5 /* vuplh */ | F36(freg) | F32(src) | (2 << 12))); - FAIL_IF(push_inst(compiler, 0xe70000000030 /* vesl */ | F36(freg) | F32(freg) | (32 << 16) | (3 << 12))); - return push_inst(compiler, 0xe700000000c4 /* vfll */ | F36(freg) | F32(freg) | (2 << 12)); + FAIL_IF(push_inst(compiler, 0xe700000000d5 /* vuplh */ | F36(vreg) | F32(src) | (2 << 12))); + FAIL_IF(push_inst(compiler, 0xe70000000030 /* vesl */ | F36(vreg) | F32(vreg) | (32 << 16) | (3 << 12))); + return push_inst(compiler, 0xe700000000c4 /* vfll */ | F36(vreg) | F32(vreg) | (2 << 12)); } - ins = ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0xe700000000d7 /* vuph */ : 0xe700000000d5 /* vuplh */) | F36(freg); + ins = ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0xe700000000d7 /* vuph */ : 0xe700000000d5 /* vuplh */) | F36(vreg); do { FAIL_IF(push_inst(compiler, ins | F32(src) | ((sljit_ins)elem_size << 12))); - src = freg; + src = vreg; } while (++elem_size < elem2_size); return SLJIT_SUCCESS; } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4291,7 +4294,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_gpr dst_r; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); ADJUST_LOCAL_OFFSET(dst, dstw); @@ -4324,7 +4327,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c if (elem_size != 0) FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(TMP_FREG1) | R32A(tmp0) | (1 << 16) | (3 << 12))); - FAIL_IF(push_inst(compiler, 0xe70000000085 /* vbperm */ | F36(TMP_FREG1) | F32(freg) | F28(TMP_FREG1))); + FAIL_IF(push_inst(compiler, 0xe70000000085 /* vbperm */ | F36(TMP_FREG1) | F32(vreg) | F28(TMP_FREG1))); dst_r = FAST_IS_REG(dst) ? gpr(dst) : tmp0; FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(dst_r) | F32(TMP_FREG1) @@ -4337,14 +4340,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_ins ins = 0; + sljit_s32 alignment; + struct addr addr; + sljit_ins ins = 0, load_ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4365,12 +4371,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = 0xe7000000006d /* vx */; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = 0xe7000000008c /* vperm */; + break; } - if (type & SLJIT_SIMD_TEST) - return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + FAIL_IF(make_addr_bx(compiler, &addr, src2, src2w, tmp1)); + load_ins = 0xe70000000006 /* vl */ | F36(TMP_FREG1) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + + if (alignment >= 4) + load_ins |= 4 << 12; + else if (alignment == 3) + load_ins |= 3 << 12; + + FAIL_IF(push_inst(compiler, load_ins)); + src2 = TMP_FREG1; + } + + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) + return push_inst(compiler, ins | F36(dst_vreg) | F32(src1_vreg) | F28(src1_vreg) | F12(src2)); - return push_inst(compiler, ins | F36(dst_freg) | F32(src1_freg) | F28(src2_freg)); + return push_inst(compiler, ins | F36(dst_vreg) | F32(src1_vreg) | F28(src2)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, @@ -4380,8 +4403,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); - SLJIT_SKIP_CHECKS(compiler); - return sljit_emit_op1(compiler, op, dst_reg, 0, SLJIT_MEM1(mem_reg), 0); + if (op & SLJIT_ATOMIC_USE_LS) + return SLJIT_ERR_UNSUPPORTED; + + switch (GET_OPCODE(op)) { + case SLJIT_MOV32: + case SLJIT_MOV_U32: + case SLJIT_MOV: + case SLJIT_MOV_P: + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + + SLJIT_SKIP_CHECKS(compiler); + return sljit_emit_op1(compiler, op & ~SLJIT_ATOMIC_USE_CAS, dst_reg, 0, SLJIT_MEM1(mem_reg), 0); + default: + return SLJIT_ERR_UNSUPPORTED; + } } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, @@ -4389,44 +4426,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler sljit_s32 mem_reg, sljit_s32 temp_reg) { - sljit_ins mask; + sljit_ins ins; sljit_gpr tmp_r = gpr(temp_reg); sljit_gpr mem_r = gpr(mem_reg); CHECK_ERROR(); CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + if (op & SLJIT_ATOMIC_USE_LS) + return SLJIT_ERR_UNSUPPORTED; + switch (GET_OPCODE(op)) { case SLJIT_MOV32: case SLJIT_MOV_U32: - return push_inst(compiler, 0xba000000 /* cs */ | R20A(tmp_r) | R16A(gpr(src_reg)) | R12A(mem_r)); - case SLJIT_MOV_U8: - mask = 0xff; + ins = 0xba000000 /* cs */ | R20A(tmp_r) | R16A(gpr(src_reg)) | R12A(mem_r); break; - case SLJIT_MOV_U16: - mask = 0xffff; + case SLJIT_MOV: + case SLJIT_MOV_P: + ins = 0xeb0000000030 /* csg */ | R36A(tmp_r) | R32A(gpr(src_reg)) | R28A(mem_r); break; default: - return push_inst(compiler, 0xeb0000000030 /* csg */ | R36A(tmp_r) | R32A(gpr(src_reg)) | R28A(mem_r)); + return SLJIT_ERR_UNSUPPORTED; } - /* tmp0 = (src_reg ^ tmp_r) & mask */ - FAIL_IF(push_inst(compiler, 0xa50f0000 /* llill */ | R20A(tmp1) | mask)); - FAIL_IF(push_inst(compiler, 0xb9e70000 /* xgrk */ | R4A(tmp0) | R0A(gpr(src_reg)) | R12A(tmp_r))); - FAIL_IF(push_inst(compiler, 0xa7090000 /* lghi */ | R20A(tmp_r) | 0xfffc)); - FAIL_IF(push_inst(compiler, 0xb9800000 /* ngr */ | R4A(tmp0) | R0A(tmp1))); - - /* tmp0 = tmp0 << (((mem_r ^ 0x3) & 0x3) << 3) */ - FAIL_IF(push_inst(compiler, 0xa50f0000 /* llill */ | R20A(tmp1) | (sljit_ins)((mask == 0xff) ? 0x18 : 0x10))); - FAIL_IF(push_inst(compiler, 0xb9800000 /* ngr */ | R4A(tmp_r) | R0A(mem_r))); - FAIL_IF(push_inst(compiler, 0xec0000000057 /* rxsbg */ | R36A(tmp1) | R32A(mem_r) | (59 << 24) | (60 << 16) | (3 << 8))); - FAIL_IF(push_inst(compiler, 0xeb000000000d /* sllg */ | R36A(tmp0) | R32A(tmp0) | R28A(tmp1))); - - /* Already computed: tmp_r = mem_r & ~0x3 */ + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; - FAIL_IF(push_inst(compiler, 0x58000000 /* l */ | R20A(tmp1) | R12A(tmp_r))); - FAIL_IF(push_inst(compiler, 0x1700 /* x */ | R4A(tmp0) | R0A(tmp1))); - return push_inst(compiler, 0xba000000 /* cs */ | R20A(tmp1) | R16A(tmp0) | R12A(tmp_r)); + return push_inst(compiler, ins); } /* --------------------------------------------------------------------- */ diff --git a/src/sljit/sljitNativeX86_32.c b/src/sljit/sljitNativeX86_32.c index 59ea04a5c..281fa0aa0 100644 --- a/src/sljit/sljitNativeX86_32.c +++ b/src/sljit/sljitNativeX86_32.c @@ -311,8 +311,8 @@ static sljit_u8* detect_far_jump_type(struct sljit_jump *jump, sljit_u8 *code_pt #define ENTER_TMP_TO_S 0x00002 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { sljit_s32 word_arg_count, saved_arg_count, float_arg_count; sljit_s32 size, args_size, types, status; @@ -323,8 +323,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi #endif CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); /* Emit ENDBR32 at function entry if needed. */ FAIL_IF(emit_endbranch(compiler)); @@ -536,14 +538,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { sljit_s32 args_size; CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); arg_types >>= SLJIT_ARG_SHIFT; args_size = 0; diff --git a/src/sljit/sljitNativeX86_64.c b/src/sljit/sljitNativeX86_64.c index 1ab79293c..ce7e2e579 100644 --- a/src/sljit/sljitNativeX86_64.c +++ b/src/sljit/sljitNativeX86_64.c @@ -454,14 +454,16 @@ typedef struct { #endif /* _WIN64 */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { sljit_uw size; sljit_s32 word_arg_count = 0; sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options); sljit_s32 saved_regs_size, tmp, i; #ifdef _WIN64 + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 saved_float_regs_size; sljit_s32 saved_float_regs_offset = 0; sljit_s32 float_arg_count = 0; @@ -469,8 +471,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi sljit_u8 *inst; CHECK_ERROR(); - CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, local_size)); + set_emit_enter(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); +#ifdef _WIN64 + saveds = ENTER_GET_REGS(saveds); +#endif /* _WIN64 */ if (options & SLJIT_ENTER_REG_ARG) arg_types = 0; @@ -630,19 +637,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler, - sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds, - sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size) + sljit_s32 options, sljit_s32 arg_types, + sljit_s32 scratches, sljit_s32 saveds, sljit_s32 local_size) { sljit_s32 saved_regs_size; #ifdef _WIN64 + sljit_s32 fscratches = ENTER_GET_FLOAT_REGS(scratches); + sljit_s32 fsaveds = ENTER_GET_FLOAT_REGS(saveds); sljit_s32 saved_float_regs_size; #endif /* _WIN64 */ CHECK_ERROR(); - CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size)); - set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size); + CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, local_size)); + set_set_context(compiler, options, arg_types, scratches, saveds, local_size); + + scratches = ENTER_GET_REGS(scratches); #ifdef _WIN64 + saveds = ENTER_GET_REGS(saveds); + local_size += SLJIT_LOCALS_OFFSET; saved_float_regs_size = GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sse2_reg); diff --git a/src/sljit/sljitNativeX86_common.c b/src/sljit/sljitNativeX86_common.c index ecb7e9be3..496f80c55 100644 --- a/src/sljit/sljitNativeX86_common.c +++ b/src/sljit/sljitNativeX86_common.c @@ -239,6 +239,7 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { #define MOVDDUP_x_xm 0x12 #define MOVDQA_x_xm 0x6f #define MOVDQA_xm_x 0x7f +#define MOVDQU_x_xm 0x6f #define MOVHLPS_x_x 0x12 #define MOVHPD_m_x 0x17 #define MOVHPD_x_m 0x16 @@ -398,6 +399,13 @@ static sljit_u32 cpu_feature_list = 0; #include #elif defined(_MSC_VER) && _MSC_VER >= 1400 #include +#elif defined(__INTEL_COMPILER) +#include +#endif + +#if (defined(_MSC_VER) && _MSC_VER >= 1400) || defined(__INTEL_COMPILER) \ + || (defined(__INTEL_LLVM_COMPILER) && defined(__XSAVE__)) +#include #endif /******************************************************/ @@ -425,49 +433,20 @@ static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value) static void execute_cpu_id(sljit_u32 info[4]) { -#if defined(_MSC_VER) && _MSC_VER >= 1400 +#if (defined(_MSC_VER) && _MSC_VER >= 1400) \ + || (defined(__INTEL_COMPILER) && __INTEL_COMPILER == 2021 && __INTEL_COMPILER_UPDATE >= 7) __cpuidex((int*)info, (int)info[0], (int)info[2]); -#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__) +#elif (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1900) - /* AT&T syntax. */ - __asm__ ( -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - "movl %0, %%esi\n" - "movl (%%esi), %%eax\n" - "movl 8(%%esi), %%ecx\n" - "pushl %%ebx\n" - "cpuid\n" - "movl %%eax, (%%esi)\n" - "movl %%ebx, 4(%%esi)\n" - "popl %%ebx\n" - "movl %%ecx, 8(%%esi)\n" - "movl %%edx, 12(%%esi)\n" -#else /* !SLJIT_CONFIG_X86_32 */ - "movq %0, %%rsi\n" - "movl (%%rsi), %%eax\n" - "movl 8(%%rsi), %%ecx\n" - "cpuid\n" - "movl %%eax, (%%rsi)\n" - "movl %%ebx, 4(%%rsi)\n" - "movl %%ecx, 8(%%rsi)\n" - "movl %%edx, 12(%%rsi)\n" -#endif /* SLJIT_CONFIG_X86_32 */ - : - : "r" (info) -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - : "memory", "eax", "ecx", "edx", "esi" -#else /* !SLJIT_CONFIG_X86_32 */ - : "memory", "rax", "rbx", "rcx", "rdx", "rsi" -#endif /* SLJIT_CONFIG_X86_32 */ - ); + __get_cpuid_count(info[0], info[2], info, info + 1, info + 2, info + 3); -#else /* _MSC_VER < 1400 */ +#elif (defined(_MSC_VER) || defined(__INTEL_COMPILER)) \ + && (defined(SLJIT_CONFIG_X86_32) && SLJIT_CONFIG_X86_32) /* Intel syntax. */ __asm { -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) mov esi, info mov eax, [esi] mov ecx, [esi + 8] @@ -476,30 +455,48 @@ static void execute_cpu_id(sljit_u32 info[4]) mov [esi + 4], ebx mov [esi + 8], ecx mov [esi + 12], edx -#else /* !SLJIT_CONFIG_X86_32 */ - mov rsi, info - mov eax, [rsi] - mov ecx, [rsi + 8] - cpuid - mov [rsi], eax - mov [rsi + 4], ebx - mov [rsi + 8], ecx - mov [rsi + 12], edx -#endif /* SLJIT_CONFIG_X86_32 */ } -#endif /* _MSC_VER && _MSC_VER >= 1400 */ +#else + + __asm__ __volatile__ ( + "cpuid\n" + : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3]) + : "0" (info[0]), "2" (info[2]) + ); + +#endif } static sljit_u32 execute_get_xcr0_low(void) { sljit_u32 xcr0; -#if defined(_MSC_VER) && _MSC_VER >= 1400 +#if (defined(_MSC_VER) && _MSC_VER >= 1400) || defined(__INTEL_COMPILER) \ + || (defined(__INTEL_LLVM_COMPILER) && defined(__XSAVE__)) xcr0 = (sljit_u32)_xgetbv(0); -#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__) +#elif defined(__TINYC__) + + __asm__ ( + "xorl %%ecx, %%ecx\n" + ".byte 0x0f\n" + ".byte 0x01\n" + ".byte 0xd0\n" + : "=a" (xcr0) + : +#if defined(SLJIT_CONFIG_X86_32) && SLJIT_CONFIG_X86_32 + : "ecx", "edx" +#else /* !SLJIT_CONFIG_X86_32 */ + : "rcx", "rdx" +#endif /* SLJIT_CONFIG_X86_32 */ + ); + +#elif (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20220100) \ + || (defined(__clang__) && __clang_major__ < 14) \ + || (defined(__GNUC__) && __GNUC__ < 3) \ + || defined(__SUNPRO_C) || defined(__SUNPRO_CC) /* AT&T syntax. */ __asm__ ( @@ -507,23 +504,37 @@ static sljit_u32 execute_get_xcr0_low(void) "xgetbv\n" : "=a" (xcr0) : -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) +#if defined(SLJIT_CONFIG_X86_32) && SLJIT_CONFIG_X86_32 : "ecx", "edx" #else /* !SLJIT_CONFIG_X86_32 */ : "rcx", "rdx" #endif /* SLJIT_CONFIG_X86_32 */ ); -#else /* _MSC_VER < 1400 */ +#elif defined(_MSC_VER) /* Intel syntax. */ __asm { - mov ecx, 0 + xor ecx, ecx xgetbv mov xcr0, eax } -#endif /* _MSC_VER && _MSC_VER >= 1400 */ +#else + + __asm__ ( + "xor{l %%ecx, %%ecx | ecx, ecx}\n" + "xgetbv\n" + : "=a" (xcr0) + : +#if defined(SLJIT_CONFIG_X86_32) && SLJIT_CONFIG_X86_32 + : "ecx", "edx" +#else /* !SLJIT_CONFIG_X86_32 */ + : "rcx", "rdx" +#endif /* SLJIT_CONFIG_X86_32 */ + ); + +#endif return xcr0; } @@ -549,6 +560,10 @@ static void get_cpu_features(void) if (max_id >= 1) { info[0] = 1; +#if defined(SLJIT_CONFIG_X86_32) && SLJIT_CONFIG_X86_32 + /* Winchip 2 and Cyrix MII bugs */ + info[1] = info[2] = 0; +#endif execute_cpu_id(info); if (info[2] & 0x80000) @@ -565,11 +580,17 @@ static void get_cpu_features(void) feature_list |= CPU_FEATURE_CMOV; } - info[0] = 0x80000001; + info[0] = 0x80000000; execute_cpu_id(info); + max_id = info[0]; + + if (max_id >= 0x80000001) { + info[0] = 0x80000001; + execute_cpu_id(info); - if (info[2] & 0x20) - feature_list |= CPU_FEATURE_LZCNT; + if (info[2] & 0x20) + feature_list |= CPU_FEATURE_LZCNT; + } if ((feature_list & CPU_FEATURE_OSXSAVE) && (execute_get_xcr0_low() & 0x4) == 0) feature_list &= ~(sljit_u32)(CPU_FEATURE_AVX | CPU_FEATURE_AVX2); @@ -1017,6 +1038,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_COPY_F32: case SLJIT_HAS_COPY_F64: case SLJIT_HAS_ATOMIC: + case SLJIT_HAS_MEMORY_BARRIER: return 1; #if !(defined SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE @@ -1476,6 +1498,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0); #endif break; + case SLJIT_MEMORY_BARRIER: + inst = (sljit_u8*)ensure_buf(compiler, 1 + 3); + FAIL_IF(!inst); + INC_SIZE(3); + inst[0] = GROUP_0F; + inst[1] = 0xae; + inst[2] = 0xf0; + return SLJIT_SUCCESS; case SLJIT_ENDBR: return emit_endbranch(compiler); case SLJIT_SKIP_FRAMES_BEFORE_RETURN: @@ -3617,7 +3647,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fselect(struct sljit_compiler *com } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3626,7 +3656,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co sljit_uw op; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_mov(compiler, type, vreg, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -3670,13 +3700,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co return SLJIT_SUCCESS; if ((op & VEX_256) || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX))) - return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw); + return emit_vex_instruction(compiler, op, vreg, 0, srcdst, srcdstw); - return emit_groupf(compiler, op, freg, srcdst, srcdstw); + return emit_groupf(compiler, op, vreg, srcdst, srcdstw); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3687,7 +3717,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil sljit_uw op; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_replicate(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -3753,48 +3783,48 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (elem_size >= 3) compiler->mode32 = 0; #endif /* SLJIT_CONFIG_X86_64 */ - FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw)); + FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, vreg, 0, src, srcw)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; #endif /* SLJIT_CONFIG_X86_64 */ - src = freg; + src = vreg; srcw = 0; } if (reg_size == 5) op |= VEX_256; - return emit_vex_instruction(compiler, op, freg, 0, src, srcw); + return emit_vex_instruction(compiler, op, vreg, 0, src, srcw); } } if (type & SLJIT_SIMD_FLOAT) { if (src == SLJIT_IMM) { if (use_vex) - return emit_vex_instruction(compiler, XORPD_x_xm | (reg_size == 5 ? VEX_256 : 0) | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); + return emit_vex_instruction(compiler, XORPD_x_xm | (reg_size == 5 ? VEX_256 : 0) | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, vreg, vreg, vreg, 0); - return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0); + return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, vreg, vreg, 0); } SLJIT_ASSERT(reg_size == 4); if (use_vex) { if (elem_size == 3) - return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, srcw); + return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, vreg, 0, src, srcw); SLJIT_ASSERT(!(src & SLJIT_MEM)); - FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, vreg, src, src, 0)); return emit_byte(compiler, 0); } - if (elem_size == 2 && freg != src) { - FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw)); - src = freg; + if (elem_size == 2 && vreg != src) { + FAIL_IF(emit_sse2_load(compiler, 1, vreg, src, srcw)); + src = vreg; srcw = 0; } op = (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2; - FAIL_IF(emit_groupf(compiler, op, freg, src, srcw)); + FAIL_IF(emit_groupf(compiler, op, vreg, src, srcw)); if (elem_size == 2) return emit_byte(compiler, 0); @@ -3820,9 +3850,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (srcw == 0 || srcw == -1) { if (use_vex) - return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); + return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, vreg, vreg, vreg, 0); - return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0); + return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, vreg, vreg, 0); } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) @@ -3864,11 +3894,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (use_vex) { if (opcode != MOVD_x_rm) { op = (opcode == 0x3a) ? (PINSRB_x_rm_i8 | VEX_OP_0F3A) : opcode; - FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1 | VEX_SSE2_OPV, freg, freg, src, srcw)); + FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1 | VEX_SSE2_OPV, vreg, vreg, src, srcw)); } else - FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw)); + FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, vreg, 0, src, srcw)); } else { - inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw); + inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, vreg, 0, src, srcw); FAIL_IF(!inst); inst[0] = GROUP_0F; inst[1] = opcode; @@ -3879,13 +3909,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil } } - if (use_vex && elem_size >= 2) { + if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && elem_size >= 2) { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) op = VPBROADCASTD_x_xm; #else /* !SLJIT_CONFIG_X86_32 */ op = (elem_size == 3) ? VPBROADCASTQ_x_xm : VPBROADCASTD_x_xm; #endif /* SLJIT_CONFIG_X86_32 */ - return emit_vex_instruction(compiler, op | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0); + return emit_vex_instruction(compiler, op | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, vreg, 0); } SLJIT_ASSERT(reg_size == 4); @@ -3897,37 +3927,37 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil case 0: if (use_vex) { FAIL_IF(emit_vex_instruction(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, TMP_FREG, 0)); - return emit_vex_instruction(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, TMP_FREG, 0); + return emit_vex_instruction(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2 | VEX_SSE2_OPV, vreg, vreg, TMP_FREG, 0); } FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); - return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); + return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, TMP_FREG, 0); case 1: if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, vreg, 0, vreg, 0)); else - FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, vreg, vreg, 0)); FAIL_IF(emit_byte(compiler, 0)); /* fallthrough */ default: if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, 0, vreg, 0)); else - FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, vreg, 0)); return emit_byte(compiler, 0); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) case 3: compiler->mode32 = 1; if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, 0, vreg, 0)); else - FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, vreg, 0)); return emit_byte(compiler, 0x44); #endif /* SLJIT_CONFIG_X86_64 */ } } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 vreg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -3936,7 +3966,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile sljit_u8 *inst; sljit_u8 opcode = 0; sljit_uw op; - sljit_s32 freg_orig = freg; + sljit_s32 vreg_orig = vreg; #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) sljit_s32 srcdst_is_ereg = 0; sljit_s32 srcdst_orig = 0; @@ -3944,7 +3974,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile #endif /* SLJIT_CONFIG_X86_32 */ CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, vreg, lane_index, srcdst, srcdstw)); ADJUST_LOCAL_OFFSET(srcdst, srcdstw); @@ -4004,29 +4034,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (elem_size == 2) { if (use_vex) - return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw); - return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw); + return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, vreg, 0, srcdst, srcdstw); + return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, vreg, srcdst, srcdstw); } } else if (srcdst & SLJIT_MEM) { SLJIT_ASSERT(elem_size == 2 || elem_size == 3); if (use_vex) - return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw); - return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw); + return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, vreg, 0, srcdst, srcdstw); + return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, vreg, srcdst, srcdstw); } else if (elem_size == 3) { if (use_vex) - return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0); - return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0); + return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, vreg, 0, srcdst, 0); + return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, vreg, srcdst, 0); } else if (use_vex) { FAIL_IF(emit_vex_instruction(compiler, XORPD_x_xm | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, TMP_FREG, 0)); - return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F3 | EX86_SSE2 | VEX_SSE2_OPV, freg, TMP_FREG, srcdst, 0); + return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F3 | EX86_SSE2 | VEX_SSE2_OPV, vreg, TMP_FREG, srcdst, 0); } } if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) { - freg = TMP_FREG; + vreg = TMP_FREG; lane_index -= (1 << (4 - elem_size)); - } else if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) { + } else if ((type & SLJIT_SIMD_FLOAT) && vreg == srcdst) { if (use_vex) FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, srcdst, srcdstw)); else @@ -4039,14 +4069,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile | ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2; if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, op | (reg_size == 5 ? VEX_256 : 0) | VEX_SSE2_OPV, freg, freg, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, op | (reg_size == 5 ? VEX_256 : 0) | VEX_SSE2_OPV, vreg, vreg, vreg, 0)); else - FAIL_IF(emit_groupf(compiler, op, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, op, vreg, vreg, 0)); } else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) { - FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, vreg, 0, TMP_FREG, 0)); FAIL_IF(emit_byte(compiler, 1)); - freg = TMP_FREG; + vreg = TMP_FREG; lane_index -= (1 << (4 - elem_size)); } @@ -4059,55 +4089,55 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile op = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m; /* VEX prefix clears upper bits of the target register. */ - if (use_vex && ((type & SLJIT_SIMD_STORE) || reg_size == 4 || freg == TMP_FREG)) + if (use_vex && ((type & SLJIT_SIMD_STORE) || reg_size == 4 || vreg == TMP_FREG)) FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2 - | ((type & SLJIT_SIMD_STORE) ? 0 : VEX_SSE2_OPV), freg, (type & SLJIT_SIMD_STORE) ? 0 : freg, srcdst, srcdstw)); + | ((type & SLJIT_SIMD_STORE) ? 0 : VEX_SSE2_OPV), vreg, (type & SLJIT_SIMD_STORE) ? 0 : vreg, srcdst, srcdstw)); else - FAIL_IF(emit_groupf(compiler, op | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_groupf(compiler, op | EX86_PREF_66 | EX86_SSE2, vreg, srcdst, srcdstw)); - /* In case of store, freg is not TMP_FREG. */ + /* In case of store, vreg is not TMP_FREG. */ } else if (type & SLJIT_SIMD_STORE) { if (lane_index == 1) { if (use_vex) - return emit_vex_instruction(compiler, MOVHLPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, freg, 0); - return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0); + return emit_vex_instruction(compiler, MOVHLPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, vreg, 0); + return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, vreg, 0); } if (use_vex) - return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, freg, 0); - return emit_sse2_load(compiler, 0, srcdst, freg, 0); - } else if (use_vex && (reg_size == 4 || freg == TMP_FREG)) { + return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, vreg, 0); + return emit_sse2_load(compiler, 0, srcdst, vreg, 0); + } else if (use_vex && (reg_size == 4 || vreg == TMP_FREG)) { if (lane_index == 1) - FAIL_IF(emit_vex_instruction(compiler, MOVLHPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, srcdst, 0)); + FAIL_IF(emit_vex_instruction(compiler, MOVLHPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, vreg, vreg, srcdst, 0)); else - FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, srcdst, 0)); + FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, vreg, vreg, srcdst, 0)); } else { if (lane_index == 1) - FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0)); + FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, vreg, srcdst, 0)); else - FAIL_IF(emit_sse2_load(compiler, 0, freg, srcdst, 0)); + FAIL_IF(emit_sse2_load(compiler, 0, vreg, srcdst, 0)); } } else if (type & SLJIT_SIMD_STORE) { if (lane_index == 0) { if (use_vex) - return emit_vex_instruction(compiler, ((srcdst & SLJIT_MEM) ? MOVSD_xm_x : MOVSD_x_xm) | EX86_PREF_F3 | EX86_SSE2 - | ((srcdst & SLJIT_MEM) ? 0 : VEX_SSE2_OPV), freg, ((srcdst & SLJIT_MEM) ? 0 : freg), srcdst, srcdstw); - return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg); + return emit_vex_instruction(compiler, MOVSD_xm_x | EX86_PREF_F3 | EX86_SSE2 | ((srcdst & SLJIT_MEM) ? 0 : VEX_SSE2_OPV), + vreg, ((srcdst & SLJIT_MEM) ? 0 : srcdst), srcdst, srcdstw); + return emit_sse2_store(compiler, 1, srcdst, srcdstw, vreg); } if (srcdst & SLJIT_MEM) { if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, srcdst, srcdstw)); + FAIL_IF(emit_vex_instruction(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, vreg, 0, srcdst, srcdstw)); else - FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, vreg, srcdst, srcdstw)); return emit_byte(compiler, U8(lane_index)); } if (use_vex) { - FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, vreg, vreg, 0)); return emit_byte(compiler, U8(lane_index)); } - if (srcdst == freg) + if (srcdst == vreg) op = SHUFPS_x_xm | EX86_SSE2; else { switch (lane_index) { @@ -4124,7 +4154,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } } - FAIL_IF(emit_groupf(compiler, op, srcdst, freg, 0)); + FAIL_IF(emit_groupf(compiler, op, srcdst, vreg, 0)); op &= 0xff; if (op == SHUFPS_x_xm || op == PSHUFD_x_xm) @@ -4133,23 +4163,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile return SLJIT_SUCCESS; } else { if (lane_index != 0 || (srcdst & SLJIT_MEM)) { - FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, vreg, srcdst, srcdstw)); FAIL_IF(emit_byte(compiler, U8(lane_index << 4))); } else - FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst)); + FAIL_IF(emit_sse2_store(compiler, 1, vreg, 0, srcdst)); } - if (freg != TMP_FREG || (type & SLJIT_SIMD_STORE)) + if (vreg != TMP_FREG || (type & SLJIT_SIMD_STORE)) return SLJIT_SUCCESS; SLJIT_ASSERT(reg_size == 5); if (type & SLJIT_SIMD_LANE_ZERO) { - FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg_orig, 0, TMP_FREG, 0)); return emit_byte(compiler, 0x4e); } - FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, vreg_orig, vreg_orig, TMP_FREG, 0)); return emit_byte(compiler, 1); } @@ -4186,9 +4216,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (use_vex && (type & SLJIT_SIMD_STORE)) { op = opcode | ((op == 3) ? VEX_OP_0F3A : 0); - FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | VEX_AUTO_W | EX86_SSE2_OP1 | VEX_SSE2_OPV, freg, 0, srcdst, srcdstw)); + FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | VEX_AUTO_W | EX86_SSE2_OP1 | VEX_SSE2_OPV, vreg, 0, srcdst, srcdstw)); } else { - inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw); + inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, vreg, 0, srcdst, srcdstw); FAIL_IF(!inst); inst[0] = GROUP_0F; @@ -4202,15 +4232,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile FAIL_IF(emit_byte(compiler, U8(lane_index))); if (!(type & SLJIT_SIMD_LANE_SIGNED) || (srcdst & SLJIT_MEM)) { - if (freg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) { + if (vreg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) { SLJIT_ASSERT(reg_size == 5); if (type & SLJIT_SIMD_LANE_ZERO) { - FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg_orig, 0, TMP_FREG, 0)); return emit_byte(compiler, 0x4e); } - FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, vreg_orig, vreg_orig, TMP_FREG, 0)); return emit_byte(compiler, 1); } @@ -4262,7 +4292,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_s32 src_lane_index) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4277,7 +4307,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c #endif /* SLJIT_CONFIG_X86_32 */ CHECK_ERROR(); - CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, vreg, src, src_lane_index)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; @@ -4301,9 +4331,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (reg_size == 5) { if (src_lane_index == 0) - return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); + return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, src, 0); - FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg, 0, src, 0)); byte = U8(byte | (byte << 2)); return emit_byte(compiler, U8(byte | (byte << 4))); @@ -4311,8 +4341,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (src_lane_index == 0) { if (use_vex) - return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, 0); - return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0); + return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, vreg, 0, src, 0); + return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, vreg, src, 0); } /* Changes it to SHUFPD_x_xm. */ @@ -4326,9 +4356,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c SLJIT_ASSERT(elem_size == 2); if (src_lane_index == 0) - return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); + return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, src, 0); - FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg, 0, src, 0)); byte = 0x44; if (src_lane_index >= 4) { @@ -4337,15 +4367,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } FAIL_IF(emit_byte(compiler, byte)); - FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, vreg, vreg, vreg, 0)); byte = U8(src_lane_index); } else if (use_vex) { - FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, vreg, src, src, 0)); } else { - if (freg != src) - FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0)); + if (vreg != src) + FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, vreg, src, 0)); - FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, vreg, vreg, 0)); } if (elem_size == 2) { @@ -4362,13 +4392,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (elem_size == 0) { if (reg_size == 5 && src_lane_index >= 16) { - FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg, 0, src, 0)); FAIL_IF(emit_byte(compiler, src_lane_index >= 24 ? 0xff : 0xaa)); src_lane_index &= 0x7; - src = freg; + src = vreg; } - if (src_lane_index != 0 || (freg != src && (!(cpu_feature_list & CPU_FEATURE_AVX2) || !use_vex))) { + if (src_lane_index != 0 || (vreg != src && (!(cpu_feature_list & CPU_FEATURE_AVX2) || !use_vex))) { pref = 0; if ((src_lane_index & 0x3) == 0) { @@ -4379,33 +4409,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c byte = U8(src_lane_index >> 1); } else { if (!use_vex) { - if (freg != src) - FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0)); + if (vreg != src) + FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, src, 0)); - FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, vreg, 0)); } else - FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, vreg, src, 0)); FAIL_IF(emit_byte(compiler, U8(src_lane_index))); } if (pref != 0) { if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, vreg, 0, src, 0)); else - FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, vreg, src, 0)); FAIL_IF(emit_byte(compiler, byte)); } - src = freg; + src = vreg; } if (use_vex && (cpu_feature_list & CPU_FEATURE_AVX2)) - return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); + return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, src, 0); SLJIT_ASSERT(reg_size == 4); FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); - return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); + return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, TMP_FREG, 0); } if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && src_lane_index == 0 && elem_size <= 3) { @@ -4424,7 +4454,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c if (reg_size == 5) pref |= VEX_256; - return emit_vex_instruction(compiler, pref, freg, 0, src, 0); + return emit_vex_instruction(compiler, pref, vreg, 0, src, 0); } if (reg_size == 5) { @@ -4443,22 +4473,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c pref = 0; break; default: - FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg, 0, src, 0)); return emit_byte(compiler, U8(src_lane_index == 0 ? 0x44 : 0xee)); } if (pref != 0) { - FAIL_IF(emit_vex_instruction(compiler, pref, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, pref, vreg, 0, src, 0)); byte = U8(byte | (byte << 2)); FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4)))); if (src_lane_index == 0) - return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0); + return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, vreg, 0); - src = freg; + src = vreg; } - FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, vreg, 0, src, 0)); byte = U8(src_lane_index); byte = U8(byte | (byte << 2)); return emit_byte(compiler, U8(byte | (byte << 4))); @@ -4471,16 +4501,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3; if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, vreg, 0, src, 0)); else - FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, vreg, src, 0)); byte = U8(byte | (byte << 2)); FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4)))); if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && pref == EX86_PREF_F2) - return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0); + return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, vreg, 0); - src = freg; + src = vreg; /* fallthrough */ case 2: byte = U8(src_lane_index); @@ -4493,14 +4523,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, 0, src, 0)); else - FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, vreg, src, 0)); return emit_byte(compiler, U8(byte | (byte << 4))); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 src, sljit_sw srcw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4510,7 +4540,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler sljit_u8 opcode; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + CHECK(check_sljit_emit_simd_extend(compiler, type, vreg, src, srcw)); ADJUST_LOCAL_OFFSET(src, srcw); @@ -4533,8 +4563,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler return SLJIT_SUCCESS; if (use_vex) - return emit_vex_instruction(compiler, CVTPS2PD_x_xm | ((reg_size == 5) ? VEX_256 : 0) | EX86_SSE2, freg, 0, src, srcw); - return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw); + return emit_vex_instruction(compiler, CVTPS2PD_x_xm | ((reg_size == 5) ? VEX_256 : 0) | EX86_SSE2, vreg, 0, src, srcw); + return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, vreg, src, srcw); } switch (elem_size) { @@ -4570,12 +4600,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler return SLJIT_SUCCESS; if (use_vex) - return emit_vex_instruction(compiler, opcode | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw); - return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw); + return emit_vex_instruction(compiler, opcode | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, 0, src, srcw); + return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, vreg, src, srcw); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 freg, + sljit_s32 vreg, sljit_s32 dst, sljit_sw dstw) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); @@ -4586,7 +4616,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_u8 *inst; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + CHECK(check_sljit_emit_simd_sign(compiler, type, vreg, dst, dstw)); ADJUST_LOCAL_OFFSET(dst, dstw); @@ -4607,10 +4637,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c switch (elem_size) { case 1: if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, vreg, vreg, 0)); else - FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0)); - freg = TMP_FREG; + FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, vreg, 0)); + vreg = TMP_FREG; break; case 2: op = EX86_SSE2_OP2; @@ -4621,9 +4651,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c op |= (elem_size < 2) ? PMOVMSKB_r_x : MOVMSKPS_r_x; if (use_vex) - FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, vreg, 0)); else - FAIL_IF(emit_groupf(compiler, op, dst_r, freg, 0)); + FAIL_IF(emit_groupf(compiler, op, dst_r, vreg, 0)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = type & SLJIT_32; @@ -4650,9 +4680,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; if (elem_size == 1) { - FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, vreg, 0, TMP_FREG, 0)); FAIL_IF(emit_byte(compiler, 1)); - FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0)); + FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, vreg, TMP_FREG, 0)); FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0)); } else { op = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2; @@ -4662,7 +4692,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c else if (elem_size == 3) op |= EX86_PREF_66; - FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, vreg, 0)); } if (dst_r == TMP_REG1) { @@ -4676,7 +4706,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src_freg) + sljit_s32 dst_vreg, sljit_s32 src_vreg) { sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2; @@ -4685,18 +4715,21 @@ static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3) op |= EX86_PREF_66; - return emit_groupf(compiler, op, dst_freg, src_freg, 0); + return emit_groupf(compiler, op, dst_vreg, src_vreg, 0); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_vreg, sljit_s32 src1_vreg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX); sljit_uw op = 0; + sljit_uw mov_op = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_vreg, src1_vreg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; @@ -4730,27 +4763,52 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) op |= EX86_PREF_66; break; + + case SLJIT_SIMD_OP2_SHUFFLE: + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + op = PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38; + break; } if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; - if (reg_size == 5 || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX))) { + if ((src2 & SLJIT_MEM) && SLJIT_SIMD_GET_ELEM2_SIZE(type) < reg_size) { + mov_op = ((type & SLJIT_SIMD_FLOAT) ? (MOVUPS_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0)) : (MOVDQU_x_xm | EX86_PREF_F3)) | EX86_SSE2; + if (use_vex) + FAIL_IF(emit_vex_instruction(compiler, mov_op, TMP_FREG, 0, src2, src2w)); + else + FAIL_IF(emit_groupf(compiler, mov_op, TMP_FREG, src2, src2w)); + + src2 = TMP_FREG; + src2w = 0; + } + + if (reg_size == 5 || use_vex) { if (reg_size == 5) op |= VEX_256; - return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0); + return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_vreg, src1_vreg, src2, src2w); } - if (dst_freg != src1_freg) { - if (dst_freg == src2_freg) - src2_freg = src1_freg; - else - FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg)); + if (dst_vreg != src1_vreg) { + if (dst_vreg == src2) { + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) { + FAIL_IF(emit_simd_mov(compiler, type, TMP_FREG, src2)); + FAIL_IF(emit_simd_mov(compiler, type, dst_vreg, src1_vreg)); + src2 = TMP_FREG; + src2w = 0; + } else + src2 = src1_vreg; + } else + FAIL_IF(emit_simd_mov(compiler, type, dst_vreg, src1_vreg)); } - FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0)); - return SLJIT_SUCCESS; + if (op & (VEX_OP_0F38 | VEX_OP_0F3A)) + return emit_groupf_ext(compiler, op | EX86_SSE2, dst_vreg, src2, src2w); + return emit_groupf(compiler, op | EX86_SSE2, dst_vreg, src2, src2w); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, @@ -4760,8 +4818,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler CHECK_ERROR(); CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + if ((op & SLJIT_ATOMIC_USE_LS) || GET_OPCODE(op) == SLJIT_MOV_S8 || GET_OPCODE(op) == SLJIT_MOV_S16 || GET_OPCODE(op) == SLJIT_MOV_S32) + return SLJIT_ERR_UNSUPPORTED; + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + SLJIT_SKIP_CHECKS(compiler); - return sljit_emit_op1(compiler, op, dst_reg, 0, SLJIT_MEM1(mem_reg), 0); + return sljit_emit_op1(compiler, op & ~SLJIT_ATOMIC_USE_CAS, dst_reg, 0, SLJIT_MEM1(mem_reg), 0); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, @@ -4770,8 +4834,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler sljit_s32 temp_reg) { sljit_uw pref; - sljit_s32 free_reg = TMP_REG1; #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + sljit_s32 saved_reg = TMP_REG1; + sljit_s32 swap_tmp = 0; sljit_sw srcw = 0; sljit_sw tempw = 0; #endif /* SLJIT_CONFIG_X86_32 */ @@ -4784,18 +4849,43 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler SLJIT_ASSERT(FAST_IS_REG(src_reg) || src_reg == SLJIT_MEM1(SLJIT_SP)); SLJIT_ASSERT(FAST_IS_REG(temp_reg) || temp_reg == SLJIT_MEM1(SLJIT_SP)); + if ((op & SLJIT_ATOMIC_USE_LS) || GET_OPCODE(op) == SLJIT_MOV_S8 || GET_OPCODE(op) == SLJIT_MOV_S16 || GET_OPCODE(op) == SLJIT_MOV_S32) + return SLJIT_ERR_UNSUPPORTED; + + if (op & SLJIT_ATOMIC_TEST) + return SLJIT_SUCCESS; + op = GET_OPCODE(op); + #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (temp_reg == SLJIT_TMP_DEST_REG) { + FAIL_IF(emit_byte(compiler, XCHG_EAX_r | reg_map[TMP_REG1])); + + if (src_reg == SLJIT_R0) + src_reg = TMP_REG1; + if (mem_reg == SLJIT_R0) + mem_reg = TMP_REG1; + + temp_reg = SLJIT_R0; + swap_tmp = 1; + } + + /* Src is virtual register or its low byte is not accessible. */ if ((src_reg & SLJIT_MEM) || (op == SLJIT_MOV_U8 && reg_map[src_reg] >= 4)) { - /* Src is virtual register or its low byte is not accessible. */ - SLJIT_ASSERT(src_reg != SLJIT_R1); - free_reg = src_reg; + SLJIT_ASSERT(src_reg != SLJIT_R1 && temp_reg != SLJIT_TMP_DEST_REG); + + if (swap_tmp) { + saved_reg = (mem_reg != SLJIT_R1) ? SLJIT_R1 : SLJIT_R2; - EMIT_MOV(compiler, TMP_REG1, 0, src_reg, srcw); - src_reg = TMP_REG1; + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, saved_reg, 0); + EMIT_MOV(compiler, saved_reg, 0, src_reg, srcw); + } else + EMIT_MOV(compiler, TMP_REG1, 0, src_reg, srcw); + + src_reg = saved_reg; if (mem_reg == src_reg) - mem_reg = TMP_REG1; + mem_reg = saved_reg; } #endif /* SLJIT_CONFIG_X86_32 */ @@ -4803,29 +4893,37 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 0; - EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0); + EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_R0, 0); EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, 0); if (src_reg == SLJIT_R0) - src_reg = free_reg; + src_reg = TMP_REG2; if (mem_reg == SLJIT_R0) - mem_reg = free_reg; + mem_reg = TMP_REG2; #else /* !SLJIT_CONFIG_X86_64 */ - if (src_reg == TMP_REG1 && mem_reg == SLJIT_R0 && (free_reg & SLJIT_MEM)) { - EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R1, 0); - EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_R0, 0); - EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw); + SLJIT_ASSERT(!swap_tmp); - mem_reg = SLJIT_R1; - free_reg = SLJIT_R1; + if (src_reg == TMP_REG1) { + if (mem_reg == SLJIT_R0) { + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R1, 0); + EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_R0, 0); + EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw); + + mem_reg = SLJIT_R1; + saved_reg = SLJIT_R1; + } else { + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R0, 0); + EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw); + saved_reg = SLJIT_R0; + } } else { - EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0); + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R0, 0); EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw); if (src_reg == SLJIT_R0) - src_reg = free_reg; + src_reg = TMP_REG1; if (mem_reg == SLJIT_R0) - mem_reg = free_reg; + mem_reg = TMP_REG1; } #endif /* SLJIT_CONFIG_X86_64 */ } @@ -4847,14 +4945,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0)); +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (swap_tmp) { + SLJIT_ASSERT(temp_reg == SLJIT_R0); + FAIL_IF(emit_byte(compiler, XCHG_EAX_r | reg_map[TMP_REG1])); + + if (saved_reg != TMP_REG1) + return emit_mov(compiler, saved_reg, 0, SLJIT_MEM1(SLJIT_SP), 0); + return SLJIT_SUCCESS; + } +#endif /* SLJIT_CONFIG_X86_32 */ + if (temp_reg != SLJIT_R0) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 0; - return emit_mov(compiler, SLJIT_R0, 0, TMP_REG1, 0); + return emit_mov(compiler, SLJIT_R0, 0, TMP_REG2, 0); #else /* !SLJIT_CONFIG_X86_64 */ - EMIT_MOV(compiler, SLJIT_R0, 0, free_reg, 0); - if (free_reg != TMP_REG1) - return emit_mov(compiler, free_reg, 0, (free_reg == SLJIT_R1) ? SLJIT_MEM1(SLJIT_SP) : TMP_REG1, 0); + EMIT_MOV(compiler, SLJIT_R0, 0, (saved_reg == SLJIT_R0) ? SLJIT_MEM1(SLJIT_SP) : saved_reg, 0); + if (saved_reg == SLJIT_R1) + return emit_mov(compiler, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_SP), 0); #endif /* SLJIT_CONFIG_X86_64 */ } return SLJIT_SUCCESS;