Skip to content
This repository has been archived by the owner on Jun 10, 2024. It is now read-only.

Commit

Permalink
fix: dynamic apply for x86_64 on macOS/linux
Browse files Browse the repository at this point in the history
  • Loading branch information
bitwalker committed Feb 27, 2023
1 parent 8b87eba commit 8d0dd93
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 90 deletions.
81 changes: 43 additions & 38 deletions library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,28 @@ __firefly_dynamic_apply:
# At this point, the following registers are bound:
#
# rdi <- callee
# rsi <- argv
# rdx <- argc
# rsi <- process
# rdx <- argv
# rcx <- argc
#
# Save the parent base pointer for when control returns to this call frame.
# CFA directives will inform the unwinder to expect %rbp at the bottom of the
# CFA directives will inform the unwinder to expect rbp at the bottom of the
# stack for this frame, so this should be the last value on the stack in the caller
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp

# Save our callee and argv pointers, and argc
mov r10, rdi
mov r11, rsi
mov rax, rdx
# Save our callee, process and argv pointers, and argc
mov r10, rdi
mov r11, rsi
mov rdi, rdx
mov rax, rcx

# Determine if spills are needed
# In the common case in which they are not, we perform a tail call
cmp rdx, 7
cmp rcx, 6
ja .L_dyn_call_spill

.L_dyn_call_no_spill:
Expand All @@ -42,9 +44,9 @@ __firefly_dynamic_apply:

# Calculate offset in jump table to block which handles the specific
# number of registers we have arguments for, then jump to that block
lea rcx, [rip + .L_dyn_call_jt]
mov rcx, [rcx + rdx * 8]
jmp rcx
lea rdx, [rip + .L_dyn_call_jt]
mov rax, [rdx + 8*rax]
jmp rax

# All of these basic blocks perform a tail call. As such,
# the unwinder will skip over this frame should the callee
Expand All @@ -54,39 +56,39 @@ __firefly_dynamic_apply:
jmp r10

.L_dyn_call_regs1:
mov rdi, [r11]
mov rsi, [rdi]
pop rbp
jmp r10

.L_dyn_call_regs2:
mov rdi, [r11]
mov rsi, [r11 + 8]
pop rbp
jmp r10
mov rsi, [rdi]
mov rdx, [rdi + 8]
pop rbp
jmp r10

.L_dyn_call_regs3:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
pop rbp
jmp r10
mov rsi, [rdi]
mov rdx, [rdi + 8]
mov rcx, [rdi + 16]
pop rbp
jmp r10

.L_dyn_call_regs4:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
pop rbp
jmp r10
mov rsi, [rdi]
mov rdx, [rdi + 8]
mov rcx, [rdi + 16]
mov r8, [rdi + 24]
pop rbp
jmp r10

.L_dyn_call_regs5:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
mov r8, [r11 + 32]
pop rbp
jmp r10
mov rsi, [rdi]
mov rdx, [rdi + 8]
mov rcx, [rdi + 16]
mov r8, [rdi + 24]
mov r9, [rdi + 32]
pop rbp
jmp r10

.L_dyn_call_regs6:
mov rdi, [r11]
Expand All @@ -105,7 +107,7 @@ __firefly_dynamic_apply:

# Calculate spill count for later (rep uses rcx for the iteration count,
# which in this case is the number of quadwords to copy)
mov rcx, rdx
mov r8, rcx
sub rcx, 6

# Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
Expand All @@ -116,16 +118,19 @@ __firefly_dynamic_apply:
sub rsp, rax

# load source pointer (last item of argv)
lea rsi, [r11 + rdx * 8 + -8]
lea rsi, [rdi + r8 * 8 - 8]
# load destination pointer (top of spill region)
lea rdi, [rsp + rcx * 8 + -8]
lea rdi, [rsp + rcx * 8 - 8]
# copy rcx quadwords from rsi to rdi, in reverse
std
rep movsq
cld

# We've spilled arguments, so we have at least 6 args
mov rdi, [r11]
mov r8, rdi # We need to move rdi to r11, but it is occupied, so temporarily move to r8
mov rdi, r11 # Move process pointer to rdi
mov r11, r8 # Move r8 to r11
mov rsi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
Expand Down
97 changes: 45 additions & 52 deletions library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ L_dyn_call_begin:
# At this point, the following registers are bound:
#
# rdi <- callee
# rsi <- argv
# rdx <- argc
# rsi <- process
# rdx <- argv
# rcx <- argc
#
# Save the parent base pointer for when control returns to this call frame.
# CFA directives will inform the unwinder to expect rbp at the bottom of the
Expand All @@ -20,14 +21,15 @@ L_dyn_call_begin:
mov rbp, rsp
.cfi_def_cfa_register rbp

# Save our callee and argv pointers, and argc
# Save our callee, process and argv pointers, and argc
mov r10, rdi
mov r11, rsi
mov rax, rdx
mov rdi, rdx
mov rax, rcx

# Determine if spills are needed
# Determine if spills are needed (argc + 1 should be <= 8 when not needed)
# In the common case in which they are not, we perform a tail call
cmp rdx, 7
cmp rcx, 6
ja L_dyn_call_spill

L_dyn_call_no_spill:
Expand All @@ -38,62 +40,52 @@ L_dyn_call_no_spill:

# Calculate offset in jump table to block which handles the specific
# number of registers we have arguments for, then jump to that block
lea rcx, [rip + L_dyn_call_jt]
mov rax, [rcx + rax * 4]
add rax, rcx
jmp [rax]
lea rdx, [rip + L_dyn_call_jt]
movsxd rax, dword ptr [rdx + 4*rax]
add rax, rdx
jmp rax

# All of these basic blocks perform a tail call. As such,
# the unwinder will skip over this frame should the callee
# throw an exception
L_dyn_call_regs0:
pop rbp
jmp [r10]
jmp r10

L_dyn_call_regs1:
mov rdi, [r11]
mov rsi, [rdi]
pop rbp
jmp [r10]
jmp r10

L_dyn_call_regs2:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rsi, [rdi]
mov rdx, [rdi + 8]
pop rbp
jmp [r10]
jmp r10

L_dyn_call_regs3:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rsi, [rdi]
mov rdx, [rdi + 8]
mov rcx, [rdi + 16]
pop rbp
jmp [r10]
jmp r10

L_dyn_call_regs4:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
mov rsi, [rdi]
mov rdx, [rdi + 8]
mov rcx, [rdi + 16]
mov r8, [rdi + 24]
pop rbp
jmp [r10]
jmp r10

L_dyn_call_regs5:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
mov r8, [r11 + 32]
mov rsi, [rdi]
mov rdx, [rdi + 8]
mov rcx, [rdi + 16]
mov r8, [rdi + 24]
mov r9, [rdi + 32]
pop rbp
jmp [r10]

L_dyn_call_regs6:
mov rdi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
mov r8, [r11 + 32]
mov r9, [r11 + 40]
pop rbp
jmp [r10]
jmp r10

L_dyn_call_spill:
# If we hit this block, we have identified that there are
Expand All @@ -102,7 +94,7 @@ L_dyn_call_spill:

# Calculate spill count for later (rep uses rcx for the iteration count,
# which in this case is the number of quadwords to copy)
mov rcx, rdx
mov r8, rcx
sub rcx, 6

# Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
Expand All @@ -113,21 +105,24 @@ L_dyn_call_spill:
sub rsp, rax

# load source pointer (last item of argv)
lea rsi, [r11 + rdx * 8 - 8]
lea rsi, [rdi + r8 * 8 - 8]
# load destination pointer (top of spill region)
lea rdi, [rsp + rcx * 8 - 8]
lea rdi, [rsp + rcx * 8 - 8]
# copy rcx quadwords from rsi to rdi, in reverse
std
rep movsq
cld

# We've spilled arguments, so we have at least 6 args
mov rdi, [r11]
mov rsi, [r11 + 8]
mov r8, rdi # We need to move rdi to r11, but it is occupied, so temporarily move to r8
mov rdi, r11 # Move process pointer to rdi
mov r11, r8 # Move r8 to r11
mov rsi, [r11]
mov rsi, [r11 + 8]
mov rdx, [r11 + 16]
mov rcx, [r11 + 24]
mov r8, [r11 + 32]
mov r9, [r11 + 40]
mov r8, [r11 + 32]
mov r9, [r11 + 40]

L_dyn_call_exec:
# If we spill arguments to the stack, we can't perform
Expand All @@ -141,7 +136,7 @@ L_dyn_call_exec:
# This instruction will push the return address and jump,
# and we can expect rbp to be the same as we left it upon
# return.
call [r10]
call r10

L_dyn_call_ret:
# Non-tail call completed successfully
Expand All @@ -156,21 +151,19 @@ L_dyn_call_end:
# a variable number of register-based arguments
.p2align 2
.data_region jt32
.set L_dyn_call_jt_entry0, L_dyn_call_exec-L_dyn_call_jt
.set L_dyn_call_jt_entry0, L_dyn_call_regs0-L_dyn_call_jt
.set L_dyn_call_jt_entry1, L_dyn_call_regs1-L_dyn_call_jt
.set L_dyn_call_jt_entry2, L_dyn_call_regs2-L_dyn_call_jt
.set L_dyn_call_jt_entry3, L_dyn_call_regs3-L_dyn_call_jt
.set L_dyn_call_jt_entry4, L_dyn_call_regs4-L_dyn_call_jt
.set L_dyn_call_jt_entry5, L_dyn_call_regs5-L_dyn_call_jt
.set L_dyn_call_jt_entry6, L_dyn_call_regs6-L_dyn_call_jt
L_dyn_call_jt:
.long L_dyn_call_jt_entry0
.long L_dyn_call_jt_entry1
.long L_dyn_call_jt_entry2
.long L_dyn_call_jt_entry3
.long L_dyn_call_jt_entry4
.long L_dyn_call_jt_entry5
.long L_dyn_call_jt_entry6
.end_data_region

# The following is the LSDA metadata for exception handling
Expand Down

0 comments on commit 8d0dd93

Please sign in to comment.