Date: Sun, 12 Apr 2026 02:23:11 +0100
On Sun, Apr 12, 2026 at 1:37 AM Thiago Macieira wrote:
>
> It isn't designed for a tail call. In fact, one major problem of your proposal
> is that you need one register to be clobbered because you need to save the
> address to which you'll jump in it.
What about putting the address inside a thread_local variable?
> If we stick to the general ABIs, then there are several caller-save
> registers that aren't using for parameter passing, and one of them specifically
> reserved for function call thunks to use (I think it's r11).
Here's what I'm thinking for System V x86_64 to accommodate __m512:
.macro __builtin_push_all_argument_registers_onto_stack
sub $576, %rsp # 56 + (8 * 64) + 8 = 576 bytes
mov %rax, 0(%rsp) # save %al too, for SysV varargs
mov %rdi, 8(%rsp)
mov %rsi, 16(%rsp)
mov %rdx, 24(%rsp)
mov %rcx, 32(%rsp)
mov %r8, 40(%rsp)
mov %r9, 48(%rsp)
vmovdqu64 %zmm0, 56(%rsp)
vmovdqu64 %zmm1, 120(%rsp)
vmovdqu64 %zmm2, 184(%rsp)
vmovdqu64 %zmm3, 248(%rsp)
vmovdqu64 %zmm4, 312(%rsp)
vmovdqu64 %zmm5, 376(%rsp)
vmovdqu64 %zmm6, 440(%rsp)
vmovdqu64 %zmm7, 504(%rsp)
# jump address gets stored later at 568(%rsp)
.endm
.macro __builtin_pop_all_arguments_off_stack_and_perform_tailcall
mov 0(%rsp), %rax
mov 8(%rsp), %rdi
mov 16(%rsp), %rsi
mov 24(%rsp), %rdx
mov 32(%rsp), %rcx
mov 40(%rsp), %r8
mov 48(%rsp), %r9
vmovdqu64 56(%rsp), %zmm0
vmovdqu64 120(%rsp), %zmm1
vmovdqu64 184(%rsp), %zmm2
vmovdqu64 248(%rsp), %zmm3
vmovdqu64 312(%rsp), %zmm4
vmovdqu64 376(%rsp), %zmm5
vmovdqu64 440(%rsp), %zmm6
vmovdqu64 504(%rsp), %zmm7
add $576, %rsp
jmp *-8(%rsp)
.endm
I think that will work perfectly for every conceivable function call
under System V x86_64.
>
> It isn't designed for a tail call. In fact, one major problem of your proposal
> is that you need one register to be clobbered because you need to save the
> address to which you'll jump in it.
What about putting the address inside a thread_local variable?
> If we stick to the general ABIs, then there are several caller-save
> registers that aren't using for parameter passing, and one of them specifically
> reserved for function call thunks to use (I think it's r11).
Here's what I'm thinking for System V x86_64 to accommodate __m512:
.macro __builtin_push_all_argument_registers_onto_stack
sub $576, %rsp # 56 + (8 * 64) + 8 = 576 bytes
mov %rax, 0(%rsp) # save %al too, for SysV varargs
mov %rdi, 8(%rsp)
mov %rsi, 16(%rsp)
mov %rdx, 24(%rsp)
mov %rcx, 32(%rsp)
mov %r8, 40(%rsp)
mov %r9, 48(%rsp)
vmovdqu64 %zmm0, 56(%rsp)
vmovdqu64 %zmm1, 120(%rsp)
vmovdqu64 %zmm2, 184(%rsp)
vmovdqu64 %zmm3, 248(%rsp)
vmovdqu64 %zmm4, 312(%rsp)
vmovdqu64 %zmm5, 376(%rsp)
vmovdqu64 %zmm6, 440(%rsp)
vmovdqu64 %zmm7, 504(%rsp)
# jump address gets stored later at 568(%rsp)
.endm
.macro __builtin_pop_all_arguments_off_stack_and_perform_tailcall
mov 0(%rsp), %rax
mov 8(%rsp), %rdi
mov 16(%rsp), %rsi
mov 24(%rsp), %rdx
mov 32(%rsp), %rcx
mov 40(%rsp), %r8
mov 48(%rsp), %r9
vmovdqu64 56(%rsp), %zmm0
vmovdqu64 120(%rsp), %zmm1
vmovdqu64 184(%rsp), %zmm2
vmovdqu64 248(%rsp), %zmm3
vmovdqu64 312(%rsp), %zmm4
vmovdqu64 376(%rsp), %zmm5
vmovdqu64 440(%rsp), %zmm6
vmovdqu64 504(%rsp), %zmm7
add $576, %rsp
jmp *-8(%rsp)
.endm
I think that will work perfectly for every conceivable function call
under System V x86_64.
Received on 2026-04-12 01:23:24
