Date: Tue, 14 Apr 2026 10:42:20 +0100
Forget about all the [[musttail]] stuff. I think I've settled on a better
way of doing this.
Firstly, here's how you'd write an interceptor:
[[interceptor]] void Logger(void)
{
puts("Function called");
goto -> GetProcAddress( LoadLibraryA("graphics.dll"), "RotateMatrix" );
}
When the compiler encounters the above, it will pretend a few things:
(1) Function name will be have "__core_" prepended to it (i.e.
__core_Logger). The name will be mangled, so it can be a template function
inside a namespace.
(2) The signature will be changed to "auto (*)(void) -> void(*)(void)"
(3) The "goto ->" will be changed to "return"
So here's what you'll have:
auto __core__Z6Loggerv(void) -> void(*)(void)
{
puts("Function called");
return dlsym( dlopen("graphics", RTLD_NOW), "RotateMatrix" );
}
Next the compiler will emit a second function in assembler named
'_Z6Loggerv' as follows:
_Z6Loggerv:
save_call_state
call __core__Z6Loggerv
stash_address_in_scratch_register
restore_call_state
jmp *scratch_register
Of course, 'save_call_state' will differ by architecture (e.g. x86_64 Vs
aarch64), and also differ by convention (e.g. SystemV Vs msabi). While I
can't write 'save_call_state' once for every architecture, I *can* write it
so that it works for all known calling conventions for the selected
architecture (so for example on x86_32, it will work for cdecl, stdcall,
thiscall).
I am implementing this first of all for x86_64. The registers needed for
SystemV are a superset of the registers needed for msabi, so I just save
all the SystemV registers. Msabi requires 32 bytes of shadow space on the
stack before a function call, so I've added that in (it's harmless on
SystemV).
Some x86_64 processors only have SSE, some have AVX-256, and some have
AVX-512, so at runtime I need to execute the CPUID instruction to find out.
Here's the assembler I've got for all this at the moment:
# ============================================================
# PUSH_ARGS_AND_MODE
#
# Saves:
# GP: rax, rdi, rsi, rdx, rcx, r8, r9
# mode tag:
# 1 = SSE (xmm0..xmm7)
# 2 = AVX (ymm0..ymm7)
# 3 = AVX512 (zmm0..zmm7 + k0..k7)
#
# Notes:
# - CPUID is executed every entry.
# - RBX is preserved around CPUID because CPUID clobbers it.
# - RSP starts 8 mod 16 on both SysV and Microsoft x64 entry.
# - The chosen frame sizes keep RSP 0 mod 16 before calling
__core_Logger
# ============================================================
.macro PUSH_ARGS_AND_MODE
# Save inbound GP argument state first.
# Offsets:
# 0 rax
# 8 rdi
# 16 rsi
# 24 rdx
# 32 rcx
# 40 r8
# 48 r9
sub $56, %rsp
mov %rax, 0(%rsp)
mov %rdi, 8(%rsp)
mov %rsi, 16(%rsp)
mov %rdx, 24(%rsp)
mov %rcx, 32(%rsp)
mov %r8, 40(%rsp)
mov %r9, 48(%rsp)
# Preserve RBX around CPUID.
push %rbx
# Detect AVX / AVX-512 each time.
# Default fallback is SSE.
mov $1, %eax
cpuid
bt $27, %ecx # OSXSAVE
jnc .Lpush_sse\@
bt $28, %ecx # AVX
jnc .Lpush_sse\@
xor %ecx, %ecx
xgetbv # XCR0 -> edx:eax
mov %eax, %r10d
and $0x6, %r10d # bits 1,2 => XMM+YMM
cmp $0x6, %r10d
jne .Lpush_sse\@
mov $7, %eax
xor %ecx, %ecx
cpuid
bt $16, %ebx # AVX512F
jnc .Lpush_avx\@
xor %ecx, %ecx
xgetbv
and $0xE6, %eax # bits 1,2,5,6,7
cmp $0xE6, %eax
je .Lpush_avx512\@
.Lpush_avx\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 2
# 8 ymm0
# 40 ymm1
# 72 ymm2
# 104 ymm3
# 136 ymm4
# 168 ymm5
# 200 ymm6
# 232 ymm7
# 264 pad
# 272 saved GP block (56 bytes)
#
# Additional size = 272, total frame = 328 bytes.
sub $272, %rsp
movq $2, 0(%rsp)
vmovdqu %ymm0, 8(%rsp)
vmovdqu %ymm1, 40(%rsp)
vmovdqu %ymm2, 72(%rsp)
vmovdqu %ymm3, 104(%rsp)
vmovdqu %ymm4, 136(%rsp)
vmovdqu %ymm5, 168(%rsp)
vmovdqu %ymm6, 200(%rsp)
vmovdqu %ymm7, 232(%rsp)
jmp .Lpush_done\@
.Lpush_avx512\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 3
# 8 zmm0
# 72 zmm1
# 136 zmm2
# 200 zmm3
# 264 zmm4
# 328 zmm5
# 392 zmm6
# 456 zmm7
# 520 k0
# 528 k1
# 536 k2
# 544 k3
# 552 k4
# 560 k5
# 568 k6
# 576 k7
# 584 pad
# 592 saved GP block (56 bytes)
#
# Additional size = 592, total frame = 648 bytes.
sub $592, %rsp
movq $3, 0(%rsp)
vmovdqu64 %zmm0, 8(%rsp)
vmovdqu64 %zmm1, 72(%rsp)
vmovdqu64 %zmm2, 136(%rsp)
vmovdqu64 %zmm3, 200(%rsp)
vmovdqu64 %zmm4, 264(%rsp)
vmovdqu64 %zmm5, 328(%rsp)
vmovdqu64 %zmm6, 392(%rsp)
vmovdqu64 %zmm7, 456(%rsp)
kmovq %k0, 520(%rsp)
kmovq %k1, 528(%rsp)
kmovq %k2, 536(%rsp)
kmovq %k3, 544(%rsp)
kmovq %k4, 552(%rsp)
kmovq %k5, 560(%rsp)
kmovq %k6, 568(%rsp)
kmovq %k7, 576(%rsp)
jmp .Lpush_done\@
.Lpush_sse\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 1
# 8 xmm0
# 24 xmm1
# 40 xmm2
# 56 xmm3
# 72 xmm4
# 88 xmm5
# 104 xmm6
# 120 xmm7
# 136 pad
# 144 saved GP block (56 bytes)
#
# Additional size = 144, total frame = 200 bytes.
sub $144, %rsp
movq $1, 0(%rsp)
movdqu %xmm0, 8(%rsp)
movdqu %xmm1, 24(%rsp)
movdqu %xmm2, 40(%rsp)
movdqu %xmm3, 56(%rsp)
movdqu %xmm4, 72(%rsp)
movdqu %xmm5, 88(%rsp)
movdqu %xmm6, 104(%rsp)
movdqu %xmm7, 120(%rsp)
.Lpush_done\@:
.endm
# ============================================================
# POP_ARGS_AND_JUMP
#
# Expects:
# r11 = final jump target returned by __core_Logger
#
# Restores according to mode tag, then jumps through r11.
# ============================================================
.macro POP_ARGS_AND_JUMP
cmpq $1, 0(%rsp)
je .Lpop_sse\@
cmpq $2, 0(%rsp)
je .Lpop_avx\@
cmpq $3, 0(%rsp)
je .Lpop_avx512\@
ud2
.Lpop_sse\@:
movdqu 8(%rsp), %xmm0
movdqu 24(%rsp), %xmm1
movdqu 40(%rsp), %xmm2
movdqu 56(%rsp), %xmm3
movdqu 72(%rsp), %xmm4
movdqu 88(%rsp), %xmm5
movdqu 104(%rsp), %xmm6
movdqu 120(%rsp), %xmm7
mov 144(%rsp), %rax
mov 152(%rsp), %rdi
mov 160(%rsp), %rsi
mov 168(%rsp), %rdx
mov 176(%rsp), %rcx
mov 184(%rsp), %r8
mov 192(%rsp), %r9
add $200, %rsp
jmp *%r11
.Lpop_avx\@:
vmovdqu 8(%rsp), %ymm0
vmovdqu 40(%rsp), %ymm1
vmovdqu 72(%rsp), %ymm2
vmovdqu 104(%rsp), %ymm3
vmovdqu 136(%rsp), %ymm4
vmovdqu 168(%rsp), %ymm5
vmovdqu 200(%rsp), %ymm6
vmovdqu 232(%rsp), %ymm7
mov 272(%rsp), %rax
mov 280(%rsp), %rdi
mov 288(%rsp), %rsi
mov 296(%rsp), %rdx
mov 304(%rsp), %rcx
mov 312(%rsp), %r8
mov 320(%rsp), %r9
add $328, %rsp
jmp *%r11
.Lpop_avx512\@:
vmovdqu64 8(%rsp), %zmm0
vmovdqu64 72(%rsp), %zmm1
vmovdqu64 136(%rsp), %zmm2
vmovdqu64 200(%rsp), %zmm3
vmovdqu64 264(%rsp), %zmm4
vmovdqu64 328(%rsp), %zmm5
vmovdqu64 392(%rsp), %zmm6
vmovdqu64 456(%rsp), %zmm7
kmovq 520(%rsp), %k0
kmovq 528(%rsp), %k1
kmovq 536(%rsp), %k2
kmovq 544(%rsp), %k3
kmovq 552(%rsp), %k4
kmovq 560(%rsp), %k5
kmovq 568(%rsp), %k6
kmovq 576(%rsp), %k7
mov 592(%rsp), %rax
mov 600(%rsp), %rdi
mov 608(%rsp), %rsi
mov 616(%rsp), %rdx
mov 624(%rsp), %rcx
mov 632(%rsp), %r8
mov 640(%rsp), %r9
add $648, %rsp
jmp *%r11
.endm
# ============================================================
# Public stub
#
# Microsoft x64 requires 32 bytes of shadow space for the call.
# Reserving it here is harmless on SysV if alignment stays correct.
# ============================================================
Logger:
PUSH_ARGS_AND_MODE
sub $32, %rsp
call __core_Logger
add $32, %rsp
mov %rax, %r11
POP_ARGS_AND_JUMP
So I think I've got a decent plan here for the implementation. This will
successfully intercept any function call on x86_64 (even if binaries are
mixing calling conventions).
way of doing this.
Firstly, here's how you'd write an interceptor:
[[interceptor]] void Logger(void)
{
puts("Function called");
goto -> GetProcAddress( LoadLibraryA("graphics.dll"), "RotateMatrix" );
}
When the compiler encounters the above, it will pretend a few things:
(1) Function name will be have "__core_" prepended to it (i.e.
__core_Logger). The name will be mangled, so it can be a template function
inside a namespace.
(2) The signature will be changed to "auto (*)(void) -> void(*)(void)"
(3) The "goto ->" will be changed to "return"
So here's what you'll have:
auto __core__Z6Loggerv(void) -> void(*)(void)
{
puts("Function called");
return dlsym( dlopen("graphics", RTLD_NOW), "RotateMatrix" );
}
Next the compiler will emit a second function in assembler named
'_Z6Loggerv' as follows:
_Z6Loggerv:
save_call_state
call __core__Z6Loggerv
stash_address_in_scratch_register
restore_call_state
jmp *scratch_register
Of course, 'save_call_state' will differ by architecture (e.g. x86_64 Vs
aarch64), and also differ by convention (e.g. SystemV Vs msabi). While I
can't write 'save_call_state' once for every architecture, I *can* write it
so that it works for all known calling conventions for the selected
architecture (so for example on x86_32, it will work for cdecl, stdcall,
thiscall).
I am implementing this first of all for x86_64. The registers needed for
SystemV are a superset of the registers needed for msabi, so I just save
all the SystemV registers. Msabi requires 32 bytes of shadow space on the
stack before a function call, so I've added that in (it's harmless on
SystemV).
Some x86_64 processors only have SSE, some have AVX-256, and some have
AVX-512, so at runtime I need to execute the CPUID instruction to find out.
Here's the assembler I've got for all this at the moment:
# ============================================================
# PUSH_ARGS_AND_MODE
#
# Saves:
# GP: rax, rdi, rsi, rdx, rcx, r8, r9
# mode tag:
# 1 = SSE (xmm0..xmm7)
# 2 = AVX (ymm0..ymm7)
# 3 = AVX512 (zmm0..zmm7 + k0..k7)
#
# Notes:
# - CPUID is executed every entry.
# - RBX is preserved around CPUID because CPUID clobbers it.
# - RSP starts 8 mod 16 on both SysV and Microsoft x64 entry.
# - The chosen frame sizes keep RSP 0 mod 16 before calling
__core_Logger
# ============================================================
.macro PUSH_ARGS_AND_MODE
# Save inbound GP argument state first.
# Offsets:
# 0 rax
# 8 rdi
# 16 rsi
# 24 rdx
# 32 rcx
# 40 r8
# 48 r9
sub $56, %rsp
mov %rax, 0(%rsp)
mov %rdi, 8(%rsp)
mov %rsi, 16(%rsp)
mov %rdx, 24(%rsp)
mov %rcx, 32(%rsp)
mov %r8, 40(%rsp)
mov %r9, 48(%rsp)
# Preserve RBX around CPUID.
push %rbx
# Detect AVX / AVX-512 each time.
# Default fallback is SSE.
mov $1, %eax
cpuid
bt $27, %ecx # OSXSAVE
jnc .Lpush_sse\@
bt $28, %ecx # AVX
jnc .Lpush_sse\@
xor %ecx, %ecx
xgetbv # XCR0 -> edx:eax
mov %eax, %r10d
and $0x6, %r10d # bits 1,2 => XMM+YMM
cmp $0x6, %r10d
jne .Lpush_sse\@
mov $7, %eax
xor %ecx, %ecx
cpuid
bt $16, %ebx # AVX512F
jnc .Lpush_avx\@
xor %ecx, %ecx
xgetbv
and $0xE6, %eax # bits 1,2,5,6,7
cmp $0xE6, %eax
je .Lpush_avx512\@
.Lpush_avx\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 2
# 8 ymm0
# 40 ymm1
# 72 ymm2
# 104 ymm3
# 136 ymm4
# 168 ymm5
# 200 ymm6
# 232 ymm7
# 264 pad
# 272 saved GP block (56 bytes)
#
# Additional size = 272, total frame = 328 bytes.
sub $272, %rsp
movq $2, 0(%rsp)
vmovdqu %ymm0, 8(%rsp)
vmovdqu %ymm1, 40(%rsp)
vmovdqu %ymm2, 72(%rsp)
vmovdqu %ymm3, 104(%rsp)
vmovdqu %ymm4, 136(%rsp)
vmovdqu %ymm5, 168(%rsp)
vmovdqu %ymm6, 200(%rsp)
vmovdqu %ymm7, 232(%rsp)
jmp .Lpush_done\@
.Lpush_avx512\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 3
# 8 zmm0
# 72 zmm1
# 136 zmm2
# 200 zmm3
# 264 zmm4
# 328 zmm5
# 392 zmm6
# 456 zmm7
# 520 k0
# 528 k1
# 536 k2
# 544 k3
# 552 k4
# 560 k5
# 568 k6
# 576 k7
# 584 pad
# 592 saved GP block (56 bytes)
#
# Additional size = 592, total frame = 648 bytes.
sub $592, %rsp
movq $3, 0(%rsp)
vmovdqu64 %zmm0, 8(%rsp)
vmovdqu64 %zmm1, 72(%rsp)
vmovdqu64 %zmm2, 136(%rsp)
vmovdqu64 %zmm3, 200(%rsp)
vmovdqu64 %zmm4, 264(%rsp)
vmovdqu64 %zmm5, 328(%rsp)
vmovdqu64 %zmm6, 392(%rsp)
vmovdqu64 %zmm7, 456(%rsp)
kmovq %k0, 520(%rsp)
kmovq %k1, 528(%rsp)
kmovq %k2, 536(%rsp)
kmovq %k3, 544(%rsp)
kmovq %k4, 552(%rsp)
kmovq %k5, 560(%rsp)
kmovq %k6, 568(%rsp)
kmovq %k7, 576(%rsp)
jmp .Lpush_done\@
.Lpush_sse\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 1
# 8 xmm0
# 24 xmm1
# 40 xmm2
# 56 xmm3
# 72 xmm4
# 88 xmm5
# 104 xmm6
# 120 xmm7
# 136 pad
# 144 saved GP block (56 bytes)
#
# Additional size = 144, total frame = 200 bytes.
sub $144, %rsp
movq $1, 0(%rsp)
movdqu %xmm0, 8(%rsp)
movdqu %xmm1, 24(%rsp)
movdqu %xmm2, 40(%rsp)
movdqu %xmm3, 56(%rsp)
movdqu %xmm4, 72(%rsp)
movdqu %xmm5, 88(%rsp)
movdqu %xmm6, 104(%rsp)
movdqu %xmm7, 120(%rsp)
.Lpush_done\@:
.endm
# ============================================================
# POP_ARGS_AND_JUMP
#
# Expects:
# r11 = final jump target returned by __core_Logger
#
# Restores according to mode tag, then jumps through r11.
# ============================================================
.macro POP_ARGS_AND_JUMP
cmpq $1, 0(%rsp)
je .Lpop_sse\@
cmpq $2, 0(%rsp)
je .Lpop_avx\@
cmpq $3, 0(%rsp)
je .Lpop_avx512\@
ud2
.Lpop_sse\@:
movdqu 8(%rsp), %xmm0
movdqu 24(%rsp), %xmm1
movdqu 40(%rsp), %xmm2
movdqu 56(%rsp), %xmm3
movdqu 72(%rsp), %xmm4
movdqu 88(%rsp), %xmm5
movdqu 104(%rsp), %xmm6
movdqu 120(%rsp), %xmm7
mov 144(%rsp), %rax
mov 152(%rsp), %rdi
mov 160(%rsp), %rsi
mov 168(%rsp), %rdx
mov 176(%rsp), %rcx
mov 184(%rsp), %r8
mov 192(%rsp), %r9
add $200, %rsp
jmp *%r11
.Lpop_avx\@:
vmovdqu 8(%rsp), %ymm0
vmovdqu 40(%rsp), %ymm1
vmovdqu 72(%rsp), %ymm2
vmovdqu 104(%rsp), %ymm3
vmovdqu 136(%rsp), %ymm4
vmovdqu 168(%rsp), %ymm5
vmovdqu 200(%rsp), %ymm6
vmovdqu 232(%rsp), %ymm7
mov 272(%rsp), %rax
mov 280(%rsp), %rdi
mov 288(%rsp), %rsi
mov 296(%rsp), %rdx
mov 304(%rsp), %rcx
mov 312(%rsp), %r8
mov 320(%rsp), %r9
add $328, %rsp
jmp *%r11
.Lpop_avx512\@:
vmovdqu64 8(%rsp), %zmm0
vmovdqu64 72(%rsp), %zmm1
vmovdqu64 136(%rsp), %zmm2
vmovdqu64 200(%rsp), %zmm3
vmovdqu64 264(%rsp), %zmm4
vmovdqu64 328(%rsp), %zmm5
vmovdqu64 392(%rsp), %zmm6
vmovdqu64 456(%rsp), %zmm7
kmovq 520(%rsp), %k0
kmovq 528(%rsp), %k1
kmovq 536(%rsp), %k2
kmovq 544(%rsp), %k3
kmovq 552(%rsp), %k4
kmovq 560(%rsp), %k5
kmovq 568(%rsp), %k6
kmovq 576(%rsp), %k7
mov 592(%rsp), %rax
mov 600(%rsp), %rdi
mov 608(%rsp), %rsi
mov 616(%rsp), %rdx
mov 624(%rsp), %rcx
mov 632(%rsp), %r8
mov 640(%rsp), %r9
add $648, %rsp
jmp *%r11
.endm
# ============================================================
# Public stub
#
# Microsoft x64 requires 32 bytes of shadow space for the call.
# Reserving it here is harmless on SysV if alignment stays correct.
# ============================================================
Logger:
PUSH_ARGS_AND_MODE
sub $32, %rsp
call __core_Logger
add $32, %rsp
mov %rax, %r11
POP_ARGS_AND_JUMP
So I think I've got a decent plan here for the implementation. This will
successfully intercept any function call on x86_64 (even if binaries are
mixing calling conventions).
Received on 2026-04-14 09:42:24
