Date: Tue, 14 Apr 2026 13:52:18 +0200
If the compiler processes it, there is no need for you for providing assembler. Or for defining how the implementations do it.
It is just compiled or even optimized like normal C++ code.
You can (with a C++ standard proposal) just declare two template functions (to be flexible about the parameters/overload), which are called before a call to the marked function and after a call to the marked function. (The question is, whether the marking may be attributes, as it changes the program)
Then you
- get notified about the call,
- can set a debug breakpoint (if you have some location, which is not optimized away)
- can log the parameters
- get notified about the return and return type
Something like _penter, but inserted on the call site.
https://learn.microsoft.com/en-us/cpp/build/reference/gh-enable-penter-hook-function
-----Ursprüngliche Nachricht-----
Von:Frederick Virchanza Gotham via Std-Proposals <std-proposals_at_[hidden]>
Gesendet:Di 14.04.2026 11:42
Betreff:Re: [std-proposals] Interceptor Function (preserve stack and all registers)
An:std-proposals_at_[hidden];
CC:Frederick Virchanza Gotham <cauldwell.thomas_at_[hidden]>;
Forget about all the [[musttail]] stuff. I think I've settled on a better way of doing this.
Firstly, here's how you'd write an interceptor:
[[interceptor]] void Logger(void)
{
puts("Function called");
goto -> GetProcAddress( LoadLibraryA("graphics.dll"), "RotateMatrix" );
}
When the compiler encounters the above, it will pretend a few things:
(1) Function name will be have "__core_" prepended to it (i.e. __core_Logger). The name will be mangled, so it can be a template function inside a namespace.
(2) The signature will be changed to "auto (*)(void) -> void(*)(void)"
(3) The "goto ->" will be changed to "return"
So here's what you'll have:
auto __core__Z6Loggerv(void) -> void(*)(void)
{
puts("Function called");
return dlsym( dlopen("graphics", RTLD_NOW), "RotateMatrix" );
}
Next the compiler will emit a second function in assembler named '_Z6Loggerv' as follows:
_Z6Loggerv:
save_call_state
call __core__Z6Loggerv
stash_address_in_scratch_register
restore_call_state
jmp *scratch_register
Of course, 'save_call_state' will differ by architecture (e.g. x86_64 Vs aarch64), and also differ by convention (e.g. SystemV Vs msabi). While I can't write 'save_call_state' once for every architecture, I *can* write it so that it works for all known calling conventions for the selected architecture (so for example on x86_32, it will work for cdecl, stdcall, thiscall).
I am implementing this first of all for x86_64. The registers needed for SystemV are a superset of the registers needed for msabi, so I just save all the SystemV registers. Msabi requires 32 bytes of shadow space on the stack before a function call, so I've added that in (it's harmless on SystemV).
Some x86_64 processors only have SSE, some have AVX-256, and some have AVX-512, so at runtime I need to execute the CPUID instruction to find out. Here's the assembler I've got for all this at the moment:
# ============================================================
# PUSH_ARGS_AND_MODE
#
# Saves:
# GP: rax, rdi, rsi, rdx, rcx, r8, r9
# mode tag:
# 1 = SSE (xmm0..xmm7)
# 2 = AVX (ymm0..ymm7)
# 3 = AVX512 (zmm0..zmm7 + k0..k7)
#
# Notes:
# - CPUID is executed every entry.
# - RBX is preserved around CPUID because CPUID clobbers it.
# - RSP starts 8 mod 16 on both SysV and Microsoft x64 entry.
# - The chosen frame sizes keep RSP 0 mod 16 before calling __core_Logger
# ============================================================
.macro PUSH_ARGS_AND_MODE
# Save inbound GP argument state first.
# Offsets:
# 0 rax
# 8 rdi
# 16 rsi
# 24 rdx
# 32 rcx
# 40 r8
# 48 r9
sub $56, %rsp
mov %rax, 0(%rsp)
mov %rdi, 8(%rsp)
mov %rsi, 16(%rsp)
mov %rdx, 24(%rsp)
mov %rcx, 32(%rsp)
mov %r8, 40(%rsp)
mov %r9, 48(%rsp)
# Preserve RBX around CPUID.
push %rbx
# Detect AVX / AVX-512 each time.
# Default fallback is SSE.
mov $1, %eax
cpuid
bt $27, %ecx # OSXSAVE
jnc .Lpush_sse\@
bt $28, %ecx # AVX
jnc .Lpush_sse\@
xor %ecx, %ecx
xgetbv # XCR0 -> edx:eax
mov %eax, %r10d
and $0x6, %r10d # bits 1,2 => XMM+YMM
cmp $0x6, %r10d
jne .Lpush_sse\@
mov $7, %eax
xor %ecx, %ecx
cpuid
bt $16, %ebx # AVX512F
jnc .Lpush_avx\@
xor %ecx, %ecx
xgetbv
and $0xE6, %eax # bits 1,2,5,6,7
cmp $0xE6, %eax
je .Lpush_avx512\@
.Lpush_avx\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 2
# 8 ymm0
# 40 ymm1
# 72 ymm2
# 104 ymm3
# 136 ymm4
# 168 ymm5
# 200 ymm6
# 232 ymm7
# 264 pad
# 272 saved GP block (56 bytes)
#
# Additional size = 272, total frame = 328 bytes.
sub $272, %rsp
movq $2, 0(%rsp)
vmovdqu %ymm0, 8(%rsp)
vmovdqu %ymm1, 40(%rsp)
vmovdqu %ymm2, 72(%rsp)
vmovdqu %ymm3, 104(%rsp)
vmovdqu %ymm4, 136(%rsp)
vmovdqu %ymm5, 168(%rsp)
vmovdqu %ymm6, 200(%rsp)
vmovdqu %ymm7, 232(%rsp)
jmp .Lpush_done\@
.Lpush_avx512\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 3
# 8 zmm0
# 72 zmm1
# 136 zmm2
# 200 zmm3
# 264 zmm4
# 328 zmm5
# 392 zmm6
# 456 zmm7
# 520 k0
# 528 k1
# 536 k2
# 544 k3
# 552 k4
# 560 k5
# 568 k6
# 576 k7
# 584 pad
# 592 saved GP block (56 bytes)
#
# Additional size = 592, total frame = 648 bytes.
sub $592, %rsp
movq $3, 0(%rsp)
vmovdqu64 %zmm0, 8(%rsp)
vmovdqu64 %zmm1, 72(%rsp)
vmovdqu64 %zmm2, 136(%rsp)
vmovdqu64 %zmm3, 200(%rsp)
vmovdqu64 %zmm4, 264(%rsp)
vmovdqu64 %zmm5, 328(%rsp)
vmovdqu64 %zmm6, 392(%rsp)
vmovdqu64 %zmm7, 456(%rsp)
kmovq %k0, 520(%rsp)
kmovq %k1, 528(%rsp)
kmovq %k2, 536(%rsp)
kmovq %k3, 544(%rsp)
kmovq %k4, 552(%rsp)
kmovq %k5, 560(%rsp)
kmovq %k6, 568(%rsp)
kmovq %k7, 576(%rsp)
jmp .Lpush_done\@
.Lpush_sse\@:
pop %rbx
# Layout after second subtraction:
# 0 mode = 1
# 8 xmm0
# 24 xmm1
# 40 xmm2
# 56 xmm3
# 72 xmm4
# 88 xmm5
# 104 xmm6
# 120 xmm7
# 136 pad
# 144 saved GP block (56 bytes)
#
# Additional size = 144, total frame = 200 bytes.
sub $144, %rsp
movq $1, 0(%rsp)
movdqu %xmm0, 8(%rsp)
movdqu %xmm1, 24(%rsp)
movdqu %xmm2, 40(%rsp)
movdqu %xmm3, 56(%rsp)
movdqu %xmm4, 72(%rsp)
movdqu %xmm5, 88(%rsp)
movdqu %xmm6, 104(%rsp)
movdqu %xmm7, 120(%rsp)
.Lpush_done\@:
.endm
# ============================================================
# POP_ARGS_AND_JUMP
#
# Expects:
# r11 = final jump target returned by __core_Logger
#
# Restores according to mode tag, then jumps through r11.
# ============================================================
.macro POP_ARGS_AND_JUMP
cmpq $1, 0(%rsp)
je .Lpop_sse\@
cmpq $2, 0(%rsp)
je .Lpop_avx\@
cmpq $3, 0(%rsp)
je .Lpop_avx512\@
ud2
.Lpop_sse\@:
movdqu 8(%rsp), %xmm0
movdqu 24(%rsp), %xmm1
movdqu 40(%rsp), %xmm2
movdqu 56(%rsp), %xmm3
movdqu 72(%rsp), %xmm4
movdqu 88(%rsp), %xmm5
movdqu 104(%rsp), %xmm6
movdqu 120(%rsp), %xmm7
mov 144(%rsp), %rax
mov 152(%rsp), %rdi
mov 160(%rsp), %rsi
mov 168(%rsp), %rdx
mov 176(%rsp), %rcx
mov 184(%rsp), %r8
mov 192(%rsp), %r9
add $200, %rsp
jmp *%r11
.Lpop_avx\@:
vmovdqu 8(%rsp), %ymm0
vmovdqu 40(%rsp), %ymm1
vmovdqu 72(%rsp), %ymm2
vmovdqu 104(%rsp), %ymm3
vmovdqu 136(%rsp), %ymm4
vmovdqu 168(%rsp), %ymm5
vmovdqu 200(%rsp), %ymm6
vmovdqu 232(%rsp), %ymm7
mov 272(%rsp), %rax
mov 280(%rsp), %rdi
mov 288(%rsp), %rsi
mov 296(%rsp), %rdx
mov 304(%rsp), %rcx
mov 312(%rsp), %r8
mov 320(%rsp), %r9
add $328, %rsp
jmp *%r11
.Lpop_avx512\@:
vmovdqu64 8(%rsp), %zmm0
vmovdqu64 72(%rsp), %zmm1
vmovdqu64 136(%rsp), %zmm2
vmovdqu64 200(%rsp), %zmm3
vmovdqu64 264(%rsp), %zmm4
vmovdqu64 328(%rsp), %zmm5
vmovdqu64 392(%rsp), %zmm6
vmovdqu64 456(%rsp), %zmm7
kmovq 520(%rsp), %k0
kmovq 528(%rsp), %k1
kmovq 536(%rsp), %k2
kmovq 544(%rsp), %k3
kmovq 552(%rsp), %k4
kmovq 560(%rsp), %k5
kmovq 568(%rsp), %k6
kmovq 576(%rsp), %k7
mov 592(%rsp), %rax
mov 600(%rsp), %rdi
mov 608(%rsp), %rsi
mov 616(%rsp), %rdx
mov 624(%rsp), %rcx
mov 632(%rsp), %r8
mov 640(%rsp), %r9
add $648, %rsp
jmp *%r11
.endm
# ============================================================
# Public stub
#
# Microsoft x64 requires 32 bytes of shadow space for the call.
# Reserving it here is harmless on SysV if alignment stays correct.
# ============================================================
Logger:
PUSH_ARGS_AND_MODE
sub $32, %rsp
call __core_Logger
add $32, %rsp
mov %rax, %r11
POP_ARGS_AND_JUMP
So I think I've got a decent plan here for the implementation. This will successfully intercept any function call on x86_64 (even if binaries are mixing calling conventions).
--
Std-Proposals mailing list
Std-Proposals_at_[hidden]
https://lists.isocpp.org/mailman/listinfo.cgi/std-proposals
Received on 2026-04-14 11:53:43
