Rust bumped LLVM from v21 to v21 (rust-lang/rust#150722).
This caused a nasty performance regression (oxc-project/oxc#21509 (comment)).
In oxc, the Token is heavely optimised, the updates to the token are in a very hot path. This regression only appears on x86-64.
reduced it to LLVM IR where an i128 value is loaded, part of the low 64 bits is masked out, a new i32 field is inserted, the i128 is stored back, and the new value is returned.
It can be reduced to the following LLVM IR:
define noundef i128 @hot_end_mask(ptr noalias noundef align 16 captures(none) dereferenceable(16) %token, i32 noundef %end) unnamed_addr {
start:
%0 = load i128, ptr %token, align 16
%1 = and i128 %0, -18446744069414584321
%_4 = zext i32 %end to i128
%_3 = shl nuw nsw i128 %_4, 32
%2 = or disjoint i128 %1, %_3
store i128 %2, ptr %token, align 16
ret i128 %2
}
define noundef i128 @hot_start_mask(ptr noalias noundef align 16 captures(none) dereferenceable(16) %token, i32 noundef %start1) unnamed_addr {
start:
%0 = load i128, ptr %token, align 16
%1 = and i128 %0, -4294967296
%_3 = zext i32 %start1 to i128
%2 = or disjoint i128 %1, %_3
store i128 %2, ptr %token, align 16
ret i128 %2
}
on v21, the following is emitted:
hot_end_mask: # @hot_end_mask
mov rdx, qword ptr [rdi + 8]
mov eax, dword ptr [rdi]
shl rsi, 32
or rax, rsi
mov qword ptr [rdi], rax
ret
hot_start_mask: # @hot_start_mask
mov rdx, qword ptr [rdi + 8]
mov ecx, dword ptr [rdi + 4]
shl rcx, 32
mov eax, esi
or rax, rcx
mov qword ptr [rdi], rax
ret
on v22:
hot_end_mask: # @hot_end_mask
mov eax, dword ptr [rdi]
pshufd xmm0, xmmword ptr [rdi], 238 # xmm0 = mem[2,3,2,3]
movq rdx, xmm0
shl rsi, 32
or rax, rsi
movq qword ptr [rdi + 8], xmm0
mov qword ptr [rdi], rax
ret
.LCPI1_0:
.long 0 # 0x0
.long 4294967295 # 0xffffffff
.long 4294967295 # 0xffffffff
.long 4294967295 # 0xffffffff
hot_start_mask: # @hot_start_mask
movdqa xmm0, xmmword ptr [rdi]
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
movq rdx, xmm1
pand xmm0, xmmword ptr [rip + .LCPI1_0]
movq rcx, xmm0
mov eax, esi
or rax, rcx
movq qword ptr [rdi + 8], xmm1
mov qword ptr [rdi], rax
ret
The following is heavly AI assisted, as I'm not an LLVM expert, so feel free to ignore.
I bisected, using LLVM, and the regression appears to start at #171616.
Prior to this commit, the parent emits scalar code with only the necessary low-half store, while the next commit the i128 bitlogic through XMM registers and stores the high half back unnecessarily.
I'm happy to attempt to look at a fix for this, but direction is appreciated - thanks!
Rust bumped LLVM from v21 to v21 (rust-lang/rust#150722).
This caused a nasty performance regression (oxc-project/oxc#21509 (comment)).
In oxc, the
Tokenis heavely optimised, the updates to the token are in a very hot path. This regression only appears on x86-64.reduced it to LLVM IR where an
i128value is loaded, part of the low 64 bits is masked out, a newi32field is inserted, thei128is stored back, and the new value is returned.It can be reduced to the following LLVM IR:
on v21, the following is emitted:
on v22:
The following is heavly AI assisted, as I'm not an LLVM expert, so feel free to ignore.
I bisected, using LLVM, and the regression appears to start at #171616.
Prior to this commit, the parent emits scalar code with only the necessary low-half store, while the next commit the
i128bitlogic through XMM registers and stores the high half back unnecessarily.I'm happy to attempt to look at a fix for this, but direction is appreciated - thanks!