; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code, optimized code
; No PGO data; 3 single block inlinees
G_M000_IG01:
sub rsp, 40
vmovaps xmmword ptr [rsp+0x10], xmm6
xor eax, eax
mov qword ptr [rsp+0x08], rax
mov qword ptr [rsp], rax
G_M000_IG02:
vxorps xmm0, xmm0, xmm0
vxorps xmm1, xmm1, xmm1
vbroadcastss ymm2, dword ptr [reloc @RWD00]
mov rax, gword ptr [rcx+0x08]
mov gword ptr [rsp+0x08], rax
test rax, rax
je SHORT G_M000_IG04
G_M000_IG03:
mov r8d, dword ptr [rax+0x08]
test r8d, r8d
je SHORT G_M000_IG04
add rax, 16
jmp SHORT G_M000_IG05
G_M000_IG04:
xor eax, eax
G_M000_IG05:
mov r8, gword ptr [rcx+0x10]
mov gword ptr [rsp], r8
test r8, r8
je SHORT G_M000_IG07
G_M000_IG06:
mov r10d, dword ptr [r8+0x08]
test r10d, r10d
je SHORT G_M000_IG07
add r8, 16
jmp SHORT G_M000_IG08
G_M000_IG07:
xor r8d, r8d
G_M000_IG08:
xor r10d, r10d
mov ecx, dword ptr [rcx+0x18]
mov r9d, ecx
sar r9d, 31
and r9d, 7
add ecx, r9d
sar ecx, 3
cmp r10d, ecx
jge G_M000_IG13
G_M000_IG09: ;; HOT LOOP START
vmovups ymm3, ymmword ptr [rax]
vcmpeqps ymm4, ymm2, ymm3
vmovups ymm5, ymmword ptr [r8]
vcmpeqps ymm6, ymm2, ymm5
vpternlogd ymm4, ymm6, ymm4, 17
vxorps ymm16, ymm16, ymm16
vcmpeqps k1, ymm16, ymm4
kortestb k1, k1
jb G_M000_IG12
G_M000_IG10:
vxorps ymm16, ymm16, ymm16
vpternlogd ymm16, ymm5, ymm4, -40
vxorps ymm5, ymm5, ymm5
vpternlogd ymm4, ymm5, ymm3, -84
vsubps ymm3, ymm16, ymm4
vcmpgtps ymm4, ymm3, ymmword ptr [reloc @RWD32]
vmovmskps r9, ymm4
test r9d, r9d
je SHORT G_M000_IG11
vxorps ymm5, ymm5, ymm5
vpternlogd ymm4, ymm5, ymm3, -84
vmovaps ymm5, ymm4
vpermilps xmm16, xmm5, -79
vaddps xmm5, xmm16, xmm5
vpermilps xmm16, xmm5, 78
vaddps xmm5, xmm16, xmm5
vextractf128 xmm4, ymm4
vpermilps xmm16, xmm4, -79
vaddps xmm4, xmm16, xmm4
vpermilps xmm16, xmm4, 78
vaddps xmm4, xmm16, xmm4
vaddss xmm4, xmm5, xmm4
vcvtss2sd xmm4, xmm4, xmm4
vaddsd xmm1, xmm4, xmm1
G_M000_IG11:
vcmpltps ymm4, ymm3, ymmword ptr [reloc @RWD64]
vmovmskps r9, ymm4
test r9d, r9d
je SHORT G_M000_IG12
vxorps ymm5, ymm5, ymm5
vpternlogd ymm4, ymm5, ymm3, -84
vmovaps ymm3, ymm4
vpermilps xmm5, xmm3, -79
vaddps xmm3, xmm5, xmm3
vpermilps xmm5, xmm3, 78
vaddps xmm3, xmm5, xmm3
vextractf128 xmm4, ymm4
vpermilps xmm5, xmm4, -79
vaddps xmm4, xmm5, xmm4
vpermilps xmm5, xmm4, 78
vaddps xmm4, xmm5, xmm4
vaddss xmm3, xmm3, xmm4
vcvtss2sd xmm3, xmm3, xmm3
vaddsd xmm0, xmm3, xmm0
G_M000_IG12: ;; HOT LOOP END
inc r10d
add rax, 32
add r8, 32
cmp r10d, ecx
jl G_M000_IG09
G_M000_IG13:
xor rax, rax
mov gword ptr [rsp+0x08], rax
mov gword ptr [rsp], rax
vmovsd qword ptr [rdx], xmm0
vmovsd qword ptr [rdx+0x08], xmm1
mov rax, rdx
G_M000_IG15:
vzeroupper
vmovaps xmm6, xmmword ptr [rsp+0x10]
add rsp, 40
ret
; Total bytes of code 462
Labels:
area-CodeGen-coreclr,tenet-performanceTitle
Vector256 explicit intrinsics regress ~71% on .NET 10 vs .NET 8 on AVX-512 hardware (Tiger Lake)
Description
Using
System.Runtime.Intrinsics.Vector256<float>operations in a tight loop shows a 71-73% performance regression on .NET 10 GA compared to .NET 8 when running on AVX-512-capable hardware (Intel Tiger Lake). The equivalent portableSystem.Numerics.Vector<float>code (which also operates at 256-bit width on this hardware) shows a 3-6% improvement on .NET 10 over .NET 8, confirming this is specific to the explicitVector256<T>intrinsics codepath.Vector512<float>explicit intrinsics also regress, but only by 12-13%.Environment
Benchmark Results
Aggregation loop processing two float arrays (null-check, conditional select, subtract, compare, masked horizontal sum). Each benchmark method processes the same data at the same vector width.
Cross-runtime comparison (4096 elements, confirmed across two independent runs):
Vector<float>portable (256-bit)Vector256<float>explicit (256-bit)Vector512<float>explicit (512-bit)Cross-runtime comparison (1024 elements):
Vector<float>portable (256-bit)Vector256<float>explicit (256-bit)Vector512<float>explicit (512-bit)Within-runtime ratios (4096 elements):
Vector<float>portable (baseline)Vector256<float>explicitVector512<float>explicitOn .NET 8,
Vector256explicit is 39% faster than portable. On .NET 10, it's 8% slower.Reproduction
Benchmark code
Project file
Run
dotnet run -c Release -- --filter "*Vector256Regression*"Root Cause Analysis via JIT Disassembly
Disassembly captured with
DOTNET_TieredCompilation=0+DOTNET_JitDisasm=ExplicitVector256.JIT header change
Emitting BLENDED_CODE for X64 with AVX512 - WindowsEmitting BLENDED_CODE for generic X64 + VEX + EVEX on WindowsThis reflects the EVEX/AVX-512 rework (likely #115983).
Three key codegen differences
1. Extended register usage forces EVEX everywhere
.NET 10 uses
ymm16,xmm16,xmm17throughout the hot loop. These registers require EVEX encoding (4-byte prefix vs 2-3 byte VEX). .NET 8 stays withinymm0–ymm7..NET 10:
.NET 8:
2. Horizontal sum strategy changed:
vhaddps→vpermilps + vaddpsThis is the largest codegen change. The
Vector256.Sum()lowering switched from compact horizontal adds to a much longer shuffle+add sequence..NET 8 (4 instructions, ~9 µops, ~20 bytes):
.NET 10 (10 instructions, ~13 µops, ~50 bytes):
This pattern is emitted twice per iteration (once for fill, once for cut), adding ~60 bytes of code to the loop body.
3. Resulting code size and loop body growth
The 28% larger loop body may cause micro-op cache (DSB) pressure on Tiger Lake, forcing fallback to the slower legacy instruction decoder.
Full optimized disassembly
ExplicitVector256 — .NET 10 (FullOpts, 462 bytes)
ExplicitVector256 — .NET 8 (FullOpts, 433 bytes)
Possibly Related
.NET 10JIT header changed from"X64 with AVX512"to"generic X64 + VEX + EVEX", suggesting the instruction encoding selection was significantly reworkedSummary
On .NET 10 GA running on AVX-512-capable Tiger Lake hardware:
Vector256.Sum()lowering changed fromvhaddps(4 instructions, ~20 bytes) tovpermilps + vaddps(10 instructions, ~50 bytes), duplicated for every call site in the loopymm16+/xmm16+, forcing EVEX encoding on instructions that could be VEX-encodedVector256<float>explicit intrinsics vs .NET 8