Skip to content

[AArch64] Use EXT for byte shuffles with leading zeros#193466

Open
bojle wants to merge 1 commit intollvm:mainfrom
bojle:upstream_shufflevector
Open

[AArch64] Use EXT for byte shuffles with leading zeros#193466
bojle wants to merge 1 commit intollvm:mainfrom
bojle:upstream_shufflevector

Conversation

@bojle
Copy link
Copy Markdown
Contributor

@bojle bojle commented Apr 22, 2026

Fixes: #191735

Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a zero fill right shift and lower them to EXT with a zero vector. Adds a regression test too.

Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92

Fixes: llvm#191735

Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a
zero fill right shift and lower them to EXT with a zero vector. Adds a
regression test too.

Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92
@llvmbot
Copy link
Copy Markdown
Member

llvmbot commented Apr 22, 2026

@llvm/pr-subscribers-backend-aarch64

Author: Shreeyash Pandey (bojle)

Changes

Fixes: #191735

Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a zero fill right shift and lower them to EXT with a zero vector. Adds a regression test too.

Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92


Full diff: https://github.com/llvm/llvm-project/pull/193466.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+21)
  • (modified) llvm/test/CodeGen/AArch64/shuffles.ll (+29-17)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9b34d9b385b4e..13190d8472c18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15048,6 +15048,27 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                        DAG.getConstant(8, DL, MVT::i32));
   }
 
+  if (EltSize == 8 && V2.getValueType() == VT && isZeroOrZeroSplat(V2, true)) {
+    unsigned PrefixElts = 0;
+    while (PrefixElts != NumElts && (ShuffleMask[PrefixElts] < 0 ||
+                                     ShuffleMask[PrefixElts] >= (int)NumElts))
+      ++PrefixElts;
+
+    if (0 < PrefixElts && PrefixElts < NumElts) {
+      bool IsZeroShift = true;
+      for (unsigned I = PrefixElts; I != NumElts; ++I) {
+        if (ShuffleMask[I] >= 0 && ShuffleMask[I] != (int)(I - PrefixElts)) {
+          IsZeroShift = false;
+          break;
+        }
+      }
+
+      if (IsZeroShift)
+        return DAG.getNode(AArch64ISD::EXT, DL, VT, V2, V1,
+                           DAG.getConstant(NumElts - PrefixElts, DL, MVT::i32));
+    }
+  }
+
   bool ReverseEXT = false;
   unsigned Imm;
   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
diff --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
index 930c3dfc54730..69ef3620395fd 100644
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@@ -443,12 +443,25 @@ define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)
   ret <8 x half> %r
 }
 
+define <16 x i8> @test_shuf_zero_ext_rhs(<16 x i8> %a) {
+; CHECKLE-LABEL: test_shuf_zero_ext_rhs:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    movi v1.2d, #0000000000000000
+; CHECKLE-NEXT:    ext v0.16b, v1.16b, v0.16b, #15
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: test_shuf_zero_ext_rhs:
+; CHECKBE:       // %bb.0:
+  %r = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %r
+}
+
 define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
 ; CHECKLE-LABEL: test_shuf12:
 ; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    adrp x8, .LCPI16_0
+; CHECKLE-NEXT:    adrp x8, .LCPI17_0
 ; CHECKLE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
 ; CHECKLE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECKLE-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECKLE-NEXT:    ret
@@ -457,8 +470,8 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
 ; CHECKBE:       // %bb.0:
 ; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
-; CHECKBE-NEXT:    adrp x8, .LCPI16_0
-; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI16_0
+; CHECKBE-NEXT:    adrp x8, .LCPI17_0
+; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI17_0
 ; CHECKBE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECKBE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECKBE-NEXT:    ld1 { v0.16b }, [x8]
@@ -474,9 +487,9 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
 define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
 ; CHECKLE-LABEL: test_shuf13:
 ; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    adrp x8, .LCPI17_0
+; CHECKLE-NEXT:    adrp x8, .LCPI18_0
 ; CHECKLE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
 ; CHECKLE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECKLE-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECKLE-NEXT:    ret
@@ -485,8 +498,8 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
 ; CHECKBE:       // %bb.0:
 ; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
-; CHECKBE-NEXT:    adrp x8, .LCPI17_0
-; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI17_0
+; CHECKBE-NEXT:    adrp x8, .LCPI18_0
+; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI18_0
 ; CHECKBE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECKBE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECKBE-NEXT:    ld1 { v0.16b }, [x8]
@@ -502,9 +515,9 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
 define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
 ; CHECKLE-LABEL: test_shuf14:
 ; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    adrp x8, .LCPI18_0
+; CHECKLE-NEXT:    adrp x8, .LCPI19_0
 ; CHECKLE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
 ; CHECKLE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECKLE-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECKLE-NEXT:    ret
@@ -513,8 +526,8 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
 ; CHECKBE:       // %bb.0:
 ; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
-; CHECKBE-NEXT:    adrp x8, .LCPI18_0
-; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI18_0
+; CHECKBE-NEXT:    adrp x8, .LCPI19_0
+; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI19_0
 ; CHECKBE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECKBE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECKBE-NEXT:    ld1 { v0.16b }, [x8]
@@ -530,9 +543,9 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
 define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
 ; CHECKLE-LABEL: test_shuf15:
 ; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    adrp x8, .LCPI19_0
+; CHECKLE-NEXT:    adrp x8, .LCPI20_0
 ; CHECKLE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
+; CHECKLE-NEXT:    ldr q2, [x8, :lo12:.LCPI20_0]
 ; CHECKLE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECKLE-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECKLE-NEXT:    ret
@@ -541,8 +554,8 @@ define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
 ; CHECKBE:       // %bb.0:
 ; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
-; CHECKBE-NEXT:    adrp x8, .LCPI19_0
-; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI19_0
+; CHECKBE-NEXT:    adrp x8, .LCPI20_0
+; CHECKBE-NEXT:    add x8, x8, :lo12:.LCPI20_0
 ; CHECKBE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECKBE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECKBE-NEXT:    ld1 { v0.16b }, [x8]
@@ -575,4 +588,3 @@ define <4 x i32> @extract_shuffle(<8 x i16> %j, <4 x i16> %k) {
   %d = shl <4 x i32> %c, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %d
 }
-

Copy link
Copy Markdown
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some examples in https://godbolt.org/z/a3zevE7jc, that would be good if they work too if I have them right. It would probably be useful to have the logic for detecting the shuffle mask in a separate function so that it can be reused (like isExtMask). It can also be expanded to any splats and (legal) types sizes.

@bojle
Copy link
Copy Markdown
Contributor Author

bojle commented Apr 23, 2026

@davemgreen sure, let me try to incorporate those examples in this patch

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[AArch64] shufflevector(v, splat(0)) produces worse code than shufflevector(splat(0), v0)

3 participants