[AArch64] Use EXT for byte shuffles with leading zeros#193466
[AArch64] Use EXT for byte shuffles with leading zeros#193466
Conversation
Fixes: llvm#191735 Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a zero fill right shift and lower them to EXT with a zero vector. Adds a regression test too. Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92
|
@llvm/pr-subscribers-backend-aarch64 Author: Shreeyash Pandey (bojle) ChangesFixes: #191735 Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a zero fill right shift and lower them to EXT with a zero vector. Adds a regression test too. Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92 Full diff: https://github.com/llvm/llvm-project/pull/193466.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9b34d9b385b4e..13190d8472c18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15048,6 +15048,27 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
DAG.getConstant(8, DL, MVT::i32));
}
+ if (EltSize == 8 && V2.getValueType() == VT && isZeroOrZeroSplat(V2, true)) {
+ unsigned PrefixElts = 0;
+ while (PrefixElts != NumElts && (ShuffleMask[PrefixElts] < 0 ||
+ ShuffleMask[PrefixElts] >= (int)NumElts))
+ ++PrefixElts;
+
+ if (0 < PrefixElts && PrefixElts < NumElts) {
+ bool IsZeroShift = true;
+ for (unsigned I = PrefixElts; I != NumElts; ++I) {
+ if (ShuffleMask[I] >= 0 && ShuffleMask[I] != (int)(I - PrefixElts)) {
+ IsZeroShift = false;
+ break;
+ }
+ }
+
+ if (IsZeroShift)
+ return DAG.getNode(AArch64ISD::EXT, DL, VT, V2, V1,
+ DAG.getConstant(NumElts - PrefixElts, DL, MVT::i32));
+ }
+ }
+
bool ReverseEXT = false;
unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
diff --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
index 930c3dfc54730..69ef3620395fd 100644
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@@ -443,12 +443,25 @@ define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)
ret <8 x half> %r
}
+define <16 x i8> @test_shuf_zero_ext_rhs(<16 x i8> %a) {
+; CHECKLE-LABEL: test_shuf_zero_ext_rhs:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: movi v1.2d, #0000000000000000
+; CHECKLE-NEXT: ext v0.16b, v1.16b, v0.16b, #15
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: test_shuf_zero_ext_rhs:
+; CHECKBE: // %bb.0:
+ %r = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+ ret <16 x i8> %r
+}
+
define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf12:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: adrp x8, .LCPI16_0
+; CHECKLE-NEXT: adrp x8, .LCPI17_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
@@ -457,8 +470,8 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
-; CHECKBE-NEXT: adrp x8, .LCPI16_0
-; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI16_0
+; CHECKBE-NEXT: adrp x8, .LCPI17_0
+; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI17_0
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
@@ -474,9 +487,9 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf13:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: adrp x8, .LCPI17_0
+; CHECKLE-NEXT: adrp x8, .LCPI18_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
@@ -485,8 +498,8 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
-; CHECKBE-NEXT: adrp x8, .LCPI17_0
-; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI17_0
+; CHECKBE-NEXT: adrp x8, .LCPI18_0
+; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI18_0
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
@@ -502,9 +515,9 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf14:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: adrp x8, .LCPI18_0
+; CHECKLE-NEXT: adrp x8, .LCPI19_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI19_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
@@ -513,8 +526,8 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
-; CHECKBE-NEXT: adrp x8, .LCPI18_0
-; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI18_0
+; CHECKBE-NEXT: adrp x8, .LCPI19_0
+; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI19_0
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
@@ -530,9 +543,9 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf15:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: adrp x8, .LCPI19_0
+; CHECKLE-NEXT: adrp x8, .LCPI20_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI19_0]
+; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI20_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
@@ -541,8 +554,8 @@ define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
-; CHECKBE-NEXT: adrp x8, .LCPI19_0
-; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI19_0
+; CHECKBE-NEXT: adrp x8, .LCPI20_0
+; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI20_0
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
@@ -575,4 +588,3 @@ define <4 x i32> @extract_shuffle(<8 x i16> %j, <4 x i16> %k) {
%d = shl <4 x i32> %c, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %d
}
-
|
davemgreen
left a comment
There was a problem hiding this comment.
There are some examples in https://godbolt.org/z/a3zevE7jc, that would be good if they work too if I have them right. It would probably be useful to have the logic for detecting the shuffle mask in a separate function so that it can be reused (like isExtMask). It can also be expanded to any splats and (legal) types sizes.
|
@davemgreen sure, let me try to incorporate those examples in this patch |
Fixes: #191735
Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a zero fill right shift and lower them to EXT with a zero vector. Adds a regression test too.
Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92