; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s ; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=FALLBACK,GISEL ; FALLBACK-NOT: remark{{.*}}test_rev_w define i32 @test_rev_w(i32 %a) nounwind { ; CHECK-LABEL: test_rev_w: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev w0, w0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_w: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: rev w0, w0 ; FALLBACK-NEXT: ret entry: %0 = tail call i32 @llvm.bswap.i32(i32 %a) ret i32 %0 } ; FALLBACK-NOT: remark{{.*}}test_rev_x define i64 @test_rev_x(i64 %a) nounwind { ; CHECK-LABEL: test_rev_x: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev x0, x0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_x: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: rev x0, x0 ; FALLBACK-NEXT: ret entry: %0 = tail call i64 @llvm.bswap.i64(i64 %a) ret i64 %0 } ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits ; of %a are zero. This optimizes rev + lsr 16 to rev16. define i32 @test_rev_w_srl16(i16 %a) { ; CHECK-LABEL: test_rev_w_srl16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev w8, w0 ; CHECK-NEXT: lsr w0, w8, #16 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_w_srl16: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: and w8, w0, #0xffff ; FALLBACK-NEXT: rev w8, w8 ; FALLBACK-NEXT: lsr w0, w8, #16 ; FALLBACK-NEXT: ret entry: %0 = zext i16 %a to i32 %1 = tail call i32 @llvm.bswap.i32(i32 %0) %2 = lshr i32 %1, 16 ret i32 %2 } define i32 @test_rev_w_srl16_load(i16 *%a) { ; CHECK-LABEL: test_rev_w_srl16_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: lsr w0, w8, #16 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_w_srl16_load: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: ldrh w8, [x0] ; FALLBACK-NEXT: rev w8, w8 ; FALLBACK-NEXT: lsr w0, w8, #16 ; FALLBACK-NEXT: ret entry: %0 = load i16, i16 *%a %1 = zext i16 %0 to i32 %2 = tail call i32 @llvm.bswap.i32(i32 %1) %3 = lshr i32 %2, 16 ret i32 %3 } define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) { ; CHECK-LABEL: test_rev_w_srl16_add: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: add w8, w8, w1, uxtb ; CHECK-NEXT: rev16 w0, w8 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_w_srl16_add: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: and w8, w1, #0xff ; FALLBACK-NEXT: add w8, w8, w0, uxtb ; FALLBACK-NEXT: rev w8, w8 ; FALLBACK-NEXT: lsr w0, w8, #16 ; FALLBACK-NEXT: ret entry: %0 = zext i8 %a to i32 %1 = zext i8 %b to i32 %2 = add i32 %0, %1 %3 = tail call i32 @llvm.bswap.i32(i32 %2) %4 = lshr i32 %3, 16 ret i32 %4 } ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits ; of %a are zero. This optimizes rev + lsr 32 to rev32. define i64 @test_rev_x_srl32(i32 %a) { ; CHECK-LABEL: test_rev_x_srl32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: rev x8, x0 ; CHECK-NEXT: lsr x0, x8, #32 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_x_srl32: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: // kill: def $w0 killed $w0 def $x0 ; FALLBACK-NEXT: ubfx x8, x0, #0, #32 ; FALLBACK-NEXT: rev x8, x8 ; FALLBACK-NEXT: lsr x0, x8, #32 ; FALLBACK-NEXT: ret entry: %0 = zext i32 %a to i64 %1 = tail call i64 @llvm.bswap.i64(i64 %0) %2 = lshr i64 %1, 32 ret i64 %2 } define i64 @test_rev_x_srl32_load(i32 *%a) { ; CHECK-LABEL: test_rev_x_srl32_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: lsr x0, x8, #32 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_x_srl32_load: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: ldr w8, [x0] ; FALLBACK-NEXT: rev x8, x8 ; FALLBACK-NEXT: lsr x0, x8, #32 ; FALLBACK-NEXT: ret entry: %0 = load i32, i32 *%a %1 = zext i32 %0 to i64 %2 = tail call i64 @llvm.bswap.i64(i64 %1) %3 = lshr i64 %2, 32 ret i64 %3 } define i64 @test_rev_x_srl32_shift(i64 %a) { ; CHECK-LABEL: test_rev_x_srl32_shift: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ubfx x8, x0, #2, #29 ; CHECK-NEXT: rev32 x0, x8 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev_x_srl32_shift: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: lsl x8, x0, #33 ; FALLBACK-NEXT: lsr x8, x8, #35 ; FALLBACK-NEXT: rev x8, x8 ; FALLBACK-NEXT: lsr x0, x8, #32 ; FALLBACK-NEXT: ret entry: %0 = shl i64 %a, 33 %1 = lshr i64 %0, 35 %2 = tail call i64 @llvm.bswap.i64(i64 %1) %3 = lshr i64 %2, 32 ret i64 %3 } declare i32 @llvm.bswap.i32(i32) nounwind readnone declare i64 @llvm.bswap.i64(i64) nounwind readnone define i32 @test_rev16_w(i32 %X) nounwind { ; CHECK-LABEL: test_rev16_w: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev16 w0, w0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev16_w: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: lsr w8, w0, #8 ; FALLBACK-NEXT: lsl w9, w0, #8 ; FALLBACK-NEXT: and w10, w8, #0xff0000 ; FALLBACK-NEXT: and w11, w9, #0xff000000 ; FALLBACK-NEXT: and w9, w9, #0xff00 ; FALLBACK-NEXT: orr w10, w11, w10 ; FALLBACK-NEXT: and w8, w8, #0xff ; FALLBACK-NEXT: orr w9, w10, w9 ; FALLBACK-NEXT: orr w0, w9, w8 ; FALLBACK-NEXT: ret entry: %tmp1 = lshr i32 %X, 8 %X15 = bitcast i32 %X to i32 %tmp4 = shl i32 %X15, 8 %tmp2 = and i32 %tmp1, 16711680 %tmp5 = and i32 %tmp4, -16777216 %tmp9 = and i32 %tmp1, 255 %tmp13 = and i32 %tmp4, 65280 %tmp6 = or i32 %tmp5, %tmp2 %tmp10 = or i32 %tmp6, %tmp13 %tmp14 = or i32 %tmp10, %tmp9 ret i32 %tmp14 } ; 64-bit REV16 is *not* a swap then a 16-bit rotation: ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432 ; 01234567 ->(rev16) 10325476 define i64 @test_rev16_x(i64 %a) nounwind { ; CHECK-LABEL: test_rev16_x: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev x8, x0 ; CHECK-NEXT: ror x0, x8, #16 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev16_x: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: rev x8, x0 ; FALLBACK-NEXT: lsl x9, x8, #48 ; FALLBACK-NEXT: orr x0, x9, x8, lsr #16 ; FALLBACK-NEXT: ret entry: %0 = tail call i64 @llvm.bswap.i64(i64 %a) %1 = lshr i64 %0, 16 %2 = shl i64 %0, 48 %3 = or i64 %1, %2 ret i64 %3 } define i64 @test_rev32_x(i64 %a) nounwind { ; CHECK-LABEL: test_rev32_x: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rev32 x0, x0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_rev32_x: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: rev x8, x0 ; FALLBACK-NEXT: lsl x9, x8, #32 ; FALLBACK-NEXT: orr x0, x9, x8, lsr #32 ; FALLBACK-NEXT: ret entry: %0 = tail call i64 @llvm.bswap.i64(i64 %a) %1 = lshr i64 %0, 32 %2 = shl i64 %0, 32 %3 = or i64 %1, %2 ret i64 %3 } define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev64D8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.8b v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64D8: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev64.8b v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 } define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind { ; CHECK-LABEL: test_vrev64D16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.4h v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64D16: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev64.4h v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 } define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind { ; CHECK-LABEL: test_vrev64D32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.2s v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64D32: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev64.2s v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> ret <2 x i32> %tmp2 } define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind { ; CHECK-LABEL: test_vrev64Df: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.2s v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64Df: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev64.2s v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> ret <2 x float> %tmp2 } define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev64Q8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.16b v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64Q8: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr q0, [x0] ; FALLBACK-NEXT: rev64.16b v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 } define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind { ; CHECK-LABEL: test_vrev64Q16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.8h v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64Q16: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr q0, [x0] ; FALLBACK-NEXT: rev64.8h v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 } define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind { ; CHECK-LABEL: test_vrev64Q32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64Q32: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr q0, [x0] ; FALLBACK-NEXT: rev64.4s v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> ret <4 x i32> %tmp2 } define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind { ; CHECK-LABEL: test_vrev64Qf: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64Qf: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr q0, [x0] ; FALLBACK-NEXT: rev64.4s v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> ret <4 x float> %tmp2 } define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev32D8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev32.8b v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev32D8: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev32.8b v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 } define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind { ; CHECK-LABEL: test_vrev32D16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev32.4h v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev32D16: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev32.4h v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 } define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev32Q8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev32.16b v0, v0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_vrev32Q8: ; GISEL: // %bb.0: ; GISEL: tbl.16b v0, { v0, v1 }, v2 ; GISEL: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 } define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { ; CHECK-LABEL: test_vrev32Q16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_vrev32Q16: ; GISEL: // %bb.0: ; GISEL: tbl.16b v0, { v0, v1 }, v2 ; GISEL: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 } define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev16D8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev16.8b v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev16D8: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev16.8b v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 } define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev16Q8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev16.16b v0, v0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_vrev16Q8: ; GISEL: // %bb.0: ; GISEL: tbl.16b v0, { v0, v1 }, v2 ; GISEL: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 } ; Undef shuffle indices should not prevent matching to VREV: define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind { ; CHECK-LABEL: test_vrev64D8_undef: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: rev64.8b v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64D8_undef: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: ldr d0, [x0] ; FALLBACK-NEXT: rev64.8b v0, v0 ; FALLBACK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> ret <8 x i8> %tmp2 } define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { ; CHECK-LABEL: test_vrev32Q16_undef: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_vrev32Q16_undef: ; GISEL: // %bb.0: ; GISEL: tbl.16b v0, { v0, v1 }, v2 ; GISEL: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 } ; vrev <4 x i16> should use REV32 and not REV64 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp { ; CHECK-LABEL: test_vrev64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: add x8, x1, #2 // =2 ; CHECK-NEXT: st1.h { v0 }[5], [x8] ; CHECK-NEXT: st1.h { v0 }[6], [x1] ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev64: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: ldr q0, [x0] ; FALLBACK-NEXT: add x8, x1, #2 // =2 ; FALLBACK-NEXT: st1.h { v0 }[5], [x8] ; FALLBACK-NEXT: st1.h { v0 }[6], [x1] ; FALLBACK-NEXT: ret entry: %0 = bitcast <4 x i16>* %source to <8 x i16>* %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4 %tmp3 = extractelement <8 x i16> %tmp2, i32 6 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0 %tmp9 = extractelement <8 x i16> %tmp2, i32 5 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4 ret void } ; Test vrev of float4 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp { ; CHECK-LABEL: float_vrev64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: dup.4s v1, v1[0] ; CHECK-NEXT: ext.16b v0, v0, v1, #12 ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: str q0, [x1, #176] ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: float_vrev64: ; FALLBACK: // %bb.0: // %entry ; FALLBACK-NEXT: fmov s0, wzr ; FALLBACK-NEXT: mov.s v0[1], v0[0] ; FALLBACK-NEXT: mov.s v0[2], v0[0] ; FALLBACK-NEXT: adrp x8, .LCPI28_0 ; FALLBACK-NEXT: mov.s v0[3], v0[0] ; FALLBACK-NEXT: ldr q1, [x0] ; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] ; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2 ; FALLBACK-NEXT: str q0, [x1, #176] ; FALLBACK-NEXT: ret entry: %0 = bitcast float* %source to <4 x float>* %tmp2 = load <4 x float>, <4 x float>* %0, align 4 %tmp5 = shufflevector <4 x float> , <4 x float> %tmp2, <4 x i32> %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4 ret void } define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { ; CHECK-LABEL: test_vrev32_bswap: ; CHECK: // %bb.0: ; CHECK-NEXT: rev32.16b v0, v0 ; CHECK-NEXT: ret ; ; FALLBACK-LABEL: test_vrev32_bswap: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: rev32.16b v0, v0 ; FALLBACK-NEXT: ret %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) ret <4 x i32> %bswap } declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone