llvm-for-llvmta/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll

; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t

; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning

;
; BFDOT
;

define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfdot_f32:
; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfdot_lane_0_f32:
; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[0]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfdot_lane_1_f32:
; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[1]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfdot_lane_2_f32:
; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[2]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfdot_lane_3_f32:
; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[3]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
  ret <vscale x 4 x float> %out
}

;
; BFMLALB
;

define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_0_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[0]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_1_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[1]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_2_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[2]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_3_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[3]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_4_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[4]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_5_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[5]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_6_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[6]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalb_lane_7_f32:
; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[7]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
  ret <vscale x 4 x float> %out
}

;
; BFMLALT
;

define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_0_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[0]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_1_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[1]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_2_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[2]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_3_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[3]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_4_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[4]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_5_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[5]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_6_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[6]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
  ret <vscale x 4 x float> %out
}

define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmlalt_lane_7_f32:
; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[7]
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
  ret <vscale x 4 x float> %out
}

;
; BFMMLA
;

define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
; CHECK-LABEL: bfmmla_f32:
; CHECK-NEXT:  bfmmla z0.s, z1.h, z2.h
; CHECK-NEXT:  ret
  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
  ret <vscale x 4 x float> %out
}

;
; BFCVT
;

define <vscale x 8 x bfloat> @fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
; CHECK-LABEL: fcvt_bf16_f32:
; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s
; CHECK-NEXT: ret
  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
  ret <vscale x 8 x bfloat> %out
}

;
; BFCVTNT
;

define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
; CHECK-LABEL: fcvtnt_bf16_f32:
; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
; CHECK-NEXT: ret
  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
  ret <vscale x 8 x bfloat> %out
}

declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
first commit 2022-04-25 10:02:23 +02:00			`; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s 2>%t \| FileCheck %s`
			`; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t`

			`; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.`
			`; WARN-NOT: warning`

			`;`
			`; BFDOT`
			`;`

			`define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfdot_f32:`
			`; CHECK-NEXT: bfdot z0.s, z1.h, z2.h`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfdot_lane_0_f32:`
			`; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[0]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfdot_lane_1_f32:`
			`; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[1]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfdot_lane_2_f32:`
			`; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[2]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfdot_lane_3_f32:`
			`; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[3]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)`
			`ret <vscale x 4 x float> %out`
			`}`

			`;`
			`; BFMLALB`
			`;`

			`define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_0_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[0]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_1_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[1]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_2_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[2]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_3_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[3]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_4_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[4]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_5_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[5]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_6_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[6]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalb_lane_7_f32:`
			`; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[7]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)`
			`ret <vscale x 4 x float> %out`
			`}`

			`;`
			`; BFMLALT`
			`;`

			`define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_0_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[0]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_1_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[1]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_2_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[2]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_3_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[3]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_4_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[4]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_5_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[5]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_6_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[6]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)`
			`ret <vscale x 4 x float> %out`
			`}`

			`define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmlalt_lane_7_f32:`
			`; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[7]`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)`
			`ret <vscale x 4 x float> %out`
			`}`

			`;`
			`; BFMMLA`
			`;`

			`define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {`
			`; CHECK-LABEL: bfmmla_f32:`
			`; CHECK-NEXT: bfmmla z0.s, z1.h, z2.h`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)`
			`ret <vscale x 4 x float> %out`
			`}`

			`;`
			`; BFCVT`
			`;`

			`define <vscale x 8 x bfloat> @fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {`
			`; CHECK-LABEL: fcvt_bf16_f32:`
			`; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)`
			`ret <vscale x 8 x bfloat> %out`
			`}`

			`;`
			`; BFCVTNT`
			`;`

			`define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {`
			`; CHECK-LABEL: fcvtnt_bf16_f32:`
			`; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s`
			`; CHECK-NEXT: ret`
			`%out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)`
			`ret <vscale x 8 x bfloat> %out`
			`}`

			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)`
			`declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)`
			`declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)`
			`declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)`