; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512

; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)

;
; v8i16
;

define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: @sub_v8i16_01234567(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
;
  %a0 = extractelement <8 x i16> %a, i32 0
  %a1 = extractelement <8 x i16> %a, i32 1
  %a2 = extractelement <8 x i16> %a, i32 2
  %a3 = extractelement <8 x i16> %a, i32 3
  %a4 = extractelement <8 x i16> %a, i32 4
  %a5 = extractelement <8 x i16> %a, i32 5
  %a6 = extractelement <8 x i16> %a, i32 6
  %a7 = extractelement <8 x i16> %a, i32 7
  %a01 = sub i16 %a0, %a1
  %a23 = sub i16 %a2, %a3
  %a45 = sub i16 %a4, %a5
  %a67 = sub i16 %a6, %a7
  %b0 = extractelement <8 x i16> %b, i32 0
  %b1 = extractelement <8 x i16> %b, i32 1
  %b2 = extractelement <8 x i16> %b, i32 2
  %b3 = extractelement <8 x i16> %b, i32 3
  %b4 = extractelement <8 x i16> %b, i32 4
  %b5 = extractelement <8 x i16> %b, i32 5
  %b6 = extractelement <8 x i16> %b, i32 6
  %b7 = extractelement <8 x i16> %b, i32 7
  %b01 = sub i16 %b0, %b1
  %b23 = sub i16 %b2, %b3
  %b45 = sub i16 %b4, %b5
  %b67 = sub i16 %b6, %b7
  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <8 x i16> %result
}

define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @sub_v8i16_u1234567(
; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[A]], [[SHIFT2]]
; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
; SSE2-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[HSUB1:%.*]] = sub <8 x i16> [[TMP7]], [[TMP4]]
; SSE2-NEXT:    [[HSUB2:%.*]] = shufflevector <8 x i16> [[HSUB1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
;
; SSE4-LABEL: @sub_v8i16_u1234567(
; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
;
; AVX-LABEL: @sub_v8i16_u1234567(
; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
; AVX-NEXT:    ret <8 x i16> [[TMP7]]
;
  %a0 = extractelement <8 x i16> %a, i32 0
  %a1 = extractelement <8 x i16> %a, i32 1
  %a2 = extractelement <8 x i16> %a, i32 2
  %a3 = extractelement <8 x i16> %a, i32 3
  %a4 = extractelement <8 x i16> %a, i32 4
  %a5 = extractelement <8 x i16> %a, i32 5
  %a6 = extractelement <8 x i16> %a, i32 6
  %a7 = extractelement <8 x i16> %a, i32 7
  %a01 = sub i16 %a0, %a1
  %a23 = sub i16 %a2, %a3
  %a45 = sub i16 %a4, %a5
  %a67 = sub i16 %a6, %a7
  %b0 = extractelement <8 x i16> %b, i32 0
  %b1 = extractelement <8 x i16> %b, i32 1
  %b2 = extractelement <8 x i16> %b, i32 2
  %b3 = extractelement <8 x i16> %b, i32 3
  %b4 = extractelement <8 x i16> %b, i32 4
  %b5 = extractelement <8 x i16> %b, i32 5
  %b6 = extractelement <8 x i16> %b, i32 6
  %b7 = extractelement <8 x i16> %b, i32 7
  %b01 = sub i16 %b0, %b1
  %b23 = sub i16 %b2, %b3
  %b45 = sub i16 %b4, %b5
  %b67 = sub i16 %b6, %b7
  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <8 x i16> %result
}

;
; v4i32
;

define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @sub_v4i32_0123(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x i32> %result
}

define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @sub_v4i32_u123(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
  ret <4 x i32> %result
}

define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @sub_v4i32_0u23(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
  ret <4 x i32> %result
}

define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: @sub_v4i32_01u3(
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
;
; SSE4-LABEL: @sub_v4i32_01u3(
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
;
; AVX2-LABEL: @sub_v4i32_01u3(
; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
;
; AVX512-LABEL: @sub_v4i32_01u3(
; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
; AVX512-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
  ret <4 x i32> %result
}

define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @sub_v4i32_012u(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
  ret <4 x i32> %result
}

define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @sub_v4i32_uu23(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
  ret <4 x i32> %result
}

define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @sub_v4i32_01uu(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
;
  %a0 = extractelement <4 x i32> %a, i32 0
  %a1 = extractelement <4 x i32> %a, i32 1
  %a2 = extractelement <4 x i32> %a, i32 2
  %a3 = extractelement <4 x i32> %a, i32 3
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %b0 = extractelement <4 x i32> %b, i32 0
  %b1 = extractelement <4 x i32> %b, i32 1
  %b2 = extractelement <4 x i32> %b, i32 2
  %b3 = extractelement <4 x i32> %b, i32 3
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  ret <4 x i32> %result
}

;
; v8i32
;

define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: @sub_v8i32_01234567(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
;
  %a0 = extractelement <8 x i32> %a, i32 0
  %a1 = extractelement <8 x i32> %a, i32 1
  %a2 = extractelement <8 x i32> %a, i32 2
  %a3 = extractelement <8 x i32> %a, i32 3
  %a4 = extractelement <8 x i32> %a, i32 4
  %a5 = extractelement <8 x i32> %a, i32 5
  %a6 = extractelement <8 x i32> %a, i32 6
  %a7 = extractelement <8 x i32> %a, i32 7
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %a45 = sub i32 %a4, %a5
  %a67 = sub i32 %a6, %a7
  %b0 = extractelement <8 x i32> %b, i32 0
  %b1 = extractelement <8 x i32> %b, i32 1
  %b2 = extractelement <8 x i32> %b, i32 2
  %b3 = extractelement <8 x i32> %b, i32 3
  %b4 = extractelement <8 x i32> %b, i32 4
  %b5 = extractelement <8 x i32> %b, i32 5
  %b6 = extractelement <8 x i32> %b, i32 6
  %b7 = extractelement <8 x i32> %b, i32 7
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %b45 = sub i32 %b4, %b5
  %b67 = sub i32 %b6, %b7
  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <8 x i32> %result
}

define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: @sub_v8i32_01234u67(
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
; SSE2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
;
; SSE4-LABEL: @sub_v8i32_01234u67(
; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP7]], [[TMP5]]
; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
;
; AVX-LABEL: @sub_v8i32_01234u67(
; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
; AVX-NEXT:    ret <8 x i32> [[TMP7]]
;
  %a0 = extractelement <8 x i32> %a, i32 0
  %a1 = extractelement <8 x i32> %a, i32 1
  %a2 = extractelement <8 x i32> %a, i32 2
  %a3 = extractelement <8 x i32> %a, i32 3
  %a4 = extractelement <8 x i32> %a, i32 4
  %a5 = extractelement <8 x i32> %a, i32 5
  %a6 = extractelement <8 x i32> %a, i32 6
  %a7 = extractelement <8 x i32> %a, i32 7
  %a01 = sub i32 %a0, %a1
  %a23 = sub i32 %a2, %a3
  %a45 = sub i32 %a4, %a5
  %a67 = sub i32 %a6, %a7
  %b0 = extractelement <8 x i32> %b, i32 0
  %b1 = extractelement <8 x i32> %b, i32 1
  %b2 = extractelement <8 x i32> %b, i32 2
  %b3 = extractelement <8 x i32> %b, i32 3
  %b4 = extractelement <8 x i32> %b, i32 4
  %b5 = extractelement <8 x i32> %b, i32 5
  %b6 = extractelement <8 x i32> %b, i32 6
  %b7 = extractelement <8 x i32> %b, i32 7
  %b01 = sub i32 %b0, %b1
  %b23 = sub i32 %b2, %b3
  %b45 = sub i32 %b4, %b5
  %b67 = sub i32 %b6, %b7
  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
  ret <8 x i32> %result
}

;
; v4f32
;

define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_0123(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <4 x float> [[TMP3]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x float> %result
}

define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_u123(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT:    ret <4 x float> [[TMP4]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
  ret <4 x float> %result
}

define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_0u23(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT:    ret <4 x float> [[TMP4]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
  ret <4 x float> %result
}

define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_01u3(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
  ret <4 x float> %result
}

define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: @sub_v4f32_012u(
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
; SSE2-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
;
; SSE4-LABEL: @sub_v4f32_012u(
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
; SSE4-NEXT:    ret <4 x float> [[TMP4]]
;
; AVX2-LABEL: @sub_v4f32_012u(
; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
; AVX2-NEXT:    ret <4 x float> [[TMP4]]
;
; AVX512-LABEL: @sub_v4f32_012u(
; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
; AVX512-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
  ret <4 x float> %result
}

define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_uu23(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
; CHECK-NEXT:    [[RESULT2:%.*]] = fsub <4 x float> [[TMP2]], [[RESULT1]]
; CHECK-NEXT:    ret <4 x float> [[RESULT2]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
  ret <4 x float> %result
}

define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_01uu(
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT:    ret <4 x float> [[TMP4]]
;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
  %a3 = extractelement <4 x float> %a, i32 3
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %b0 = extractelement <4 x float> %b, i32 0
  %b1 = extractelement <4 x float> %b, i32 1
  %b2 = extractelement <4 x float> %b, i32 2
  %b3 = extractelement <4 x float> %b, i32 3
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  ret <4 x float> %result
}

;
; v8f32
;

define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
; CHECK-LABEL: @sub_v8f32_01234567(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <8 x float> [[TMP3]]
;
  %a0 = extractelement <8 x float> %a, i32 0
  %a1 = extractelement <8 x float> %a, i32 1
  %a2 = extractelement <8 x float> %a, i32 2
  %a3 = extractelement <8 x float> %a, i32 3
  %a4 = extractelement <8 x float> %a, i32 4
  %a5 = extractelement <8 x float> %a, i32 5
  %a6 = extractelement <8 x float> %a, i32 6
  %a7 = extractelement <8 x float> %a, i32 7
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %a45 = fsub float %a4, %a5
  %a67 = fsub float %a6, %a7
  %b0 = extractelement <8 x float> %b, i32 0
  %b1 = extractelement <8 x float> %b, i32 1
  %b2 = extractelement <8 x float> %b, i32 2
  %b3 = extractelement <8 x float> %b, i32 3
  %b4 = extractelement <8 x float> %b, i32 4
  %b5 = extractelement <8 x float> %b, i32 5
  %b6 = extractelement <8 x float> %b, i32 6
  %b7 = extractelement <8 x float> %b, i32 7
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %b45 = fsub float %b4, %b5
  %b67 = fsub float %b6, %b7
  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <8 x float> %result
}

define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: @sub_v8f32_012u4567(
; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
; SSE2-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP5]], [[TMP8]]
; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE2-NEXT:    ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @sub_v8f32_012u4567(
; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP2]], [[TMP5]]
; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP7]]
; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE4-NEXT:    ret <8 x float> [[RESULT]]
;
; AVX-LABEL: @sub_v8f32_012u4567(
; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
; AVX-NEXT:    ret <8 x float> [[TMP7]]
;
  %a0 = extractelement <8 x float> %a, i32 0
  %a1 = extractelement <8 x float> %a, i32 1
  %a2 = extractelement <8 x float> %a, i32 2
  %a3 = extractelement <8 x float> %a, i32 3
  %a4 = extractelement <8 x float> %a, i32 4
  %a5 = extractelement <8 x float> %a, i32 5
  %a6 = extractelement <8 x float> %a, i32 6
  %a7 = extractelement <8 x float> %a, i32 7
  %a01 = fsub float %a0, %a1
  %a23 = fsub float %a2, %a3
  %a45 = fsub float %a4, %a5
  %a67 = fsub float %a6, %a7
  %b0 = extractelement <8 x float> %b, i32 0
  %b1 = extractelement <8 x float> %b, i32 1
  %b2 = extractelement <8 x float> %b, i32 2
  %b3 = extractelement <8 x float> %b, i32 3
  %b4 = extractelement <8 x float> %b, i32 4
  %b5 = extractelement <8 x float> %b, i32 5
  %b6 = extractelement <8 x float> %b, i32 6
  %b7 = extractelement <8 x float> %b, i32 7
  %b01 = fsub float %b0, %b1
  %b23 = fsub float %b2, %b3
  %b45 = fsub float %b4, %b5
  %b67 = fsub float %b6, %b7
  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
  ret <8 x float> %result
}

;
; v2f64
;

define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_01(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <2 x double> [[TMP3]]
;
  %a0 = extractelement <2 x double> %a, i32 0
  %a1 = extractelement <2 x double> %a, i32 1
  %a01 = fsub double %a0, %a1
  %b0 = extractelement <2 x double> %b, i32 0
  %b1 = extractelement <2 x double> %b, i32 1
  %b01 = fsub double %b0, %b1
  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
  ret <2 x double> %result
}

define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_u1(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[B]]
; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
;
  %a0 = extractelement <2 x double> %a, i32 0
  %a1 = extractelement <2 x double> %a, i32 1
  %a01 = fsub double %a0, %a1
  %b0 = extractelement <2 x double> %b, i32 0
  %b1 = extractelement <2 x double> %b, i32 1
  %b01 = fsub double %b0, %b1
  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
  ret <2 x double> %result
}

define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_0u(
; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[RESULT]]
; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
;
  %a0 = extractelement <2 x double> %a, i32 0
  %a1 = extractelement <2 x double> %a, i32 1
  %a01 = fsub double %a0, %a1
  %b0 = extractelement <2 x double> %b, i32 0
  %b1 = extractelement <2 x double> %b, i32 1
  %b01 = fsub double %b0, %b1
  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
  ret <2 x double> %result
}

;
; v4f64
;

define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @sub_v4f64_0123(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    ret <4 x double> [[TMP3]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x double> %result
}

define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: @sub_v4f64_u123(
; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
; SSE2-NEXT:    ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_u123(
; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
; SSE4-NEXT:    ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @sub_v4f64_u123(
; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT:    ret <4 x double> [[TMP4]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
  ret <4 x double> %result
}

define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: @sub_v4f64_0u23(
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
; SSE2-NEXT:    ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_0u23(
; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
; SSE4-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP1]], [[TMP3]]
; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP2]], double [[B23]], i64 3
; SSE4-NEXT:    ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @sub_v4f64_0u23(
; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT:    ret <4 x double> [[TMP4]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
  ret <4 x double> %result
}

define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: @sub_v4f64_01u3(
; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
; SSE2-NEXT:    ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_01u3(
; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
; SSE4-NEXT:    ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @sub_v4f64_01u3(
; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT:    ret <4 x double> [[TMP4]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
  ret <4 x double> %result
}

define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: @sub_v4f64_012u(
; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
; SSE2-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
; SSE2-NEXT:    ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_012u(
; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
; SSE4-NEXT:    ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @sub_v4f64_012u(
; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT:    ret <4 x double> [[TMP4]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
  ret <4 x double> %result
}

define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: @sub_v4f64_uu23(
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
;
; SSE4-LABEL: @sub_v4f64_uu23(
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
;
; AVX-LABEL: @sub_v4f64_uu23(
; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT:    ret <4 x double> [[RESULT1]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
  ret <4 x double> %result
}

define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: @sub_v4f64_01uu(
; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT:    ret <4 x double> [[TMP4]]
;
; SSE4-LABEL: @sub_v4f64_01uu(
; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT:    ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_01uu(
; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; AVX-NEXT:    ret <4 x double> [[TMP3]]
;
  %a0 = extractelement <4 x double> %a, i32 0
  %a1 = extractelement <4 x double> %a, i32 1
  %a2 = extractelement <4 x double> %a, i32 2
  %a3 = extractelement <4 x double> %a, i32 3
  %a01 = fsub double %a0, %a1
  %a23 = fsub double %a2, %a3
  %b0 = extractelement <4 x double> %b, i32 0
  %b1 = extractelement <4 x double> %b, i32 1
  %b2 = extractelement <4 x double> %b, i32 2
  %b3 = extractelement <4 x double> %b, i32 3
  %b01 = fsub double %b0, %b1
  %b23 = fsub double %b2, %b3
  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  ret <4 x double> %result
}
