llvm-for-llvmta/lib/Transforms/Vectorize/VPlanSLP.cpp

//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// This file implements SLP analysis based on VPlan. The analysis is based on
/// the ideas described in
///
///   Look-ahead SLP: auto-vectorization in the presence of commutative
///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
///   Luís F. W. Góes
///
//===----------------------------------------------------------------------===//

#include "VPlan.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <cassert>
#include <iterator>
#include <string>
#include <vector>

using namespace llvm;

#define DEBUG_TYPE "vplan-slp"

// Number of levels to look ahead when re-ordering multi node operands.
static unsigned LookaheadMaxDepth = 5;

VPInstruction *VPlanSlp::markFailed() {
  // FIXME: Currently this is used to signal we hit instructions we cannot
  //        trivially SLP'ize.
  CompletelySLP = false;
  return nullptr;
}

void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
  if (all_of(Operands, [](VPValue *V) {
        return cast<VPInstruction>(V)->getUnderlyingInstr();
      })) {
    unsigned BundleSize = 0;
    for (VPValue *V : Operands) {
      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
      assert(!T->isVectorTy() && "Only scalar types supported for now");
      BundleSize += T->getScalarSizeInBits();
    }
    WidestBundleBits = std::max(WidestBundleBits, BundleSize);
  }

  auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
  assert(Res.second &&
         "Already created a combined instruction for the operand bundle");
  (void)Res;
}

bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
  // Currently we only support VPInstructions.
  if (!all_of(Operands, [](VPValue *Op) {
        return Op && isa<VPInstruction>(Op) &&
               cast<VPInstruction>(Op)->getUnderlyingInstr();
      })) {
    LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
    return false;
  }

  // Check if opcodes and type width agree for all instructions in the bundle.
  // FIXME: Differing widths/opcodes can be handled by inserting additional
  //        instructions.
  // FIXME: Deal with non-primitive types.
  const Instruction *OriginalInstr =
      cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
  unsigned Opcode = OriginalInstr->getOpcode();
  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
  if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
        const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
        return I->getOpcode() == Opcode &&
               I->getType()->getPrimitiveSizeInBits() == Width;
      })) {
    LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
    return false;
  }

  // For now, all operands must be defined in the same BB.
  if (any_of(Operands, [this](VPValue *Op) {
        return cast<VPInstruction>(Op)->getParent() != &this->BB;
      })) {
    LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
    return false;
  }

  if (any_of(Operands,
             [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
    LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
    return false;
  }

  // For loads, check that there are no instructions writing to memory in
  // between them.
  // TODO: we only have to forbid instructions writing to memory that could
  //       interfere with any of the loads in the bundle
  if (Opcode == Instruction::Load) {
    unsigned LoadsSeen = 0;
    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
    for (auto &I : *Parent) {
      auto *VPI = cast<VPInstruction>(&I);
      if (VPI->getOpcode() == Instruction::Load &&
          llvm::is_contained(Operands, VPI))
        LoadsSeen++;

      if (LoadsSeen == Operands.size())
        break;
      if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
        LLVM_DEBUG(
            dbgs() << "VPSLP: instruction modifying memory between loads\n");
        return false;
      }
    }

    if (!all_of(Operands, [](VPValue *Op) {
          return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
              ->isSimple();
        })) {
      LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
      return false;
    }
  }

  if (Opcode == Instruction::Store)
    if (!all_of(Operands, [](VPValue *Op) {
          return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
              ->isSimple();
        })) {
      LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
      return false;
    }

  return true;
}

static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
                                             unsigned OperandIndex) {
  SmallVector<VPValue *, 4> Operands;
  for (VPValue *V : Values) {
    // Currently we only support VPInstructions.
    auto *U = cast<VPInstruction>(V);
    Operands.push_back(U->getOperand(OperandIndex));
  }
  return Operands;
}

static bool areCommutative(ArrayRef<VPValue *> Values) {
  return Instruction::isCommutative(
      cast<VPInstruction>(Values[0])->getOpcode());
}

static SmallVector<SmallVector<VPValue *, 4>, 4>
getOperands(ArrayRef<VPValue *> Values) {
  SmallVector<SmallVector<VPValue *, 4>, 4> Result;
  auto *VPI = cast<VPInstruction>(Values[0]);

  switch (VPI->getOpcode()) {
  case Instruction::Load:
    llvm_unreachable("Loads terminate a tree, no need to get operands");
  case Instruction::Store:
    Result.push_back(getOperands(Values, 0));
    break;
  default:
    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
      Result.push_back(getOperands(Values, I));
    break;
  }

  return Result;
}

/// Returns the opcode of Values or ~0 if they do not all agree.
static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
  if (any_of(Values, [Opcode](VPValue *V) {
        return cast<VPInstruction>(V)->getOpcode() != Opcode;
      }))
    return None;
  return {Opcode};
}

/// Returns true if A and B access sequential memory if they are loads or
/// stores or if they have identical opcodes otherwise.
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
                                  VPInterleavedAccessInfo &IAI) {
  if (A->getOpcode() != B->getOpcode())
    return false;

  if (A->getOpcode() != Instruction::Load &&
      A->getOpcode() != Instruction::Store)
    return true;
  auto *GA = IAI.getInterleaveGroup(A);
  auto *GB = IAI.getInterleaveGroup(B);

  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
}

/// Implements getLAScore from Listing 7 in the paper.
/// Traverses and compares operands of V1 and V2 to MaxLevel.
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
                           VPInterleavedAccessInfo &IAI) {
  auto *I1 = dyn_cast<VPInstruction>(V1);
  auto *I2 = dyn_cast<VPInstruction>(V2);
  // Currently we only support VPInstructions.
  if (!I1 || !I2)
    return 0;

  if (MaxLevel == 0)
    return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);

  unsigned Score = 0;
  for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
    for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
      Score +=
          getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
  return Score;
}

std::pair<VPlanSlp::OpMode, VPValue *>
VPlanSlp::getBest(OpMode Mode, VPValue *Last,
                  SmallPtrSetImpl<VPValue *> &Candidates,
                  VPInterleavedAccessInfo &IAI) {
  assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
         "Currently we only handle load and commutative opcodes");
  LLVM_DEBUG(dbgs() << "      getBest\n");

  SmallVector<VPValue *, 4> BestCandidates;
  LLVM_DEBUG(dbgs() << "        Candidates  for "
                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
  for (auto *Candidate : Candidates) {
    auto *LastI = cast<VPInstruction>(Last);
    auto *CandidateI = cast<VPInstruction>(Candidate);
    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
                        << " ");
      BestCandidates.push_back(Candidate);
    }
  }
  LLVM_DEBUG(dbgs() << "\n");

  if (BestCandidates.empty())
    return {OpMode::Failed, nullptr};

  if (BestCandidates.size() == 1)
    return {Mode, BestCandidates[0]};

  VPValue *Best = nullptr;
  unsigned BestScore = 0;
  for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
    unsigned PrevScore = ~0u;
    bool AllSame = true;

    // FIXME: Avoid visiting the same operands multiple times.
    for (auto *Candidate : BestCandidates) {
      unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
      if (PrevScore == ~0u)
        PrevScore = Score;
      if (PrevScore != Score)
        AllSame = false;
      PrevScore = Score;

      if (Score > BestScore) {
        BestScore = Score;
        Best = Candidate;
      }
    }
    if (!AllSame)
      break;
  }
  LLVM_DEBUG(dbgs() << "Found best "
                    << *cast<VPInstruction>(Best)->getUnderlyingInstr()
                    << "\n");
  Candidates.erase(Best);

  return {Mode, Best};
}

SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
  SmallVector<MultiNodeOpTy, 4> FinalOrder;
  SmallVector<OpMode, 4> Mode;
  FinalOrder.reserve(MultiNodeOps.size());
  Mode.reserve(MultiNodeOps.size());

  LLVM_DEBUG(dbgs() << "Reordering multinode\n");

  for (auto &Operands : MultiNodeOps) {
    FinalOrder.push_back({Operands.first, {Operands.second[0]}});
    if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
        Instruction::Load)
      Mode.push_back(OpMode::Load);
    else
      Mode.push_back(OpMode::Opcode);
  }

  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
    SmallPtrSet<VPValue *, 4> Candidates;
    LLVM_DEBUG(dbgs() << "  Candidates  ");
    for (auto Ops : MultiNodeOps) {
      LLVM_DEBUG(
          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
                 << " ");
      Candidates.insert(Ops.second[Lane]);
    }
    LLVM_DEBUG(dbgs() << "\n");

    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
      if (Mode[Op] == OpMode::Failed)
        continue;

      VPValue *Last = FinalOrder[Op].second[Lane - 1];
      std::pair<OpMode, VPValue *> Res =
          getBest(Mode[Op], Last, Candidates, IAI);
      if (Res.second)
        FinalOrder[Op].second.push_back(Res.second);
      else
        // TODO: handle this case
        FinalOrder[Op].second.push_back(markFailed());
    }
  }

  return FinalOrder;
}

void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
  dbgs() << " Ops: ";
  for (auto Op : Values) {
    if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
      if (auto *Instr = VPInstr->getUnderlyingInstr()) {
        dbgs() << *Instr << " | ";
        continue;
      }
    dbgs() << " nullptr | ";
  }
  dbgs() << "\n";
}

VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
  assert(!Values.empty() && "Need some operands!");

  // If we already visited this instruction bundle, re-use the existing node
  auto I = BundleToCombined.find(to_vector<4>(Values));
  if (I != BundleToCombined.end()) {
#ifndef NDEBUG
    // Check that the resulting graph is a tree. If we re-use a node, this means
    // its values have multiple users. We only allow this, if all users of each
    // value are the same instruction.
    for (auto *V : Values) {
      auto UI = V->user_begin();
      auto *FirstUser = *UI++;
      while (UI != V->user_end()) {
        assert(*UI == FirstUser && "Currently we only support SLP trees.");
        UI++;
      }
    }
#endif
    return I->second;
  }

  // Dump inputs
  LLVM_DEBUG({
    dbgs() << "buildGraph: ";
    dumpBundle(Values);
  });

  if (!areVectorizable(Values))
    return markFailed();

  assert(getOpcode(Values) && "Opcodes for all values must match");
  unsigned ValuesOpcode = getOpcode(Values).getValue();

  SmallVector<VPValue *, 4> CombinedOperands;
  if (areCommutative(Values)) {
    bool MultiNodeRoot = !MultiNodeActive;
    MultiNodeActive = true;
    for (auto &Operands : getOperands(Values)) {
      LLVM_DEBUG({
        dbgs() << "  Visiting Commutative";
        dumpBundle(Operands);
      });

      auto OperandsOpcode = getOpcode(Operands);
      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
        CombinedOperands.push_back(buildGraph(Operands));
      } else {
        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
        // Create dummy VPInstruction, which will we replace later by the
        // re-ordered operand.
        VPInstruction *Op = new VPInstruction(0, {});
        CombinedOperands.push_back(Op);
        MultiNodeOps.emplace_back(Op, Operands);
      }
    }

    if (MultiNodeRoot) {
      LLVM_DEBUG(dbgs() << "Reorder \n");
      MultiNodeActive = false;

      auto FinalOrder = reorderMultiNodeOps();

      MultiNodeOps.clear();
      for (auto &Ops : FinalOrder) {
        VPInstruction *NewOp = buildGraph(Ops.second);
        Ops.first->replaceAllUsesWith(NewOp);
        for (unsigned i = 0; i < CombinedOperands.size(); i++)
          if (CombinedOperands[i] == Ops.first)
            CombinedOperands[i] = NewOp;
        delete Ops.first;
        Ops.first = NewOp;
      }
      LLVM_DEBUG(dbgs() << "Found final order\n");
    }
  } else {
    LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
    if (ValuesOpcode == Instruction::Load)
      for (VPValue *V : Values)
        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
    else
      for (auto &Operands : getOperands(Values))
        CombinedOperands.push_back(buildGraph(Operands));
  }

  unsigned Opcode;
  switch (ValuesOpcode) {
  case Instruction::Load:
    Opcode = VPInstruction::SLPLoad;
    break;
  case Instruction::Store:
    Opcode = VPInstruction::SLPStore;
    break;
  default:
    Opcode = ValuesOpcode;
    break;
  }

  if (!CompletelySLP)
    return markFailed();

  assert(CombinedOperands.size() > 0 && "Need more some operands");
  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());

  LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
                    << *cast<VPInstruction>(Values[0]) << "\n");
  addCombined(Values, VPI);
  return VPI;
}
first commit 2022-04-25 10:02:23 +02:00			`//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`/// This file implements SLP analysis based on VPlan. The analysis is based on`
			`/// the ideas described in`
			`///`
			`/// Look-ahead SLP: auto-vectorization in the presence of commutative`
			`/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,`
			`/// Luís F. W. Góes`
			`///`
			`//===----------------------------------------------------------------------===//`

			`#include "VPlan.h"`
			`#include "llvm/ADT/DepthFirstIterator.h"`
			`#include "llvm/ADT/PostOrderIterator.h"`
			`#include "llvm/ADT/SmallVector.h"`
			`#include "llvm/ADT/Twine.h"`
			`#include "llvm/Analysis/LoopInfo.h"`
			`#include "llvm/Analysis/VectorUtils.h"`
			`#include "llvm/IR/BasicBlock.h"`
			`#include "llvm/IR/CFG.h"`
			`#include "llvm/IR/Dominators.h"`
			`#include "llvm/IR/InstrTypes.h"`
			`#include "llvm/IR/Instruction.h"`
			`#include "llvm/IR/Instructions.h"`
			`#include "llvm/IR/Type.h"`
			`#include "llvm/IR/Value.h"`
			`#include "llvm/Support/Casting.h"`
			`#include "llvm/Support/Debug.h"`
			`#include "llvm/Support/ErrorHandling.h"`
			`#include "llvm/Support/GraphWriter.h"`
			`#include "llvm/Support/raw_ostream.h"`
			`#include "llvm/Transforms/Utils/BasicBlockUtils.h"`
			`#include <cassert>`
			`#include <iterator>`
			`#include <string>`
			`#include <vector>`

			`using namespace llvm;`

			`#define DEBUG_TYPE "vplan-slp"`

			`// Number of levels to look ahead when re-ordering multi node operands.`
			`static unsigned LookaheadMaxDepth = 5;`

			`VPInstruction *VPlanSlp::markFailed() {`
			`// FIXME: Currently this is used to signal we hit instructions we cannot`
			`// trivially SLP'ize.`
			`CompletelySLP = false;`
			`return nullptr;`
			`}`

			`void VPlanSlp::addCombined(ArrayRef<VPValue > Operands, VPInstruction New) {`
			`if (all_of(Operands, [](VPValue *V) {`
			`return cast<VPInstruction>(V)->getUnderlyingInstr();`
			`})) {`
			`unsigned BundleSize = 0;`
			`for (VPValue *V : Operands) {`
			`Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();`
			`assert(!T->isVectorTy() && "Only scalar types supported for now");`
			`BundleSize += T->getScalarSizeInBits();`
			`}`
			`WidestBundleBits = std::max(WidestBundleBits, BundleSize);`
			`}`

			`auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);`
			`assert(Res.second &&`
			`"Already created a combined instruction for the operand bundle");`
			`(void)Res;`
			`}`

			`bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {`
			`// Currently we only support VPInstructions.`
			`if (!all_of(Operands, [](VPValue *Op) {`
			`return Op && isa<VPInstruction>(Op) &&`
			`cast<VPInstruction>(Op)->getUnderlyingInstr();`
			`})) {`
			`LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");`
			`return false;`
			`}`

			`// Check if opcodes and type width agree for all instructions in the bundle.`
			`// FIXME: Differing widths/opcodes can be handled by inserting additional`
			`// instructions.`
			`// FIXME: Deal with non-primitive types.`
			`const Instruction *OriginalInstr =`
			`cast<VPInstruction>(Operands[0])->getUnderlyingInstr();`
			`unsigned Opcode = OriginalInstr->getOpcode();`
			`unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();`
			`if (!all_of(Operands, [Opcode, Width](VPValue *Op) {`
			`const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();`
			`return I->getOpcode() == Opcode &&`
			`I->getType()->getPrimitiveSizeInBits() == Width;`
			`})) {`
			`LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");`
			`return false;`
			`}`

			`// For now, all operands must be defined in the same BB.`
			`if (any_of(Operands, [this](VPValue *Op) {`
			`return cast<VPInstruction>(Op)->getParent() != &this->BB;`
			`})) {`
			`LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");`
			`return false;`
			`}`

			`if (any_of(Operands,`
			`[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {`
			`LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");`
			`return false;`
			`}`

			`// For loads, check that there are no instructions writing to memory in`
			`// between them.`
			`// TODO: we only have to forbid instructions writing to memory that could`
			`// interfere with any of the loads in the bundle`
			`if (Opcode == Instruction::Load) {`
			`unsigned LoadsSeen = 0;`
			`VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();`
			`for (auto &I : *Parent) {`
			`auto *VPI = cast<VPInstruction>(&I);`
			`if (VPI->getOpcode() == Instruction::Load &&`
			`llvm::is_contained(Operands, VPI))`
			`LoadsSeen++;`

			`if (LoadsSeen == Operands.size())`
			`break;`
			`if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {`
			`LLVM_DEBUG(`
			`dbgs() << "VPSLP: instruction modifying memory between loads\n");`
			`return false;`
			`}`
			`}`

			`if (!all_of(Operands, [](VPValue *Op) {`
			`return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())`
			`->isSimple();`
			`})) {`
			`LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");`
			`return false;`
			`}`
			`}`

			`if (Opcode == Instruction::Store)`
			`if (!all_of(Operands, [](VPValue *Op) {`
			`return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())`
			`->isSimple();`
			`})) {`
			`LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");`
			`return false;`
			`}`

			`return true;`
			`}`

			`static SmallVector<VPValue , 4> getOperands(ArrayRef<VPValue > Values,`
			`unsigned OperandIndex) {`
			`SmallVector<VPValue *, 4> Operands;`
			`for (VPValue *V : Values) {`
			`// Currently we only support VPInstructions.`
			`auto *U = cast<VPInstruction>(V);`
			`Operands.push_back(U->getOperand(OperandIndex));`
			`}`
			`return Operands;`
			`}`

			`static bool areCommutative(ArrayRef<VPValue *> Values) {`
			`return Instruction::isCommutative(`
			`cast<VPInstruction>(Values[0])->getOpcode());`
			`}`

			`static SmallVector<SmallVector<VPValue *, 4>, 4>`
			`getOperands(ArrayRef<VPValue *> Values) {`
			`SmallVector<SmallVector<VPValue *, 4>, 4> Result;`
			`auto *VPI = cast<VPInstruction>(Values[0]);`

			`switch (VPI->getOpcode()) {`
			`case Instruction::Load:`
			`llvm_unreachable("Loads terminate a tree, no need to get operands");`
			`case Instruction::Store:`
			`Result.push_back(getOperands(Values, 0));`
			`break;`
			`default:`
			`for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)`
			`Result.push_back(getOperands(Values, I));`
			`break;`
			`}`

			`return Result;`
			`}`

			`/// Returns the opcode of Values or ~0 if they do not all agree.`
			`static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {`
			`unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();`
			`if (any_of(Values, [Opcode](VPValue *V) {`
			`return cast<VPInstruction>(V)->getOpcode() != Opcode;`
			`}))`
			`return None;`
			`return {Opcode};`
			`}`

			`/// Returns true if A and B access sequential memory if they are loads or`
			`/// stores or if they have identical opcodes otherwise.`
			`static bool areConsecutiveOrMatch(VPInstruction A, VPInstruction B,`
			`VPInterleavedAccessInfo &IAI) {`
			`if (A->getOpcode() != B->getOpcode())`
			`return false;`

			`if (A->getOpcode() != Instruction::Load &&`
			`A->getOpcode() != Instruction::Store)`
			`return true;`
			`auto *GA = IAI.getInterleaveGroup(A);`
			`auto *GB = IAI.getInterleaveGroup(B);`

			`return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);`
			`}`

			`/// Implements getLAScore from Listing 7 in the paper.`
			`/// Traverses and compares operands of V1 and V2 to MaxLevel.`
			`static unsigned getLAScore(VPValue V1, VPValue V2, unsigned MaxLevel,`
			`VPInterleavedAccessInfo &IAI) {`
			`auto *I1 = dyn_cast<VPInstruction>(V1);`
			`auto *I2 = dyn_cast<VPInstruction>(V2);`
			`// Currently we only support VPInstructions.`
			`if (!I1 \|\| !I2)`
			`return 0;`

			`if (MaxLevel == 0)`
			`return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);`

			`unsigned Score = 0;`
			`for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)`
			`for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)`
			`Score +=`
			`getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);`
			`return Score;`
			`}`

			`std::pair<VPlanSlp::OpMode, VPValue *>`
			`VPlanSlp::getBest(OpMode Mode, VPValue *Last,`
			`SmallPtrSetImpl<VPValue *> &Candidates,`
			`VPInterleavedAccessInfo &IAI) {`
			`assert((Mode == OpMode::Load \|\| Mode == OpMode::Opcode) &&`
			`"Currently we only handle load and commutative opcodes");`
			`LLVM_DEBUG(dbgs() << " getBest\n");`

			`SmallVector<VPValue *, 4> BestCandidates;`
			`LLVM_DEBUG(dbgs() << " Candidates for "`
			`<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");`
			`for (auto *Candidate : Candidates) {`
			`auto *LastI = cast<VPInstruction>(Last);`
			`auto *CandidateI = cast<VPInstruction>(Candidate);`
			`if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {`
			`LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()`
			`<< " ");`
			`BestCandidates.push_back(Candidate);`
			`}`
			`}`
			`LLVM_DEBUG(dbgs() << "\n");`

			`if (BestCandidates.empty())`
			`return {OpMode::Failed, nullptr};`

			`if (BestCandidates.size() == 1)`
			`return {Mode, BestCandidates[0]};`

			`VPValue *Best = nullptr;`
			`unsigned BestScore = 0;`
			`for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {`
			`unsigned PrevScore = ~0u;`
			`bool AllSame = true;`

			`// FIXME: Avoid visiting the same operands multiple times.`
			`for (auto *Candidate : BestCandidates) {`
			`unsigned Score = getLAScore(Last, Candidate, Depth, IAI);`
			`if (PrevScore == ~0u)`
			`PrevScore = Score;`
			`if (PrevScore != Score)`
			`AllSame = false;`
			`PrevScore = Score;`

			`if (Score > BestScore) {`
			`BestScore = Score;`
			`Best = Candidate;`
			`}`
			`}`
			`if (!AllSame)`
			`break;`
			`}`
			`LLVM_DEBUG(dbgs() << "Found best "`
			`<< *cast<VPInstruction>(Best)->getUnderlyingInstr()`
			`<< "\n");`
			`Candidates.erase(Best);`

			`return {Mode, Best};`
			`}`

			`SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {`
			`SmallVector<MultiNodeOpTy, 4> FinalOrder;`
			`SmallVector<OpMode, 4> Mode;`
			`FinalOrder.reserve(MultiNodeOps.size());`
			`Mode.reserve(MultiNodeOps.size());`

			`LLVM_DEBUG(dbgs() << "Reordering multinode\n");`

			`for (auto &Operands : MultiNodeOps) {`
			`FinalOrder.push_back({Operands.first, {Operands.second[0]}});`
			`if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==`
			`Instruction::Load)`
			`Mode.push_back(OpMode::Load);`
			`else`
			`Mode.push_back(OpMode::Opcode);`
			`}`

			`for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {`
			`LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");`
			`SmallPtrSet<VPValue *, 4> Candidates;`
			`LLVM_DEBUG(dbgs() << " Candidates ");`
			`for (auto Ops : MultiNodeOps) {`
			`LLVM_DEBUG(`
			`dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()`
			`<< " ");`
			`Candidates.insert(Ops.second[Lane]);`
			`}`
			`LLVM_DEBUG(dbgs() << "\n");`

			`for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {`
			`LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");`
			`if (Mode[Op] == OpMode::Failed)`
			`continue;`

			`VPValue *Last = FinalOrder[Op].second[Lane - 1];`
			`std::pair<OpMode, VPValue *> Res =`
			`getBest(Mode[Op], Last, Candidates, IAI);`
			`if (Res.second)`
			`FinalOrder[Op].second.push_back(Res.second);`
			`else`
			`// TODO: handle this case`
			`FinalOrder[Op].second.push_back(markFailed());`
			`}`
			`}`

			`return FinalOrder;`
			`}`

			`void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {`
			`dbgs() << " Ops: ";`
			`for (auto Op : Values) {`
			`if (auto *VPInstr = cast_or_null<VPInstruction>(Op))`
			`if (auto *Instr = VPInstr->getUnderlyingInstr()) {`
			`dbgs() << *Instr << " \| ";`
			`continue;`
			`}`
			`dbgs() << " nullptr \| ";`
			`}`
			`dbgs() << "\n";`
			`}`

			`VPInstruction VPlanSlp::buildGraph(ArrayRef<VPValue > Values) {`
			`assert(!Values.empty() && "Need some operands!");`

			`// If we already visited this instruction bundle, re-use the existing node`
			`auto I = BundleToCombined.find(to_vector<4>(Values));`
			`if (I != BundleToCombined.end()) {`
			`#ifndef NDEBUG`
			`// Check that the resulting graph is a tree. If we re-use a node, this means`
			`// its values have multiple users. We only allow this, if all users of each`
			`// value are the same instruction.`
			`for (auto *V : Values) {`
			`auto UI = V->user_begin();`
			`auto FirstUser = UI++;`
			`while (UI != V->user_end()) {`
			`assert(*UI == FirstUser && "Currently we only support SLP trees.");`
			`UI++;`
			`}`
			`}`
			`#endif`
			`return I->second;`
			`}`

			`// Dump inputs`
			`LLVM_DEBUG({`
			`dbgs() << "buildGraph: ";`
			`dumpBundle(Values);`
			`});`

			`if (!areVectorizable(Values))`
			`return markFailed();`

			`assert(getOpcode(Values) && "Opcodes for all values must match");`
			`unsigned ValuesOpcode = getOpcode(Values).getValue();`

			`SmallVector<VPValue *, 4> CombinedOperands;`
			`if (areCommutative(Values)) {`
			`bool MultiNodeRoot = !MultiNodeActive;`
			`MultiNodeActive = true;`
			`for (auto &Operands : getOperands(Values)) {`
			`LLVM_DEBUG({`
			`dbgs() << " Visiting Commutative";`
			`dumpBundle(Operands);`
			`});`

			`auto OperandsOpcode = getOpcode(Operands);`
			`if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {`
			`LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");`
			`CombinedOperands.push_back(buildGraph(Operands));`
			`} else {`
			`LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");`
			`// Create dummy VPInstruction, which will we replace later by the`
			`// re-ordered operand.`
			`VPInstruction *Op = new VPInstruction(0, {});`
			`CombinedOperands.push_back(Op);`
			`MultiNodeOps.emplace_back(Op, Operands);`
			`}`
			`}`

			`if (MultiNodeRoot) {`
			`LLVM_DEBUG(dbgs() << "Reorder \n");`
			`MultiNodeActive = false;`

			`auto FinalOrder = reorderMultiNodeOps();`

			`MultiNodeOps.clear();`
			`for (auto &Ops : FinalOrder) {`
			`VPInstruction *NewOp = buildGraph(Ops.second);`
			`Ops.first->replaceAllUsesWith(NewOp);`
			`for (unsigned i = 0; i < CombinedOperands.size(); i++)`
			`if (CombinedOperands[i] == Ops.first)`
			`CombinedOperands[i] = NewOp;`
			`delete Ops.first;`
			`Ops.first = NewOp;`
			`}`
			`LLVM_DEBUG(dbgs() << "Found final order\n");`
			`}`
			`} else {`
			`LLVM_DEBUG(dbgs() << " NonCommuntative\n");`
			`if (ValuesOpcode == Instruction::Load)`
			`for (VPValue *V : Values)`
			`CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));`
			`else`
			`for (auto &Operands : getOperands(Values))`
			`CombinedOperands.push_back(buildGraph(Operands));`
			`}`

			`unsigned Opcode;`
			`switch (ValuesOpcode) {`
			`case Instruction::Load:`
			`Opcode = VPInstruction::SLPLoad;`
			`break;`
			`case Instruction::Store:`
			`Opcode = VPInstruction::SLPStore;`
			`break;`
			`default:`
			`Opcode = ValuesOpcode;`
			`break;`
			`}`

			`if (!CompletelySLP)`
			`return markFailed();`

			`assert(CombinedOperands.size() > 0 && "Need more some operands");`
			`auto *VPI = new VPInstruction(Opcode, CombinedOperands);`
			`VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());`

			`LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "`
			`<< *cast<VPInstruction>(Values[0]) << "\n");`
			`addCombined(Values, VPI);`
			`return VPI;`
			`}`