reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file
// This post-linking pass replaces the function pointer of enqueued
// block kernel with a global variable (runtime handle) and adds
// "runtime-handle" attribute to the enqueued block kernel.
//
// In LLVM CodeGen the runtime-handle metadata will be translated to
// RuntimeHandle metadata in code object. Runtime allocates a global buffer
// for each kernel with RuntimeHandel metadata and saves the kernel address
// required for the AQL packet into the buffer. __enqueue_kernel function
// in device library knows that the invoke function pointer in the block
// literal is actually runtime handle and loads the kernel address from it
// and put it into AQL packet for dispatching.
//
// This cannot be done in FE since FE cannot create a unique global variable
// with external linkage across LLVM modules. The global variable with internal
// linkage does not work since optimization passes will try to replace loads
// of the global variable with its initialization value.
//
// It also identifies the kernels directly or indirectly enqueues kernels
// and adds "calls-enqueue-kernel" function attribute to them, which will
// be used to determine whether to emit runtime metadata for the kernel
// enqueue related hidden kernel arguments.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

#define DEBUG_TYPE "amdgpu-lower-enqueued-block"

using namespace llvm;

namespace {

/// Lower enqueued blocks.
class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
public:
  static char ID;

  explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}

private:
  bool runOnModule(Module &M) override;
};

} // end anonymous namespace

char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;

char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
    AMDGPUOpenCLEnqueuedBlockLowering::ID;

INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
                "Lower OpenCL enqueued blocks", false, false)

ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
  return new AMDGPUOpenCLEnqueuedBlockLowering();
}

/// Collect direct or indrect callers of \p F and save them
/// to \p Callers.
static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
  for (auto U : F->users()) {
    if (auto *CI = dyn_cast<CallInst>(&*U)) {
      auto *Caller = CI->getParent()->getParent();
      if (Callers.insert(Caller).second)
        collectCallers(Caller, Callers);
    }
  }
}

/// If \p U is instruction or constant, collect functions which directly or
/// indirectly use it.
static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
  if (auto *I = dyn_cast<Instruction>(U)) {
    auto *F = I->getParent()->getParent();
    if (Funcs.insert(F).second)
      collectCallers(F, Funcs);
    return;
  }
  if (!isa<Constant>(U))
    return;
  for (auto UU : U->users())
    collectFunctionUsers(&*UU, Funcs);
}

bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
  DenseSet<Function *> Callers;
  auto &C = M.getContext();
  bool Changed = false;
  for (auto &F : M.functions()) {
    if (F.hasFnAttribute("enqueued-block")) {
      if (!F.hasName()) {
        SmallString<64> Name;
        Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
                                   M.getDataLayout());
        F.setName(Name);
      }
      LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
      auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
      auto T = ArrayType::get(Type::getInt64Ty(C), 2);
      auto *GV = new GlobalVariable(
          M, T,
          /*isConstant=*/false, GlobalValue::ExternalLinkage,
          /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
          /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
          AMDGPUAS::GLOBAL_ADDRESS,
          /*isExternallyInitialized=*/false);
      LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');

      for (auto U : F.users()) {
        auto *UU = &*U;
        if (!isa<ConstantExpr>(UU))
          continue;
        collectFunctionUsers(UU, Callers);
        auto *BitCast = cast<ConstantExpr>(UU);
        auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
        BitCast->replaceAllUsesWith(NewPtr);
        F.addFnAttr("runtime-handle", RuntimeHandle);
        F.setLinkage(GlobalValue::ExternalLinkage);
        Changed = true;
      }
    }
  }

  for (auto F : Callers) {
    if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
      continue;
    F->addFnAttr("calls-enqueue-kernel");
    LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
  }
  return Changed;
}