auto模式示例

范围

这个文档列出了一些使用manual模式和auto模式对比的kernel用例。

TADD


#include <pto/pto-inst.hpp>
#include <pto/common/constants.hpp>

using namespace pto;

AICORE void runTAdd(__gm__ float __out__ *out, __gm__ float __in__ *src0, __gm__ float __in__ *src1) {
    using DynShapeDim5 = Shape<1, 1, 1, 64, 64>;
    using DynStridDim5 = Stride<1, 1, 1, 64, 1>;
    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
    using TileData = Tile<TileType::Vec, float, 64, 64, BLayout::RowMajor, 64, 64>;
    TileData src0Tile(64, 64);
    TileData src1Tile(64, 64);
    TileData dstTile(64, 64);

    GlobalData src0Global(src0);
    GlobalData src1Global(src1);
    GlobalData dstGlobal(out);

    TLOAD(src0Tile, src0Global);
    TLOAD(src1Tile, src1Global);

    TADD(dstTile, src0Tile, src1Tile);

    TSTORE(dstGlobal, dstTile);
}

然而在manual模式看上去像这样:


#include <pto/pto-inst.hpp>
#include <pto/common/constants.hpp>

using namespace pto;

AICORE void runTAdd(__gm__ float __out__ *out, __gm__ float __in__ *src0, __gm__ float __in__ *src1) {
    using DynShapeDim5 = Shape<1, 1, 1, 64, 64>;
    using DynStridDim5 = Stride<1, 1, 1, 64, 1>;
    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
    using TileData = Tile<TileType::Vec, float, 64, 64, BLayout::RowMajor, 64, 64>;
    TileData src0Tile(64, 64);
    TileData src1Tile(64, 64);
    TileData dstTile(64, 64);

    /* TAssign only in manual mode */
    TASSIGN(src0Tile, 0x0);
    TASSIGN(src1Tile, 0x10000);
    TASSIGN(dstTile, 0x20000);

    GlobalData src0Global(src0);
    GlobalData src1Global(src1);
    GlobalData dstGlobal(out);

    /* event model only in manual mode */
    Event<Op::TLOAD, Op::TADD> event0;
    Event<Op::TADD, Op::TSTORE_VEC> event1;

    TLOAD(src0Tile, src0Global);
    event0 = TLOAD(src1Tile, src1Global);
    event1 = TADD(dstTile, src0Tile, src1Tile, event0);
    TSTORE(dstGlobal, dstTile, event1);
}

TMATMUL


#include <pto/pto-inst.hpp>
#include <pto/common/constants.hpp>

using namespace pto;

template <typename cType, typename aType, typename bType, typename fbType, typename l0cType, int M, int K, int N,
          int ValidM, int ValidK, int ValidN>
__global__ AICORE void runTMatMul(__gm__ cType *out, __gm__ aType *src0, __gm__ bType *src1, __gm__ fbType *src2)
{
    using GlobalDataSrc0 = GlobalTensor<aType, pto::Shape<1, 1, 1, ValidM, ValidK>,
                                        pto::Stride<ValidM * ValidK, ValidM * ValidK, ValidM * ValidK, ValidK, 1>>;
    using GlobalDataSrc1 = GlobalTensor<bType, pto::Shape<1, 1, 1, ValidK, ValidN>,
                                        pto::Stride<ValidK * ValidN, ValidK * ValidN, ValidK * ValidN, ValidN, 1>>;
    using GlobalDataSrc2 =
        GlobalTensor<fbType, pto::Shape<1, 1, 1, 1, ValidN>, pto::Stride<ValidN, ValidN, ValidN, ValidN, 1>>;
    using GlobalDataOut = GlobalTensor<cType, pto::Shape<1, 1, 1, ValidM, ValidN>,
                                       pto::Stride<ValidM * ValidN, ValidM * ValidN, ValidM * ValidN, ValidN, 1>>;

    GlobalDataSrc0 src0Global(src0);
    GlobalDataSrc1 src1Global(src1);
    GlobalDataSrc2 src2Global(src2);
    GlobalDataOut dstGlobal(out);

    using TileMatAData = Tile<TileType::Mat, aType, M, K, BLayout::ColMajor, ValidM, ValidK, SLayout::RowMajor, 512>;
    using TileMatBData = Tile<TileType::Mat, bType, K, N, BLayout::ColMajor, ValidK, ValidN, SLayout::RowMajor, 512>;
    using TileMatFbData = Tile<TileType::Mat, fbType, 1, N, BLayout::RowMajor, 1, ValidN, SLayout::NoneBox>;

    using LeftTile = TileLeft<aType, M, K, ValidM, ValidK>;
    using RightTile = TileRight<bType, K, N, ValidK, ValidN>;
    using AccTile = TileAcc<l0cType, M, N, ValidM, ValidN>;

    using FbTile = Tile<TileType::Scaling, fbType, 1, N, BLayout::RowMajor, 1, ValidN, SLayout::NoneBox>;

    TileMatAData aMatTile;
    TileMatBData bMatTile;
    TileMatFbData fbMatTile;

    LeftTile aTile;
    RightTile bTile;
    AccTile cTile;
    FbTile fbTile;

    TLOAD(aMatTile, src0Global);
    TLOAD(bMatTile, src1Global);
    TLOAD(fbMatTile, src2Global);

    /**************************TMOV & TMATMUL**************************/
    TMOV(aTile, aMatTile);
    TMOV(bTile, bMatTile);
    TMATMUL(cTile, aTile, bTile);
    TMOV(fbTile, fbMatTile);

    /********************************TSTORE****************************/
    TSTORE_FP<AccTile, GlobalDataOut, FbTile>(dstGlobal, cTile, fbTile);
}

在manual模式下:


#include <pto/pto-inst.hpp>
#include <pto/common/constants.hpp>

using namespace pto;

template <typename cType, typename aType, typename bType, typename fbType, typename l0cType, int M, int K, int N,
          int ValidM, int ValidK, int ValidN>
__global__ AICORE void runTMatMul(__gm__ cType *out, __gm__ aType *src0, __gm__ bType *src1, __gm__ fbType *src2)
{
    using GlobalDataSrc0 = GlobalTensor<aType, pto::Shape<1, 1, 1, ValidM, ValidK>,
                                        pto::Stride<ValidM * ValidK, ValidM * ValidK, ValidM * ValidK, ValidK, 1>>;
    using GlobalDataSrc1 = GlobalTensor<bType, pto::Shape<1, 1, 1, ValidK, ValidN>,
                                        pto::Stride<ValidK * ValidN, ValidK * ValidN, ValidK * ValidN, ValidN, 1>>;
    using GlobalDataSrc2 =
        GlobalTensor<fbType, pto::Shape<1, 1, 1, 1, ValidN>, pto::Stride<ValidN, ValidN, ValidN, ValidN, 1>>;
    using GlobalDataOut = GlobalTensor<cType, pto::Shape<1, 1, 1, ValidM, ValidN>,
                                       pto::Stride<ValidM * ValidN, ValidM * ValidN, ValidM * ValidN, ValidN, 1>>;

    GlobalDataSrc0 src0Global(src0);
    GlobalDataSrc1 src1Global(src1);
    GlobalDataSrc2 src2Global(src2);
    GlobalDataOut dstGlobal(out);

    using TileMatAData = Tile<TileType::Mat, aType, M, K, BLayout::ColMajor, ValidM, ValidK, SLayout::RowMajor, 512>;
    using TileMatBData = Tile<TileType::Mat, bType, K, N, BLayout::ColMajor, ValidK, ValidN, SLayout::RowMajor, 512>;
    using TileMatFbData = Tile<TileType::Mat, fbType, 1, N, BLayout::RowMajor, 1, ValidN, SLayout::NoneBox>;

    using LeftTile = TileLeft<aType, M, K, ValidM, ValidK>;
    using RightTile = TileRight<bType, K, N, ValidK, ValidN>;
    using AccTile = TileAcc<l0cType, M, N, ValidM, ValidN>;

    using FbTile = Tile<TileType::Scaling, fbType, 1, N, BLayout::RowMajor, 1, ValidN, SLayout::NoneBox>;

    TileMatAData aMatTile;
    TileMatBData bMatTile;
    TileMatFbData fbMatTile;

    /* TAssign only in manual mode */
    TASSIGN(aMatTile, 0x0);
    TASSIGN(bMatTile, 0x10000);
    TASSIGN(fbMatTile, 0x20000);

    LeftTile aTile;
    RightTile bTile;
    AccTile cTile;
    FbTile fbTile;

    /* TAssign only in manual mode */
    TASSIGN(aTile, 0x0);
    TASSIGN(bTile, 0x0);
    TASSIGN(cTile, 0x0);
    TASSIGN(fbTile, 0x0);

    /* event model only in manual mode */
    Event<Op::TLOAD, Op::TMOV_M2L> evtLoad_Mov;
    Event<Op::TMOV_M2B, Op::TMATMUL> evtMov_Matmul;
    Event<Op::TMATMUL, Op::TMOV_M2S> evtMatmul_MovM2s;

    TLOAD(aMatTile, src0Global);
    TLOAD(bMatTile, src1Global);
    evtLoad_Mov = TLOAD(fbMatTile, src2Global);

    /**************************TMOV & TMATMUL**************************/
    TMOV(aTile, aMatTile, evtLoad_Mov);
    evtMov_Matmul = TMOV(bTile, bMatTile);
    evtMatmul_MovM2s = TMATMUL(cTile, aTile, bTile, evtMov_Matmul);
    TMOV(fbTile, fbMatTile, evtMatmul_MovM2s);

    /********************************TSTORE****************************/
    TSTORE_FP<AccTile, GlobalDataOut, FbTile>(dstGlobal, cTile, fbTile);
}