64     typename InstructionShape_,
    85     typename InstructionShape_,
    90     Shape_, Operand::
kA, Element_,
    92         sizeof_bits<Element_>::value>,
    93     InstructionShape_, OpDelta_, 32> {
   112   static int const kOpDelta = OpDelta_;
   115   static int const kThreads = 32;
   132         !(Shape::kContiguous % InstructionShape::kContiguous),
   133         "Shape of warp-level Mma must be divisible by operator shape.");
   144       InstructionShape::kStrided / LdsShape::kStrided,
   145       Shape::kContiguous / LdsShape::kContiguous
   153     "Alternative arrangements not supported at present.");
   156   static int const kPointerCount = 2;
   193     stride_(ref.stride(0) / 
Layout::kElementsPerAccess), byte_offset_(0) {
   198     int vec_row = (lane_id >> 4); 
   199     int vec_col = ((lane_id & 4) >> 2); 
   202     for (
int i = 0; i < kPointerCount; ++i) {
   207       int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
   208       int access_contiguous = access_contiguous_idx;
   210       int access_strided = vec_row;
   212         access_contiguous + access_strided * stride_;
   221     byte_offset_ += offset * 
sizeof(Element);
   230     int contiguous_offset = tile_offset.contiguous();
   231     int strided_offset = tile_offset.strided();
   234     if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
   235       if (contiguous_offset % 2) {
   237         pointer_[0] = pointer_[1];
   238         pointer_[1] = tmp_pointer;
   240       contiguous_offset = contiguous_offset / 2;
   243     int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
   244                      Layout::kElementsPerAccess +
   245                  contiguous_offset * Shape::kContiguous;
   247     add_pointer_offset(offset);
   255     byte_offset_ += stride_ * InstructionShape::kStrided * 
sizeof(Element) *
   256                     Layout::kElementsPerAccess;
   264     byte_offset_ -= stride_ * InstructionShape::kStrided * 
sizeof(Element) *
   265                     Layout::kElementsPerAccess;
   273     add_tile_offset(tile_offset);
   280     add_tile_offset(-tile_offset);
   288     load_with_byte_offset(frag, 0);
   297       Index byte_offset)
 const {
   302     for (
int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
   305       for (
int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
   307         int access_idx = c + s * Policy::LdsIterations::kContiguous;
   309         AccessType const *source_ptr = pointer_[s & 1] +
   310           Policy::LdsShape::kContiguous * c +
   311           Policy::LdsShape::kStrided * (s / 2) * stride_;
   313         char const *source_byte_ptr = 
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
   314         fetch_ptr[access_idx] = *(
reinterpret_cast<AccessType const*
> (source_byte_ptr));
   325       Index pointer_offset)
 const {
   326     load_with_byte_offset(frag, pointer_offset * 
sizeof(Element));
   336     load_with_byte_offset(frag, tile_offset, 0);
   347       Index pointer_offset)
 const {
   348     load_with_byte_offset(frag, tile_offset, pointer_offset * 
sizeof(Element));
   359       Index byte_offset)
 const {
   360     Index pointer_offset =
   361         tile_offset.contiguous() * Shape::kContiguous /
   362             Layout::kElementsPerAccess +
   363         tile_offset.strided() * InstructionShape::kStrided * stride_;
   365     byte_offset += 
sizeof(
AccessType) * pointer_offset;
   367     load_with_byte_offset(frag, byte_offset);
   396     typename InstructionShape_,
   402     Shape_, Operand::
kB, Element_,
   404         sizeof_bits<Element_>::value>,
   405     InstructionShape_, OpDelta_, 32> {
   424   static int const kOpDelta = OpDelta_;
   427   static int const kThreads = 32;
   444         !(Shape::kContiguous % InstructionShape::kContiguous),
   445         "Shape of warp-level Mma must be divisible by operator shape.");
   455       Shape::kContiguous / LdsShape::kContiguous,
   456       InstructionShape::kStrided / LdsShape::kStrided
   464     "Alternative arrangements not supported at present.");
   501     stride_(ref.stride(0) / 
Layout::kElementsPerAccess), byte_offset_(0) {
   504     int access_strided = (lane_id >> 3) & 0x3;
   505     int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);
   508                 access_contiguous + access_strided * stride_;
   516     byte_offset_ += offset * 
sizeof(Element);
   525     int contiguous_offset = tile_offset.contiguous();
   526     int strided_offset = tile_offset.strided();
   528     int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
   529                      Layout::kElementsPerAccess +
   530                  contiguous_offset * Shape::kContiguous;
   532     add_pointer_offset(offset);
   540     byte_offset_ += stride_ * InstructionShape::kStrided * 
sizeof(Element) *
   541                     Layout::kElementsPerAccess;
   549     byte_offset_ += stride_ * InstructionShape::kStrided * 
sizeof(Element) *
   550                     Layout::kElementsPerAccess;
   558     add_tile_offset(tile_offset);
   565     add_tile_offset(-tile_offset);
   573     load_with_byte_offset(frag, 0);
   582       Index byte_offset)
 const {
   587     for (
int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
   590       for (
int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
   592         int access_idx = c + s * Policy::LdsIterations::kContiguous;
   595           Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
   596           Policy::LdsShape::kStrided * s * stride_;
   598         char const *source_byte_ptr = 
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
   599         fetch_ptr[access_idx] = *(
reinterpret_cast<AccessType const*
> (source_byte_ptr));
   610       Index pointer_offset)
 const {
   611     load_with_byte_offset(frag, pointer_offset * 
sizeof(Element));
   621     load_with_byte_offset(frag, tile_offset, 0);
   632       Index pointer_offset)
 const {
   633     load_with_byte_offset(frag, tile_offset, pointer_offset * 
sizeof(Element));
   644       Index byte_offset)
 const {
   645     Index pointer_offset =
   646         tile_offset.contiguous() * Shape::kContiguous /
   647             Layout::kElementsPerAccess +
   648         tile_offset.strided() * InstructionShape::kStrided * stride_;
   650     byte_offset += 
sizeof(
AccessType) * pointer_offset;
   652     load_with_byte_offset(frag, byte_offset);
   682     typename InstructionShape_,
   687     Shape_, Operand::
kA, Element_,
   689         sizeof_bits<Element_>::value>,
   690     InstructionShape_, OpDelta_, 32> {
   709   static int const kOpDelta = OpDelta_;
   712   static int const kThreads = 32;
   731                                InstructionShape::kColumn>,
   741   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
   759   ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
   766     iterator_.add_pointer_offset(offset);
   775     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
   801     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
   808     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
   816     iterator_.load(frag);
   825       Index pointer_offset)
 const {
   826     iterator_.load_with_pointer_offset(frag, pointer_offset);
   835       Index byte_offset)
 const {
   836     iterator_.load_with_byte_offset(frag, byte_offset);
   857       Index pointer_offset)
 const {
   869       Index byte_offset)
 const {
   870     iterator_.load_with_byte_offset(
   872       {tile_offset.contiguous(), tile_offset.strided()},
   885     iterator_.set_kgroup_index(k_group); 
   903     typename InstructionShape_,
   908     Shape_, Operand::
kB, Element_,
   910         sizeof_bits<Element_>::value>,
   911     InstructionShape_, OpDelta_, 32> {
   921     "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
   933   static int const kOpDelta = OpDelta_;
   936   static int const kThreads = 32;
   955                                InstructionShape::kRow>,
   965   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
   983   ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
   990     iterator_.add_pointer_offset(offset);
   999     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
  1025     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
  1032     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
  1040     iterator_.load(frag);
  1049       Index pointer_offset)
 const {
  1050     iterator_.load_with_pointer_offset(frag, pointer_offset);
  1059       Index byte_offset)
 const {
  1060     iterator_.load_with_byte_offset(frag, byte_offset);
  1081       Index pointer_offset)
 const {
  1093       Index byte_offset)
 const {
  1094     iterator_.load_with_byte_offset(
  1096       {tile_offset.strided(), tile_offset.contiguous()},
  1109     iterator_.set_kgroup_index(k_group); 
  1131     typename InstructionShape_,
  1157   static int const kThreads = 32;
  1177     static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
  1178       "Shape of warp-level Mma must be divisible by operator shape.");
  1181       "Layouts must be defined for logical MatrixCoord coordinate space.");
  1185       Shape::kRow / InterleavedTile::kRow,
  1186       Shape::kColumn / InterleavedTile::kColumn
  1190         MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
  1191                     InterleavedTile::kColumn / InstructionShape::kN>;
  1197   static int const kElementsPerPartial = 4;
  1202   static int const kElementsPerMma = 8;
  1203   static int const kAccumulatorPatials = 2;
  1234     int quad = (lane_id >> 2);
  1235     int lane_in_quad = (lane_id & 3);
  1236     int accum_m, accum_n;
  1238     if (platform::is_same<Element, float>::value) {
  1240       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
  1242       accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
  1245       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad; 
  1246       accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
  1286     add_tile_offset(tile_offset);
  1293     add_tile_offset(-tile_offset);
  1300     load_with_pointer_offset(frag, 0);
  1307     Index pointer_offset)
 const {               
  1313     for (
int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
  1315       for (
int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
  1317         for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  1319           for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  1321             int mma_accum_start =
  1322                 (((tile_n * Policy::TileIterations::kRow + tile_m) *
  1323                     Policy::MmaIterations::kColumn + mma_n) *
  1324                      Policy::MmaIterations::kRow + mma_m) * 
  1328             for (
int p = 0; p < kAccumulatorPatials; ++p) {
  1330               for (
int m = 0; m < EleShapePerPatial::kRow; ++m) {
  1332                 for (
int n = 0; n < EleShapePerPatial::kColumn; ++n) {
  1333                   int accum_m = tile_m * Policy::InterleavedTile::kRow +
  1334                                 mma_m * QuadShapePerPatialMma::kRow + m * 2;
  1335                   int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
  1336                                 mma_n * QuadShapePerPatialMma::kColumn +
  1337                                 p * Policy::InterleavedTile::kColumn/2 + n;
  1338                   int idx = mma_accum_start + p * kElementsPerPartial + 
  1339                             m * EleShapePerPatial::kColumn + n;
  1340                 frag[idx] = offset_ref.
at({accum_m, accum_n});
  1353     Index byte_offset)
 const {                  
  1355     load_with_pointer_offset(byte_offset / 
sizeof(Element));
  1364     load(frag, tile_offset, 0);
  1372     Index pointer_offset)
 const {               
  1374     load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  1380     store_with_pointer_offset(frag, 0);
  1387     Index pointer_offset)
 const {               
  1393     for (
int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
  1395       for (
int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
  1397         for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  1399           for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  1401             int mma_accum_start =
  1402                 (((tile_n * Policy::TileIterations::kRow + tile_m) *
  1403                     Policy::MmaIterations::kColumn + mma_n) *
  1404                      Policy::MmaIterations::kRow + mma_m) * 
  1408             for (
int p = 0; p < kAccumulatorPatials; ++p) {
  1410               for (
int m = 0; m < EleShapePerPatial::kRow; ++m) {
  1412                 for (
int n = 0; n < EleShapePerPatial::kColumn; ++n) {
  1413                   int accum_m = tile_m * Policy::InterleavedTile::kRow +
  1414                                 mma_m * QuadShapePerPatialMma::kRow + m * 2;
  1415                   int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
  1416                                 mma_n * QuadShapePerPatialMma::kColumn +
  1417                                 p * Policy::InterleavedTile::kColumn/2 + n;
  1418                   int idx = mma_accum_start + p * kElementsPerPartial + 
  1419                             m * EleShapePerPatial::kColumn + n;
  1420                   offset_ref.
at({accum_m, accum_n}) = frag[idx];
  1434     Index byte_offset)
 const {                  
  1436     store_with_pointer_offset(byte_offset / 
sizeof(Element));
  1445     store(frag, tile_offset, 0);
  1456       Index pointer_offset)
 const {
  1457     store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  1476     typename InstructionShape_,
  1483     Shape_, Operand_, Element_,
  1485         sizeof_bits<Element_>::value, KBlock>,
  1486     InstructionShape_, OpDelta_, 32> {
  1492   static Operand 
const kOperand = Operand_;
  1495                 "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for "  1496                 "A or B operands to warp-level Mma.");
  1502   static int const kKBlock = KBlock;
  1513   static int const kOpDelta = OpDelta_;
  1516   static int const kThreads = 32;
  1540     static int const kElementsPerAccess = 8;
  1543     static int const kContiguousElementsPerLine = 4;
  1549                 "Alternative arrangements not supported at present.");
  1594       : pointer_(reinterpret_cast<
AccessType const *>(ref.data())),
  1595         stride_(ref.stride(0) * Policy::kElementsPerAccess),
  1596         line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
  1597                   Policy::kElementsPerAccess),
  1601     int quad = (lane_id / 4);
  1602     int lane_in_quad = (lane_id % 4);
  1603     int access_contiguous;
  1608       access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
  1609                             ((quad & 0x1) ^ ((quad & 0x4) >> 2));
  1613       access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
  1614                             ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
  1617     byte_offset_ = access_contiguous *
  1618                    sizeof(Element) * Policy::kElementsPerAccess;
  1624     byte_offset_ += offset * 
sizeof(Element);
  1635     int contiguous_offset = tile_offset.contiguous();
  1636     int strided_offset = tile_offset.strided();
  1639     pointer_ += contiguous_offset *
  1640                     (InstructionShape::kContiguous /
  1641                      Policy::kContiguousElementsPerLine) *
  1643                 strided_offset * Shape::kStrided / 2;
  1650     k_group_idx_ = (k_group_idx_ + 1) % 8;
  1652     if (k_group_idx_ == 4 || k_group_idx_ == 0) {
  1653       byte_offset_ ^= 1 * 
sizeof(Element) * Policy::kElementsPerAccess;
  1656     pointer_ += line_size;
  1669     add_tile_offset(tile_offset);
  1678     add_tile_offset(-tile_offset);
  1692       Index byte_offset)
 const {
  1697     for (
int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
  1700       for (
int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
  1702         int access_idx = c + s * Policy::LdsIterations::kContiguous;
  1705           Policy::LdsShape::kContiguous * c * line_size +
  1706           Policy::LdsShape::kStrided * s / 2;
  1708         char const *source_byte_ptr = 
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
  1709         fetch_ptr[access_idx] = *(
reinterpret_cast<AccessType const*
> (source_byte_ptr));
  1712         if (k_group_idx_ &  0x2) {
  1713             uint64_t *low = 
reinterpret_cast<uint64_t *
>(&frag) + access_idx * 2;
  1714             uint64_t *high = 
reinterpret_cast<uint64_t *
>(&frag) + access_idx * 2 + 1;
  1715             uint64_t tmp = *low;
  1729       Index pointer_offset)
 const {
  1730     load_with_byte_offset(frag, pointer_offset * 
sizeof(Element));
  1740     load_with_byte_offset(frag, tile_offset, 0);
  1751       Index pointer_offset)
 const {
  1752     load_with_byte_offset(frag, tile_offset, pointer_offset * 
sizeof(Element));
  1763       Index byte_offset)
 const {
  1764     Index pointer_offset = tile_offset.contiguous() *
  1765                                InstructionShape::kContiguous /
  1766                                Policy::kElementsPerAccess +
  1767                            tile_offset.strided() * Shape::kStrided * stride_;
  1769     byte_offset += 
sizeof(
AccessType) * pointer_offset;
  1771     load_with_byte_offset(frag, byte_offset);
  1783     k_group_idx_ = k_group;
  1802     typename InstructionShape_,
  1809     Shape_, Operand_, Element_,
  1811         sizeof_bits<Element_>::value, KBlock>,
  1812     InstructionShape_, OpDelta_, 32> {
  1818   static Operand 
const kOperand = Operand_;
  1821                 "MmaTensorOpMultiplicandIterator may only be instantiated for "  1822                 "A or B operands to warp-level Mma.");
  1828   static int const kKBlock = KBlock;
  1840   static int const kOpDelta = OpDelta_;
  1843   static int const kThreads = 32;
  1863                                InstructionShape::kColumn>,
  1864       kOpDelta, kThreads>;
  1872   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
  1886       : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
  1891     iterator_.add_pointer_offset(offset);
  1901     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
  1927     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
  1936     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
  1950       Index pointer_offset)
 const {
  1951     iterator_.load_with_pointer_offset(frag, pointer_offset);
  1960       Index byte_offset)
 const {
  1961     iterator_.load_with_byte_offset(frag, byte_offset);
  1983       Index pointer_offset)
 const {
  1996       Index byte_offset)
 const {
  1997     iterator_.load_with_byte_offset(
  1998         frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
  2010     iterator_.set_kgroup_index(k_group); 
  2031     typename InstructionShape_,
  2038     Shape_, Operand_, Element_,
  2040         sizeof_bits<Element_>::value, KBlock>,
  2041     InstructionShape_, OpDelta_, 32> {
  2047   static Operand 
const kOperand = Operand_;
  2050                 "MmaTensorOpMultiplicandIterator may only be instantiated for "  2051                 "A or B operands to warp-level Mma.");
  2057   static int const kKBlock = KBlock;
  2068   static int const kOpDelta = OpDelta_;
  2071   static int const kThreads = 32;
  2091                                InstructionShape::kRow>,
  2092       kOpDelta, kThreads>;
  2100   using Fragment = Array<Element, Shape::kCount / kThreads * 2>;
  2114       : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
  2119     iterator_.add_pointer_offset(offset);
  2129     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
  2155     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
  2164     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
  2178       Index pointer_offset)
 const {
  2179     iterator_.load_with_pointer_offset(frag, pointer_offset);
  2188       Index byte_offset)
 const {
  2189     iterator_.load_with_byte_offset(frag, byte_offset);
  2211       Index pointer_offset)
 const {
  2224       Index byte_offset)
 const {
  2225     iterator_.load_with_byte_offset(
  2226         frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
  2238     iterator_.set_kgroup_index(k_group); 
 CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1369
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2194
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous. 
Definition: tensor_op_multiplicand_sm70.h:630
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1031
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:254
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:353
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:189
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:539
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:127
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1166
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2077
Describes the size of a matrix tile. 
Definition: matrix_shape.h:42
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:821
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:2064
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:1151
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:2044
Definition: mma_tensor_op_tile_iterator_sm70.h:70
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:436
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:626
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:279
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Stores a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1450
Definition: aligned_buffer.h:35
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1045
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:965
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:945
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2170
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1925
Defines a structure containing strides, bounds, and a pointer to tensor data. 
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:1782
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:980
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:606
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1154
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1560
CUTLASS_HOST_DEVICE Element * data() const 
Returns the pointer to referenced data. 
Definition: tensor_ref.h:254
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1224
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:694
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:121
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1385
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1024
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:185
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:1108
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1299
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:415
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:219
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate. 
Definition: coord.h:387
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1725
Operand
GEMM operand enumeration: D = A * B + C. 
Definition: include/cutlass/gemm/gemm.h:39
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1881
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1499
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:924
Aligned array type. 
Definition: array.h:511
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1278
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:439
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1360
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:884
Defines common types used for all GEMM-like operators. 
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:782
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1855
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1262
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1584
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1522
Definition: tensor_op_multiplicand_sm70.h:848
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2174
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1825
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1087
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1228
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1956
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1169
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1688
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1271
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1213
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:523
Shape_ Shape
Shape of tile to load (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:1139
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:286
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer. 
Definition: tensor_ref.h:326
CUTLASS_HOST_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1255
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const 
Stores a fragment to memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1379
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:791
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:807
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1966
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile, needs on more time number of registers. 
Definition: mma_tensor_op_tile_iterator_sm70.h:476
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1006
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:1836
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1667
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:741
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1145
CUTLASS_HOST_DEVICE Stride stride() const 
Returns the layout object's stride vector. 
Definition: tensor_ref.h:277
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space. 
Definition: tensor_ref.h:171
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:168
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1292
Defines a Shape template for matrix tiles. 
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:263
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1038
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1852
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1934
Defines the size of an element in bits. 
Definition: numeric_types.h:42
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:831
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:1815
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1055
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:228
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:706
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:548
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2162
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:616
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:841
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1684
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:756
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:948
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:331
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:752
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:293
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:421
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1885
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:1489
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:976
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1890
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1065
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:409
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1676
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:800
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2118
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1899
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1305
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1757
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1528
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:2009
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2083
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:718
Template based on element size (in bits) - defined in terms of pitch-linear memory. 
Definition: tensor_op_multiplicand_sm70.h:397
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:564
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1662
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:997
Top-level include for all CUTLASS numeric types. 
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const 
Computes the offset of an index from the origin of the tensor. 
Definition: tensor_ref.h:301
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1015
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:863
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:497
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:764
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2144
CUTLASS_HOST_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const 
Stores a fragment to memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1441
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:1509
Internal structure of iterator - made public to enable introspection. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1172
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:321
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:638
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:124
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:1632
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:514
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2205
Definition: tensor_op_multiplicand_sm70.h:733
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:988
Definition: tensor_op_multiplicand_sm70.h:943
typename Layout::Index Index
Index type. 
Definition: tensor_ref.h:165
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:942
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1163
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1351
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:341
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:851
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:571
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1872
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const 
Returns a reference to the element at a given Coord. 
Definition: tensor_ref.h:307
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2184
Definition: mma_tensor_op_tile_iterator_sm70.h:1135
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:578
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1623
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1735
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:915
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1977
Defines layout functions used by TensorRef and derived classes. 
Array< Element, Shape::kCount/kThreads *2 > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2100
CUTLASS_HOST_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1432
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2054
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2109
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1745
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:97
Defines layout functions used by TensorRef and derived classes for pitch-linear memory. 
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:814
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:773
CUTLASS_DEVICE MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:1285
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1942
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer. 
Definition: tensor_ref.h:319
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:272
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1849
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous. 
Definition: tensor_op_multiplicand_sm70.h:191
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1990
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1946
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:663
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2113
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:2237
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:103
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator_sm70.h:557
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:930
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2136
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1649
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1075
Layout_ Layout
Layout of source tile. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1148
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1916
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2153
TensorRef< Element, Layout > TensorRef
TensorRef type for loading element from a tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1519
Template based on element size (in bits) - defined in terms of pitch-linear memory. 
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS. 
Definition: matrix_coord.h:39
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator_sm70.h:724
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1908
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:721
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:700
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2218
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator_sm70.h:2127
CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator_sm70.h:493
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator_sm70.h:378
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:433
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape) 
Definition: mma_tensor_op_tile_iterator_sm70.h:109
typename Layout::LongIndex LongIndex
Long index used for pointer offsets. 
Definition: tensor_ref.h:168
CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1593
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:1525
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator_sm70.h:2080