50 namespace threadblock {
    71   layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
    78     "Specialization for pitch-linear iterator may along advance along the "    79     "contiguous(rank=0) or strided(rank=1) dimension.");
    84   static int const kAdvanceRank = AdvanceRank;
    98     static int const kAccessSizeInBits = 128;
   102       "This iterator requires a policy whose access size is 128bs");
   105     static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   112   using AccessType = Array<Element, Layout::kElementsPerAccess>;
   117   using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
   129   AccessType * pointer_[Detail::kPointerCount];
   141   ): stride_(ref.stride(0) / 
Layout::kElementsPerAccess), byte_offset_(0) {
   146     for (
int i = 0; i < Detail::kPointerCount; ++i) {
   154       pointer_[i] = 
reinterpret_cast<AccessType *
>(ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
   162     byte_offset_ += pointer_offset * 
sizeof(
Element);
   169     add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
   188       coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
   189       coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
   197     AccessType *frag_ptr = 
reinterpret_cast<AccessType *
>(&frag);
   199     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
   202     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   204       AccessType *access_ptr = pointer_[s & 1];
   205       int stride_idx = (s & ~1);
   208       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   210         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
   211             c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
   214         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   216         char const *access_byte_ptr = 
reinterpret_cast<char const *
>(access_ptr + access_offset);
   218         frag_ptr[access_idx] = *
reinterpret_cast<AccessType 
const *
>(access_byte_ptr + byte_offset_);
   226     load_with_pointer_offset(frag, 0);
   233     Index pointer_offset) {
   235     AccessType 
const *frag_ptr = 
reinterpret_cast<AccessType 
const *
>(&frag);
   237     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
   240     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   242       AccessType *access_ptr = pointer_[s & 1];
   243       int stride_idx = (s & ~1);
   246       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   248         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
   249           c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
   252         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   254         char *access_byte_ptr = 
reinterpret_cast<char *
>(access_ptr + access_offset);
   256         *
reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
   264     store_with_pointer_offset(frag, 0);
   287   layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
   294     "Specialization for column-major iterator may along advance along the "   295     "columns(rank=0) or rows(rank=1) dimension.");
   300   static int const kAdvanceRank = AdvanceRank;
   315     (kAdvanceRank == 0 ? 0 : 1),
   321   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   335   ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
   342     iterator_.add_pointer_offset(pointer_offset);
   348     iterator_.add_tile_offset({coord.row(), coord.column()});
   372     iterator_.load_with_pointer_offset(frag, pointer_offset);
   378     load_with_pointer_offset(frag, 0);
   385     Index pointer_offset) {
   387     iterator_.store_with_pointer_offset(frag, pointer_offset);
   393     store_with_pointer_offset(frag, 0);
   417   layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
   424     "Specialization for row-major iterator may along advance along the "   425     "columns(rank=0) or rows(rank=1) dimension.");
   430   static int const kAdvanceRank = AdvanceRank;
   445     (kAdvanceRank == 0 ? 1 : 0),
   451   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   465   ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
   472     iterator_.add_pointer_offset(pointer_offset);
   478     iterator_.add_tile_offset({coord.column(), coord.row()});
   502     iterator_.load_with_pointer_offset(frag, pointer_offset);
   508     load_with_pointer_offset(frag, 0);
   515     Index pointer_offset) {
   517     iterator_.store_with_pointer_offset(frag, pointer_offset);
   523     store_with_pointer_offset(frag, 0);
   543   layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
   550     "Specialization for pitch-linear iterator may along advance along the "   551     "contiguous(rank=0) or strided(rank=1) dimension.");
   556   static int const kAdvanceRank = AdvanceRank;
   570     static int const kAccessSizeInBits = 128;
   574       "This iterator requires a policy whose access size is 128bs");
   577     static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   584   using AccessType = Array<Element, Layout::kElementsPerAccess>;
   589   using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
   601   AccessType * pointer_[Detail::kPointerCount];
   613   ): stride_(ref.stride(0) / 
Layout::kElementsPerAccess), byte_offset_(0) {
   618     for (
int i = 0; i < Detail::kPointerCount; ++i) {
   626       pointer_[i] = 
reinterpret_cast<AccessType *
>(ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
   634     byte_offset_ += pointer_offset * 
sizeof(
Element);
   641     add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
   660       coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
   661       coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
   669     AccessType *frag_ptr = 
reinterpret_cast<AccessType *
>(&frag);
   671     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
   674     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   676       AccessType *access_ptr = pointer_[s & 1];
   677       int stride_idx = (s & ~1);
   680       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   682         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
   683             c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
   686         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   688         char const *access_byte_ptr = 
reinterpret_cast<char const *
>(access_ptr + access_offset);
   690         frag_ptr[access_idx] = *
reinterpret_cast<AccessType 
const *
>(access_byte_ptr + byte_offset_);
   698     load_with_pointer_offset(frag, 0);
   705     Index pointer_offset) {
   707     AccessType 
const *frag_ptr = 
reinterpret_cast<AccessType 
const *
>(&frag);
   709     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
   712     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   714       AccessType *access_ptr = pointer_[s & 1];
   715       int stride_idx = (s & ~1);
   718       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   720         int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
   721           c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
   724         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   726         char *access_byte_ptr = 
reinterpret_cast<char *
>(access_ptr + access_offset);
   728         *
reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
   736     store_with_pointer_offset(frag, 0);
   759   layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
   766     "Specialization for column-major iterator may along advance along the "   767     "columns(rank=0) or rows(rank=1) dimension.");
   772   static int const kAdvanceRank = AdvanceRank;
   787     (kAdvanceRank == 0 ? 0 : 1),
   793   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   807   ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
   814     iterator_.add_pointer_offset(pointer_offset);
   820     iterator_.add_tile_offset({coord.row(), coord.column()});
   844     iterator_.load_with_pointer_offset(frag, pointer_offset);
   850     load_with_pointer_offset(frag, 0);
   857     Index pointer_offset) {
   859     iterator_.store_with_pointer_offset(frag, pointer_offset);
   865     store_with_pointer_offset(frag, 0);
   889   layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
   896     "Specialization for row-major iterator may along advance along the "   897     "columns(rank=0) or rows(rank=1) dimension.");
   902   static int const kAdvanceRank = AdvanceRank;
   917     (kAdvanceRank == 0 ? 1 : 0),
   923   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   937   ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
   944     iterator_.add_pointer_offset(pointer_offset);
   950     iterator_.add_tile_offset({coord.column(), coord.row()});
   974     iterator_.load_with_pointer_offset(frag, pointer_offset);
   980     load_with_pointer_offset(frag, 0);
   987     Index pointer_offset) {
   989     iterator_.store_with_pointer_offset(frag, pointer_offset);
   995     store_with_pointer_offset(frag, 0);
  1015   typename ThreadMap_,
  1020     layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
  1021                                                Shape_::kContiguous>,
  1022     AdvanceRank, ThreadMap_, Alignment> {
  1026       AdvanceRank == 0 || AdvanceRank == 1,
  1027       "Specialization for pitch-linear iterator may along advance along the "  1028       "contiguous(rank=0) or strided(rank=1) dimension.");
  1034                                                  Shape::kContiguous>;
  1035   static int const kAdvanceRank = AdvanceRank;
  1049     static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
  1052     static int const kIterarionsPerAccess =
  1053         ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;
  1056     static int const kContiguousElementsPerLine = 4;
  1061   using AccessType = Array<Element, Layout::kElementsPerAccess>;
  1066       Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
  1079   AccessType *pointer_[Detail::kPointerCount];
  1091       : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine / 
Layout::kElementsPerAccess),
  1095         ThreadMap::initial_offset(thread_id);
  1098     for (
int i = 0; i < Detail::kPointerCount; ++i) {
  1102           thread_offset_base +
  1104               0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
  1107       pointer_[i] = 
reinterpret_cast<AccessType *
>(
  1108           ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
  1115     byte_offset_ += pointer_offset * 
sizeof(
Element);
  1123     add_pointer_offset(Shape::kContiguous * line_size);
  1139     add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
  1140                        line_size + coord.strided() * Shape::kStrided) *
  1141                        Layout::kElementsPerAccess);
  1147     AccessType *frag_ptr = 
reinterpret_cast<AccessType *
>(&frag);
  1149     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
  1152     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
  1153       AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];
  1155       access_ptr += 16 * (s / 2);
  1158       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
  1161         for(
int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
  1164             c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
  1165             vec_pointer_offset + i * line_size;
  1167           int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
  1168             Detail::kIterarionsPerAccess + i;
  1170           char const *access_byte_ptr = 
reinterpret_cast<char const*
>(access_ptr + access_offset);
  1172           frag_ptr[access_idx] = *
reinterpret_cast<AccessType 
const *
>(
  1173               access_byte_ptr + byte_offset_);
  1186     AccessType 
const *frag_ptr = 
reinterpret_cast<AccessType 
const *
>(&frag);
  1188     Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
  1191     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
  1192       AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];
  1194       access_ptr += 16 * (s / 2);
  1197       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
  1199         for(
int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
  1202             c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
  1203             vec_pointer_offset + i * line_size;
  1205           int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
  1206             Detail::kIterarionsPerAccess + i;
  1208           char *access_byte_ptr = 
reinterpret_cast<char *
>(access_ptr + access_offset);
  1210           *
reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_) =
  1211               frag_ptr[access_idx];
  1235   typename ThreadMap_,
  1239                           layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
  1240                               sizeof_bits<Element_>::value, Shape_::kRow>,
  1241                           AdvanceRank, ThreadMap_, Alignment> {
  1244       AdvanceRank == 0 || AdvanceRank == 1,
  1245       "Specialization for column-major iterator may along advance along the "  1246       "columns(rank=0) or rows(rank=1) dimension.");
  1252   static int const kAdvanceRank = AdvanceRank;
  1267       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
  1271   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
  1283       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
  1288     iterator_.add_pointer_offset(pointer_offset);
  1294     iterator_.add_tile_offset({coord.row(), coord.column()});
  1316     iterator_.load_with_pointer_offset(frag, pointer_offset);
  1326     iterator_.store_with_pointer_offset(frag, pointer_offset);
  1347   typename ThreadMap_,  
  1351                           layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
  1352                               sizeof_bits<Element_>::value, Shape_::kColumn>,
  1353                           AdvanceRank, ThreadMap_, Alignment> {
  1356       AdvanceRank == 0 || AdvanceRank == 1,
  1357       "Specialization for row-major iterator may along advance along the "  1358       "columns(rank=0) or rows(rank=1) dimension.");
  1364   static int const kAdvanceRank = AdvanceRank;
  1365   static int const kAlignment = Alignment;
  1380       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
  1384   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
  1396       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
  1401     iterator_.add_pointer_offset(pointer_offset);
  1407     iterator_.add_tile_offset({coord.column(), coord.row()});
  1429     iterator_.load_with_pointer_offset(frag, pointer_offset);
  1439     iterator_.store_with_pointer_offset(frag, pointer_offset);
 
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous. 
Definition: tensor_op_multiplicand_sm70.h:630
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:537
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:69
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space. 
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data. 
CUTLASS_HOST_DEVICE Element * data() const 
Returns the pointer to referenced data. 
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:859
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:540
Definition: tensor_op_multiplicand_sm70.h:848
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:642
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:200
CUTLASS_HOST_DEVICE Stride stride() const 
Returns the layout object's stride vector. 
Definition: tensor_ref.h:277
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous. 
Definition: tensor_op_multiplicand_sm70.h:528
Defines the size of an element in bits. 
Definition: numeric_types.h:42
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:405
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:203
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:951
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous. 
Definition: tensor_op_multiplicand_sm70.h:293
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:305
Template based on element size (in bits) - defined in terms of pitch-linear memory. 
Definition: tensor_op_multiplicand_sm70.h:397
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:302
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const 
Computes the offset of an index from the origin of the tensor. 
Definition: tensor_ref.h:301
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:72
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:741
Definition: tensor_op_multiplicand_sm70.h:733
Definition: tensor_op_multiplicand_sm70.h:943
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:639
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:744
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:954
Defines a canonical coordinate for rank=2 matrices offering named indices. 
Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
Defines layout functions used by TensorRef and derived classes for pitch-linear memory. 
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm70.h:856
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous. 
Definition: tensor_op_multiplicand_sm70.h:191
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm70.h:408
Template based on element size (in bits) - defined in terms of pitch-linear memory. 
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS. 
Definition: matrix_coord.h:39