Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions aiter/configs/a8w8_blockscale_tuned_gemm.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,84 @@ cu_num,M,N,K,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio
256,20480,512,7168,cktile,11,0,106.7764,a8w8_blockscale_cktile_192x256x128_4x2x1_16x16x128_intrawave_0x1x0_1,1407.84,1605.62,0.0
256,128,1024,4096,ck,8,0,13.7599,a8w8_blockscale_1x128x128_256x16x64x256_16x16_16x16_1x1_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.03,361.97,0.0
256,128,4096,1280,ck,7,0,7.4194,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,180.9,870.06,0.0

64,16,512,7168,ck,5,0,21.0677,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,5.57,180.42,0.0
64,32,512,7168,ck,7,0,21.2326,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,11.06,185.19,0.0
64,64,512,7168,ck,5,0,21.5607,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,21.79,194.53,0.0
64,128,512,7168,ck,7,0,22.3868,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,41.97,210.78,0.0
64,256,512,7168,ck,5,0,25.4464,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,73.84,226.64,0.0
64,512,512,7168,ck,5,0,37.5173,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,100.17,209.62,0.0
64,1024,512,7168,ck,5,0,70.283,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,106.94,171.57,0.0
64,1536,512,7168,ck,4,0,83.7976,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,134.54,193.95,0.0
64,2048,512,7168,ck,4,0,110.0607,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,136.58,185.78,0.0
64,4096,512,7168,ck,4,0,217.6474,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,138.14,171.03,0.0
64,8192,512,7168,ck,4,0,420.1874,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,143.1,168.45,0.0
64,16384,512,7168,ck,4,0,847.0249,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,141.98,162.79,0.0
64,20480,512,7168,ck,4,0,1063.6592,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,141.33,161.18,0.0
64,16,576,7168,ck,7,0,27.5569,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,4.79,154.66,0.0
64,32,576,7168,ck,7,0,27.8319,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,9.49,157.91,0.0
64,64,576,7168,ck,7,0,27.8765,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,18.96,167.21,0.0
64,128,576,7168,ck,5,0,31.8357,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,33.2,163.14,0.0
64,256,576,7168,ck,5,0,42.258,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,50.02,148.11,0.0
64,512,576,7168,ck,5,0,66.3617,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,63.71,126.41,0.0
64,1024,576,7168,ck,5,0,122.2805,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,69.15,103.44,0.0
64,1536,576,7168,ck,5,0,174.2874,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,72.77,97.01,0.0
64,2048,576,7168,ck,5,0,227.9122,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,74.2,92.88,0.0
64,4096,576,7168,ck,5,0,443.798,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,76.21,86.09,0.0
64,8192,576,7168,ck,5,0,882.2476,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,76.67,81.93,0.0
64,16384,576,7168,ck,5,0,1705.0677,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,79.35,82.37,0.0
64,20480,576,7168,ck,5,0,2175.1492,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,77.75,80.23,0.0
64,128,1024,4096,ck,5,0,17.9983,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,59.66,276.73,0.0
64,16,1536,7168,ck,7,0,25.3706,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,13.89,440.43,0.0
64,32,1536,7168,ck,7,0,25.5001,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,27.63,444.62,0.0
64,64,1536,7168,ck,5,0,33.3487,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,42.26,349.8,0.0
64,128,1536,7168,ck,5,0,36.524,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,77.17,337.33,0.0
64,256,1536,7168,ck,4,0,54.0994,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,104.2,251.97,0.0
64,512,1536,7168,ck,4,0,91.6311,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,123.04,177.37,0.0
64,1024,1536,7168,ck,4,0,163.4359,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,137.97,131.52,0.0
64,1536,1536,7168,ck,4,0,240.0444,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,140.9,111.39,0.0
64,2048,1536,7168,ck,4,0,315.9935,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,142.72,101.21,0.0
64,4096,1536,7168,ck,4,0,617.6339,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,146.03,85.74,0.0
64,8192,1536,7168,ck,4,0,1202.2416,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,150.04,78.93,0.0
64,16384,1536,7168,ck,4,0,2443.3914,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,147.65,73.17,0.0
64,20480,1536,7168,ck,4,0,3101.2909,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,145.41,71.17,0.0
64,128,4096,1280,ck,5,0,17.2714,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,77.71,373.76,0.0
64,16,4608,7168,ck,7,0,62.3389,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,16.96,534.05,0.0
64,32,4608,7168,ck,5,0,65.7963,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,32.13,509.97,0.0
64,64,4608,7168,ck,5,0,72.0885,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,58.65,472.73,0.0
64,128,4608,7168,ck,5,0,94.9716,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,89.03,369.87,0.0
64,256,4608,7168,ck,4,0,127.4667,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,132.67,292.03,0.0
64,512,4608,7168,ck,4,0,240.8659,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,140.42,171.96,0.0
64,1024,4608,7168,ck,4,0,466.6247,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,144.97,106.74,0.0
64,1536,4608,7168,ck,4,0,686.5133,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,147.8,84.77,0.0
64,2048,4608,7168,ck,4,0,913.8289,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,148.05,72.86,0.0
64,4096,4608,7168,ck,4,0,1829.1106,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,147.93,54.75,0.0
64,8192,4608,7168,ck,4,0,3699.2334,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,146.29,45.21,0.0
64,16384,4608,7168,ck,4,0,7372.5394,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,146.81,40.89,0.0
64,20480,4608,7168,ck,4,0,9277.5367,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,145.83,39.73,0.0
64,16,7168,256,ck,5,0,7.0275,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,8.36,294.34,0.0
64,16,7168,2304,ck,5,0,30.0406,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,17.59,558.62,0.0
64,32,7168,256,ck,5,0,7.5259,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,15.6,305.87,0.0
64,32,7168,2304,ck,5,0,32.0259,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,33.0,532.31,0.0
64,64,7168,256,ck,7,0,8.5336,a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_1x2_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,27.52,324.47,0.0
64,64,7168,2304,ck,5,0,35.3908,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,59.73,496.74,0.0
64,128,7168,256,ck,5,0,10.2,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,46.06,363.02,0.0
64,128,7168,2304,ck,5,0,44.4092,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,95.2,419.85,0.0
64,256,7168,256,ck,5,0,16.5328,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,56.83,336.94,0.0
64,256,7168,2304,ck,4,0,70.939,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,119.2,292.86,0.0
64,512,7168,256,ck,5,0,30.8461,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,60.92,301.69,0.0
64,512,7168,2304,ck,4,0,136.0203,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,124.33,184.05,0.0
64,1024,7168,256,ck,5,0,58.1541,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,64.62,288.5,0.0
64,1024,7168,2304,ck,4,0,251.6477,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,134.41,133.34,0.0
64,1536,7168,256,ck,5,0,85.6123,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,65.85,283.23,0.0
64,1536,7168,2304,ck,4,0,373.3302,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,135.9,112.7,0.0
64,2048,7168,256,ck,5,0,113.2028,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,66.4,280.2,0.0
64,2048,7168,2304,ck,4,0,498.9962,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,135.56,101.39,0.0
64,4096,7168,256,ck,5,0,203.805,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,73.76,302.27,0.0
64,4096,7168,2304,ck,4,0,982.7803,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,137.66,86.16,0.0
64,8192,7168,256,ck,5,0,415.4376,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,72.37,292.16,0.0
64,8192,7168,2304,ck,4,0,1966.7933,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,137.58,77.71,0.0
64,16384,7168,256,ck,5,0,831.8664,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,72.28,289.6,0.0
64,16384,7168,2304,ck,4,0,4046.2315,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,133.75,71.46,0.0
64,20480,7168,256,ck,5,0,1051.0383,a8w8_blockscale_1x128x128_256x16x128x128_8x16_16x16_1x2_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,71.51,286.08,0.0
64,20480,7168,2304,ck,4,0,5103.8124,a8w8_blockscale_1x128x128_256x16x256x128_8x16_16x16_1x4_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1,132.54,70.01,0.0
Loading
Loading