/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py (d52258ea47284c75ea1745530f85c9049c5f98cb)
;
.macro .v_u32_div v_q, v_n, v_d, v_tmp4, s_tmp4
    v_cvt_f32_u32     v[\v_tmp4+0],   v[\v_d]
    v_rcp_f32         v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_f32         v[\v_tmp4+0],   0x4f800000, v[\v_tmp4+0]
    v_cvt_u32_f32     v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   v[\v_d],      v[\v_tmp4+0]
    v_mul_hi_u32      v[\v_tmp4+2],   v[\v_d],      v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+3],   vcc, 0,     v[\v_tmp4+1]
    v_cmp_ne_i32      s[\s_tmp4:\s_tmp4+1], 0,          v[\v_tmp4+2]
    v_cndmask_b32     v[\v_tmp4+1],   v[\v_tmp4+3],   v[\v_tmp4+1],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+1],   v[\v_tmp4+1],   v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_add_co_u32      v[\v_tmp4+0],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_cndmask_b32     v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_tmp4+2],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_n]
    v_mul_lo_u32      v[\v_tmp4+1],   v[\v_tmp4+0],   v[\v_d]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4:\s_tmp4+1], v[\v_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4+2:\s_tmp4+3], v[\v_tmp4+2],   v[\v_d]
    v_add_co_u32      v[\v_tmp4+2],   vcc, 1, v[\v_tmp4+0]
    s_and_b64         s[\s_tmp4+2:\s_tmp4+3], s[\s_tmp4:\s_tmp4+1], s[\s_tmp4+2:\s_tmp4+3]
    v_add_co_u32      v[\v_tmp4+1],   vcc, -1,    v[\v_tmp4+0]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+0],   v[\v_tmp4+2],      s[\s_tmp4+2:\s_tmp4+3]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+1],   v[\v_tmp4+2],      s[\s_tmp4:\s_tmp4+1]
    v_cmp_ne_i32      vcc,          0,          v[\v_d]
    v_cndmask_b32     v[\v_q],      -1,         v[\v_tmp4+2],      vcc
.endm

.macro .v_u32_div_vs v_q, v_n, s_d, v_tmp4, s_tmp4
    v_cvt_f32_u32     v[\v_tmp4+0],   s[\s_d]
    v_rcp_f32         v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_f32         v[\v_tmp4+0],   0x4f800000, v[\v_tmp4+0]
    v_cvt_u32_f32     v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],      v[\v_tmp4+0]
    v_mul_hi_u32      v[\v_tmp4+2],   s[\s_d],      v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+3],   vcc, 0,     v[\v_tmp4+1]
    v_cmp_ne_i32      s[\s_tmp4:\s_tmp4+1], 0,          v[\v_tmp4+2]
    v_cndmask_b32     v[\v_tmp4+1],   v[\v_tmp4+3],   v[\v_tmp4+1],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+1],   v[\v_tmp4+1],   v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_add_co_u32      v[\v_tmp4+0],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_cndmask_b32     v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_tmp4+2],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_n]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],     v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4:\s_tmp4+1], v[\v_n],      v[\v_tmp4+1]
    v_cmp_le_u32      s[\s_tmp4+2:\s_tmp4+3],  s[\s_d],    v[\v_tmp4+2]
    v_add_co_u32      v[\v_tmp4+2],   vcc, 1, v[\v_tmp4+0]
    s_and_b64         s[\s_tmp4+2:\s_tmp4+3], s[\s_tmp4:\s_tmp4+1], s[\s_tmp4+2:\s_tmp4+3]
    v_add_co_u32      v[\v_tmp4+1],   vcc, -1,    v[\v_tmp4+0]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+0],   v[\v_tmp4+2],      s[\s_tmp4+2:\s_tmp4+3]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+1],   v[\v_tmp4+2],      s[\s_tmp4:\s_tmp4+1]
    v_cmp_ne_i32      vcc,          s[\s_d],   0
    v_cndmask_b32     v[\v_q],      -1,         v[\v_tmp4+2],      vcc
.endm

.macro .v_u32_div_ss v_q, s_n, s_d, v_tmp4, s_tmp4
    v_cvt_f32_u32     v[\v_tmp4+0],   s[\s_d]
    v_rcp_f32         v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_f32         v[\v_tmp4+0],   0x4f800000, v[\v_tmp4+0]
    v_cvt_u32_f32     v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],      v[\v_tmp4+0]
    v_mul_hi_u32      v[\v_tmp4+2],   s[\s_d],      v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+3],   vcc, 0,     v[\v_tmp4+1]
    v_cmp_ne_i32      s[\s_tmp4:\s_tmp4+1], 0,          v[\v_tmp4+2]
    v_cndmask_b32     v[\v_tmp4+1],   v[\v_tmp4+3],   v[\v_tmp4+1],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+1],   v[\v_tmp4+1],   v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_add_co_u32      v[\v_tmp4+0],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_cndmask_b32     v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_tmp4+2],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+0],   s[\s_n],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],     v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        s[\s_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4:\s_tmp4+1], s[\s_n],      v[\v_tmp4+1]
    v_cmp_le_u32      s[\s_tmp4+2:\s_tmp4+3],  s[\s_d],    v[\v_tmp4+2]
    v_add_co_u32      v[\v_tmp4+2],   vcc, 1, v[\v_tmp4+0]
    s_and_b64         s[\s_tmp4+2:\s_tmp4+3], s[\s_tmp4:\s_tmp4+1], s[\s_tmp4+2:\s_tmp4+3]
    v_add_co_u32      v[\v_tmp4+1],   vcc, -1,    v[\v_tmp4+0]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+0],   v[\v_tmp4+2],      s[\s_tmp4+2:\s_tmp4+3]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+1],   v[\v_tmp4+2],      s[\s_tmp4:\s_tmp4+1]
    v_cmp_ne_i32      vcc,          s[\s_d],   0
    v_cndmask_b32     v[\v_q],      -1,         v[\v_tmp4+2],      vcc
.endm

.macro .v_u32_div_rem v_r, v_q, v_n, v_d, v_tmp4, s_tmp4
    .v_u32_div \v_q, \v_n, \v_d, \v_tmp4, \s_tmp4
    v_mul_lo_u32 v[\v_tmp4], v[\v_d], v[\v_q]
    v_sub_u32 v[\v_r], v[\v_n], v[\v_tmp4]
.endm

.macro .v_u32_div_rem_vs v_r, v_q, v_n, s_d, v_tmp4, s_tmp4
    .v_u32_div_vs \v_q, \v_n, \s_d, \v_tmp4, \s_tmp4
    v_mul_lo_u32 v[\v_tmp4], s[\s_d], v[\v_q]
    v_sub_u32 v[\v_r], v[\v_n], v[\v_tmp4]
.endm

.macro .v_u32_div_rem_ss s_r, s_q, s_n, s_d, v_q, v_tmp4, s_tmp4
    .v_u32_div_ss \v_q, \s_n, \s_d, \v_tmp4, \s_tmp4
    v_readfirstlane_b32 s[\s_q], v[\v_q]
    s_mul_i32 s[\s_tmp4], s[\s_d], s[\s_q]
    s_sub_i32 s[\s_r], s[\s_n], s[\s_tmp4]
.endm

.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
    s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
    s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
    s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
    .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
    s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
    s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
    v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
    v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
    v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
    .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
    v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
    v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
    _a = \a
    .rept \num
        v_accvgpr_write_b32 a[_a], 0
        _a = _a + 1
    .endr
.endm

.macro .v_clear_nc vid, num
    _v = \vid
    .rept \num
        v_mov_b32 v[_v], 0
        _v = _v + 1
    .endr
.endm

;----------------------------------------------------------
; starting of kernel igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32
; tensor_layout              : 'nchw'
; gemm_m_per_block           : 64
; gemm_n_per_block           : 32
; gemm_k_per_block           : 16
; wave_tile_m                : 32
; wave_step_m                : 1
; wave_repeat_m              : 1
; wave_tile_n                : 8
; wave_step_n                : 2
; wave_repeat_n              : 1
; wave_tile_k                : 4
; tensor_a_thread_lengths    : [1, 2, 1, 2]
; tensor_a_cluster_lengths   : [1, 8, 1, 32]
; tensor_b_thread_lengths    : [1, 2, 1, 1]
; tensor_b_cluster_lengths   : [1, 8, 1, 32]
; direction                  : 'wrw'
; precision                  : 'fp16'
; nxb                        : 4
; nxe                        : 0
; 
; block_size                 : 256
; lds_total                  : 8192
; 
.set k_p_in, 0
.set k_p_wei, 8
.set k_p_out, 16
.set k_hi, 24
.set k_wi, 28
.set k_n, 32
.set k_k, 36
.set k_c, 40
.set k_ho, 44
.set k_wo, 48
.set k_stride_h, 52
.set k_stride_w, 56
.set k_dilation_h, 60
.set k_dilation_w, 64
.set k_pad_h, 68
.set k_pad_w, 72
.set k_y, 76
.set k_x, 80
.set k_gemm_k_global_split, 84
.set k_group, 88
.set k_pack_0, 92
.set k_end, 96

.set s_ka, 0
.set s_bx, 2
.set s_p_in, 4
.set s_p_wei, 8
.set s_p_out, 12
.set s_hi, 16
.set s_wi, 17
.set s_n, 18
.set s_k, 19
.set s_c, 20
.set s_ho, 21
.set s_wo, 22
.set s_stride_h, 23
.set s_stride_w, 24
.set s_dilation_h, 25
.set s_dilation_w, 26
.set s_pad_h, 27
.set s_pad_w, 28
.set s_y, 29
.set s_x, 30
.set s_gemmk_split, 31
.set s_group, 32
.set s_ho_padded, 33
.set s_out_stride_k, 34
.set s_hoxwo, 31
.set s_out_stride_n, 35
.set s_in_stride_c, 36
.set s_in_stride_n, 37
.set s_wei_stride_c, 38
.set s_wei_stride_k, 39
.set s_out_stride_n_n1, 40
.set s_in_stride_n_n1, 41
.set s_move_slice_n_n1, 42
.set s_move_slice_n_dsho, 43
.set s_move_slice_n_dswo, 44
.set s_block_gtc_ik, 45
.set s_block_gtc_ic0, 46
.set s_block_gtc_ic1e, 47
.set s_block_gtc_in, 48
.set s_block_gtc_ig, 49
.set s_knum, 1
.set s_gemm_k_num_n1, 0
.set s_kitr, 3
.set s_in_offset, 50
.set s_out_offset, 50
.set s_sub_n, 50
.set s_k_padded, 51
.set s_tmp, 52
.set s_end, 58

.set v_c, 0  ; coalescing:8, needed:0, resuable:27
.set v_a, 0
.set v_b, 4
.set v_gld_a, 12
.set v_gld_b, 14
.set v_sst_a_os, 15
.set v_sst_b_os, 16
.set v_sld_a_os, 17
.set v_sld_b_os, 18
.set v_in_ihi, 19
.set v_in_iwi, 20
.set v_in_os, 21
.set v_in_os_base, 22
.set v_out_iho, 23
.set v_out_iwo, 24
.set v_out_os, 25
.set v_out_os_base, 26
.set v_co_sst, 27
.set v_co_sld, 28
.set v_wei_flag, 29
.set v_wei_os, 30
.set v_gtc_ik1, 31
.set v_move_slice_n_in1, 32
.set v_move_slice_n_idsho, 33
.set v_move_slice_n_idswo, 34
.set v_wei_iy, 35
.set v_wei_ix, 36
.set v_gtc_ic0, 37
.set v_gtc_ic1e, 38
.set v_gtc_ik0, 39
.set v_gtc_ic1, 40
.set v_gtc_in0, 41
.set v_out_flag_prev, 41
.set v_gtc_in1b, 42
.set v_gtc_in1, 43
.set v_gemm_in, 44
.set v_gemm_im, 45
.set v_wei_ic0, 46
.set v_wei_ic1e, 47
.set v_wei_ic1, 48
.set v_co_sub_m_index, 49
.set v_co_sub_n_index, 50
.set v_cur_k, 51
.set v_cur_c, 52
.set v_in_flag_with_wi, 53
.set v_tmp, 54
.set v_end, 60

.set a_c, 0
.set a_end, 8

.text
.globl igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32
.p2align 8
.type igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32,@function
igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32:
    ; unmerge_sub_n:4, unmerge_sub_n1:4, unmerge_sub_c:32, unmerge_sub_c1:32
    ; gemm_m_unmerge_cluster:0, gemm_n_unmerge_cluster:0, gemm_k_unmerge_cluster:0
    s_load_dwordx2  s[s_p_in+0:s_p_in+1],       s[s_ka+0:s_ka+1],    0+k_p_in
    s_load_dwordx2  s[s_p_wei+0:s_p_wei+1],      s[s_ka+0:s_ka+1],    0+k_p_wei
    s_load_dwordx2  s[s_p_out+0:s_p_out+1],      s[s_ka+0:s_ka+1],    0+k_p_out
    s_load_dwordx16 s[s_hi+0:s_hi+15],        s[s_ka+0:s_ka+1],    0+k_hi
    s_load_dwordx2  s[s_group+0:s_group+1],      s[s_ka+0:s_ka+1],    0+k_group

    ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_gtc_in1b], 7, v[v_tmp]
    v_lshlrev_b32 v[v_gtc_in1b], 1, v[v_gtc_in1b]
    v_lshrrev_b32 v[v_tmp], 3, v[v_tmp]
    v_mov_b32 v[v_gtc_in0], 0
    v_and_b32 v[v_gtc_ic1e], 31, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 5, v[v_tmp]
    v_mov_b32 v[v_gtc_ic0], 0

    ; output, thread(n0,n1b,k0,k1): 1x2x1x2, cluster(n0,n1b,k0,k1) 1x8x1x32
    v_lshrrev_b32 v[v_tmp], 3, v0
    v_and_b32 v[v_gtc_ik1], 31, v[v_tmp]
    v_lshlrev_b32 v[v_gtc_ik1], 1, v[v_gtc_ik1]
    v_lshrrev_b32 v[v_tmp], 5, v[v_tmp]
    v_mov_b32 v[v_gtc_ik0], 0

    s_mov_b32 s[s_p_in+3], 0x27000
    s_mov_b32 s[s_p_wei+2], 0xffffffff
    s_mov_b32 s[s_p_wei+3], 0x27000
    s_mov_b32 s[s_p_out+3], 0x27000
    s_waitcnt lgkmcnt(0)

    ; calculate index
    s_mul_i32 s[s_in_stride_c],      s[s_hi],       s[s_wi]
    s_mul_i32 s[s_tmp],  s[s_in_stride_c], s[s_c]
    s_mul_i32 s[s_in_stride_n],      s[s_group],        s[s_tmp]
    s_mul_i32 s[s_wei_stride_c],       s[s_y],       s[s_x]
    s_mul_i32 s[s_wei_stride_k],       s[s_c],        s[s_wei_stride_c]
    s_mul_i32 s[s_out_stride_k],      s[s_ho],        s[s_wo]
    s_mul_i32 s[s_tmp],  s[s_out_stride_k],  s[s_k]
    s_mul_i32 s[s_out_stride_n],      s[s_group],        s[s_tmp]
    ; config for weight range
    s_mul_i32 s[s_p_out+2], s[s_out_stride_n], s[s_n]
    s_lshl_b32 s[s_p_out+2], s[s_p_out+2], 1
    ; n1b transform
    .v_u32_div_rem_vs v_tmp+4, v_move_slice_n_in1, v_gtc_in1b, s_out_stride_k, v_tmp, s_tmp
    .v_u32_div_rem_vs v_move_slice_n_idswo, v_move_slice_n_idsho, v_tmp+4, s_wo, v_tmp, s_tmp

    ; pad gemm_m if needed
    s_add_u32 s[s_tmp], 63, s[s_k]
    s_lshr_b32 s[s_tmp], s[s_tmp], 6
    s_lshl_b32 s[s_k_padded], s[s_tmp], 6

    ; add block i_n
    ; gemm_m_per_block:64, gemm_n_per_block:32
    s_lshr_b32 s[0], s[s_wei_stride_k], 5

    s_lshr_b32 s[s_tmp], s[s_k], 6
    s_mul_i32 s[1], s[0], s[s_tmp]
    s_lshl_b32 s[3], s[1], s[s_gemmk_split]
    .v_u32_div_rem_ss s_tmp+4, s_block_gtc_ig, s_bx, 3, v_tmp+5, v_tmp, s_tmp
    .v_u32_div_rem_ss s_bx, s_block_gtc_in, s_tmp+4, 1, v_tmp+5, v_tmp, s_tmp
    ; config input and output n offset
    s_lshr_b32 s[s_sub_n], s[s_n], s[s_gemmk_split]
    s_mul_i32 s[s_block_gtc_in], s[s_block_gtc_in], s[s_sub_n]
    .v_u32_div_rem_ss s_tmp+4, s_tmp+5, s_bx, 0, v_tmp+5, v_tmp, s_tmp
    ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im
    s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+5], 6
    s_mov_b32 s[0], s[s_wei_stride_c] ; total number of c1e
    .v_u32_div_rem_ss s_block_gtc_ic1e, s_block_gtc_ic0, s_tmp+4, 0, v_tmp+5, v_tmp, s_tmp
    s_lshl_b32 s[s_block_gtc_ic1e], s[s_block_gtc_ic1e], 5

    ; c1e transform
    v_add_u32 v[v_tmp+5], s[s_block_gtc_ic1e], v[v_gtc_ic1e]
    .v_u32_div_rem_vs v_tmp+4, v_gtc_ic1, v_tmp+5, s_wei_stride_c, v_tmp, s_tmp
    .v_u32_div_rem_vs v_wei_ix, v_wei_iy, v_tmp+4, s_x, v_tmp, s_tmp

    v_mul_u32_u24 v[v_tmp], s[s_dilation_h], v[v_wei_iy]
    v_mul_u32_u24 v[v_tmp+1], s[s_dilation_w], v[v_wei_ix]
    v_sub_i32 v[v_wei_iy], v[v_tmp], s[s_pad_h]
    v_sub_i32 v[v_wei_ix], v[v_tmp+1], s[s_pad_w]
    ; ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h,   here make sure iy <- iy * s_dilation_h - s_pad_h before hand
    ; iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w,   here make sure ix <- ix * s_dilation_w - s_pad_w before hand
    v_mul_lo_u32 v[v_tmp], s[s_stride_h], v[v_move_slice_n_idsho]
    v_add_i32 v[v_in_ihi], v[v_tmp], v[v_wei_iy]
    v_mul_lo_u32 v[v_tmp+1], s[s_stride_w], v[v_move_slice_n_idswo]
    v_add_i32 v[v_in_iwi], v[v_tmp+1], v[v_wei_ix]
    ; calculate input offset
    ; set input range
    s_mul_i32 s[s_p_in+2], s[s_in_stride_n], s[s_n]
    s_lshl_b32 s[s_p_in+2], s[s_p_in+2], 1
    s_mul_i32 s[s_tmp+5], s[s_c], s[s_in_stride_c]
    s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+5]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+5]
    s_sub_u32 s[s_p_in+2], s[s_p_in+2], s[s_tmp]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]
    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic0], 6
    s_mul_i32 s[s_tmp], s[s_in_stride_c], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_in_stride_c], s[s_tmp+3]
    s_sub_u32 s[s_p_in+2], s[s_p_in+2], s[s_tmp]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]

    v_mov_b32 v[v_tmp], v[v_move_slice_n_in1]
    v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_in]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_n], v[v_tmp]
    v_mov_b32 v[v_tmp+1], v[v_gtc_ic1]
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_c], v[v_tmp+1]
    v_add_lshl_u32 v[v_in_os_base], v[v_tmp], v[v_tmp+1], 1
    ; from hi, wi, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wi], v[v_in_ihi], v[v_in_iwi]
    v_lshl_add_u32 v[v_in_os], v[v_tmp], 1, v[v_in_os_base]

    s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1

    
    ; load input
    buffer_load_dword v[v_gld_b+0], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0

    ; calculate out offset
    ; set output range
    s_mul_i32 s[s_p_out+2], s[s_out_stride_n], s[s_n]
    s_lshl_b32 s[s_p_out+2], s[s_p_out+2], 1
    s_mul_i32 s[s_tmp+2], s[s_k], s[s_out_stride_k]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_sub_u32 s[s_p_out+2], s[s_p_out+2], s[s_tmp]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]

    v_mov_b32 v[v_tmp], v[v_gtc_ik1]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_k], v[v_tmp+5]
    v_mov_b32 v[v_tmp+1], v[v_move_slice_n_in1]
    v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_in]
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_tmp+1]
    v_add_lshl_u32 v[v_out_os_base], v[v_tmp], v[v_tmp+1], 1
    ; from ho, wo, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wo], v[v_move_slice_n_idsho], v[v_move_slice_n_idswo]
    v_lshl_add_u32 v[v_out_os], v[v_tmp], 1, v[v_out_os_base]

    s_lshl_b32 s[s_out_stride_n], s[s_out_stride_n], 1
    s_lshl_b32 s[s_out_stride_k], s[s_out_stride_k], 1

    
    ; load output
    buffer_load_dword v[v_gld_a+0], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0
    buffer_load_dword v[v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_stride_k] offen offset:0

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get source matrix gemm index
    v_and_b32 v[v_gemm_in], 3, v[v_tmp+5]           ; block_n index 
    v_and_b32 v[v_gemm_im], 3, v[v_tmp+5]           ; block_m index 
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5]          ; block_n_per_wave index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 1], 7, v[v_tmp+5]          ; block_m_per_wave index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im]
    v_lshrrev_b32 v[v_tmp+5], 3, v[v_tmp+5]
    v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5]  ; waves_per_n index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 4, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5]  ; waves_per_m index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im]

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get dst matrix gemm index
    v_and_b32 v[v_tmp+0], 3, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_and_b32 v[v_tmp+2], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+3], 7, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 3, v[v_tmp+5]
    v_mov_b32 v[v_co_sst], v[v_tmp+0]
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3]
    v_and_b32 v[v_tmp+0], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst]
    v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld]

    ; LDS store, in: n0,n1b,c0,c1e: 1x2x1x1, 1x8x1x32, order:4
    v_lshlrev_b32 v[v_tmp], 2, v[v_gtc_ic1e]
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_in1b]
    v_lshl_add_u32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp]
    v_and_b32 v[v_tmp+1], 3, v[v_gtc_in1b]
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1]
    v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp]
    v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os]

    ; LDS store, out: n0,n1b,k0,k1: 1x2x1x2, 1x8x1x32, order:0
    v_lshlrev_b32 v[v_tmp], 2, v[v_gtc_ik1]
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_in1b]
    v_lshl_add_u32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp]
    v_and_b32 v[v_tmp+1], 3, v[v_gtc_in1b]
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1]
    v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp]

    ; LDS load
    v_lshlrev_b32 v[v_sld_b_os], 3, v[v_gemm_in]
    v_lshlrev_b32 v[v_sld_a_os], 3, v[v_gemm_im]
    v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os]

    v_mov_b32 v[v_gemm_in], v[v_co_sst]
    v_mov_b32 v[v_gemm_im], v[v_co_sld]
    ; init_co_lds_offset for xdlops
    v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im]
    v_and_b32 v[v_tmp], 0, v[v_tmp]   ; thread id of lanegroup_m_per_cluster
    v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp]
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im]
    v_and_b32 v[v_tmp+1], 7  , v[v_tmp+1]   ; thread id of block_m_per_lanegroup
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst]
    v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im]  ; thread id of waves_per_m
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst]
    v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst]
    v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in]   ; implicit transpose with m granularity:4 while store
    v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1]
    v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 3, v[0]
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28]
    ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:8, n_mv:2
    ; nd_stride:[1, 8, 1, 1, 1, 2, 1]
    v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0]   ; get tid along m
    v_and_b32 v[v_tmp+0], 7, v[v_co_sub_m_index]                   ; => x_ml
    v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0]      ; => accumulate x_ml
    v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_co_sub_m_index]
    ; init_co_sub_n_index xdlops
    v_and_b32 v[v_co_sub_n_index], 31, v[0]

    ; weight offset
    s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]
    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic0], 6
    s_mul_i32 s[s_tmp], s[s_wei_stride_c], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_wei_stride_c], s[s_tmp+3]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]

    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1
    s_mul_i32 s[s_tmp], s[s_wei_stride_k], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_wei_stride_k], s[s_tmp+3]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]

    ; compute v_co_sub_n_index along c0 x c1e : 1x32
    v_and_b32 v[v_wei_ic1e], 31, v[v_co_sub_n_index]     ; => C1E
    ;   compute from n1b
    v_add_u32 v[v_tmp+5], s[s_block_gtc_ic1e], v[v_wei_ic1e]
    .v_u32_div_rem_vs v_tmp+4, v_wei_ic1, v_tmp+5, s_wei_stride_c, v_tmp, s_tmp

    ; add wei_ic0, wei_ic1
    v_mul_lo_u32 v[v_wei_os], s[s_wei_stride_c], v[v_wei_ic1]
    ; add i_k
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_co_sub_m_index]
    v_add_u32 v[v_wei_os], v[v_wei_os], v[v_tmp]
    ; add y, x
    v_add_u32 v[v_wei_os], v[v_wei_os], v[v_tmp+4]
    v_lshlrev_b32 v[v_wei_os], 1, v[v_wei_os]
    ; move slice stride
    s_mul_i32 s[s_hoxwo], s[s_ho], s[s_wo]
    s_mov_b32 s[0], 16
    .v_u32_div_rem_ss s_tmp+4, s_move_slice_n_n1, 0, s_hoxwo, v_tmp+4, v_tmp, s_tmp
    .v_u32_div_rem_ss s_move_slice_n_dswo, s_move_slice_n_dsho, s_tmp+4, s_wo, v_tmp+4, v_tmp, s_tmp

    s_lshr_b32 s[s_tmp], s[s_out_stride_n], 1
    s_lshr_b32 s[s_tmp+1], s[s_in_stride_n], 1
    s_mul_i32 s[s_in_stride_n_n1], s[s_move_slice_n_n1], s[s_tmp+1]  ; might be 0 or larger
    s_mul_i32 s[s_out_stride_n_n1], s[s_move_slice_n_n1], s[s_tmp]  ; might be 0 or larger
    s_lshl_b32 s[s_in_stride_n_n1], s[s_in_stride_n_n1], 1
    s_lshl_b32 s[s_out_stride_n_n1], s[s_out_stride_n_n1], 1
    
    
    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1
    s_mov_b32 s[s_gemm_k_num_n1], 4
    s_mul_i32 s[s_knum], s[s_hoxwo], s[s_sub_n]

    ; start MFMA loop, 32x8 wave tile with 1x1 repeat, 1x2 step
    s_waitcnt vmcnt(2)
    ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] 

    s_waitcnt vmcnt(0)
    ds_write2_b32 v[v_sst_a_os], v[v_gld_a], v[v_gld_a+1], offset0:0, offset1:2

    .v_clear_acc_c a_c, 8
    ; make sure acc WAR harzard, at least 1 nop for src_c
    s_sub_i32 s[s_kitr], s[s_knum], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_end

    v_xor_b32 v[v_sst_b_os], 0x1000, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x1000, v[v_sst_a_os] ; switch double buffer a store
    v_add_u32 v[v_move_slice_n_idswo], s[s_move_slice_n_dswo], v[v_move_slice_n_idswo]
    v_cmpx_le_u32 vcc, s[s_wo], v[v_move_slice_n_idswo]
    v_subrev_u32 v[v_move_slice_n_idswo], s[s_wo], v[v_move_slice_n_idswo]
    v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho]
    s_mov_b64 exec, -1

    v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho]
    v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho]
    v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho]
    v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base]
    v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base]
    s_mov_b64 exec, -1

    v_add_u32 v[v_in_os_base], s[s_in_stride_n_n1], v[v_in_os_base]
    v_add_u32 v[v_out_os_base], s[s_out_stride_n_n1], v[v_out_os_base]
    ; from hi, wi, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wi], v[v_move_slice_n_idsho], v[v_move_slice_n_idswo]
    v_lshl_add_u32 v[v_in_os], v[v_tmp], 1, v[v_in_os_base]
    ; from ho, wo, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wo], v[v_move_slice_n_idsho], v[v_move_slice_n_idswo]
    v_lshl_add_u32 v[v_out_os], v[v_tmp], 1, v[v_out_os_base]
L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_body:
    ; load input
    buffer_load_dword v[v_gld_b+0], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0
    ; load output
    buffer_load_dword v[v_gld_a+0], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0
    buffer_load_dword v[v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_stride_k] offen offset:0
    ; do fma accumulate with unroll 16
    s_waitcnt lgkmcnt(0)
    s_barrier

    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] 
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:8
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:32, offset1:40
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_add_u32 v[v_move_slice_n_idswo], s[s_move_slice_n_dswo], v[v_move_slice_n_idswo]
    v_cmpx_le_u32 vcc, s[s_wo], v[v_move_slice_n_idswo]
    v_subrev_u32 v[v_move_slice_n_idswo], s[s_wo], v[v_move_slice_n_idswo]
    v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho]
    s_mov_b64 exec, -1
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho]
    v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho]
    v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho]
    v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base]
    v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base]
    s_mov_b64 exec, -1
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:64, offset1:72
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_add_u32 v[v_in_os_base], s[s_in_stride_n_n1], v[v_in_os_base]
    v_add_u32 v[v_out_os_base], s[s_out_stride_n_n1], v[v_out_os_base]
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    v_mad_u32_u24 v[v_tmp], s[s_wi], v[v_move_slice_n_idsho], v[v_move_slice_n_idswo]
    v_lshl_add_u32 v[v_in_os], v[v_tmp], 1, v[v_in_os_base]
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:96, offset1:104
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_mad_u32_u24 v[v_tmp], s[s_wo], v[v_move_slice_n_idsho], v[v_move_slice_n_idswo]
    v_lshl_add_u32 v[v_out_os], v[v_tmp], 1, v[v_out_os_base]
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    s_waitcnt lgkmcnt(0)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    ; k iteration : 8
    s_waitcnt lgkmcnt(0)
    s_waitcnt vmcnt(2)
    ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] 
    s_waitcnt vmcnt(0)
    ds_write2_b32 v[v_sst_a_os], v[v_gld_a], v[v_gld_a+1], offset0:0, offset1:2
    
    v_xor_b32 v[v_sld_b_os], 4096, v[v_sld_b_os] ; switch double buffer b load
    v_xor_b32 v[v_sld_a_os], 4096, v[v_sld_a_os] ; switch double buffer a load
    s_sub_i32 s[s_kitr], s[s_kitr], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_finishing
    v_xor_b32 v[v_sst_b_os], 4096, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 4096, v[v_sst_a_os] ; switch double buffer a store
    s_branch L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_body

L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_finishing:
L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] 
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:8
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:32, offset1:40
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:64, offset1:72
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:96, offset1:104
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    s_waitcnt lgkmcnt(0)
    v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+3]     ; repeat:0x0, step:0x0, num_a_c:4
    v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+4:a_c+7]     ; repeat:0x0, step:0x1, num_a_c:4
    s_nop 3
    ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:32, wt_n:8, ws:4, r_m:1, r_n:1, s_m:1, s_n:2 | 4x4x4, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1
    ; coalescing_groups:1, num_dword_per_group:8
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28]
    ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:8, n_mv:2
    ; nd_stride:[1, 8, 1, 1, 1, 2, 1]
    ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+1]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+2]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+3]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1]
    v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+4]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+5]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+6]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+7]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5]
    v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:64   ; idword:8(0,8),  0x8 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] 
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ;   store to global, m index start from 0, m0:0, m1:0
    s_mov_b32 s[s_tmp], 0   ; i_m:0(i_m0:0,i_m1:0)
    v_add_u32 v[v_cur_k], s[s_block_gtc_ik], v[v_co_sub_m_index]
    v_mov_b32 v[v_tmp], v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b32 s[s_tmp], s[s_wei_stride_k]   ; i_m:1(i_m0:0,i_m1:1)
    v_add_u32 v[v_tmp], 1, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 2, s[s_wei_stride_k]   ; i_m:2(i_m0:0,i_m1:2)
    v_add_u32 v[v_tmp], 2, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 3, s[s_wei_stride_k]   ; i_m:3(i_m0:0,i_m1:3)
    v_add_u32 v[v_tmp], 3, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 32, s[s_wei_stride_k]   ; i_m:32(i_m0:0,i_m1:32)
    v_add_u32 v[v_tmp], 32, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 33, s[s_wei_stride_k]   ; i_m:33(i_m0:0,i_m1:33)
    v_add_u32 v[v_tmp], 33, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 34, s[s_wei_stride_k]   ; i_m:34(i_m0:0,i_m1:34)
    v_add_u32 v[v_tmp], 34, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 35, s[s_wei_stride_k]   ; i_m:35(i_m0:0,i_m1:35)
    v_add_u32 v[v_tmp], 35, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]

L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_out:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32
    .amdhsa_group_segment_fixed_size 8192
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 60
    .amdhsa_next_free_sgpr 58
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32
    .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32.kd
    .sgpr_count: 64
    .vgpr_count: 60
    .kernarg_segment_align: 8
    .kernarg_segment_size: 96
    .group_segment_fixed_size: 8192
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in      , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: p_wei     , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_out     , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: hi        , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi        , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n_         , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k         , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c         , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho        , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo        , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h  , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w  , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h, .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w, .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h     , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w     , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y_         , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x         , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: gemm_k_global_split, .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
    - { .name: group     , .size: 4, .offset:  88, .value_kind: by_value, .value_type: i32}
    - { .name: __pack_0  , .size: 4, .offset:  92, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
