/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py (a3229bb2a2624f0dc8e4fbd118817745667e66ac)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
    s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
    s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
    s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
    .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
    s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
    s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
    v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
    v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
    v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
    .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
    v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
    v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
    _a = \a
    .rept \num
        v_accvgpr_write_b32 a[_a], 0
        _a = _a + 1
    .endr
.endm

.macro .v_clear_nc vid, num
    _v = \vid
    .rept \num
        v_mov_b32 v[_v], 0
        _v = _v + 1
    .endr
.endm

;----------------------------------------------------------
; starting of kernel igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16
; tensor_layout              : nchw
; gemm_m_per_block           : 64
; gemm_n_per_block           : 256
; gemm_k_per_block           : 16
; wave_tile_m                : 16
; wave_step_m                : 1
; wave_repeat_m              : 2
; wave_tile_n                : 64
; wave_step_n                : 1
; wave_repeat_n              : 2
; wave_tile_k                : 4
; tensor_a_thread_lengths    : [1, 1, 4, 1]
; tensor_a_cluster_lengths   : [1, 16, 1, 16]
; tensor_b_thread_lengths    : [1, 1, 16, 1]
; tensor_b_cluster_lengths   : [1, 16, 1, 16]
; direction                  : bwd
; precision                  : fp16
; nxb                        : 1
; nxe                        : 1
; 
; block_size                 : 256
; lds_total                  : 16384
; 
.set k_p_in, 0
.set k_p_wei, 8
.set k_p_out, 16
.set k_hi, 24
.set k_wi, 28
.set k_n, 32
.set k_k, 36
.set k_c, 40
.set k_ho, 44
.set k_wo, 48
.set k_stride_h, 52
.set k_stride_w, 56
.set k_dilation_h, 60
.set k_dilation_w, 64
.set k_pad_h, 68
.set k_pad_w, 72
.set k_y, 76
.set k_x, 80
.set k_dtile_iy, 84
.set k_dtile_ix, 88
.set k_dtile_dy, 92
.set k_dtile_dx, 96
.set k_dtile_y, 100
.set k_dtile_x, 104
.set k_dtile_h, 108
.set k_dtile_w, 112
.set k_dslice_y, 116
.set k_dslice_x, 120
.set k_dslice_h, 124
.set k_dslice_w, 128
.set k_dslice_h_left, 132
.set k_dslice_w_left, 136
.set k_group, 140
.set k_magic_0, 144
.set k_magic_1, 148
.set k_magic_2, 152
.set k_magic_3, 156
.set k_magic_4, 160
.set k_magic_5, 164
.set k_magic_6, 168
.set k_shift_pack_0, 172
.set k_shift_pack_1, 176
.set k__pack_0, 180
.set k_end, 184

.set s_ka, 0
.set s_bx, 2
.set s_p_in, 4
.set s_p_wei, 8
.set s_p_out, 12
.set s_hi, 16
.set s_wi, 17
.set s_n, 18
.set s_k, 19
.set s_c, 20
.set s_ho, 21
.set s_wo, 22
.set s_stride_h, 23
.set s_stride_w, 24
.set s_dilation_h, 25
.set s_dilation_w, 26
.set s_pad_h, 27
.set s_pad_w, 28
.set s_y, 29
.set s_x, 30
.set s_dtile_iy, 31
.set s_dtile_ix, 32
.set s_dtile_dy, 33
.set s_dtile_dx, 34
.set s_dtile_y, 35
.set s_dtile_x, 36
.set s_dtile_h, 37
.set s_dtile_w, 38
.set s_dslice_y, 39
.set s_dslice_x, 40
.set s_dslice_h, 41
.set s_dslice_w, 42
.set s_dslice_h_left, 43
.set s_dslice_w_left, 44
.set s_group, 45
.set s_out_stride_k, 37
.set s_out_stride_k0, 46
.set s_out_stride_n, 38
.set s_out_stride_n0, 47
.set s_in_stride_c, 48
.set s_in_stride_n, 45
.set s_wei_stride_c, 49
.set s_wei_stride_c0, 50
.set s_wei_stride_k, 51
.set s_wei_stride_k0, 52
.set s_stride_dslice_hw, 41
.set s_stride_dslice_yx, 29
.set s_dslice_dim_b, 41
.set s_out_stride_k_k1, 23
.set s_wei_stride_k_k1, 25
.set s_out_stride_k_k0_k1_diff, 24
.set s_wei_stride_k_k0_k1_diff, 26
.set s_move_slice_k_k1, 27
.set s_move_slice_k_dsy, 43
.set s_move_slice_k_dsx, 44
.set s_block_gtc_ig, 53
.set s_block_gtc_ic, 54
.set s_block_gtc_in0, 55
.set s_block_gtc_in1b, 56
.set s_knum, 1
.set s_gemm_k_num_k1, 2
.set s_gemm_k_num_dsy, 39
.set s_gemm_k_num_dsx, 40
.set s_dtile_dy_neg, 33
.set s_dtile_dx_neg, 34
.set s_kitr, 3
.set s_out_offset, 57
.set s_wei_offset, 71
.set s_magic_0, 54
.set s_magic_1, 55
.set s_magic_2, 14
.set s_magic_3, 15
.set s_magic_4, 3
.set s_magic_5, 10
.set s_magic_6, 11
.set s_shift_pack_0, 6
.set s_shift_pack_1, 7
.set s_tmp, 74
.set s_end, 80

.set v_c, 0  ; coalescing:32, needed:2, resuable:44
.set v_a, 2
.set v_b, 10
.set v_gld_a, 18
.set v_gld_b, 22
.set v_sst_a_os, 38
.set v_sst_b_os, 39
.set v_sld_a_os, 40
.set v_sld_b_os, 41
.set v_out_iho, 42
.set v_out_iwo, 43
.set v_out_dslice_ih, 44
.set v_out_dslice_iw, 45
.set v_out_os, 46
.set v_out_os_base, 47
.set v_wei_iy, 48
.set v_wei_ix, 49
.set v_dtile_iy, 50
.set v_dtile_ix, 51
.set v_wei_os, 52
.set v_wei_os_base, 53
.set v_out_flag, 54
.set v_co_sst, 55
.set v_co_sld, 56
.set v_in_flag, 57
.set v_in_os, 58
.set v_gtc_ik1, 59
.set v_gtc_dslice_iy, 60
.set v_gtc_dslice_ix, 61
.set v_move_slice_k_ik1, 59
.set v_move_slice_k_idsy, 60
.set v_move_slice_k_idsx, 61
.set v_gtc_ic0, 62
.set v_gtc_ic1, 63
.set v_gtc_ik0, 64
.set v_gtc_ik1e, 65
.set v_gtc_in0, 66
.set v_gtc_in1b, 67
.set v_gtc_in1, 68
.set v_gemm_in, 69
.set v_gemm_im, 70
.set v_in_in0, 71
.set v_in_in1b, 72
.set v_in_in1, 73
.set v_in_ihi, 74
.set v_in_iwi, 75
.set v_in_dslice_ih, 76
.set v_in_dslice_iw, 77
.set v_co_sub_m_index, 78
.set v_co_sub_n_index, 79
.set v_tmp, 80
.set v_end, 86

.set a_c, 0
.set a_end, 64

.text
.globl igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16
.p2align 8
.type igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16,@function
igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16:
    ; unmerge_sub_k:16, unmerge_sub_k1:16, unmerge_sub_n:256, unmerge_sub_n1:16
    ; gemm_m_unmerge_cluster:0, gemm_n_unmerge_cluster:0, gemm_k_unmerge_cluster:0
    s_load_dwordx2  s[s_p_in+0:s_p_in+1],       s[s_ka+0:s_ka+1],    0+k_p_in
    s_load_dwordx2  s[s_p_wei+0:s_p_wei+1],      s[s_ka+0:s_ka+1],    0+k_p_wei
    s_load_dwordx2  s[s_p_out+0:s_p_out+1],      s[s_ka+0:s_ka+1],    0+k_p_out
    s_load_dwordx16 s[s_hi+0:s_hi+15],        s[s_ka+0:s_ka+1],    0+k_hi
    s_load_dwordx8  s[s_dtile_ix+0:s_dtile_ix+7],   s[s_ka+0:s_ka+1],    0+k_dtile_ix
    s_load_dwordx4  s[s_dslice_x+0:s_dslice_x+3],   s[s_ka+0:s_ka+1],    0+k_dslice_x
    s_load_dwordx2  s[s_dslice_w_left+0:s_dslice_w_left+1],   s[s_ka+0:s_ka+1],    0+k_dslice_w_left
    s_load_dwordx2 s[s_magic_0+0:s_magic_0+1],   s[s_ka+0:s_ka+1],    0+k_magic_0
    s_load_dwordx2 s[s_magic_2+0:s_magic_2+1],   s[s_ka+0:s_ka+1],    0+k_magic_2
    s_load_dword   s[s_magic_4],   s[s_ka+0:s_ka+1],    0+k_magic_4
    s_load_dwordx2 s[s_magic_5+0:s_magic_5+1],   s[s_ka+0:s_ka+1],    0+k_magic_5
    s_load_dwordx2 s[s_shift_pack_0+0:s_shift_pack_0+1],   s[s_ka+0:s_ka+1],    0+k_shift_pack_0

    ; output, thread(k0,k1e,n0,n1b): 1x1x16x1, cluster(k0,k1e,n0,n1b): 1x16x1x16
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_gtc_in1b], 15, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 4, v[v_tmp]
    v_mov_b32 v[v_gtc_in0], 0
    v_and_b32 v[v_gtc_ik1e], 15, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 4, v[v_tmp]
    v_mov_b32 v[v_gtc_ik0], 0

    ; wei, thread(k0,k1e,c0,c1): 1x1x4x1, cluster(k0,k1e,c0,c1): 1x16x1x16
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_gtc_ic1], 15, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 4, v[v_tmp]
    v_mov_b32 v[v_gtc_ic0], 0

    s_waitcnt lgkmcnt(0)

    ; calculate index ...

    ; initialize the strides
    s_mul_i32 s[s_out_stride_k],      s[s_ho],       s[s_wo]
    s_mul_i32 s[s_tmp],      s[s_k],       s[s_out_stride_k]
    s_mul_i32 s[s_out_stride_n],      s[s_group],        s[s_tmp]
    s_mul_i32 s[s_in_stride_c],       s[s_hi],       s[s_wi]
    s_mul_i32 s[s_tmp],       s[s_c],        s[s_in_stride_c]
    s_mul_i32 s[s_in_stride_n],       s[s_group],        s[s_tmp]
    s_mul_i32 s[s_wei_stride_c],      s[s_y],        s[s_x]
    s_mul_i32 s[s_wei_stride_k],      s[s_c],        s[s_wei_stride_c]
    s_mul_i32 s[s_stride_dslice_hw],  s[s_dslice_h], s[s_dslice_w]
    s_mul_i32 s[s_stride_dslice_yx],  s[s_dslice_y], s[s_dslice_x]
    ; pad b into multiplier of nxb
    s_mov_b32 s[s_dslice_dim_b], s[s_stride_dslice_hw]
    s_lshl_b32 s[s_out_stride_n0], s[s_out_stride_n], 4
    s_lshl_b32 s[s_wei_stride_c0], s[s_wei_stride_c], 4
    s_mul_i32 s[s_dtile_dy_neg], -1, s[s_dtile_dy]
    s_mul_i32 s[s_dtile_dx_neg], -1, s[s_dtile_dx]

    ; k1e transform
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_gtc_ik1,v_gtc_ik1e,s_magic_0,s_tmp+3,s_stride_dslice_yx,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_gtc_dslice_ix,v_gtc_dslice_iy,v_tmp+4,s_magic_1,s_tmp+3,s_dslice_x,v_tmp

    ; gemm_m_per_block:64, gemm_n_per_block:256
    s_mul_i32 s[s_tmp], s[s_dslice_dim_b], s[s_n]
    s_mul_i32 s[s_tmp+1], s[s_tmp], s[s_c]
    s_lshr_b32 s[0], s[s_tmp+1], 14
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_2,s_tmp+3,0,s_tmp
    s_mov_b32 s[s_bx], s[s_tmp+4]
    s_mul_i32 s[s_tmp], s[s_dslice_dim_b], s[s_n]
    s_lshr_b32 s[0], s[s_tmp], 8
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_3,s_tmp+3,0,s_tmp
    s_mov_b64 s[0:1], s[s_magic_0+0:s_magic_0+1]
    ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im
    s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+5], 6
    s_mov_b32 s[s_tmp+5], s[s_dslice_dim_b] ; total number of n1b
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_ss s_block_gtc_in1b,s_block_gtc_in0,s_tmp+4,s_magic_4,s_tmp+3,s_tmp+5,s_tmp
    s_lshl_b32 s[s_block_gtc_in1b], s[s_block_gtc_in1b], 4
    s_lshl_b32 s[s_block_gtc_in0], s[s_block_gtc_in0], 4

    ; n1b transform
    v_add_u32 v[v_tmp+5], s[s_block_gtc_in1b], v[v_gtc_in1b]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_gtc_in1,v_tmp+5,s_magic_5,s_tmp+3,s_dslice_dim_b,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_out_dslice_iw,v_out_dslice_ih,v_tmp+4,s_magic_6,s_tmp+3,s_dslice_w,v_tmp

    ; iHTildaLeft, iWTildaLeft
    v_add_u32 v[v_out_dslice_ih], s[s_dslice_h_left], v[v_out_dslice_ih]
    v_add_u32 v[v_out_dslice_iw], s[s_dslice_w_left], v[v_out_dslice_iw]
    ; dslice_y,dslice_h -> oh, dslice_x,dslice_w -> ow
    v_mad_i32_i24 v[v_out_iho], s[s_dtile_dy_neg], v[v_gtc_dslice_iy], v[v_out_dslice_ih]
    v_mad_i32_i24 v[v_out_iwo], s[s_dtile_dx_neg], v[v_gtc_dslice_ix], v[v_out_dslice_iw]

    s_mov_b64 s[2:3], s[s_magic_5+0:s_magic_5+1]
    ; calculate output offset
    s_mul_i32 s[s_tmp+5], s[s_k], s[s_out_stride_k]
    s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+5]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+5]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]
    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_in0], 5
    s_mul_i32 s[s_tmp], s[s_out_stride_n], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_out_stride_n], s[s_tmp+3]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]

    v_mov_b32 v[v_tmp], v[v_gtc_ik1]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_k], v[v_tmp]
    v_mov_b32 v[v_tmp+1], v[v_gtc_in1]
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_tmp+1]
    v_add_lshl_u32 v[v_out_os_base], v[v_tmp], v[v_tmp+1], 1
    ; from ho, wo, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wo], v[v_out_iho], v[v_out_iwo]
    v_lshl_add_u32 v[v_out_os], v[v_tmp], 1, v[v_out_os_base]
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho]
    v_cndmask_b32 v[v_out_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc

    s_lshl_b32 s[s_out_stride_n0], s[s_out_stride_n0], 1

    s_mul_i32 s[s_out_offset+0], 2, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+1], 3, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+2], 4, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+3], 5, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+4], 6, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+5], 7, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+6], 8, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+7], 9, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+8], 10, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+9], 11, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+10], 12, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+11], 13, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+12], 14, s[s_out_stride_n0]
    s_mul_i32 s[s_out_offset+13], 15, s[s_out_stride_n0]
    s_mov_b32 s[s_p_out+2], 0xffffffff
    s_mov_b32 s[s_p_out+3], 0x27000
    ; load output
    .v_clear_nc v_gld_b, 16
    v_cmp_eq_u32 vcc, 1, v[v_out_flag]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_load_short_d16 v[v_gld_b+0], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0
    buffer_load_short_d16 v[v_gld_b+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_stride_n0] offen offset:0
    buffer_load_short_d16 v[v_gld_b+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+0] offen offset:0
    buffer_load_short_d16 v[v_gld_b+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+1] offen offset:0
    buffer_load_short_d16 v[v_gld_b+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+2] offen offset:0
    buffer_load_short_d16 v[v_gld_b+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+3] offen offset:0
    buffer_load_short_d16 v[v_gld_b+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+4] offen offset:0
    buffer_load_short_d16 v[v_gld_b+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+5] offen offset:0
    buffer_load_short_d16 v[v_gld_b+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+6] offen offset:0
    buffer_load_short_d16 v[v_gld_b+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+7] offen offset:0
    buffer_load_short_d16 v[v_gld_b+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+8] offen offset:0
    buffer_load_short_d16 v[v_gld_b+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+9] offen offset:0
    buffer_load_short_d16 v[v_gld_b+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+10] offen offset:0
    buffer_load_short_d16 v[v_gld_b+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+11] offen offset:0
    buffer_load_short_d16 v[v_gld_b+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+12] offen offset:0
    buffer_load_short_d16 v[v_gld_b+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+13] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]

    ; calculate wei offset
    s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]
    v_mov_b32 v[v_dtile_iy], s[s_dtile_iy]
    v_mov_b32 v[v_dtile_ix], s[s_dtile_ix]
    ; calculate y, x in original wei from variance by slice window of gemm-k
    v_mad_u32_u24 v[v_wei_iy], s[s_dtile_y], v[v_gtc_dslice_iy], v[v_dtile_iy]
    v_mad_u32_u24 v[v_wei_ix], s[s_dtile_x], v[v_gtc_dslice_ix], v[v_dtile_ix]
    v_mov_b32 v[v_tmp], v[v_gtc_ic1]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_c], v[v_tmp+5]
    v_mov_b32 v[v_tmp+1], v[v_gtc_ik1]
    v_mul_lo_u32 v[v_tmp+1], s[s_wei_stride_k], v[v_tmp+1]
    v_add_lshl_u32 v[v_wei_os_base], v[v_tmp], v[v_tmp+1], 1
    ; from y, x, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], v[v_wei_iy], s[s_x], v[v_wei_ix]
    v_lshl_add_u32 v[v_wei_os], v[v_tmp], 1, v[v_wei_os_base]

    s_lshl_b32 s[s_wei_stride_c0], s[s_wei_stride_c0], 1

    s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_c0]
    s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_c0]
    s_mov_b32 s[s_p_wei+2], 0xffffffff
    s_mov_b32 s[s_p_wei+3], 0x27000
    ; load weight
    .v_clear_nc v_gld_a, 4
    buffer_load_short_d16 v[v_gld_a+0], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    buffer_load_short_d16 v[v_gld_a+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_c0] offen offset:0
    buffer_load_short_d16 v[v_gld_a+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+0] offen offset:0
    buffer_load_short_d16 v[v_gld_a+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get source matrix gemm index
    v_and_b32 v[v_gemm_in], 15, v[v_tmp+5]           ; block_n index 
    v_and_b32 v[v_gemm_im], 15, v[v_tmp+5]           ; block_m index 
    v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5]
    v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5]          ; block_n_per_wave index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5]  ; waves_per_n index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5]  ; waves_per_m index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 4, v[v_gemm_im]

    ; LDS store, out: k0,k1e,n0,n1b: 1x1x16x1, 1x16x1x16, order:4
    v_lshlrev_b32 v[v_tmp], 2, v[v_gtc_in1b]
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ik1e]
    v_lshl_add_u32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp]
    v_and_b32 v[v_tmp+1], 3, v[v_gtc_ik1e]
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1]
    v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp]
    v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os]

    ; LDS store, wei: k0,k1e,c0,c1: 1x1x4x1, 1x16x1x16, order:0
    v_lshlrev_b32 v[v_tmp], 2, v[v_gtc_ic1]
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ik1e]
    v_lshl_add_u32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp]
    v_and_b32 v[v_tmp+1], 3, v[v_gtc_ik1e]
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1]
    v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp]

    ; LDS load
    v_lshlrev_b32 v[v_sld_b_os], 3, v[v_gemm_in]
    v_lshlrev_b32 v[v_sld_a_os], 3, v[v_gemm_im]
    v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os]

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get dst matrix gemm index
    v_and_b32 v[v_tmp+0], 15, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 3, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_mov_b32 v[v_gemm_in], v[v_tmp+0]
    v_lshlrev_b32 v[v_gemm_im], 2, v[v_tmp+1]
    v_and_b32 v[v_tmp+0], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp+0], 6, v[v_gemm_in]
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp+1], 4, v[v_gemm_im]

    ; init_co_lds_offset for xdlops
    v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im]
    v_and_b32 v[v_tmp], 3, v[v_tmp]   ; thread id of lanegroup_m_per_cluster
    v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp]
    v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im]  ; thread id of waves_per_m
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst]
    v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst]
    v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in]   ; implicit transpose with m granularity:4 while store
    v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1]
    v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 3, v[0]
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0]
    ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2
    ; nd_stride:[4, 1, 1, 1, 1, 2, 1]
    v_mov_b32 v[v_co_sub_m_index], 0
    ; init_co_sub_n_index xdlops
    v_and_b32 v[v_co_sub_n_index], 255, v[0]

    ; input offset
    s_mul_i32 s[s_tmp+2], s[s_c], s[s_in_stride_c]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]
    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_in0], 5
    s_mul_i32 s[s_tmp], s[s_in_stride_n], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_in_stride_n], s[s_tmp+3]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]

    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1
    s_mul_i32 s[s_tmp], s[s_in_stride_c], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_in_stride_c], s[s_tmp+3]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]

    ; compute v_co_sub_n_index along n0 x n1b : 16x16
    v_and_b32 v[v_in_in1b], 15, v[v_co_sub_n_index]     ; => N1B
    v_lshrrev_b32 v[v_in_in0], 4, v[v_co_sub_n_index]  ; => N0
    ;   compute from n1b
    v_add_u32 v[v_tmp+5], s[s_block_gtc_in1b], v[v_in_in1b]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in1,v_tmp+5,2,s_tmp+3,s_dslice_dim_b,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_dslice_iw,v_in_dslice_ih,v_tmp+4,3,s_tmp+3,s_dslice_w,v_tmp

    v_add_u32 v[v_in_dslice_ih], s[s_dslice_h_left], v[v_in_dslice_ih]
    v_add_u32 v[v_in_dslice_iw], s[s_dslice_w_left], v[v_in_dslice_iw]

    ; dslice_h,dslice_y -> hip,  dslice_w,dslicw_x -> wip
    s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_dilation_h]
    v_mul_lo_u32 v[v_tmp], s[s_stride_h], v[v_in_dslice_ih]
    v_add_u32 v[v_tmp], s[s_tmp], v[v_tmp]
    s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w]
    v_mul_lo_u32 v[v_tmp+1], s[s_stride_w], v[v_in_dslice_iw]
    v_add_u32 v[v_tmp+1], s[s_tmp+1], v[v_tmp+1]
    ; v_tmp: hip, v_tmp+1: wip

    ; hip->h, wip->w
    v_sub_i32 v[v_in_ihi], v[v_tmp], s[s_pad_h]
    v_sub_i32 v[v_in_iwi], v[v_tmp+1], s[s_pad_w]

    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc

    ; add in_in0, in_in1
    v_lshl_or_b32 v[v_tmp+1], v[v_in_in0], 4, v[v_in_in1]
    v_mul_lo_u32 v[v_in_os], s[s_in_stride_n], v[v_tmp+1]
    ; add i_c
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_c], v[v_co_sub_m_index]
    v_add_u32 v[v_in_os], v[v_in_os], v[v_tmp]
    ; add hi, wi
    v_mul_lo_u32 v[v_tmp+1], s[s_wi], v[v_in_ihi]
    v_add3_u32 v[v_in_os], v[v_in_os], v[v_tmp+1], v[v_in_iwi]
    v_lshlrev_b32 v[v_in_os], 1, v[v_in_os]

    ; move slice stride
    s_mov_b32 s[s_tmp+5], 16
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_move_slice_k_k1,s_tmp+5,0,s_tmp+3,s_stride_dslice_yx,s_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_ss s_move_slice_k_dsx,s_move_slice_k_dsy,s_tmp+4,1,s_tmp+3,s_dslice_x,s_tmp

    s_mov_b32 s[s_p_in+2], 0xffffffff
    s_mov_b32 s[s_p_in+3], 0x27000
    s_mul_i32 s[s_out_stride_k_k0_k1_diff], 0, s[s_out_stride_k]
    s_mul_i32 s[s_wei_stride_k_k0_k1_diff], 0, s[s_wei_stride_k]
    s_mul_i32 s[s_out_stride_k_k1], s[s_move_slice_k_k1], s[s_out_stride_k]  ; might be 0 or larger
    s_mul_i32 s[s_wei_stride_k_k1], s[s_move_slice_k_k1], s[s_wei_stride_k]  ; might be 0 or larger
    s_lshl_b32 s[s_out_stride_k_k1], s[s_out_stride_k_k1], 1
    s_lshl_b32 s[s_wei_stride_k_k1], s[s_wei_stride_k_k1], 1
    s_lshl_b32 s[s_out_stride_k_k0_k1_diff], s[s_out_stride_k_k0_k1_diff], 1
    s_lshl_b32 s[s_wei_stride_k_k0_k1_diff], s[s_wei_stride_k_k0_k1_diff], 1
    s_lshl_b32 s[s_out_stride_k], s[s_out_stride_k], 1
    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1
    s_lshl_b32 s[s_in_stride_c], s[s_in_stride_c], 1
    s_mov_b32 s[s_gemm_k_num_k1], 16
    s_mul_i32 s[s_knum], s[s_stride_dslice_yx], s[s_k]

    ; start MFMA loop, 16x64 wave tile with 2x2 repeat, 1x1 step
    s_waitcnt vmcnt(4)
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] 
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+4] offset:512
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+5] offset:640
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+6] offset:768
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+7] offset:896
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+8] offset:1024
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+9] offset:1152
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+10] offset:1280
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+11] offset:1408
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+12] offset:1536
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+13] offset:1664
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+14] offset:1792
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+15] offset:1920

    s_waitcnt vmcnt(0)
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] 
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384

    .v_clear_acc_c a_c, 64
    ; make sure acc WAR harzard, at least 1 nop for src_c
    s_sub_i32 s[s_kitr], s[s_knum], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_mfma_end

    ; move slice window by unroll-k along gemm-k
    v_add_u32 v[v_move_slice_k_idsx], s[s_move_slice_k_dsx], v[v_move_slice_k_idsx]
    v_cmpx_le_u32 vcc, s[s_gemm_k_num_dsx], v[v_move_slice_k_idsx]
    v_subrev_u32 v[v_move_slice_k_idsx], s[s_gemm_k_num_dsx], v[v_move_slice_k_idsx]
    v_add_u32 v[v_move_slice_k_idsy], 1, v[v_move_slice_k_idsy]
    s_mov_b64 exec, -1

    v_add_u32 v[v_move_slice_k_idsy], s[s_move_slice_k_dsy], v[v_move_slice_k_idsy]
    v_cmpx_le_u32 vcc, s[s_gemm_k_num_dsy], v[v_move_slice_k_idsy]
    v_subrev_u32 v[v_move_slice_k_idsy], s[s_gemm_k_num_dsy], v[v_move_slice_k_idsy]
    v_add_u32 v[v_move_slice_k_ik1], 1, v[v_move_slice_k_ik1]
    v_add_u32 v[v_out_os_base], s[s_out_stride_k], v[v_out_os_base]
    v_add_u32 v[v_wei_os_base], s[s_wei_stride_k], v[v_wei_os_base]
    s_mov_b64 exec, -1

    v_add_u32 v[v_move_slice_k_ik1], s[s_move_slice_k_k1], v[v_move_slice_k_ik1]
    v_add_u32 v[v_out_os_base], s[s_out_stride_k_k1], v[v_out_os_base]
    v_add_u32 v[v_wei_os_base], s[s_wei_stride_k_k1], v[v_wei_os_base]
    v_cmpx_le_u32 vcc, s[s_gemm_k_num_k1], v[v_move_slice_k_ik1]
    v_subrev_u32 v[v_move_slice_k_ik1], s[s_gemm_k_num_k1], v[v_move_slice_k_ik1]
    v_add_u32 v[v_out_os_base], s[s_out_stride_k_k0_k1_diff], v[v_out_os_base]
    v_add_u32 v[v_wei_os_base], s[s_wei_stride_k_k0_k1_diff], v[v_wei_os_base]
    s_mov_b64 exec, -1

    ; dslice_y,dslice_h -> oh, dslice_x,dslice_w -> ow
    v_mad_i32_i24 v[v_out_iho], s[s_dtile_dy_neg], v[v_move_slice_k_idsy], v[v_out_dslice_ih]
    v_mad_i32_i24 v[v_out_iwo], s[s_dtile_dx_neg], v[v_move_slice_k_idsx], v[v_out_dslice_iw]
    ; from ho, wo, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wo], v[v_out_iho], v[v_out_iwo]
    v_lshl_add_u32 v[v_out_os], v[v_tmp], 1, v[v_out_os_base]
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho]
    v_cndmask_b32 v[v_out_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc
    ; calculate y, x in original wei from variance by slice window of gemm-k
    v_mad_u32_u24 v[v_wei_iy], s[s_dtile_y], v[v_move_slice_k_idsy], v[v_dtile_iy]
    v_mad_u32_u24 v[v_wei_ix], s[s_dtile_x], v[v_move_slice_k_idsx], v[v_dtile_ix]
    ; from y, x, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], v[v_wei_iy], s[s_x], v[v_wei_ix]
    v_lshl_add_u32 v[v_wei_os], v[v_tmp], 1, v[v_wei_os_base]
    s_waitcnt lgkmcnt(0)
    s_barrier
L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_mfma_body:
    ; do fma accumulate with unroll 16
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] 
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] 
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    .v_clear_nc v_gld_b, 16
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(3)
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    v_cmp_eq_u32 vcc, 1, v[v_out_flag]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_load_short_d16 v[v_gld_b+0], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0
    buffer_load_short_d16 v[v_gld_b+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_stride_n0] offen offset:0
    buffer_load_short_d16 v[v_gld_b+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+0] offen offset:0
    buffer_load_short_d16 v[v_gld_b+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+1] offen offset:0
    buffer_load_short_d16 v[v_gld_b+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+2] offen offset:0
    buffer_load_short_d16 v[v_gld_b+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+3] offen offset:0
    buffer_load_short_d16 v[v_gld_b+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+4] offen offset:0
    buffer_load_short_d16 v[v_gld_b+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+5] offen offset:0
    buffer_load_short_d16 v[v_gld_b+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+6] offen offset:0
    buffer_load_short_d16 v[v_gld_b+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+7] offen offset:0
    buffer_load_short_d16 v[v_gld_b+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+8] offen offset:0
    buffer_load_short_d16 v[v_gld_b+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+9] offen offset:0
    buffer_load_short_d16 v[v_gld_b+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+10] offen offset:0
    buffer_load_short_d16 v[v_gld_b+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+11] offen offset:0
    buffer_load_short_d16 v[v_gld_b+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+12] offen offset:0
    buffer_load_short_d16 v[v_gld_b+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+13] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(4)
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    .v_clear_nc v_gld_a, 4
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0
    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    buffer_load_short_d16 v[v_gld_a+0], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    buffer_load_short_d16 v[v_gld_a+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_c0] offen offset:0
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    buffer_load_short_d16 v[v_gld_a+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+0] offen offset:0
    buffer_load_short_d16 v[v_gld_a+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    v_add_u32 v[v_move_slice_k_idsx], s[s_move_slice_k_dsx], v[v_move_slice_k_idsx]
    v_cmpx_le_u32 vcc, s[s_gemm_k_num_dsx], v[v_move_slice_k_idsx]
    v_subrev_u32 v[v_move_slice_k_idsx], s[s_gemm_k_num_dsx], v[v_move_slice_k_idsx]
    v_add_u32 v[v_move_slice_k_idsy], 1, v[v_move_slice_k_idsy]
    s_mov_b64 exec, -1
    v_add_u32 v[v_move_slice_k_idsy], s[s_move_slice_k_dsy], v[v_move_slice_k_idsy]
    v_cmpx_le_u32 vcc, s[s_gemm_k_num_dsy], v[v_move_slice_k_idsy]
    v_subrev_u32 v[v_move_slice_k_idsy], s[s_gemm_k_num_dsy], v[v_move_slice_k_idsy]
    v_add_u32 v[v_move_slice_k_ik1], 1, v[v_move_slice_k_ik1]
    v_add_u32 v[v_out_os_base], s[s_out_stride_k], v[v_out_os_base]
    v_add_u32 v[v_wei_os_base], s[s_wei_stride_k], v[v_wei_os_base]
    s_mov_b64 exec, -1
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    v_add_u32 v[v_move_slice_k_ik1], s[s_move_slice_k_k1], v[v_move_slice_k_ik1]
    v_add_u32 v[v_out_os_base], s[s_out_stride_k_k1], v[v_out_os_base]
    v_add_u32 v[v_wei_os_base], s[s_wei_stride_k_k1], v[v_wei_os_base]
    v_cmpx_le_u32 vcc, s[s_gemm_k_num_k1], v[v_move_slice_k_ik1]
    v_subrev_u32 v[v_move_slice_k_ik1], s[s_gemm_k_num_k1], v[v_move_slice_k_ik1]
    v_add_u32 v[v_out_os_base], s[s_out_stride_k_k0_k1_diff], v[v_out_os_base]
    v_add_u32 v[v_wei_os_base], s[s_wei_stride_k_k0_k1_diff], v[v_wei_os_base]
    s_mov_b64 exec, -1
    v_mad_i32_i24 v[v_out_iho], s[s_dtile_dy_neg], v[v_move_slice_k_idsy], v[v_out_dslice_ih]
    v_mad_i32_i24 v[v_out_iwo], s[s_dtile_dx_neg], v[v_move_slice_k_idsx], v[v_out_dslice_iw]
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0
    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mad_u32_u24 v[v_tmp], s[s_wo], v[v_out_iho], v[v_out_iwo]
    v_lshl_add_u32 v[v_out_os], v[v_tmp], 1, v[v_out_os_base]
    v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho]
    v_cndmask_b32 v[v_out_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1
    v_mad_u32_u24 v[v_wei_iy], s[s_dtile_y], v[v_move_slice_k_idsy], v[v_dtile_iy]
    v_mad_u32_u24 v[v_wei_ix], s[s_dtile_x], v[v_move_slice_k_idsx], v[v_dtile_ix]
    v_mad_u32_u24 v[v_tmp], v[v_wei_iy], s[s_x], v[v_wei_ix]
    v_lshl_add_u32 v[v_wei_os], v[v_tmp], 1, v[v_wei_os_base]
    s_waitcnt lgkmcnt(0)
    s_barrier
    s_waitcnt vmcnt(4)
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+0]
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+4] offset:512
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+5] offset:640
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+6] offset:768
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+7] offset:896
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+8] offset:1024
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+9] offset:1152
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+10] offset:1280
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+11] offset:1408
    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+12] offset:1536
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+13] offset:1664
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+14] offset:1792
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+15] offset:1920
    s_waitcnt vmcnt(0)
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+0]
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384
    s_sub_i32 s[s_kitr], s[s_kitr], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_mfma_finishing
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    s_waitcnt lgkmcnt(0)
    s_barrier
    s_branch L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_mfma_body
L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_mfma_finishing:
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_mfma_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] 
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] 
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256
    ; k iteration : 0
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(3)
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0

    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1

    ; k iteration : 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0

    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1

    ; k iteration : 14
    s_waitcnt lgkmcnt(6)
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16

    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ; k iteration : 15
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(1)
    v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(0)
    v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16

    v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16

    s_nop 9
    ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:16, wt_n:64, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4
    ; coalescing_groups:2, num_dword_per_group:32
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0]
    ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2
    ; nd_stride:[4, 1, 1, 1, 1, 2, 1]
    ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+1]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+2]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+3]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1]
    v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+4]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+5]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+6]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+7]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5]
    v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128   ; idword:16(0,16),  0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:1
    v_accvgpr_read_b32 v[v_c+8], a[a_c+8]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+9]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+10]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+11]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9]
    v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11]
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:256   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:2
    v_accvgpr_read_b32 v[v_c+12], a[a_c+12]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+13]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+14]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+15]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13]
    v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15]
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:384   ; idword:48(0,48),  0x48 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:3
    v_accvgpr_read_b32 v[v_c+16], a[a_c+16]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+17]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+18]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+19]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    v_pack_b32_f16 v[v_c+16], v[v_c+16], v[v_c+17]
    v_pack_b32_f16 v[v_c+17], v[v_c+18], v[v_c+19]
    ds_write_b64 v[v_co_sst], v[v_c+16:v_c+16+1] offset:1024   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+20]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+21]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+22]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+23]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    v_pack_b32_f16 v[v_c+20], v[v_c+20], v[v_c+21]
    v_pack_b32_f16 v[v_c+21], v[v_c+22], v[v_c+23]
    ds_write_b64 v[v_co_sst], v[v_c+20:v_c+20+1] offset:1152   ; idword:144(0,144),  0x144 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:1
    v_accvgpr_read_b32 v[v_c+24], a[a_c+24]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+25]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+26]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+27]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    v_pack_b32_f16 v[v_c+24], v[v_c+24], v[v_c+25]
    v_pack_b32_f16 v[v_c+25], v[v_c+26], v[v_c+27]
    ds_write_b64 v[v_co_sst], v[v_c+24:v_c+24+1] offset:1280   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:2
    v_accvgpr_read_b32 v[v_c+28], a[a_c+28]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+29]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+30]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+31]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    v_pack_b32_f16 v[v_c+28], v[v_c+28], v[v_c+29]
    v_pack_b32_f16 v[v_c+29], v[v_c+30], v[v_c+31]
    ds_write_b64 v[v_co_sst], v[v_c+28:v_c+28+1] offset:1408   ; idword:176(0,176),  0x176 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:3
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] 
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    ds_read_b64 v[v_c+8:v_c+8+1], v[v_co_sld] offset:8192
    ds_read_b64 v[v_c+10:v_c+10+1], v[v_co_sld] offset:10240
    ds_read_b64 v[v_c+12:v_c+12+1], v[v_co_sld] offset:12288
    ds_read_b64 v[v_c+14:v_c+14+1], v[v_co_sld] offset:14336
    v_cmpx_eq_u32 vcc, 1, v[v_in_flag]
    ;   store to global, m index start from 0, m0:0, m1:0
    s_mov_b32 s[s_tmp], 0   ; i_m:0(i_m0:0,i_m1:0)
    s_waitcnt lgkmcnt(7)
    buffer_store_short v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b32 s[s_tmp], s[s_in_stride_c]   ; i_m:1(i_m0:0,i_m1:1)
    buffer_store_short_d16_hi v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 2, s[s_in_stride_c]   ; i_m:2(i_m0:0,i_m1:2)
    buffer_store_short v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 3, s[s_in_stride_c]   ; i_m:3(i_m0:0,i_m1:3)
    buffer_store_short_d16_hi v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 4, s[s_in_stride_c]   ; i_m:4(i_m0:0,i_m1:4)
    s_waitcnt lgkmcnt(6)
    buffer_store_short v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 5, s[s_in_stride_c]   ; i_m:5(i_m0:0,i_m1:5)
    buffer_store_short_d16_hi v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 6, s[s_in_stride_c]   ; i_m:6(i_m0:0,i_m1:6)
    buffer_store_short v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 7, s[s_in_stride_c]   ; i_m:7(i_m0:0,i_m1:7)
    buffer_store_short_d16_hi v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 8, s[s_in_stride_c]   ; i_m:8(i_m0:0,i_m1:8)
    s_waitcnt lgkmcnt(5)
    buffer_store_short v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 9, s[s_in_stride_c]   ; i_m:9(i_m0:0,i_m1:9)
    buffer_store_short_d16_hi v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 10, s[s_in_stride_c]   ; i_m:10(i_m0:0,i_m1:10)
    buffer_store_short v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 11, s[s_in_stride_c]   ; i_m:11(i_m0:0,i_m1:11)
    buffer_store_short_d16_hi v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 12, s[s_in_stride_c]   ; i_m:12(i_m0:0,i_m1:12)
    s_waitcnt lgkmcnt(4)
    buffer_store_short v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 13, s[s_in_stride_c]   ; i_m:13(i_m0:0,i_m1:13)
    buffer_store_short_d16_hi v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 14, s[s_in_stride_c]   ; i_m:14(i_m0:0,i_m1:14)
    buffer_store_short v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 15, s[s_in_stride_c]   ; i_m:15(i_m0:0,i_m1:15)
    buffer_store_short_d16_hi v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 16, s[s_in_stride_c]   ; i_m:16(i_m0:1,i_m1:0)
    s_waitcnt lgkmcnt(3)
    buffer_store_short v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 17, s[s_in_stride_c]   ; i_m:17(i_m0:1,i_m1:1)
    buffer_store_short_d16_hi v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 18, s[s_in_stride_c]   ; i_m:18(i_m0:1,i_m1:2)
    buffer_store_short v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 19, s[s_in_stride_c]   ; i_m:19(i_m0:1,i_m1:3)
    buffer_store_short_d16_hi v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 20, s[s_in_stride_c]   ; i_m:20(i_m0:1,i_m1:4)
    s_waitcnt lgkmcnt(2)
    buffer_store_short v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 21, s[s_in_stride_c]   ; i_m:21(i_m0:1,i_m1:5)
    buffer_store_short_d16_hi v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 22, s[s_in_stride_c]   ; i_m:22(i_m0:1,i_m1:6)
    buffer_store_short v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 23, s[s_in_stride_c]   ; i_m:23(i_m0:1,i_m1:7)
    buffer_store_short_d16_hi v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 24, s[s_in_stride_c]   ; i_m:24(i_m0:1,i_m1:8)
    s_waitcnt lgkmcnt(1)
    buffer_store_short v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 25, s[s_in_stride_c]   ; i_m:25(i_m0:1,i_m1:9)
    buffer_store_short_d16_hi v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 26, s[s_in_stride_c]   ; i_m:26(i_m0:1,i_m1:10)
    buffer_store_short v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 27, s[s_in_stride_c]   ; i_m:27(i_m0:1,i_m1:11)
    buffer_store_short_d16_hi v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 28, s[s_in_stride_c]   ; i_m:28(i_m0:1,i_m1:12)
    s_waitcnt lgkmcnt(0)
    buffer_store_short v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 29, s[s_in_stride_c]   ; i_m:29(i_m0:1,i_m1:13)
    buffer_store_short_d16_hi v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 30, s[s_in_stride_c]   ; i_m:30(i_m0:1,i_m1:14)
    buffer_store_short v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 31, s[s_in_stride_c]   ; i_m:31(i_m0:1,i_m1:15)
    buffer_store_short_d16_hi v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
    ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+32]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+33]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+34]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+35]
    v_cvt_f16_f32_e32 v[v_c], v[v_c]
    v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1]
    v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2]
    v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3]
    v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1]
    v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+36]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+37]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+38]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+39]
    v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5]
    v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6]
    v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7]
    v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5]
    v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128   ; idword:16(0,16),  0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:1
    v_accvgpr_read_b32 v[v_c+8], a[a_c+40]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+41]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+42]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+43]
    v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9]
    v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10]
    v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11]
    v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9]
    v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11]
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:256   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:2
    v_accvgpr_read_b32 v[v_c+12], a[a_c+44]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+45]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+46]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+47]
    v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13]
    v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14]
    v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15]
    v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13]
    v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15]
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:384   ; idword:48(0,48),  0x48 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:3
    v_accvgpr_read_b32 v[v_c+16], a[a_c+48]
    v_accvgpr_read_b32 v[v_c+17], a[a_c+49]
    v_accvgpr_read_b32 v[v_c+18], a[a_c+50]
    v_accvgpr_read_b32 v[v_c+19], a[a_c+51]
    v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17]
    v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18]
    v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19]
    v_pack_b32_f16 v[v_c+16], v[v_c+16], v[v_c+17]
    v_pack_b32_f16 v[v_c+17], v[v_c+18], v[v_c+19]
    ds_write_b64 v[v_co_sst], v[v_c+16:v_c+16+1] offset:1024   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+20], a[a_c+52]
    v_accvgpr_read_b32 v[v_c+21], a[a_c+53]
    v_accvgpr_read_b32 v[v_c+22], a[a_c+54]
    v_accvgpr_read_b32 v[v_c+23], a[a_c+55]
    v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21]
    v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22]
    v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23]
    v_pack_b32_f16 v[v_c+20], v[v_c+20], v[v_c+21]
    v_pack_b32_f16 v[v_c+21], v[v_c+22], v[v_c+23]
    ds_write_b64 v[v_co_sst], v[v_c+20:v_c+20+1] offset:1152   ; idword:144(0,144),  0x144 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:1
    v_accvgpr_read_b32 v[v_c+24], a[a_c+56]
    v_accvgpr_read_b32 v[v_c+25], a[a_c+57]
    v_accvgpr_read_b32 v[v_c+26], a[a_c+58]
    v_accvgpr_read_b32 v[v_c+27], a[a_c+59]
    v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25]
    v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26]
    v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27]
    v_pack_b32_f16 v[v_c+24], v[v_c+24], v[v_c+25]
    v_pack_b32_f16 v[v_c+25], v[v_c+26], v[v_c+27]
    ds_write_b64 v[v_co_sst], v[v_c+24:v_c+24+1] offset:1280   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:2
    v_accvgpr_read_b32 v[v_c+28], a[a_c+60]
    v_accvgpr_read_b32 v[v_c+29], a[a_c+61]
    v_accvgpr_read_b32 v[v_c+30], a[a_c+62]
    v_accvgpr_read_b32 v[v_c+31], a[a_c+63]
    v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29]
    v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30]
    v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31]
    v_pack_b32_f16 v[v_c+28], v[v_c+28], v[v_c+29]
    v_pack_b32_f16 v[v_c+29], v[v_c+30], v[v_c+31]
    ds_write_b64 v[v_co_sst], v[v_c+28:v_c+28+1] offset:1408   ; idword:176(0,176),  0x176 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:3
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] 
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    ds_read_b64 v[v_c+8:v_c+8+1], v[v_co_sld] offset:8192
    ds_read_b64 v[v_c+10:v_c+10+1], v[v_co_sld] offset:10240
    ds_read_b64 v[v_c+12:v_c+12+1], v[v_co_sld] offset:12288
    ds_read_b64 v[v_c+14:v_c+14+1], v[v_co_sld] offset:14336
    v_cmpx_eq_u32 vcc, 1, v[v_in_flag]
    ;   store to global, m index start from 32, m0:2, m1:0
    s_mul_i32 s[s_tmp], 32, s[s_in_stride_c]   ; i_m:32(i_m0:2,i_m1:0)
    s_waitcnt lgkmcnt(7)
    buffer_store_short v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 33, s[s_in_stride_c]   ; i_m:33(i_m0:2,i_m1:1)
    buffer_store_short_d16_hi v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 34, s[s_in_stride_c]   ; i_m:34(i_m0:2,i_m1:2)
    buffer_store_short v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 35, s[s_in_stride_c]   ; i_m:35(i_m0:2,i_m1:3)
    buffer_store_short_d16_hi v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 36, s[s_in_stride_c]   ; i_m:36(i_m0:2,i_m1:4)
    s_waitcnt lgkmcnt(6)
    buffer_store_short v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 37, s[s_in_stride_c]   ; i_m:37(i_m0:2,i_m1:5)
    buffer_store_short_d16_hi v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 38, s[s_in_stride_c]   ; i_m:38(i_m0:2,i_m1:6)
    buffer_store_short v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 39, s[s_in_stride_c]   ; i_m:39(i_m0:2,i_m1:7)
    buffer_store_short_d16_hi v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 40, s[s_in_stride_c]   ; i_m:40(i_m0:2,i_m1:8)
    s_waitcnt lgkmcnt(5)
    buffer_store_short v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 41, s[s_in_stride_c]   ; i_m:41(i_m0:2,i_m1:9)
    buffer_store_short_d16_hi v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 42, s[s_in_stride_c]   ; i_m:42(i_m0:2,i_m1:10)
    buffer_store_short v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 43, s[s_in_stride_c]   ; i_m:43(i_m0:2,i_m1:11)
    buffer_store_short_d16_hi v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 44, s[s_in_stride_c]   ; i_m:44(i_m0:2,i_m1:12)
    s_waitcnt lgkmcnt(4)
    buffer_store_short v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 45, s[s_in_stride_c]   ; i_m:45(i_m0:2,i_m1:13)
    buffer_store_short_d16_hi v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 46, s[s_in_stride_c]   ; i_m:46(i_m0:2,i_m1:14)
    buffer_store_short v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 47, s[s_in_stride_c]   ; i_m:47(i_m0:2,i_m1:15)
    buffer_store_short_d16_hi v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 48, s[s_in_stride_c]   ; i_m:48(i_m0:3,i_m1:0)
    s_waitcnt lgkmcnt(3)
    buffer_store_short v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 49, s[s_in_stride_c]   ; i_m:49(i_m0:3,i_m1:1)
    buffer_store_short_d16_hi v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 50, s[s_in_stride_c]   ; i_m:50(i_m0:3,i_m1:2)
    buffer_store_short v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 51, s[s_in_stride_c]   ; i_m:51(i_m0:3,i_m1:3)
    buffer_store_short_d16_hi v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 52, s[s_in_stride_c]   ; i_m:52(i_m0:3,i_m1:4)
    s_waitcnt lgkmcnt(2)
    buffer_store_short v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 53, s[s_in_stride_c]   ; i_m:53(i_m0:3,i_m1:5)
    buffer_store_short_d16_hi v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 54, s[s_in_stride_c]   ; i_m:54(i_m0:3,i_m1:6)
    buffer_store_short v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 55, s[s_in_stride_c]   ; i_m:55(i_m0:3,i_m1:7)
    buffer_store_short_d16_hi v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 56, s[s_in_stride_c]   ; i_m:56(i_m0:3,i_m1:8)
    s_waitcnt lgkmcnt(1)
    buffer_store_short v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 57, s[s_in_stride_c]   ; i_m:57(i_m0:3,i_m1:9)
    buffer_store_short_d16_hi v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 58, s[s_in_stride_c]   ; i_m:58(i_m0:3,i_m1:10)
    buffer_store_short v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 59, s[s_in_stride_c]   ; i_m:59(i_m0:3,i_m1:11)
    buffer_store_short_d16_hi v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 60, s[s_in_stride_c]   ; i_m:60(i_m0:3,i_m1:12)
    s_waitcnt lgkmcnt(0)
    buffer_store_short v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 61, s[s_in_stride_c]   ; i_m:61(i_m0:3,i_m1:13)
    buffer_store_short_d16_hi v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 62, s[s_in_stride_c]   ; i_m:62(i_m0:3,i_m1:14)
    buffer_store_short v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mul_i32 s[s_tmp], 63, s[s_in_stride_c]   ; i_m:63(i_m0:3,i_m1:15)
    buffer_store_short_d16_hi v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0
    s_mov_b64 exec, -1
L_igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16_out:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16
    .amdhsa_group_segment_fixed_size 16384
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 86
    .amdhsa_next_free_sgpr 80
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16
    .symbol: igemm_bwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x16x1_1x16x1x16.kd
    .sgpr_count: 86
    .vgpr_count: 86
    .kernarg_segment_align: 8
    .kernarg_segment_size: 184
    .group_segment_fixed_size: 16384
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in      , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: p_wei     , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_out     , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: hi        , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi        , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n_         , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k         , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c         , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho        , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo        , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h  , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w  , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h, .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w, .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h     , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w     , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y_         , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x         , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_iy  , .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_ix  , .size: 4, .offset:  88, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_dy  , .size: 4, .offset:  92, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_dx  , .size: 4, .offset:  96, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_y   , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_x   , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_h   , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32}
    - { .name: dtile_w   , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_y  , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_x  , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_h  , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_w  , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32}
    - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32}
    - { .name: group     , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32}
    - { .name: magic_0   , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32}
    - { .name: magic_1   , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32}
    - { .name: magic_2   , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32}
    - { .name: magic_3   , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32}
    - { .name: magic_4   , .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32}
    - { .name: magic_5   , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32}
    - { .name: magic_6   , .size: 4, .offset: 168, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_0, .size: 4, .offset: 172, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_1, .size: 4, .offset: 176, .value_kind: by_value, .value_type: i32}
    - { .name: _pack_0   , .size: 4, .offset: 180, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
