
/******************************************/
/* Begin Kernel                           */
/******************************************/
.amdgcn_target "amdgcn-amd-amdhsa--gfx950"
.text
.protected Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950
.globl Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950
.p2align 8
.type Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950,@function
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950
  .amdhsa_user_sgpr_kernarg_segment_ptr 1
  .amdhsa_accum_offset 248 // accvgpr offset
  .amdhsa_next_free_vgpr 504 // vgprs
  .amdhsa_next_free_sgpr 88 // sgprs
  .amdhsa_group_segment_fixed_size 133120 // lds bytes
  .amdhsa_private_segment_fixed_size 0
  .amdhsa_system_sgpr_workgroup_id_x 1
  .amdhsa_system_sgpr_workgroup_id_y 1
  .amdhsa_system_sgpr_workgroup_id_z 1
  .amdhsa_system_vgpr_workitem_id 0
  .amdhsa_float_denorm_mode_32 3
  .amdhsa_float_denorm_mode_16_64 3
  .amdhsa_user_sgpr_count 13
  .amdhsa_user_sgpr_kernarg_preload_length 11
  .amdhsa_user_sgpr_kernarg_preload_offset 0
.end_amdhsa_kernel
.text
/* Num VGPR   =248 */
/* Num AccVGPR=256 */
/* Num SGPR   =88 */

/******************************************/
/* Optimizations and Config:              */
/******************************************/
/* ThreadTile= 32 x 8 */
/* SubGroup= 8 x 32 */
/* VectorWidthA=8 */
/* VectorWidthB=8 */
/* GlobalReadVectorWidthA=8, GlobalReadVectorWidthB=8 */
/* DirectToLdsA=True */
/* DirectToLdsB=True */
/* UseSgprForGRO=1 */
.amdgpu_metadata
---
custom.config:
  InternalSupportParams:
    KernArgsVersion: 2
  ProblemType:
      OperationType: GEMM
      DataType: b
      DestDataType: b
      ComputeDataType: s
      HighPrecisionAccumulate: True
      TransposeA: 1
      TransposeB: 0
      UseBeta: True
      Batched: True
      Activation: False
amdhsa.version:
  - 1
  - 1
amdhsa.kernels:
  - .name: Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950
    .symbol: 'Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950.kd'
    .language:                   OpenCL C
    .language_version:
      - 2
      - 0
    .args:
      - .name:            Gemm info
        .size:            4
        .offset:          0
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info0
        .size:            4
        .offset:          4
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info1
        .size:            4
        .offset:          8
        .value_kind:      by_value
        .value_type:      u32
      - .name:            numWG
        .size:            4
        .offset:          12
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree0
        .size:            4
        .offset:          16
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree1
        .size:            4
        .offset:          20
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree2
        .size:            4
        .offset:          24
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesSum0
        .size:            4
        .offset:          28
        .value_kind:      by_value
        .value_type:      u32
      - .name:            D
        .size:            8
        .offset:          32
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            C
        .size:            8
        .offset:          40
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            A
        .size:            8
        .offset:          48
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            B
        .size:            8
        .offset:          56
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            strideD0
        .size:            4
        .offset:          64
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideD1
        .size:            4
        .offset:          68
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC0
        .size:            4
        .offset:          72
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC1
        .size:            4
        .offset:          76
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA0
        .size:            4
        .offset:          80
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA1
        .size:            4
        .offset:          84
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB0
        .size:            4
        .offset:          88
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB1
        .size:            4
        .offset:          92
        .value_kind:      by_value
        .value_type:      u32
      - .name:            alpha
        .size:            4
        .offset:          96
        .value_kind:      by_value
        .value_type:      f32
      - .name:            beta
        .size:            4
        .offset:          100
        .value_kind:      by_value
        .value_type:      f32
    .group_segment_fixed_size:   133120
    .kernarg_segment_align:      8
    .kernarg_segment_size:       104
    .max_flat_workgroup_size:    256
    .private_segment_fixed_size: 0
    .sgpr_count:                 88
    .sgpr_spill_count:           0
    .vgpr_count:                 248
    .vgpr_spill_count:           0
    .wavefront_size:             64
...
.end_amdgpu_metadata
Custom_Cijk_Alik_Bljk_BBS_BH_MT256x256x64_MI16x16x1_UserArgs_shortname1_gfx950:
label_ASM_Start:  /// Main body of the asm kernel
.macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req
    v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber
    v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA
    v_add_u32 v[\vgprDstIdx+0], v[\vgprDstIdx+0], v[\vgprDstIdx+1]
    v_lshrrev_b32 v[\vgprDstIdx+0], \magicShift, v[\vgprDstIdx+0]
.endm

/******************************************/
/* VGPR Assignments                       */
/******************************************/
/* ValuC range: [0-0), serializedStore enabled */
.set vgprValuC, 0
/* ValuA/B   Xn=PLR buffer idx,  In=InnerUnroll idx */
.set vgprBase, 4
.set vgprGlobalReadOffsetA, 0
.set vgprGlobalReadOffsetB, 1
.set vgprLocalReadAddrA, 2
.set vgprLocalReadAddrB, 3
.set vgprLocalReadSwapAddrA, 132
.set vgprLocalReadSwapAddrB, 133
.set vgprSerial, 134

/******************************************/
/* VGPR Macro Assignments                 */
/******************************************/
.set vgprValuA_X0_I0_BASE, vgprBase+0
.set vgprValuB_X0_I0_BASE, vgprBase+64
.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0
.set vgprValuA_X1_I0, vgprValuA_X0_I0_BASE+32
.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0
.set vgprValuB_X1_I0, vgprValuB_X0_I0_BASE+32

/******************************************/
/* SGPR Assignments                       */
/******************************************/
.set sgprKernArgAddress, 0
.set sgprWorkGroup0, 2
.set sgprWorkGroup1, 3
.set sgprWorkGroup2, 4
.set sgprArgType, 5
.set sgprGSUSumIdx, 6
.set sgprGSULog2BpeC, 8
.set sgprGSULog2BpeD, 9
.set sgprStaggerU, 10
.set sgprWGM, 11
.set sgprLoopCounterL, 12
.set sgprOrigLoopCounter, 13
.set sgprSrdD, 16
.set sgprSrdC, 20
.set sgprNumWorkGroups0, 14
.set sgprNumWorkGroups1, 15
.set sgprSizesFree, 24
.set sgprSizesSum, 27
.set sgprAddressD, 28
.set sgprAddressC, 30
.set sgprAddressA, 32
.set sgprAddressB, 34
.set sgprStridesD, 36
.set sgprStridesC, 38
.set sgprStridesA, 40
.set sgprStridesB, 42
.set sgprAlpha, 44
.set sgprBeta, 45
.set sgprLocalWriteAddrA, 46
.set sgprLocalWriteAddrB, 47
.set sgprSwapA, 48
.set sgprSwapB, 49
.set sgprGSU, 50

/* Size Assignments */
.set sgprSizeI, sgprSizesFree+0
.set sgprSizeJ, sgprSizesFree+1
.set sgprSizeK, sgprSizesFree+2
.set sgprSizeL, sgprSizesSum+0

/* Stride Assignments */
.set constStrideD0I, 1
.set sgprStrideD1J, sgprStridesD+0
.set sgprStrideDK, sgprStridesD+1
.set constStrideC0I, 1
.set sgprStrideC1J, sgprStridesC+0
.set sgprStrideCK, sgprStridesC+1
.set constStrideAL, 1
.set sgprStrideA0I, sgprStridesA+0
.set sgprStrideAK, sgprStridesA+1
.set constStrideBL, 1
.set sgprStrideB1J, sgprStridesB+0
.set sgprStrideBK, sgprStridesB+1

.set MT0, 256
.set MT1, 256
.set DepthU, 64
.set BpeA, 2
.set BpeALog2, 1
.set BpeB, 2
.set BpeBLog2, 1
.set BpeAGR, 2
.set BpeAGRLog2, 1
.set BpeBGR, 2
.set BpeBGRLog2, 1
/* Number of elements to shift-left SRD */
.set SrdShiftLeftA, 8
.set SrdShiftLeftB, 8
/* 2GB limit - set offsets to -1 to exceed this and clamp */
.set BufferLimit, 0xffffffff
.set BufferOOB, 0x80000000

/******************************************/
/* Bits 127:96 of SRD.                    */
/* hex: 0x20000                           */
/* dst_sel_x (3b): 0                      */
/* dst_sel_y (3b): 0                      */
/* dst_sel_z (3b): 0                      */
/* dst_sel_w (3b): 0                      */
/* num_format (3b): 0                     */
/* data_format (4b): 4                    */
/* user_vm_enable (1b): 0                 */
/* user_vm_mode (1b): 0                   */
/* index_stride (2b): 0                   */
/* add_tid_enable (1b): 0                 */
/* _unusedA (3b): 0                       */
/* nv (1b): 0                             */
/* _unusedB (2b): 0                       */
/* type (2b): 0                           */
/******************************************/
.set Srd127_96, 0x20000

/* Global Offset A */
.macro GLOBAL_OFFSET_A vgprAddr:req, vgprOffsetL:req, vgprOffset0I:req, vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0]      // add prepad for pointer shift
    v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0]    // offset *= bytes/element
.endm

/* Global Offset B */
.macro GLOBAL_OFFSET_B vgprAddr:req, vgprOffsetL:req, vgprOffset1J:req, vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0], 0x8, v[\vgprAddr+0]      // add prepad for pointer shift
    v_lshlrev_b32 v[\vgprAddr+0], 1, v[\vgprAddr+0]    // offset *= bytes/element
.endm

/******************************************/
/* Allocate Resources                     */
/******************************************/

/* Load num of Gemms */
s_load_dword s51, s[sgprKernArgAddress:sgprKernArgAddress+1], 0

/* Load packed kernel args (StaggerU/GSU) */
s_load_dword s53, s[sgprKernArgAddress:sgprKernArgAddress+1], 4

/* Load WGM data */
s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8

/* Load num of WGs */
s_load_dword s54, s[sgprKernArgAddress:sgprKernArgAddress+1], 12
s_waitcnt lgkmcnt(0)                               // load args
s_lshr_b32 s52, s51, 0x1e                          // Get arg type
s_and_b32 s51, 0x3fffffff, s51                     // Get nums of gemm
s_cmp_eq_u32 s52, 0                                // Is kernel args
s_cbranch_scc0 label_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0
s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64
s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80
s_waitcnt lgkmcnt(0)                               // preload
s_branch label_LoadArgsEnd
label_HBMArgs:

/* Load address of kernel arguments */
s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 16
s_waitcnt lgkmcnt(0)                               // wait for args to load
label_LoadArgsEnd:
s_branch label_common_kernel_entry

/* pad 37 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
label_Preload_Offset_Start:
s_and_b32 s51, 0x3fffffff, s2                      // Get nums of gemm
s_lshr_b32 s52, s2, 0x1e                           // Get arg type
s_mov_b32 s53, s3                                  // Preload internal args
s_cmp_eq_u32 s52, 0                                // Is kernel args
s_cbranch_scc0 label_Preload_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dword s31, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28
s_load_dwordx8 s[32:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32
s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64
s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80
s_mov_b64 s[24:25], s[6:7]                         // move preload data to correct sgpr
s_mov_b64 s[26:27], s[8:9]                         // move preload data to correct sgpr
s_mov_b64 s[28:29], s[10:11]                       // move preload data to correct sgpr
s_mov_b32 s30, s12                                 // move preload data to correct sgpr
s_branch label_Preload_LoadArgsEnd
label_Preload_HBMArgs:
s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments
label_Preload_LoadArgsEnd:
s_mov_b32 s[sgprWGM], s4                           // Preload internal args2
s_mov_b32 s54, s5                                  // Load num of WGs
label_common_kernel_entry:  /// for both preload/non-preload common code
s_mov_b32 s[sgprWorkGroup0+0], s13                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+1], s14                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+2], s15                 // restore workgroup id
s_and_b32 s[sgprStaggerU], s53, 0xffff0000         // Restore StaggerU related vars
s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10
s_and_b32 s[sgprGSU], s53, 0xffff                  // Restore GSUConfig and GSU
s_mov_b32 s[sgprArgType], s52
s_mov_b32 m0, 0x20800                              // LDS clamp at 133120 bytes
v_mov_b32 v[vgprSerial], v0                        // thread serial id
  
/* remap workgroup to XCCs */
s_lshr_b32 s60, s[sgprWGM], 0x10                   // Get WGMXCC
s_ff1_i32_b32 s60, s60                             // Get log(WGMXCC)
s_lshr_b32 s61, s[sgprWGM], 0x16                   // Get CU_Count
/* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */
s_cmp_gt_i32 s60, 0
s_cbranch_scc0 label_skip_WGMXCC
/* only remap WGs in the range */
s_lshr_b32 s57, s54, s60
s_lshl_b32 s57, s57, s60
s_cmp_ge_u32 s[sgprWorkGroup0], s57
s_cbranch_scc1 label_skip_WGMXCC
s_cmp_eq_u32 s61, 0                                // CU_Count == 0 ?
s_cbranch_scc0 label_XCCG_nonzero
s_lshr_b32 s57, s[sgprWorkGroup0], s60
s_bfm_b32 s58, s60, 0
s_and_b32 s58, s[sgprWorkGroup0], s58
s_lshr_b32 s59, s54, s60
s_mul_i32 s58, s58, s59
s_add_u32 s[sgprWorkGroup0], s57, s58
s_branch label_skip_WGMXCC
label_XCCG_nonzero:
/* temp0 = (wg//CU_Count)*CU_Count */
v_cvt_f32_u32 v10, s61                             // wg//CU_Count
v_rcp_iflag_f32 v10, v10                           // wg//CU_Count
v_cvt_f32_u32 v11, s[sgprWorkGroup0]               // wg//CU_Count
v_mul_f32 v10, v10, v11                            // wg//CU_Count
v_cvt_u32_f32 v10, v10                             // wg//CU_Count
v_mul_u32_u24 v11, v10, s61                        // wg//CU_Count
v_sub_u32 v11, s[sgprWorkGroup0], v11              // wg//CU_Count
v_cmpx_eq_u32 exec, v11, s61                       // wg//CU_Count
v_add_u32 v10, 1, v10                              // wg//CU_Count
v_mov_b32 v11, 0                                   // wg//CU_Count
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s61                       // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
v_mul_u32_u24 v11, v10, s61                        // re-calculate remainder
v_sub_u32 v11, s[sgprWorkGroup0], v11              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s57, v10                       // quotient
v_readfirstlane_b32 s58, v11                       // remainder
s_mul_i32 s57, s57, s61
/* temp1 = (wg%CU_Count)//WGMXCC */
s_lshr_b32 s58, s58, s60
/* temp0 = temp0 + temp1 */
s_add_u32 s57, s57, s58
/* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */
v_cvt_f32_u32 v10, s61                             // WGs//CU_Count
v_rcp_iflag_f32 v10, v10                           // WGs//CU_Count
v_cvt_f32_u32 v11, s54                             // WGs//CU_Count
v_mul_f32 v10, v10, v11                            // WGs//CU_Count
v_cvt_u32_f32 v10, v10                             // WGs//CU_Count
v_mul_u32_u24 v11, v10, s61                        // WGs//CU_Count
v_sub_u32 v11, s54, v11                            // WGs//CU_Count
v_cmpx_eq_u32 exec, v11, s61                       // WGs//CU_Count
v_add_u32 v10, 1, v10                              // WGs//CU_Count
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s61                       // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s58, v10                       // quotient
s_mul_i32 s58, s58, s61
s_sub_u32 s59, s54, s58
s_cmp_gt_u32 s[sgprWorkGroup0], s58
s_cselect_b32 s58, s59, s61
s_lshr_b32 s58, s58, s60
s_bfm_b32 s59, s60, 0
s_and_b32 s59, s[sgprWorkGroup0], s59
s_mul_i32 s58, s58, s59
/* WorkGroup0 = temp0 + temp1 */
s_add_u32 s[sgprWorkGroup0], s57, s58
label_skip_WGMXCC:  /// skip WGMXCC if no enough WGs to remap
//s_mov_b32 s[sgprWorkGroup0], 0 

/* init: add vgpr [4...136) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */

/******************************************/
/* Local Read Addresses                   */
/******************************************/

/* local read addresses: tile assignments a/b */
/* lr0I */
v_and_b32 v5, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v4, 15, v5                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v4, 6, v4                            // 1. N offset: nOffset = nIdx * nStride(64)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v4, 3, v4                            // 4. apply VectorWidth: bnOffset = bnOffset * vw(8)
v_lshrrev_b32 v5, 4, v5                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshl_add_u32 v4, v5, 3, v4                       // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v8, 6, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v8, 1, v8                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2)
v_lshl_add_u32 v4, v8, 13, v4                      // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset
/* lr1J */
v_and_b32 v6, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v5, 15, v6                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v5, 6, v5                            // 1. N offset: nOffset = nIdx * nStride(64)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v5, 3, v5                            // 4. apply VectorWidth: bnOffset = bnOffset * vw(8)
v_lshrrev_b32 v6, 4, v6                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshl_add_u32 v5, v6, 3, v5                       // 5. K offset: lrKOffset = kIdx * mStride(8); 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v7, 7, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128)
v_and_b32 v7, 1, v7                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2)
v_lshl_add_u32 v5, v7, 13, v5                      // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192); 7. final local read offset: flrOffset = lrOffset + WOffset

/* local read addresses: final offsets a */
v_lshrrev_b32 v6, 6, v[vgprSerial]                 // 6 = Serial / 64
v_lshrrev_b32 v6, 2, v6                            // LSU offset: Get LSU wave_id
s_mov_b32 s53, 64                                  // LSU offset: stride = lsuStride(64) when umlds==True
v_mul_lo_u32 v6, s53, v6                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD)
v_add_lshl_u32 v[vgprLocalReadAddrA], v6, v4, 0x1  // Final Offset: offset = (lro0+lsuoffset)*bpeDS
v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA]        // Final Offset: padding 16 per block 1024
v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 4, v[vgprLocalReadAddrA] // Final Offset: padding 16 per block 1024

/* local read addresses: final offsets b */
v_lshrrev_b32 v4, 6, v[vgprSerial]                 // 4 = Serial / 64
v_lshrrev_b32 v4, 2, v4                            // LSU offset: Get LSU wave_id
                                                   // LSU offset: stride = lsuStride(64) when umlds==True (dup assign opt.)
v_mul_lo_u32 v4, s53, v4                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD)
v_add_lshl_u32 v[vgprLocalReadAddrB], v4, v5, 0x1  // Final Offset: offset = (lro1+lsuoffset)*bpeDS
v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB]        // Final Offset: padding 16 per block 1024
v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 4, v[vgprLocalReadAddrB] // Final Offset: padding 16 per block 1024

/* local read addresses: declare addresses a */
/* N/A */

/* local read addresses: declare addresses b */
v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8200, v[vgprLocalReadAddrB+0] //  += LdsOffsetB (lower)
v_add_u32 v[vgprLocalReadSwapAddrA], 66560, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer
v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping
v_add_u32 v[vgprLocalReadSwapAddrB], 66560, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer
v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping

/******************************************/
/* Local Write Addresses                  */
/******************************************/
/* LVCA = 8 */
/* v5 = A-unroll = serial%LVCA */
v_lshrrev_b32 v4, 3, v[vgprSerial]                 // 4 = Serial / 8
v_and_b32 v5, 7, v[vgprSerial]                     // 5 = Serial % 8
/* unroll *= glvw */
v_lshlrev_b32 v5, 3, v5                            // v5 = v5 * 8
v_mov_b32 v8, v5                                   // copy for GlobalSplitU
/* LVCB = 8 */
/* v7 = B-unroll = serial%LVCB */
v_lshrrev_b32 v6, 3, v[vgprSerial]                 // 6 = Serial / 8
v_and_b32 v7, 7, v[vgprSerial]                     // 7 = Serial % 8
/* unroll *= glvw */
v_lshlrev_b32 v7, 3, v7                            // v7 = v7 * 8
v_mov_b32 v9, v7                                   // copy for GlobalSplitU
/* lwaUnrollAssignmentA = v8 */
/* lwaUnrollAssignmentB = v9 */

/* local write addresses: first offset a */
v_mul_u32_u24 v10, 0x40, v4                        // lwAL**(DepthU_Compute + PAD)
v_add_lshl_u32 v10, v8, v10, 0x1                   // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS
v_lshrrev_b32 v12, 10, v10                         // padding 16 per block 1024
v_lshl_add_u32 v10, v12, 4, v10                    // padding 16 per block 1024
s_nop 0                                            // 1 wait states required before reading vgpr by lane
v_readfirstlane_b32 s[sgprLocalWriteAddrA], v10    // Copy lds write address VGPR to SGPR
s_nop 0                                            // 1 wait states
s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 66560 // Calculate starting lds addr of second buffer
s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping

/* local write addresses: first offset b */
v_mul_u32_u24 v10, 0x40, v6                        // lwBL**(DepthU_Compute + PAD)
v_add_lshl_u32 v10, v9, v10, 0x1                   // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS
v_lshrrev_b32 v12, 10, v10                         // padding 16 per block 1024
v_lshl_add_u32 v10, v12, 4, v10                    // padding 16 per block 1024
v_add_co_u32 v10, vcc, 0x8200, v10                 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=33280
s_nop 0                                            // 1 wait states required before reading vgpr by lane
v_readfirstlane_b32 s[sgprLocalWriteAddrB], v10    // Copy lds write address VGPR to SGPR
s_nop 0                                            // 1 wait states
s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 66560 // Calculate starting lds addr of second buffer
s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping
v_mov_b32 v12, MT0                                 // set MT0 into sgpr
v_mov_b32 v11, s[sgprSizesFree+0]                  // set Free0 size
v_cvt_f32_u32 v10, v12                             // v10 = ceil(v11 / v12)
v_rcp_iflag_f32 v10, v10                           // v10 = ceil(v11 / v12)
v_cvt_f32_u32 v13, v11                             // v10 = ceil(v11 / v12)
v_mul_f32 v10, v10, v13                            // v10 = ceil(v11 / v12)
v_cvt_u32_f32 v10, v10                             // v10 = ceil(v11 / v12)
v_mul_u32_u24 v13, v10, v12                        // v10 = ceil(v11 / v12)
v_sub_u32 v13, v11, v13                            // v10 = ceil(v11 / v12)
v_cmp_ne_u32 vcc, v13, 0                           // v10 = ceil(v11 / v12)
v_addc_co_u32 v10, vcc, v10, 0, vcc                // ceil
v_mov_b32 v12, MT1                                 // set MT1 into sgpr
v_mov_b32 v11, s[sgprSizesFree+1]                  // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v10     // set back to numWorkGroup0
v_cvt_f32_u32 v10, v12                             // v10 = ceil(v11 / v12)
v_rcp_iflag_f32 v10, v10                           // v10 = ceil(v11 / v12)
v_cvt_f32_u32 v13, v11                             // v10 = ceil(v11 / v12)
v_mul_f32 v10, v10, v13                            // v10 = ceil(v11 / v12)
v_cvt_u32_f32 v10, v10                             // v10 = ceil(v11 / v12)
v_mul_u32_u24 v13, v10, v12                        // v10 = ceil(v11 / v12)
v_sub_u32 v13, v11, v13                            // v10 = ceil(v11 / v12)
v_cmp_ne_u32 vcc, v13, 0                           // v10 = ceil(v11 / v12)
v_addc_co_u32 v10, vcc, v10, 0, vcc                // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v10     // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 44/0 bytes of kern args

/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */
/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */
s_mul_i32 s52, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1]
s_and_b32 s53, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s52, s52, s53
v_cvt_f32_u32 v10, s52                             // s52 = s[sgprWorkGroup0] / s52
v_rcp_iflag_f32 v10, v10                           // s52 = s[sgprWorkGroup0] / s52
v_cvt_f32_u32 v11, s[sgprWorkGroup0]               // s52 = s[sgprWorkGroup0] / s52
v_mul_f32 v10, v10, v11                            // s52 = s[sgprWorkGroup0] / s52
v_cvt_u32_f32 v10, v10                             // s52 = s[sgprWorkGroup0] / s52
v_mul_u32_u24 v11, v10, s52                        // s52 = s[sgprWorkGroup0] / s52
v_sub_u32 v11, s[sgprWorkGroup0], v11              // s52 = s[sgprWorkGroup0] / s52
v_cmpx_eq_u32 exec, v11, s52                       // s52 = s[sgprWorkGroup0] / s52
v_add_u32 v10, 1, v10                              // s52 = s[sgprWorkGroup0] / s52
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s52                       // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s52, v10                       // quotient
s_mov_b32 s[sgprWorkGroup2], s52
/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */
s_mul_i32 s52, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0]
s_mul_i32 s52, s52, s[sgprWorkGroup2]
s_mul_i32 s52, s52, s53
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52
/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */
v_cvt_f32_u32 v10, s[sgprNumWorkGroups0]           // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_rcp_iflag_f32 v10, v10                           // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cvt_f32_u32 v11, s[sgprWorkGroup0]               // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_mul_f32 v10, v10, v11                            // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cvt_u32_f32 v10, v10                             // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups0]      // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_sub_u32 v11, s[sgprWorkGroup0], v11              // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups0]     // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_add_u32 v10, 1, v10                              // s52 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups0]     // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s52, v10                       // quotient
s_mov_b32 s[sgprWorkGroup1], s52
/* wg0 = idxWG01 - wg1 * numWG0 */
s_mul_i32 s52, s[sgprWorkGroup1], s[sgprNumWorkGroups0]
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s52

.set sgprSrdA, 52
.set sgprSrdB, 56
.set sgprShadowLimitA, 60
.set sgprShadowLimitB, 62
.set sgprStaggerUIter, 51
.set sgprWrapUA, 64
.set sgprWrapUB, 66
.set sgprGlobalReadIncsA, 68
.set sgprGlobalReadIncsB, 69
.set sgprScalarGlobalReadOffsetA, 70
.set sgprScalarGlobalReadOffsetB, 77
s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift
s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift

/* Short circuit condition if Alpha == 0, then sumDims=0 */
v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_AlphaNonZero                  // branch if s[Alpha] != 0
s_mov_b32 s[sgprSizesSum+0], 0                     // Set summation dim=0 if Alpha == 0
label_AlphaNonZero:

/******************************************/
/* Begin setupNewTile                     */
/******************************************/

/* global read addresses: work-group */
/* graWorkGroup mapping */
s_and_b32 s84, s[sgprGSU], 0x3fff                  // Restore GSU
s_cmp_eq_u32 s84, 1                                // GSU == 1 ?
s_cbranch_scc1 label_GSU                           // branch if GSU == 1
// GSU-not-WGMapRR :nwg1 = (size1J + MT1J - 1) / MT1J;
s_and_b32 s84, s[sgprGSU], 0x4000                  // SCC = (GSUWGMRR == 1) ?
s_cbranch_scc1 label_GSUWGMRR                      // branch if GSUWGMRR == 1
s_and_b32 s84, s[sgprGSU], 0x3fff                  // Restore GSU
v_cvt_f32_u32 v10, s84                             // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_rcp_iflag_f32 v10, v10                           // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_cvt_f32_u32 v11, s[sgprWorkGroup1]               // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_mul_f32 v10, v10, v11                            // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_cvt_u32_f32 v10, v10                             // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_mul_u32_u24 v11, v10, s84                        // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_sub_u32 v11, s[sgprWorkGroup1], v11              // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_cmpx_eq_u32 exec, v11, s84                       // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_add_u32 v10, 1, v10                              // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s84
v_mov_b32 v11, 0                                   // s[sgprGSUSumIdx] = s[sgprWorkGroup1] % s84
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s84                       // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
v_mul_u32_u24 v11, v10, s84                        // re-calculate remainder
v_sub_u32 v11, s[sgprWorkGroup1], v11              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup1], v10         // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx], v11          // remainder
s_branch label_GSUWGMRR_End
label_GSUWGMRR:
v_cvt_f32_u32 v10, s[sgprNumWorkGroups1]           // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_rcp_iflag_f32 v10, v10                           // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_cvt_f32_u32 v11, s[sgprWorkGroup1]               // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_mul_f32 v10, v10, v11                            // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_cvt_u32_f32 v10, v10                             // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1]      // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_sub_u32 v11, s[sgprWorkGroup1], v11              // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_cmpx_eq_u32 exec, v11, s[sgprNumWorkGroups1]     // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_add_u32 v10, 1, v10                              // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_mov_b32 v11, 0                                   // s[sgprWorkGroup1] = s[sgprWorkGroup1] % s[sgprNumWorkGroups1]
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s[sgprNumWorkGroups1]     // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
v_mul_u32_u24 v11, v10, s[sgprNumWorkGroups1]      // re-calculate remainder
v_sub_u32 v11, s[sgprWorkGroup1], v11              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprGSUSumIdx], v10          // quotient
v_readfirstlane_b32 s[sgprWorkGroup1], v11         // remainder
label_GSUWGMRR_End:
s_mov_b32 s[sgprGSULog2BpeC], 1
s_mov_b32 s[sgprGSULog2BpeD], 2
s_branch label_GSU_End
label_GSU:
s_mov_b64 s[sgprGSUSumIdx:sgprGSUSumIdx+1], 0      // Set GSUSumIdx to 0
s_mov_b32 s[sgprGSULog2BpeC], 1
s_mov_b32 s[sgprGSULog2BpeD], 1
label_GSU_End:
s_sext_i32_i16 s[sgprWGM], s[sgprWGM]              // Restore WGM
s_cmp_gt_i32 s[sgprWGM], 1                         // WGM > 1 ?
s_cbranch_scc1 label_WGMPositive                   // branch if WGM > 1
s_cmp_ge_i32 s[sgprWGM], 0                         // WGM >= 0 ?
s_cbranch_scc1 label_WGM                           // branch if WGM >= 0
s_abs_i32 s[sgprWGM], s[sgprWGM]                   // abs(WGM)
v_cvt_f32_u32 v10, s[sgprWGM]                      // WGM
v_rcp_iflag_f32 v10, v10                           // WGM
v_cvt_f32_u32 v11, s[sgprWorkGroup0]               // WGM
v_mul_f32 v10, v10, v11                            // WGM
v_cvt_u32_f32 v10, v10                             // WGM
v_mul_u32_u24 v11, v10, s[sgprWGM]                 // WGM
v_sub_u32 v11, s[sgprWorkGroup0], v11              // WGM
v_cmpx_eq_u32 exec, v11, s[sgprWGM]                // WGM
v_add_u32 v10, 1, v10                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s[sgprWGM]                // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s86, v10                       // quotient
s_mul_i32 s87, s86, s[sgprWGM]                     // quotient * non-magic divisor
s_sub_u32 s87, s[sgprWorkGroup0], s87              // WorkGroup0=remainder
s_mul_i32 s87, s87, s[sgprNumWorkGroups1]          // (wg1 % WGM)*NumWorkGroups1
s_add_u32 s87, s87, s[sgprWorkGroup1]              // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1
v_cvt_f32_u32 v10, s[sgprWGM]                      // WGM
v_rcp_iflag_f32 v10, v10                           // WGM
v_cvt_f32_u32 v11, s[sgprNumWorkGroups0]           // WGM
v_mul_f32 v10, v10, v11                            // WGM
v_cvt_u32_f32 v10, v10                             // WGM
v_mul_u32_u24 v11, v10, s[sgprWGM]                 // WGM
v_sub_u32 v11, s[sgprNumWorkGroups0], v11          // WGM
v_cmpx_eq_u32 exec, v11, s[sgprWGM]                // WGM
v_add_u32 v10, 1, v10                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s[sgprWGM]                // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s84, v10                       // quotient
s_mul_i32 s85, s[sgprWGM], s84                     // quotient * non-magic divisor
s_sub_u32 s85, s[sgprNumWorkGroups0], s85          // NumWorkGroups0=remainder
s_cmp_eq_u32 s85, 0                                // remainder == 0 ?
s_cmov_b32 s85, s[sgprWGM]                         // remainder = WGM if remainder == 0
s_cmp_ge_u32 s86, s84                              // blockId >= numFullBlocks ?
s_cselect_b32 s84, s85, s[sgprWGM]
v_cvt_f32_u32 v10, s84                             // s[sgprWorkGroup1] = s87 / s84
v_rcp_iflag_f32 v10, v10                           // s[sgprWorkGroup1] = s87 / s84
v_cvt_f32_u32 v11, s87                             // s[sgprWorkGroup1] = s87 / s84
v_mul_f32 v10, v10, v11                            // s[sgprWorkGroup1] = s87 / s84
v_cvt_u32_f32 v10, v10                             // s[sgprWorkGroup1] = s87 / s84
v_mul_u32_u24 v11, v10, s84                        // s[sgprWorkGroup1] = s87 / s84
v_sub_u32 v11, s87, v11                            // s[sgprWorkGroup1] = s87 / s84
v_cmpx_eq_u32 exec, v11, s84                       // s[sgprWorkGroup1] = s87 / s84
v_add_u32 v10, 1, v10                              // s[sgprWorkGroup1] = s87 / s84
v_mov_b32 v11, 0                                   // s[sgprWorkGroup0] = s87 % s84
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s84                       // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
v_mul_u32_u24 v11, v10, s84                        // re-calculate remainder
v_sub_u32 v11, s87, v11                            // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup1], v10         // quotient
v_readfirstlane_b32 s[sgprWorkGroup0], v11         // remainder
s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s84 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup0], s87, s[sgprWorkGroup0] // WorkGroup0=remainder
s_mul_i32 s86, s86, s[sgprWGM]                     // blockId * WGM
s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s86 // wg1 += blockId * WGM
s_branch label_WGM
label_WGMPositive:
v_cvt_f32_u32 v10, s[sgprWGM]                      // WGM
v_rcp_iflag_f32 v10, v10                           // WGM
v_cvt_f32_u32 v11, s[sgprWorkGroup1]               // WGM
v_mul_f32 v10, v10, v11                            // WGM
v_cvt_u32_f32 v10, v10                             // WGM
v_mul_u32_u24 v11, v10, s[sgprWGM]                 // WGM
v_sub_u32 v11, s[sgprWorkGroup1], v11              // WGM
v_cmpx_eq_u32 exec, v11, s[sgprWGM]                // WGM
v_add_u32 v10, 1, v10                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s[sgprWGM]                // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s86, v10                       // quotient
s_mul_i32 s87, s86, s[sgprWGM]                     // quotient * non-magic divisor
s_sub_u32 s87, s[sgprWorkGroup1], s87              // WorkGroup1=remainder
s_mul_i32 s87, s87, s[sgprNumWorkGroups0]          // (wg1 % WGM)*NumWorkGroups0
s_add_u32 s87, s87, s[sgprWorkGroup0]              // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0
v_cvt_f32_u32 v10, s[sgprWGM]                      // WGM
v_rcp_iflag_f32 v10, v10                           // WGM
v_cvt_f32_u32 v11, s[sgprNumWorkGroups1]           // WGM
v_mul_f32 v10, v10, v11                            // WGM
v_cvt_u32_f32 v10, v10                             // WGM
v_mul_u32_u24 v11, v10, s[sgprWGM]                 // WGM
v_sub_u32 v11, s[sgprNumWorkGroups1], v11          // WGM
v_cmpx_eq_u32 exec, v11, s[sgprWGM]                // WGM
v_add_u32 v10, 1, v10                              // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s[sgprWGM]                // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s84, v10                       // quotient
s_mul_i32 s85, s[sgprWGM], s84                     // quotient * non-magic divisor
s_sub_u32 s85, s[sgprNumWorkGroups1], s85          // NumWorkGroups1=remainder
s_cmp_eq_u32 s85, 0                                // remainder == 0 ?
s_cmov_b32 s85, s[sgprWGM]                         // remainder = WGM if remainder == 0
s_cmp_ge_u32 s86, s84                              // blockId >= numFullBlocks ?
s_cselect_b32 s84, s85, s[sgprWGM]
v_cvt_f32_u32 v10, s84                             // s[sgprWorkGroup0] = s87 / s84
v_rcp_iflag_f32 v10, v10                           // s[sgprWorkGroup0] = s87 / s84
v_cvt_f32_u32 v11, s87                             // s[sgprWorkGroup0] = s87 / s84
v_mul_f32 v10, v10, v11                            // s[sgprWorkGroup0] = s87 / s84
v_cvt_u32_f32 v10, v10                             // s[sgprWorkGroup0] = s87 / s84
v_mul_u32_u24 v11, v10, s84                        // s[sgprWorkGroup0] = s87 / s84
v_sub_u32 v11, s87, v11                            // s[sgprWorkGroup0] = s87 / s84
v_cmpx_eq_u32 exec, v11, s84                       // s[sgprWorkGroup0] = s87 / s84
v_add_u32 v10, 1, v10                              // s[sgprWorkGroup0] = s87 / s84
v_mov_b32 v11, 0                                   // s[sgprWorkGroup1] = s87 % s84
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v11, s84                       // overflow happened in remainder
v_sub_u32 v10, v10, 1                              // quotient - 1
v_mul_u32_u24 v11, v10, s84                        // re-calculate remainder
v_sub_u32 v11, s87, v11                            // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup0], v10         // quotient
v_readfirstlane_b32 s[sgprWorkGroup1], v11         // remainder
s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s84 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup1], s87, s[sgprWorkGroup1] // WorkGroup1=remainder
s_mul_i32 s86, s86, s[sgprWGM]                     // blockId * WGM
s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s86 // wg1 += blockId * WGM
label_WGM:

/* global read addresses: tile offset assignment a */
/* graTileAssignmentA = v4 */

/* global read addresses: tile offset assignment b */
/* graTileAssignmentB = v6 */

/* global read addresses: unroll assignment a */
/* v5 */

/* global read addresses: unroll assignment b */
/* v7 */

/* global read addresses: other free assignments */
/* s[sgprWorkGroup2] */

/* global read addresses: tile offsets a */

/* global read addresses: tile offsets b */

/* global read addresses: unroll offsets a */

/* global read addresses: unroll offsets b */

/* global read addresses: final offsets a */
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0,  5,  4, 10 // gROA_0_0_0_0
s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 32 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+0], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 64 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+1], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 96 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+2], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], s[sgprStrideA0I], 128 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+3], s[sgprScalarGlobalReadOffsetA+3], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+4], s[sgprStrideA0I], 160 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+4], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+5], s[sgprStrideA0I], 192 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+5], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+6], s[sgprStrideA0I], 224 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+6], 0x1 // scalar offset *= bytes/element

/* global read addresses: final offsets b */
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0,  7,  6, 10 // gROB_0_0_0_0
s_mul_i32 s[sgprScalarGlobalReadOffsetB+0], s[sgprStrideB1J], 32 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+0], s[sgprScalarGlobalReadOffsetB+0], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetB+1], s[sgprStrideB1J], 64 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+1], s[sgprScalarGlobalReadOffsetB+1], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetB+2], s[sgprStrideB1J], 96 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+2], s[sgprScalarGlobalReadOffsetB+2], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetB+3], s[sgprStrideB1J], 128 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+3], s[sgprScalarGlobalReadOffsetB+3], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetB+4], s[sgprStrideB1J], 160 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+4], s[sgprScalarGlobalReadOffsetB+4], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+5], s[sgprScalarGlobalReadOffsetB+5], 0x1 // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim)
s_lshl_b32 s[sgprScalarGlobalReadOffsetB+6], s[sgprScalarGlobalReadOffsetB+6], 0x1 // scalar offset *= bytes/element

/* global read addresses: addresses a */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s87, s[sgprWorkGroup0], 256           // WorkGroup[01] * MT
s_mul_i32 s86, s[sgprWorkGroup0], 256              // WorkGroup[01] * MT
s_mul_hi_u32 s87, s86, s[sgprStrideA0I]            // tlu=0, scaled tile-offset by stride
s_mul_i32 s86, s86, s[sgprStrideA0I]               // tlu=0, scaled tile-offset by stride
s_and_b32 s84, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cbranch_scc1 label_GSUC_A                        // branch if GSUC == 1
s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx]             // gsuOffset = DepthU*GSUSumIdx
s_mul_i32 s84, 64, s[sgprGSUSumIdx]                // gsuOffset = DepthU*GSUSumIdx
s_branch label_GSUC_A_End
label_GSUC_A:
s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64
s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff   // Restore GSU
v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1]               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_rcp_iflag_f32 v4, v4                             // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_f32_u32 v5, s[sgprLoopCounterL]              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_f32 v4, v4, v5                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_u32_f32 v4, v4                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1]           // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_sub_u32 v5, s[sgprLoopCounterL], v5              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1]         // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_add_u32 v4, 1, v4                                // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mov_b32 v5, 0                                    // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1]
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1]         // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1]           // re-calculate remainder
v_sub_u32 v5, s[sgprLoopCounterL], v5              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprLoopCounterL], v4        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5         // remainder
s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx
s_add_u32 s84, 1, s[sgprLoopCounterL]              // quotient+1
s_add_u32 s85, s85, s[sgprGSUSumIdx+1]             // quotient*GSUSumIdx+remainder
s_mul_i32 s84, s84, s[sgprGSUSumIdx]               // (quotient+1)*GSUSumIdx
s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx < numIterPerWgRemainder
s_cselect_b32 s84, s84, s85                        // (quotient+1)*GSUSumIdx if needed
s_mul_hi_u32 s85, s84, 64                          // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
s_mul_i32 s84, s84, 64                             // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
label_GSUC_A_End:
s_add_u32 s86, s86, s84                            // accum GsuOffset term to tilestart
s_addc_u32 s87, s87, s85                           // accum GsuOffset term to tilestart
s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size
s_sub_u32 s84, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s85, constStrideAL, s84               // stride x (size-1)
s_mul_i32 s84, constStrideAL, s84                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size
s_sub_u32 s84, s[sgprSizeI], 1                     // (size-1)
s_mul_hi_u32 s85, s[sgprStrideA0I], s84            // stride x (size-1)
s_mul_i32 s84, s[sgprStrideA0I], s84               // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // sum tensor size
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s86 // sub tileStart
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s87 // sub tileStart
s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x1 // Set limit to use bytes
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s85, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s84, s[sgprStrideAK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s86, s86, s84                            // accum wg term to tilestart
s_addc_u32 s87, s87, s85                           // accum wg term to tilestart
s_lshl_b64 s[86:87], s[86:87], 1                   // tileStart *= BPE
s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s86    // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s87   // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdA+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: addresses b */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s87, s[sgprWorkGroup1], 256           // WorkGroup[01] * MT
s_mul_i32 s86, s[sgprWorkGroup1], 256              // WorkGroup[01] * MT
s_mul_hi_u32 s87, s86, s[sgprStrideB1J]            // tlu=0, scaled tile-offset by stride
s_mul_i32 s86, s86, s[sgprStrideB1J]               // tlu=0, scaled tile-offset by stride
s_and_b32 s84, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cbranch_scc1 label_GSUC_B                        // branch if GSUC == 1
s_mul_hi_u32 s85, 64, s[sgprGSUSumIdx]             // gsuOffset = DepthU*GSUSumIdx
s_mul_i32 s84, 64, s[sgprGSUSumIdx]                // gsuOffset = DepthU*GSUSumIdx
s_branch label_GSUC_B_End
label_GSUC_B:
s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 // s[LoopCounterL] = s[sgprSizesSum] / 64
s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff   // Restore GSU
v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1]               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_rcp_iflag_f32 v4, v4                             // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_f32_u32 v5, s[sgprLoopCounterL]              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_f32 v4, v4, v5                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_u32_f32 v4, v4                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1]           // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_sub_u32 v5, s[sgprLoopCounterL], v5              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1]         // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_add_u32 v4, 1, v4                                // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mov_b32 v5, 0                                    // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1]
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1]         // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1]           // re-calculate remainder
v_sub_u32 v5, s[sgprLoopCounterL], v5              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprLoopCounterL], v4        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5         // remainder
s_mul_i32 s85, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx
s_add_u32 s84, 1, s[sgprLoopCounterL]              // quotient+1
s_add_u32 s85, s85, s[sgprGSUSumIdx+1]             // quotient*GSUSumIdx+remainder
s_mul_i32 s84, s84, s[sgprGSUSumIdx]               // (quotient+1)*GSUSumIdx
s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx < numIterPerWgRemainder
s_cselect_b32 s84, s84, s85                        // (quotient+1)*GSUSumIdx if needed
s_mul_hi_u32 s85, s84, 64                          // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
s_mul_i32 s84, s84, 64                             // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
label_GSUC_B_End:
s_add_u32 s86, s86, s84                            // accum GsuOffset term to tilestart
s_addc_u32 s87, s87, s85                           // accum GsuOffset term to tilestart
s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size
s_sub_u32 s84, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s85, constStrideBL, s84               // stride x (size-1)
s_mul_i32 s84, constStrideBL, s84                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size
s_sub_u32 s84, s[sgprSizeJ], 1                     // (size-1)
s_mul_hi_u32 s85, s[sgprStrideB1J], s84            // stride x (size-1)
s_mul_i32 s84, s[sgprStrideB1J], s84               // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // sum tensor size
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s86 // sub tileStart
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s87 // sub tileStart
s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x1 // Set limit to use bytes
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s85, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s84, s[sgprStrideBK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s86, s86, s84                            // accum wg term to tilestart
s_addc_u32 s87, s87, s85                           // accum wg term to tilestart
s_lshl_b64 s[86:87], s[86:87], 1                   // tileStart *= BPE
s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s86    // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s87   // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdB+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: increments a */
s_and_b32 s85, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s85, s85, DepthU*BpeAGR                  // GSU*DepthU*Bpe
s_and_b32 s84, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cselect_b32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR, s85 // incrA (unrollIdx)

/* global read addresses: increments b */
s_and_b32 s85, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s85, s85, DepthU*BpeBGR                  // GSU*DepthU*Bpe
s_and_b32 s84, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cselect_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR, s85 // incrB (unrollIdx)
/* declare loop num iterations */
s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 6 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 64
s_and_b32 s84, s[sgprGSU], 0x3fff                  // Restore GSU
s_cmp_eq_u32 s84, 1                                // GSU == 1 ?
s_cbranch_scc1 label_GSU_1                         // branch if GSU == 1
s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff   // Restore GSU
v_cvt_f32_u32 v4, s[sgprGSUSumIdx+1]               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_rcp_iflag_f32 v4, v4                             // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_f32_u32 v5, s[sgprLoopCounterL]              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_f32 v4, v4, v5                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_u32_f32 v4, v4                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1]           // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_sub_u32 v5, s[sgprLoopCounterL], v5              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cmpx_eq_u32 exec, v5, s[sgprGSUSumIdx+1]         // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_add_u32 v4, 1, v4                                // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mov_b32 v5, 0                                    // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1]
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s[sgprGSUSumIdx+1]         // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s[sgprGSUSumIdx+1]           // re-calculate remainder
v_sub_u32 v5, s[sgprLoopCounterL], v5              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprLoopCounterL], v4        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v5         // remainder
s_add_u32 s84, 1, s[sgprLoopCounterL]              // tmp<-numIterMyWg+1
s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx < numIterPerWgRemainder
s_cmov_b32 s[sgprLoopCounterL], s84                // numIterMyWg++ if needed
label_GSU_1:
s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter
s_and_b32 s86, s[sgprStaggerU], 0x1f00
s_lshr_b32 s86, s86, 0x8
s_and_b32 s87, s[sgprStaggerU], 0xe000
s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff
s_mov_b32 s84, s[sgprStaggerU]                     // init staggerU
label_beginStaggerUIter:
s_lshl_b32 s85, s84, s86                           // shift by StaggerUStride
s_cmp_ge_u32 s[sgprOrigLoopCounter], s85           // loopCount >= current shift Count
s_cbranch_scc1 label_endStaggerUIter               // jump to end
s_lshr_b32 s84, s84, 1                             // step down to smaller stagger
s_branch label_beginStaggerUIter                   // jump to begin
label_endStaggerUIter:
s_sub_u32 s85, s84, 1                              // staggerU mask
s_cmp_ge_u32 s84, 1                                // if current staggerU >= 1
s_cselect_b32 s[sgprStaggerUIter], s85, 0          // set Mask
s_cmp_eq_u32 s87, 0x0
s_cbranch_scc1 label_StaggerUMapping_1
s_mov_b32 s84, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_1:
s_cmp_eq_u32 s87, 0x2000
s_cbranch_scc1 label_StaggerUMapping_2
s_mov_b32 s84, s[sgprWorkGroup1]
s_branch label_staggerInputEnd
label_StaggerUMapping_2:
s_cmp_eq_u32 s87, 0x4000
s_cbranch_scc1 label_StaggerUMapping_3
s_mov_b32 s84, -0x1
s_branch label_staggerInputEnd
label_StaggerUMapping_3:
s_cmp_eq_u32 s87, 0x6000
s_cbranch_scc1 label_StaggerUMapping_4
s_mul_i32 s85, s[sgprNumWorkGroups0], s[sgprWorkGroup1]
s_add_u32 s84, s84, s85
s_add_u32 s84, s84, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_4:
s_cmp_eq_u32 s87, 0x8000
s_cbranch_scc1 label_staggerInputEnd
s_mov_b32 s84, -0x1
s_branch label_staggerInputEnd
label_staggerInputEnd:
s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s84 // Compute actual stagger start for this tile
s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s86 // shift by StaggerUStride

/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */
s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration
s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1]     // remove one iteration
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */
s_mul_hi_i32 s85, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_i32 s84, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration
s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1]     // remove one iteration
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap
/* local read addresses: init pointers a */

/* localReadInitPointers */
/* local read addresses: init pointers b */

/* localReadInitPointers */

/* prefetch: global -> local */
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?
s_cbranch_scc1 label_ShadowInitStart               // skip to ShadowInitStart iter b/c numIter==0

s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0

s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

// Interleave Init C
v_accvgpr_write acc0, 0                            // initC
v_accvgpr_write acc1, 0                            // initC
v_accvgpr_write acc2, 0                            // initC
v_accvgpr_write acc3, 0                            // initC
v_accvgpr_write acc4, 0                            // initC
v_accvgpr_write acc5, 0                            // initC
v_accvgpr_write acc6, 0                            // initC
v_accvgpr_write acc7, 0                            // initC
v_accvgpr_write acc8, 0                            // initC
v_accvgpr_write acc9, 0                            // initC
v_accvgpr_write acc10, 0                           // initC
v_accvgpr_write acc11, 0                           // initC
v_accvgpr_write acc12, 0                           // initC
v_accvgpr_write acc13, 0                           // initC
v_accvgpr_write acc14, 0                           // initC
v_accvgpr_write acc15, 0                           // initC

v_mov_b64 v[6:7], 0
v_mov_b64 v[8:9], 0

v_mfma_f32_32x32x16_bf16 acc[16:31], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[32:47], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[48:63], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[64:79], v[6:9], v[6:9], acc[0:15]  
v_mfma_f32_32x32x16_bf16 acc[80:95], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[96:111], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[112:127], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[128:143], v[6:9], v[6:9], acc[0:15]

buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

v_mfma_f32_32x32x16_bf16 acc[144:159], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[160:175], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[176:191], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[192:207], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[208:223], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[224:239], v[6:9], v[6:9], acc[0:15]
v_mfma_f32_32x32x16_bf16 acc[240:255], v[6:9], v[6:9], acc[0:15]  

buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0

/* global read inc A loopL */
s_add_u32 s86, s[sgprLoopCounterL], 1              // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s86              // Is this wrapIter? (pf)
s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
s_cselect_b32 s85, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* global read inc B loopL */
s_add_u32 s86, s[sgprLoopCounterL], 1              // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s86              // Is this wrapIter? (pf)
s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s85, s[sgprWrapUB+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/******************************************/
/* End setupNewTile                       */
/******************************************/
label_ShadowInitStart:
s_mov_b64 s[sgprSrdD+0:sgprSrdD+0+1], s[sgprAddressD+0:sgprAddressD+0+1] // init SRD base address
s_mov_b32 s[sgprSrdD+2], BufferOOB
s_mov_b32 s[sgprSrdD+3], Srd127_96                 // Set bits 127_96 in post-loop SRD

s_mov_b64 s[sgprSrdC+0:sgprSrdC+0+1], s[sgprAddressC+0:sgprAddressC+0+1] // init SRD base address
s_mov_b32 s[sgprSrdC+2], BufferOOB
s_mov_b32 s[sgprSrdC+3], Srd127_96                 // Set bits 127_96 in post-loop SRD


s_mul_i32 s86, MT1, s[sgprWorkGroup1]              // <- wg1*MT1
s_mul_hi_u32 s85, s86, s[sgprStrideC1J]            // ScaleC s86 by Stride
s_mul_i32 s84, s86, s[sgprStrideC1J]               // ScaleC s86 by Stride
s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC]  // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s84    // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s85   // add hi to SRD
s_mul_hi_u32 s85, s86, s[sgprStrideD1J]            // ScaleD s86 by Stride
s_mul_i32 s84, s86, s[sgprStrideD1J]               // ScaleD s86 by Stride
s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD]  // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s84    // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s85   // add hi to SRD

s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride
s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideCK]  // ScaleC s[sgprWorkGroup2] by Stride
s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeC]  // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s84        // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s85       // add hi to SRD
s_mul_hi_u32 s85, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride
s_mul_i32 s84, s[sgprWorkGroup2], s[sgprStrideDK]  // ScaleD s[sgprWorkGroup2] by Stride
s_lshl_b64 s[84:85], s[84:85], s[sgprGSULog2BpeD]  // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84        // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85       // add hi to SRD

s_and_b32 s84, s[sgprGSU], 0x3fff                  // Restore GSU
s_cmp_eq_u32 s84, 1                                // GSU == 1 ?
s_cbranch_scc1 label_GSU_2                         // branch if GSU == 1
// GSU Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * GSUIdx * bpe%s
s_mul_hi_u32 s85, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0
s_mul_i32 s84, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0
s_sub_u32 s86, s[sgprSizesFree+1], 1               // Free1
s_mul_i32 s86, s86, s[sgprGSUSumIdx]               // Free1
s_mul_hi_u32 s87, s86, s[sgprStrideC1J]            // Free1
s_mul_i32 s86, s86, s[sgprStrideC1J]               // Free1
s_add_u32 s84, s84, s86                            // Free1
s_addc_u32 s85, s85, s87                           // Free1
s_sub_u32 s86, s[sgprSizesFree+2], 1               // Free2
s_mul_i32 s86, s86, s[sgprGSUSumIdx]               // Free2
s_mul_hi_u32 s87, s86, s[sgprStrideCK]             // Free2
s_mul_i32 s86, s86, s[sgprStrideCK]                // Free2
s_add_u32 s84, s84, s86                            // Free2
s_addc_u32 s85, s85, s87                           // Free2
s_lshl_b64 s[84:85], s[84:85], 2                   // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s84        // add lo GSU offset to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s85       // add hi GSU offset to SRD
label_GSU_2:
.set sgprGSULog2BpeC, UNDEF
.set sgprAddressC, UNDEF

s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?

/* after InitC, skip to end of prefetch last iter if numIter==0 */
s_cbranch_scc0 label_NoBranch_T8JHFHKM7BO5OHXW     // Only branch on scc1
s_getpc_b64 s[84:85]                               // addr of next instr
s_add_i32 s86, label_PrefetchGlobalLastIterEnd, 4  // target branch offset
s_add_u32 s84, s84, s86                            // add target branch offset
s_addc_u32 s85, s85, 0                             // add high and carry
s_setpc_b64 s[84:85]                               // branch to label_PrefetchGlobalLastIterEnd
label_NoBranch_T8JHFHKM7BO5OHXW:

/* local write a */

/* local write b */

/* local write swap a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/* local write swap b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // PGR=2 but only 1 loop
s_cbranch_scc1 label_skipPGR2                      // PGR=2 but only 1 loop
s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address

buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0

s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address

buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0


/* local write swap a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/* local write swap b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR


label_skipPGR2:

s_waitcnt vmcnt(24)
s_barrier  

/* local read prefetch a */
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0

s_waitcnt vmcnt(16)
s_barrier  
  
/* local read prefetch b */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0

s_waitcnt lgkmcnt(0)
  
/******************************************/
/* Unrolled Loop(s) - Begin               */
/******************************************/
label_openLoopL:
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // LoopCounterL < EndCounter
s_cbranch_scc1 label_toPGR1                        // PGR=2 but only 1 loop, toPGR1
s_cmp_le_u32 s[sgprLoopCounterL], 0x2              // LoopCounterL < EndCounter
s_cbranch_scc1 label_LoopEndL                      // do not enter LoopL
  

// MAIN LOOP MACRO - Shared code between Even/Odd simds
.macro MAINLOOP isOdd

/*  mfmaIndex:0  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:1  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
s_cselect_b32 s84, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0  
/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]
s_cselect_b32 s85, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s84        // gra SRD += inc(lower)
/*  mfmaIndex:4  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s85       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s84 // limit -= inc)
/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
/*  mfmaIndex:8  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]
s_cselect_b32 s84, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s85, s[sgprWrapUB+1], 0              // incUpper <- ?
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s84        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s85       // gra SRD += inc(upper)
/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]
s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s84 // limit -= inc)
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s85 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
/*  mfmaIndex:17  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
s_waitcnt lgkmcnt(0) // wait for A local reads
/*  mfmaIndex:21  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
s_barrier

.if \isOdd == 0
////////////////////////////////////////////////////////////////////// EVEN WAVES
/*  mfmaIndex:22  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0
/*  mfmaIndex:23  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:24  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:25  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 , lds // G -> Reg 0_0_1_0
/*  mfmaIndex:26  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:27  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:28  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 , lds // G -> Reg 0_0_2_0
/*  mfmaIndex:29  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:30  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0  

/*  mfmaIndex:31  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 , lds // G -> Reg 0_0_3_0
/*  mfmaIndex:32  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:33  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 , lds // G -> Reg 0_0_4_0
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:36  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0
  
.else
////////////////////////////////////////////////////////////////////// ODD WAVES
/*  mfmaIndex:22  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:23  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0
/*  mfmaIndex:24  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:25  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:26  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 , lds // G -> Reg 0_0_1_0

/*  mfmaIndex:27  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:28  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:29  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 , lds // G -> Reg 0_0_2_0

/*  mfmaIndex:30  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:31  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:32  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 , lds // G -> Reg 0_0_3_0

/*  mfmaIndex:33  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0
  
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 , lds // G -> Reg 0_0_4_0
  
/*  mfmaIndex:36  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
  
.endif ////////////////////////////////////////////////////////////////////// END branch
  

/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:39  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0
  /*  mfmaIndex:43  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
  /*  mfmaIndex:47  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
s_waitcnt lgkmcnt(0)
/*  mfmaIndex:51  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
s_barrier

.if \isOdd == 0
////////////////////////////////////////////////////////////////////// EVEN WAVES  
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 , lds // G -> Reg 0_0_5_0
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 , lds // G -> Reg 0_0_6_0
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]

/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 , lds // G -> Reg 0_0_7_0

/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address

/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]

/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0

/*  mfmaIndex:62  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:63  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]

// Iteration one

/*  mfmaIndex:64  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 , lds // G -> Reg 0_0_1_0

/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/* local write swap offsets a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
  
.else
////////////////////////////////////////////////////////////////////// ODD WAVES
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]

/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 , lds // G -> Reg 0_0_5_0

/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]

/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 , lds // G -> Reg 0_0_6_0
  
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
  
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]


/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 , lds // G -> Reg 0_0_7_0

/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
  
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]


/*  mfmaIndex:62  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 , lds // G -> Reg 0_0_0_0

/*  mfmaIndex:63  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
  
// Iteration one

/*  mfmaIndex:64  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]


/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 , lds // G -> Reg 0_0_1_0
  
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
/* local write swap offsets a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR
  
.endif
  

/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]


.if \isOdd == 0
  
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
/* local read swap offsets a */
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
/* local read swap offsets b */
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk
  
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 , lds // G -> Reg 0_0_2_0

/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 , lds // G -> Reg 0_0_3_0

/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 , lds // G -> Reg 0_0_4_0

/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]

  
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
s_waitcnt vmcnt(13)                                // wait for previous set of global reads

/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
s_barrier

/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0

  
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0


/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 , lds // G -> Reg 0_0_5_0

/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0


/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line


/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]


/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 , lds // G -> Reg 0_0_6_0


/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]

.else

/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]

/* local read swap offsets a */
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
/* local read swap offsets b */
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk

/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 , lds // G -> Reg 0_0_2_0

  
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 , lds // G -> Reg 0_0_3_0

/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 nt, lds // G -> Reg 0_0_4_0

/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]


/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]

  
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
s_waitcnt vmcnt(13)                                // wait for previous set of global reads

/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
s_barrier

/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0

  
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line

/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 nt, lds // G -> Reg 0_0_5_0


/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0


/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0


/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line


/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 nt, lds // G -> Reg 0_0_6_0

/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]

/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]  

.endif
  

/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
s_add_u32 m0, m0, 4160                             // Move LDS write address to next line
  
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0



/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0

  
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]



/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]

/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0


/*  mfmaIndex:110  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]


/*  mfmaIndex:111  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]



/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
  
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]

  

/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]

ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0  
  

/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]

  
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]


/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0  
  

/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]


/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]

  
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
  
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]


.if \isOdd == 0

/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
  
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 , lds // G -> Reg 0_0_7_0

.else

/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
  
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 nt, lds // G -> Reg 0_0_7_0
  
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]

.endif

  
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/* local write swap offsets b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
s_cmp_eq_i32 s[sgprLoopCounterL], 0x2              // counterL==2
s_waitcnt lgkmcnt(0)
  
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
.endm



// EVEN SIMDID takes WVLoop0 path, ODD SIMDID takes other path  
s_getreg_b32 s86, hwreg(HW_REG_HW_ID, 4, 1)
//s_and_b32 s86, s86, 1
s_cmp_eq_u32 s86, 0
s_cbranch_scc0 WVLoop1  
  
/******************************************/
/* Unrolled Loop 1/1 - Begin (Even SIMD)  */
/******************************************/
WVLoop0:
label_LoopBeginL0:
MAINLOOP 0
/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_cbranch_scc0 label_LoopBeginL0                    // restart LoopL
s_branch label_LoopEndL

/******************************************/
/* Unrolled Loop 1/1 - Begin (Odd SIMD)   */
/******************************************/  
WVLoop1:
label_LoopBeginL1:
MAINLOOP 1
/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_cbranch_scc0 label_LoopBeginL1                    // restart LoopL

label_LoopEndL:

/* Before NLL: Check VGPR.checkin for INT8 LW */

/******************************************/
/* Ord. NoGlobalLoadLoop - Begin          */
/******************************************/

/* iter 0 (reset local read pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105  */
/*  numMfmaForLR:20, syncPlrMfmaIndex:107  */

/*  mfmaIndex:0  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:1  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]


/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]


/*  mfmaIndex:4  */  
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]

/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]

/*  mfmaIndex:8  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0


/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]

/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]

/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]

/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]

/*  mfmaIndex:16  */
/* localReadsVacancy: latencyLeft 1 */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0

  /*  mfmaIndex:17  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0

  /*  mfmaIndex:19  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0

  /*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0


  /*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* schedule remaining localreads for one buffer scheduling */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0


  /*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0


  /*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0


  /*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0

  /*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
/* localReadsVacancy: latencyLeft 5 */

/* local read swap offsets a */
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk

/* local read swap offsets b */
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk

/* local read init pointers a */

/* localReadInitPointers */

/* local read init pointers b */

/* localReadInitPointers */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */

/* iter 1 (swap and reset local write pointers iteration)  */
/*  grEndMfmaIndex:6, lwStartMfmaIndex:25, lwEndMfmaIndex:105  */
/*  numMfmaForLR:20, syncPlrMfmaIndex:107  */
/*  mfmaIndex:64  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */

v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
s_waitcnt vmcnt(0)                                 // wait for global reads with lds

/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
s_barrier
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=1 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
label_toPGR1:
s_and_b32 s8, s[sgprGSU], 0x3fff                   // Restore GSU
s_cmp_eq_u32 s8, 1                                 // GSU == 1 ?
s_cbranch_scc0 label_GSU_3                         // branch if GSU != 1

/******************************************/
/* Opt. NoLoadLoop - Begin                */
/******************************************/
s_cmpk_eq_u32 s[sgprBeta], 0                       // Beta == 0
s_cbranch_scc0 label_OptNLL_End                    // Branch if Beta is not zero

s_cmp_eq_u32 s[sgprAlpha], 1.0                     // Alpha == 1.0 ?
s_cbranch_scc0 label_OptNLL_End                    // branch if alpha != 1

s_and_b32 s84, 255, s[sgprSizeI]                   // s84 = s[sgprSizeI] % 256
s_add_u32 s85, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s85                // wg0 >= nwg0-1 ?
s_cselect_b32 s84, s84, 0                          // set rMT0
s_cmpk_gt_u32 s84, 0                               // rMT0 > 0
s_cbranch_scc1 label_OptNLL_End                    // jump if edges required
s_and_b32 s84, 255, s[sgprSizeJ]                   // s84 = s[sgprSizeJ] % 256
s_add_u32 s85, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s85                // wg1 >= nwg1-1
s_cselect_b32 s84, s84, 0                          // set rMT1
s_cmpk_gt_u32 s84, 0                               // rMT1 > 0
s_cbranch_scc1 label_OptNLL_End                    // jump if edges required


  
/*  mfmaIndex:0  */
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:1  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]

/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]

/*  mfmaIndex:4  */  
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]

/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]

/*  mfmaIndex:8  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]

/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]

/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]

/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]

/*  mfmaIndex:16  */
/* localReadsVacancy: latencyLeft 1 */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:17  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:19  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* schedule remaining localreads for one buffer scheduling */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0

/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
/* localReadsVacancy: latencyLeft 5 */

v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]

/* iter 1 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63  */
/*  numMfmaForLR:20, syncPlrMfmaIndex:107  */
/*  mfmaIndex:64  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
label_toPGR1end_OptNLL:
/* Stores for OptNLL */
label_Summation_End_OptNLL:
/* endSummation: add vgpr [0...132) to pool */
/* load store sgprs */

/* Mapping of Acc register -> C Vgpr register */
/* computeStoreVgprs */
v_lshrrev_b32 v4, 6, v[vgprSerial]                 // 4 = Serial / 64
v_lshrrev_b32 v5, 1, v4                            // 5 = 4 / 2
v_mul_lo_u32 v5, 0x10, v5                          // wave coordination offset 1
v_and_b32 v1, 63, v[vgprSerial]                    // v1 = v[vgprSerial] % 64
v_lshrrev_b32 v1, 4, v1                            // 1 = 1 / 16
v_lshlrev_b32 v1, 2, v1                            // thread0 * continuous_output
v_add_lshl_u32 v1, v5, v1, 3                       // coordination 1 = vwB *(wave_id1 + tid1)
v_mul_lo_u32 v2, v1, s[sgprStrideC1J]              //  offset 1
v_mul_lo_u32 v3, v1, s[sgprStrideD1J]              //  offset 1
v_and_b32 v0, 1, v4                                // v0 = v4 % 2
v_mul_lo_u32 v0, 0x10, v0                          // wave coordination offset 0
v_and_b32 v5, 15, v[vgprSerial]                    // v5 = v[vgprSerial] % 16
v_add_lshl_u32 v0, v5, v0, 3                       // coordination 0 = vwA * (wave_id0 + tid0)
s_mul_i32 s8, 256, s[sgprWorkGroup0]               // wgp0 * MT0
v_add_u32 v0, s8, v0                               // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
s_mul_i32 s8, 256, s[sgprWorkGroup1]               // wgp1 * MT1
v_add_u32 v1, s8, v1                               // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1

/******************************************/
/* Global Write Elements                  */
/******************************************/
label_GW_B0_E0:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=28 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_lshl_u32 v11, v3, v0, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
v_accvgpr_read_b32 v[vgprValuC+16], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+17], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+18], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+19], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+20], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+21], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+22], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+23], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+24], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+25], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+26], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+27], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+28], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+29], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+30], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+31], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+32], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+33], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+34], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+35], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+36], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+37], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+38], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+39], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+40], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+41], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+42], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+43], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+44], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+45], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+46], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+47], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+48], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+49], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+50], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+51], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+52], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+53], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+54], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+55], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+56], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+57], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+58], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+59], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+60], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+61], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+62], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+63], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+64], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+65], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+66], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+67], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+68], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+69], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+70], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+71], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+72], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+73], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+74], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+75], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+76], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+77], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+78], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+79], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+80], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+81], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+82], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+83], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+84], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+85], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+86], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+87], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+88], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+89], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+90], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+91], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+92], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+93], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+94], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+95], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+96], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+97], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+98], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+99], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+100], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+101], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+102], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+103], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+104], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+105], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+106], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+107], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+108], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+109], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+110], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+111], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+112], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+113], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+114], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+115], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+116], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+117], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+118], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+119], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+120], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+121], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+122], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+123], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+124], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+125], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+126], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+127], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+136], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+137], acc197        // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+138], acc201        // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+139], acc205        // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+140], acc209        // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+141], acc213        // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+142], acc217        // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+143], acc221        // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+144], acc225        // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+145], acc229        // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+146], acc233        // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+147], acc237        // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+148], acc241        // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+149], acc245        // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+150], acc249        // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+151], acc253        // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+152], acc2          // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+153], acc6          // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+154], acc10         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+155], acc14         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+156], acc18         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+157], acc22         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+158], acc26         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+159], acc30         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+160], acc34         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+161], acc38         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+162], acc42         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+163], acc46         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+164], acc50         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+165], acc54         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+166], acc58         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+167], acc62         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+168], acc66         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+169], acc70         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+170], acc74         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+171], acc78         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+172], acc82         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+173], acc86         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+174], acc90         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+175], acc94         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+176], acc98         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+177], acc102        // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+178], acc106        // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+179], acc110        // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+180], acc114        // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+181], acc118        // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+182], acc122        // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+183], acc126        // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+184], acc130        // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+185], acc134        // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+186], acc138        // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+187], acc142        // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+188], acc146        // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+189], acc150        // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+190], acc154        // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+191], acc158        // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+192], acc162        // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+193], acc166        // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+194], acc170        // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+195], acc174        // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+196], acc178        // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+197], acc182        // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+198], acc186        // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+199], acc190        // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+200], acc194        // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+201], acc198        // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+202], acc202        // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+203], acc206        // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+204], acc210        // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+205], acc214        // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+206], acc218        // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+207], acc222        // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+208], acc226        // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+209], acc230        // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+210], acc234        // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+211], acc238        // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+212], acc242        // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+213], acc246        // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+214], acc250        // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+215], acc254        // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+216], acc3          // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+217], acc7          // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+218], acc11         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+219], acc15         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+220], acc19         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+221], acc23         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+222], acc27         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+223], acc31         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+224], acc35         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+225], acc39         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+226], acc43         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+227], acc47         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+228], acc51         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+229], acc55         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+230], acc59         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+231], acc63         // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+232], acc67         // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+233], acc71         // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+234], acc75         // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+235], acc79         // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+236], acc83         // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+237], acc87         // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+238], acc91         // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+239], acc95         // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+240], acc99         // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+241], acc103        // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+242], acc107        // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+243], acc111        // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+244], acc115        // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+245], acc119        // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+246], acc123        // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+247], acc127        // copy acc to vreg[223]

/* apply mask, calc new C and issue writes */
v_mov_b32 v8, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v9, 0x7fff0000                           // fp32 Nan
v_mov_b32 v10, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[88:91], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[96:99], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[104:107], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[112:115], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[120:123], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[136:139], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[144:147], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[152:155], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[160:163], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[168:171], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[176:179], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[184:187], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[192:195], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[200:203], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[208:211], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v216, v[vgprValuC+216], v[vgprValuC+217] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v217, v[vgprValuC+218], v[vgprValuC+219] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v218, v[vgprValuC+220], v[vgprValuC+221] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v219, v[vgprValuC+222], v[vgprValuC+223] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[216:219], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v224, v[vgprValuC+224], v[vgprValuC+225] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v225, v[vgprValuC+226], v[vgprValuC+227] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v226, v[vgprValuC+228], v[vgprValuC+229] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v227, v[vgprValuC+230], v[vgprValuC+231] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[224:227], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v232, v[vgprValuC+232], v[vgprValuC+233] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v233, v[vgprValuC+234], v[vgprValuC+235] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v234, v[vgprValuC+236], v[vgprValuC+237] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v235, v[vgprValuC+238], v[vgprValuC+239] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[232:235], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v240, v[vgprValuC+240], v[vgprValuC+241] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v241, v[vgprValuC+242], v[vgprValuC+243] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v242, v[vgprValuC+244], v[vgprValuC+245] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v243, v[vgprValuC+246], v[vgprValuC+247] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[240:243], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_accvgpr_read_b32 v[vgprValuC+16], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+17], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+18], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+19], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+20], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+21], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+22], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+23], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+24], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+25], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+26], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+27], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+28], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+29], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+30], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+31], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+32], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+33], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+34], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+35], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+36], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+37], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+38], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+39], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+40], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+41], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+42], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+43], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+44], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+45], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+46], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+47], acc255         // copy acc to vreg[255]

/* apply mask, calc new C and issue writes */
v_mov_b32 v8, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v9, 0x7fff0000                           // fp32 Nan
v_mov_b32 v10, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_End:

s_endpgm                                           // Kernel End
label_OptNLL_End:
label_GSU_3:

/******************************************/
/* Ord. NoLoadLoop - Begin                */
/******************************************/

/* iter 0 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63  */
/*  numMfmaForLR:20, syncPlrMfmaIndex:107  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(7)                               // wait for prior local read local write old=0, new=7 newLW=0 newLR=7 for iteration == 0
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
ds_read_b128 v[vgprValuA_X1_I0+0:vgprValuA_X1_I0+0+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
ds_read_b128 v[vgprValuB_X1_I0+0:vgprValuB_X1_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
ds_read_b128 v[vgprValuA_X1_I0+4:vgprValuA_X1_I0+4+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
ds_read_b128 v[vgprValuA_X1_I0+8:vgprValuA_X1_I0+8+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
ds_read_b128 v[vgprValuA_X1_I0+12:vgprValuA_X1_I0+12+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
ds_read_b128 v[vgprValuA_X1_I0+16:vgprValuA_X1_I0+16+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
ds_read_b128 v[vgprValuA_X1_I0+20:vgprValuA_X1_I0+20+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuA_X1_I0+24:vgprValuA_X1_I0+24+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0
s_waitcnt lgkmcnt(8)                               // wait for prior local read local write
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
ds_read_b128 v[vgprValuA_X1_I0+28:vgprValuA_X1_I0+28+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
ds_read_b128 v[vgprValuB_X1_I0+4:vgprValuB_X1_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
ds_read_b128 v[vgprValuB_X1_I0+8:vgprValuB_X1_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
ds_read_b128 v[vgprValuB_X1_I0+12:vgprValuB_X1_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
ds_read_b128 v[vgprValuB_X1_I0+16:vgprValuB_X1_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
ds_read_b128 v[vgprValuB_X1_I0+20:vgprValuB_X1_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
ds_read_b128 v[vgprValuB_X1_I0+24:vgprValuB_X1_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=1 iui=0
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
ds_read_b128 v[vgprValuB_X1_I0+28:vgprValuB_X1_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=32 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=1 iui=0
/* localReadsVacancy: latencyLeft 1 */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+20+0+0:vgprValuA_X0_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* schedule remaining localreads for one buffer scheduling */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
/* localReadsVacancy: latencyLeft 5 */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+3], v[vgprValuA_X0_I0+28+0+0:vgprValuA_X0_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=8 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */

/* iter 1 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:63, lwEndMfmaIndex:63  */
/*  numMfmaForLR:20, syncPlrMfmaIndex:107  */
/*  mfmaIndex:64  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_bf16 acc[0:3], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
v_mfma_f32_16x16x32_bf16 acc[4:7], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_bf16 acc[8:11], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_bf16 acc[12:15], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_bf16 acc[16:19], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_bf16 acc[20:23], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_bf16 acc[24:27], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_bf16 acc[28:31], v[vgprValuB_X1_I0+0+0+0:vgprValuB_X1_I0+0+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_bf16 acc[32:35], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_bf16 acc[36:39], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_bf16 acc[40:43], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_bf16 acc[44:47], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_bf16 acc[48:51], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_bf16 acc[52:55], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
v_mfma_f32_16x16x32_bf16 acc[56:59], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
v_mfma_f32_16x16x32_bf16 acc[60:63], v[vgprValuB_X1_I0+4+0+0:vgprValuB_X1_I0+4+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_bf16 acc[64:67], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_bf16 acc[68:71], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_bf16 acc[72:75], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_bf16 acc[76:79], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_bf16 acc[80:83], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_bf16 acc[84:87], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_bf16 acc[88:91], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_bf16 acc[92:95], v[vgprValuB_X1_I0+8+0+0:vgprValuB_X1_I0+8+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_bf16 acc[96:99], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_bf16 acc[100:103], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_bf16 acc[104:107], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_bf16 acc[108:111], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_bf16 acc[112:115], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_bf16 acc[116:119], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
v_mfma_f32_16x16x32_bf16 acc[120:123], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
v_mfma_f32_16x16x32_bf16 acc[124:127], v[vgprValuB_X1_I0+12+0+0:vgprValuB_X1_I0+12+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
v_mfma_f32_16x16x32_bf16 acc[128:131], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
v_mfma_f32_16x16x32_bf16 acc[132:135], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
v_mfma_f32_16x16x32_bf16 acc[136:139], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
v_mfma_f32_16x16x32_bf16 acc[140:143], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
v_mfma_f32_16x16x32_bf16 acc[144:147], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
v_mfma_f32_16x16x32_bf16 acc[148:151], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
v_mfma_f32_16x16x32_bf16 acc[152:155], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
v_mfma_f32_16x16x32_bf16 acc[156:159], v[vgprValuB_X1_I0+16+0+0:vgprValuB_X1_I0+16+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
v_mfma_f32_16x16x32_bf16 acc[160:163], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_bf16 acc[164:167], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_bf16 acc[168:171], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_bf16 acc[172:175], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_bf16 acc[176:179], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_bf16 acc[180:183], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
v_mfma_f32_16x16x32_bf16 acc[184:187], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
v_mfma_f32_16x16x32_bf16 acc[188:191], v[vgprValuB_X1_I0+20+0+0:vgprValuB_X1_I0+20+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_bf16 acc[192:195], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_bf16 acc[196:199], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_bf16 acc[200:203], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_bf16 acc[204:207], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_bf16 acc[208:211], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_bf16 acc[212:215], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_bf16 acc[216:219], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_bf16 acc[220:223], v[vgprValuB_X1_I0+24+0+0:vgprValuB_X1_I0+24+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_bf16 acc[224:227], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+3], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_bf16 acc[228:231], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+4+0+0:vgprValuA_X1_I0+4+0+0+3], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_bf16 acc[232:235], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+8+0+0:vgprValuA_X1_I0+8+0+0+3], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_bf16 acc[236:239], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+12+0+0:vgprValuA_X1_I0+12+0+0+3], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_bf16 acc[240:243], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+16+0+0:vgprValuA_X1_I0+16+0+0+3], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_bf16 acc[244:247], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+20+0+0:vgprValuA_X1_I0+20+0+0+3], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
v_mfma_f32_16x16x32_bf16 acc[248:251], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+24+0+0:vgprValuA_X1_I0+24+0+0+3], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
v_mfma_f32_16x16x32_bf16 acc[252:255], v[vgprValuB_X1_I0+28+0+0:vgprValuB_X1_I0+28+0+0+3], v[vgprValuA_X1_I0+28+0+0:vgprValuA_X1_I0+28+0+0+3], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=8 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
label_toPGR1end_OrdNLL:
label_PrefetchGlobalLastIterEnd:

/* Tail: add ValuA/B vgpr buffer [4...132) to pool */

/* Tail: add address/G2L vgpr [132...132) to pool */
label_Summation_End_S4FDBQ587JJL6NOU:
.set sgprWGM, UNDEF
.set sgprLoopCounterL, UNDEF
.set sgprOrigLoopCounter, UNDEF
.set sgprAddressA, UNDEF
.set sgprAddressB, UNDEF
.set sgprStridesA, UNDEF
.set sgprStridesB, UNDEF
.set sgprStaggerUIter, UNDEF
.set sgprSrdA, UNDEF
.set sgprSrdB, UNDEF
.set sgprShadowLimitA, UNDEF
.set sgprShadowLimitB, UNDEF
.set sgprWrapUA, UNDEF
.set sgprWrapUB, UNDEF
.set sgprGlobalReadIncsA, UNDEF
.set sgprGlobalReadIncsB, UNDEF
.set sgprScalarGlobalReadOffsetA, UNDEF
.set sgprScalarGlobalReadOffsetB, UNDEF
/* load store sgprs */

/* Mapping of Acc register -> C Vgpr register */

/* not-LocalSplitU: global write indices */
/* computeStoreVgprs */
v_lshrrev_b32 v8, 6, v[vgprSerial]                 // 8 = Serial / 64
v_lshrrev_b32 v9, 1, v8                            // 9 = 8 / 2
v_mul_lo_u32 v9, 0x10, v9                          // wave coordination offset 1
v_and_b32 v5, 63, v[vgprSerial]                    // v5 = v[vgprSerial] % 64
v_lshrrev_b32 v5, 4, v5                            // 5 = 5 / 16
v_lshlrev_b32 v5, 2, v5                            // thread0 * continuous_output
v_add_lshl_u32 v5, v9, v5, 3                       // coordination 1 = vwB *(wave_id1 + tid1)
v_mul_lo_u32 v6, v5, s[sgprStrideC1J]              //  offset 1
v_mul_lo_u32 v7, v5, s[sgprStrideD1J]              //  offset 1
v_and_b32 v4, 1, v8                                // v4 = v8 % 2
v_mul_lo_u32 v4, 0x10, v4                          // wave coordination offset 0
v_and_b32 v9, 15, v[vgprSerial]                    // v9 = v[vgprSerial] % 16
v_add_lshl_u32 v4, v9, v4, 3                       // coordination 0 = vwA * (wave_id0 + tid0)
s_mul_i32 s8, 256, s[sgprWorkGroup0]               // wgp0 * MT0
v_add_u32 v4, s8, v4                               // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
s_mul_i32 s8, 256, s[sgprWorkGroup1]               // wgp1 * MT1
v_add_u32 v5, s8, v5                               // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1

/* not-LocalSplitU: global write */

/******************************************/
/* Global Write Elements                  */
/******************************************/
s_and_b32 s8, s[sgprGSU], 0x3fff                   // Restore GSU
s_cmp_eq_u32 s8, 1                                 // GSU == 1 ?
s_cbranch_scc1 label_GSU_4                         // branch if GSU == 1
s_and_b32 s30, 255, s[sgprSizeI]                   // s30 = s[sgprSizeI] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s31                // wg0 >= nwg0-1 ?
s_cselect_b32 s30, s30, 0                          // set rMT0
s_cmpk_gt_u32 s30, 0                               // rMT0 > 0
s_cbranch_scc1 label_GW_B0_E1_M                    // jump if edges required
s_and_b32 s30, 255, s[sgprSizeJ]                   // s30 = s[sgprSizeJ] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s31                // wg1 >= nwg1-1
s_cselect_b32 s30, s30, 0                          // set rMT1
s_cmpk_gt_u32 s30, 0                               // rMT1 > 0
s_cbranch_scc1 label_GW_B0_E1_N                    // jump if edges required
label_GW_B0_E0_1:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=26 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_lshl_u32 v15, v7, v4, 0x2                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+80], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+81], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+82], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+83], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+84], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+85], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+86], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+87], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+88], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+89], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+90], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+91], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+92], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+93], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+94], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+95], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+96], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+97], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+98], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+99], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+100], acc49         // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+101], acc53         // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+102], acc57         // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+103], acc61         // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+104], acc65         // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+105], acc69         // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+106], acc73         // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+107], acc77         // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+108], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+109], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+110], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+111], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+112], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+113], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+114], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+115], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+116], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+117], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+118], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+119], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+120], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+121], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+122], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+123], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+124], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+125], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+126], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+127], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+136], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+137], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+138], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+139], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+140], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+141], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+142], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+143], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+144], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+145], acc197        // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+146], acc201        // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+147], acc205        // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+148], acc209        // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+149], acc213        // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+150], acc217        // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+151], acc221        // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+152], acc225        // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+153], acc229        // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+154], acc233        // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+155], acc237        // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+156], acc241        // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+157], acc245        // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+158], acc249        // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+159], acc253        // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+160], acc2          // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+161], acc6          // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+162], acc10         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+163], acc14         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+164], acc18         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+165], acc22         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+166], acc26         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+167], acc30         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+168], acc34         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+169], acc38         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+170], acc42         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+171], acc46         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+172], acc50         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+173], acc54         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+174], acc58         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+175], acc62         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+176], acc66         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+177], acc70         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+178], acc74         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+179], acc78         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+180], acc82         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+181], acc86         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+182], acc90         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+183], acc94         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+184], acc98         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+185], acc102        // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+186], acc106        // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+187], acc110        // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+188], acc114        // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+189], acc118        // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+190], acc122        // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+191], acc126        // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+192], acc130        // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+193], acc134        // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+194], acc138        // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+195], acc142        // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+196], acc146        // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+197], acc150        // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+198], acc154        // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+199], acc158        // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+200], acc162        // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+201], acc166        // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+202], acc170        // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+203], acc174        // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+204], acc178        // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+205], acc182        // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+206], acc186        // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+207], acc190        // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+208], acc194        // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+209], acc198        // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+210], acc202        // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+211], acc206        // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+212], acc210        // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+213], acc214        // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+214], acc218        // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+215], acc222        // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+216], acc226        // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+217], acc230        // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+218], acc234        // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+219], acc238        // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+220], acc242        // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+221], acc246        // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+222], acc250        // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+223], acc254        // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+224], acc3          // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+225], acc7          // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+226], acc11         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+227], acc15         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+228], acc19         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+229], acc23         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+230], acc27         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+231], acc31         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+232], acc35         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+233], acc39         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+234], acc43         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+235], acc47         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+236], acc51         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+237], acc55         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+238], acc59         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+239], acc63         // copy acc to vreg[207]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0)] */

/* apply mask, calc new C and issue writes */
buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[28:31], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[36:39], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[44:47], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[60:63], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[68:71], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[76:79], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[84:87], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[92:95], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[100:103], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[108:111], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[116:119], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[124:127], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[140:143], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[148:151], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[156:159], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[164:167], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[172:175], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[176:179], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[180:183], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[184:187], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[188:191], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[192:195], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[196:199], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[200:203], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[204:207], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[208:211], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[212:215], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[216:219], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[220:223], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[224:227], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[228:231], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[232:235], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[236:239], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_accvgpr_read_b32 v[vgprValuC+24], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+25], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+26], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+27], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+28], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+29], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+30], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+31], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+32], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+33], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+34], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+35], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+36], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+37], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+38], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+39], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+40], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+41], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+42], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+43], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+44], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+45], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+46], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+47], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+48], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+49], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+50], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+51], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+52], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+53], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+54], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+55], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+56], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+57], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+58], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+59], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+60], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+61], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+62], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+63], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+64], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+65], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+66], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+67], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+68], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+69], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+70], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+71], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */

/* apply mask, calc new C and issue writes */
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[28:31], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[36:39], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[44:47], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[52:55], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[60:63], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[68:71], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End_1                            // jump to end
label_GW_B0_E1_N:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v15, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v15, v10, v15, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v128, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v10, v128, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v129, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v129, v10, v129, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v130, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v10, v130, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v131, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v131, v10, v131, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v135, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v10, v135, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v216, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v216, v10, v216, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v217, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v217, v10, v217, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v218, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v218, v10, v218, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v219, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v219, v10, v219, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v220, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v220, v10, v220, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v221, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v221, v10, v221, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v222, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v222, v10, v222, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v223, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v223, v10, v223, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v224, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v224, v10, v224, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v225, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v225, v10, v225, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v226, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v226, v10, v226, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v227, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v227, v10, v227, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v228, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v228, v10, v228, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v229, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v229, v10, v229, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v230, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v230, v10, v230, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v231, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v231, v10, v231, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v232, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v232, v10, v232, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v233, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v233, v10, v233, s[34:35]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+16], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+17], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+18], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+19], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+20], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+21], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+22], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+23], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+24], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+25], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+26], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+27], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+28], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+29], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+30], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+31], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+32], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+33], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+34], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+35], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+36], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+37], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+38], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+39], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+40], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+41], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+42], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+43], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+44], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+45], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+46], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+47], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+48], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+49], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+50], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+51], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+52], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+53], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+54], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+55], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+56], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+57], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+58], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+59], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+60], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+61], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+62], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+63], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+64], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+65], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+66], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+67], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+68], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+69], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+70], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+71], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+72], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+73], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+74], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+75], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+76], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+77], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+78], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+79], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+80], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+81], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+82], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+83], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+84], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+85], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+86], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+87], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+88], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+89], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+90], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+91], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+92], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+93], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+94], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+95], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+96], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+97], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+98], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+99], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+100], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+101], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+102], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+103], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+104], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+105], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+106], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+107], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+108], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+109], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+110], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+111], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+112], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+113], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+114], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+115], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+116], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+117], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+118], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+119], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+120], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+121], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+122], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+123], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+124], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+125], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+126], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+127], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+136], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+137], acc197        // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+138], acc201        // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+139], acc205        // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+140], acc209        // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+141], acc213        // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+142], acc217        // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+143], acc221        // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+144], acc225        // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+145], acc229        // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+146], acc233        // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+147], acc237        // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+148], acc241        // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+149], acc245        // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+150], acc249        // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+151], acc253        // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+152], acc2          // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+153], acc6          // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+154], acc10         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+155], acc14         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+156], acc18         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+157], acc22         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+158], acc26         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+159], acc30         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+160], acc34         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+161], acc38         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+162], acc42         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+163], acc46         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+164], acc50         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+165], acc54         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+166], acc58         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+167], acc62         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+168], acc66         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+169], acc70         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+170], acc74         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+171], acc78         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+172], acc82         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+173], acc86         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+174], acc90         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+175], acc94         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+176], acc98         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+177], acc102        // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+178], acc106        // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+179], acc110        // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+180], acc114        // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+181], acc118        // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+182], acc122        // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+183], acc126        // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+184], acc130        // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+185], acc134        // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+186], acc138        // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+187], acc142        // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+188], acc146        // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+189], acc150        // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+190], acc154        // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+191], acc158        // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+192], acc162        // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+193], acc166        // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+194], acc170        // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+195], acc174        // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+196], acc178        // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+197], acc182        // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+198], acc186        // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+199], acc190        // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+200], acc194        // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+201], acc198        // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+202], acc202        // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+203], acc206        // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+204], acc210        // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+205], acc214        // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+206], acc218        // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+207], acc222        // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+208], acc226        // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+209], acc230        // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+210], acc234        // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+211], acc238        // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+212], acc242        // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+213], acc246        // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+214], acc250        // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+215], acc254        // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */

/* apply mask, calc new C and issue writes */
buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[20:23], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[24:27], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[28:31], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[32:35], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[36:39], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[40:43], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[44:47], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[48:51], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[52:55], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[56:59], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[60:63], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[64:67], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[68:71], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[72:75], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[76:79], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[80:83], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[84:87], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[88:91], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[92:95], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[96:99], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[100:103], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[104:107], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[108:111], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[112:115], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[116:119], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[120:123], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[124:127], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[136:139], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[140:143], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[144:147], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[148:151], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[152:155], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[156:159], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[160:163], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[164:167], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[168:171], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[172:175], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[176:179], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[180:183], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[184:187], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[188:191], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[192:195], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[196:199], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[200:203], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[204:207], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[208:211], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[212:215], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v15, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v15, v10, v15, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v80, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v10, v80, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v81, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v81, v10, v81, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v82, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v10, v82, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v83, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v83, v10, v83, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v84, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v10, v84, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v85, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v85, v10, v85, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v86, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v10, v86, s[34:35]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+16], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+17], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+18], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+19], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+20], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+21], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+22], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+23], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+24], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+25], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+26], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+27], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+28], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+29], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+30], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+31], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+32], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+33], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+34], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+35], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+36], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+37], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+38], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+39], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+40], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+41], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+42], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+43], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+44], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+45], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+46], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+47], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+48], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+49], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+50], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+51], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+52], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+53], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+54], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+55], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+56], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+57], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+58], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+59], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+60], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+61], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+62], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+63], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+64], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+65], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+66], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+67], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+68], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+69], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+70], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+71], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+72], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+73], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+74], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+75], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+76], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+77], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+78], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+79], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */

/* apply mask, calc new C and issue writes */
buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[20:23], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[24:27], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[28:31], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[32:35], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[36:39], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[40:43], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[44:47], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[48:51], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[52:55], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[56:59], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[60:63], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[64:67], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[68:71], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
buffer_store_dwordx4 v[72:75], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dwordx4 v[76:79], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:16 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End_1                            // jump to end
label_GW_B0_E1_M:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=114 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v129, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v129, v10, v129, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v130, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v10, v130, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v131, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v131, v10, v131, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v135, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v10, v135, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v136, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v10, v136, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v137, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v10, v137, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v138, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v10, v138, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v139, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v10, v139, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v140, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v10, v140, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v141, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v10, v141, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v142, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v10, v142, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v143, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v10, v143, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v144, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v10, v144, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v145, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v10, v145, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v146, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v146, v10, v146, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v147, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v10, v147, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v148, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v148, v10, v148, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v149, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v10, v149, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v150, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v150, v10, v150, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v151, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v10, v151, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v152, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v152, v10, v152, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v153, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v10, v153, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v154, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v154, v10, v154, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v155, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v10, v155, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v156, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v156, v10, v156, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v157, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v10, v157, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v158, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v158, v10, v158, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v159, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v10, v159, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v160, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v160, v10, v160, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v161, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v10, v161, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v162, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v162, v10, v162, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v163, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v10, v163, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v164, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v164, v10, v164, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v165, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v10, v165, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v166, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v166, v10, v166, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v167, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v10, v167, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v168, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v168, v10, v168, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v169, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v10, v169, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v170, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v170, v10, v170, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v171, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v171, v10, v171, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v172, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v172, v10, v172, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v173, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v173, v10, v173, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v174, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v174, v10, v174, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v175, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v175, v10, v175, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v176, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v176, v10, v176, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v177, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v177, v10, v177, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v178, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v178, v10, v178, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v179, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v179, v10, v179, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v180, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v180, v10, v180, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v181, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v181, v10, v181, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v182, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v182, v10, v182, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v183, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v183, v10, v183, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v184, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v184, v10, v184, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v185, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v185, v10, v185, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v186, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v186, v10, v186, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v187, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v187, v10, v187, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v188, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v188, v10, v188, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v189, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v189, v10, v189, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v190, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v190, v10, v190, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v191, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v191, v10, v191, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v192, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v192, v10, v192, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v193, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v193, v10, v193, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v194, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v194, v10, v194, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v195, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v195, v10, v195, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v196, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v196, v10, v196, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v197, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v197, v10, v197, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v198, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v198, v10, v198, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v199, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v199, v10, v199, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v200, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v200, v10, v200, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v201, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v201, v10, v201, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v202, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v202, v10, v202, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v203, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v203, v10, v203, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v204, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v204, v10, v204, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v205, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v205, v10, v205, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v206, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v206, v10, v206, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v207, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v207, v10, v207, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v208, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v208, v10, v208, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v209, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v209, v10, v209, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v210, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v210, v10, v210, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v211, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v211, v10, v211, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v212, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v212, v10, v212, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v213, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v213, v10, v213, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v214, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v214, v10, v214, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v215, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v215, v10, v215, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v216, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v216, v10, v216, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v217, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v217, v10, v217, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v218, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v218, v10, v218, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v219, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v219, v10, v219, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v220, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v220, v10, v220, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v221, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v221, v10, v221, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v222, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v222, v10, v222, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v223, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v223, v10, v223, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v224, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v224, v10, v224, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v225, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v225, v10, v225, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v226, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v226, v10, v226, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v227, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v227, v10, v227, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v228, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v228, v10, v228, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v229, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v229, v10, v229, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v230, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v230, v10, v230, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v231, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v231, v10, v231, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v232, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v232, v10, v232, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v233, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v233, v10, v233, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v234, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v234, v10, v234, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v235, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v235, v10, v235, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v236, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v236, v10, v236, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v237, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v237, v10, v237, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v238, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v238, v10, v238, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v239, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v239, v10, v239, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v240, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v240, v10, v240, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v241, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v241, v10, v241, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v242, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v242, v10, v242, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v243, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v243, v10, v243, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v244, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v244, v10, v244, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v245, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v245, v10, v245, s[34:35]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+15], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+16], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+17], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+18], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+19], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+20], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+21], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+22], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+23], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+24], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+25], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+26], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+27], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+28], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+29], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+30], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+31], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+32], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+33], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+34], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+35], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+36], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+37], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+38], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+39], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+40], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+41], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+42], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+43], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+44], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+45], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+46], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+47], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+48], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+49], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+50], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+51], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+52], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+53], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+54], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+55], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+56], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+57], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+58], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+59], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+60], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+61], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+62], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+63], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+64], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+65], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+66], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+67], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+68], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+69], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+70], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+71], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+72], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+73], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+74], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+75], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+76], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+77], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+78], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+79], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+80], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+81], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+82], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+83], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+84], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+85], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+86], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+87], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+88], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+89], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+90], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+91], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+92], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+93], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+94], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+95], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+96], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+97], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+98], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+99], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+100], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+101], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+102], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+103], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+104], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+105], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+106], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+107], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+108], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+109], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+110], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+111], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+112], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+113], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+114], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+115], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+116], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+117], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+118], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+119], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+120], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+121], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+122], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+123], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+124], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+125], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+126], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+127], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+128], acc197        // copy acc to vreg[113]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1)] */

/* apply mask, calc new C and issue writes */
buffer_store_dword v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,14,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v129, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v129, v10, v129, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v130, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v10, v130, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v131, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v131, v10, v131, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v135, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v10, v135, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v136, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v10, v136, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v137, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v10, v137, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v138, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v10, v138, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v139, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v10, v139, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v140, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v10, v140, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v141, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v10, v141, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v142, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v10, v142, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v143, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v10, v143, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v144, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v10, v144, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v145, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v10, v145, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v146, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v146, v10, v146, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v147, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v10, v147, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v148, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v148, v10, v148, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v149, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v10, v149, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v150, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v150, v10, v150, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v151, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v10, v151, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v152, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v152, v10, v152, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v153, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v10, v153, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v154, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v154, v10, v154, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v155, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v10, v155, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v156, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v156, v10, v156, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v157, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v10, v157, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v158, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v158, v10, v158, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v159, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v10, v159, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v160, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v160, v10, v160, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v161, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v10, v161, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v162, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v162, v10, v162, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v163, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v10, v163, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v164, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v164, v10, v164, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v165, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v10, v165, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v166, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v166, v10, v166, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v167, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v10, v167, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v168, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v168, v10, v168, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v169, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v10, v169, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v170, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v170, v10, v170, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v171, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v171, v10, v171, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v172, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v172, v10, v172, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v173, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v173, v10, v173, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v174, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v174, v10, v174, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v175, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v175, v10, v175, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v176, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v176, v10, v176, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v177, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v177, v10, v177, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v178, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v178, v10, v178, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v179, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v179, v10, v179, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v180, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v180, v10, v180, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v181, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v181, v10, v181, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v182, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v182, v10, v182, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v183, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v183, v10, v183, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v184, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v184, v10, v184, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v185, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v185, v10, v185, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v186, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v186, v10, v186, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v187, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v187, v10, v187, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v188, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v188, v10, v188, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v189, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v189, v10, v189, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v190, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v190, v10, v190, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v191, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v191, v10, v191, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v192, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v192, v10, v192, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v193, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v193, v10, v193, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v194, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v194, v10, v194, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v195, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v195, v10, v195, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v196, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v196, v10, v196, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v197, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v197, v10, v197, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v198, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v198, v10, v198, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v199, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v199, v10, v199, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v200, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v200, v10, v200, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v201, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v201, v10, v201, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v202, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v202, v10, v202, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v203, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v203, v10, v203, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v204, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v204, v10, v204, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v205, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v205, v10, v205, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v206, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v206, v10, v206, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v207, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v207, v10, v207, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v208, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v208, v10, v208, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v209, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v209, v10, v209, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v210, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v210, v10, v210, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v211, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v211, v10, v211, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v212, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v212, v10, v212, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v213, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v213, v10, v213, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v214, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v214, v10, v214, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v215, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v215, v10, v215, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v216, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v216, v10, v216, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v217, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v217, v10, v217, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v218, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v218, v10, v218, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v219, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v219, v10, v219, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v220, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v220, v10, v220, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v221, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v221, v10, v221, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v222, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v222, v10, v222, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v223, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v223, v10, v223, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v224, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v224, v10, v224, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v225, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v225, v10, v225, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v226, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v226, v10, v226, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v227, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v227, v10, v227, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v228, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v228, v10, v228, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v229, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v229, v10, v229, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v230, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v230, v10, v230, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v231, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v231, v10, v231, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v232, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v232, v10, v232, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v233, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v233, v10, v233, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v234, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v234, v10, v234, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v235, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v235, v10, v235, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v236, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v236, v10, v236, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v237, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v237, v10, v237, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v238, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v238, v10, v238, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v239, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v239, v10, v239, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v240, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v240, v10, v240, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v241, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v241, v10, v241, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v242, v7, v4, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v242, v10, v242, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v243, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v243, v10, v243, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v244, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v244, v10, v244, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v245, v7, v8, 0x2                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v245, v10, v245, s[34:35]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+15], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+16], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+17], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+18], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+19], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+20], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+21], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+22], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+23], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+24], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+25], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+26], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+27], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+28], acc253         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+29], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+30], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+31], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+32], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+33], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+34], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+35], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+36], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+37], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+38], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+39], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+40], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+41], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+42], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+43], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+44], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+45], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+46], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+47], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+48], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+49], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+50], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+51], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+52], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+53], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+54], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+55], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+56], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+57], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+58], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+59], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+60], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+61], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+62], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+63], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+64], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+65], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+66], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+67], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+68], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+69], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+70], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+71], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+72], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+73], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+74], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+75], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+76], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+77], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+78], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+79], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+80], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+81], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+82], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+83], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+84], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+85], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+86], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+87], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+88], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+89], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+90], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+91], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+92], acc254         // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+93], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+94], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+95], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+96], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+97], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+98], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+99], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+100], acc31         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+101], acc35         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+102], acc39         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+103], acc43         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+104], acc47         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+105], acc51         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+106], acc55         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+107], acc59         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+108], acc63         // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+109], acc67         // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+110], acc71         // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+111], acc75         // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+112], acc79         // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+113], acc83         // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+114], acc87         // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+115], acc91         // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+116], acc95         // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+117], acc99         // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+118], acc103        // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+119], acc107        // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+120], acc111        // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+121], acc115        // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+122], acc119        // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+123], acc123        // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+124], acc127        // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+125], acc131        // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+126], acc135        // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+127], acc139        // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+128], acc143        // copy acc to vreg[227]

/* rC *= alpha batchElements=[(0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */

/* apply mask, calc new C and issue writes */
buffer_store_dword v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,28,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v43, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v43, v10, v43, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v44, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v44, v10, v44, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v45, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v45, v10, v45, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v46, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v46, v10, v46, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v47, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v47, v10, v47, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v48, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v48, v10, v48, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v49, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v49, v10, v49, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v50, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v50, v10, v50, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v51, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v51, v10, v51, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v52, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v52, v10, v52, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v53, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v53, v10, v53, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v54, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v54, v10, v54, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v55, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v55, v10, v55, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v56, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v56, v10, v56, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v57, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v57, v10, v57, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v58, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v10, v58, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v59, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v59, v10, v59, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v60, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v60, v10, v60, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v61, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v61, v10, v61, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v62, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v10, v62, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v63, v7, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v63, v10, v63, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v64, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v10, v64, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v65, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v65, v10, v65, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v66, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v10, v66, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v67, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v67, v10, v67, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v68, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v10, v68, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v69, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v69, v10, v69, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v70, v7, v8, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v10, v70, s[34:35]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+15], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+16], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+17], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+18], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+19], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+20], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+21], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+22], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+23], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+24], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+25], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+26], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+27], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+28], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+29], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+30], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+31], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+32], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+33], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+34], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+35], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+36], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+37], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+38], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+39], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+40], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+41], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+42], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */

/* apply mask, calc new C and issue writes */
buffer_store_dword v15, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v17, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v18, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v19, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v20, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v21, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v22, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v23, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v24, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v25, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v26, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v27, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v28, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v29, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v30, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v31, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v32, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v33, v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v34, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v35, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v36, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v37, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v38, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v39, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v40, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v41, v69, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
buffer_store_dword v42, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End_1                            // jump to end
label_GW_End_1:
s_getpc_b64 s[30:31]                               // addr of next instr
s_add_i32 s32, label_KernelEnd, 4                  // target branch offset
s_add_u32 s30, s30, s32                            // add target branch offset
s_addc_u32 s31, s31, 0                             // add high and carry
s_setpc_b64 s[30:31]                               // branch to label_KernelEnd
label_GSU_4:
s_cmpk_eq_u32 s[sgprBeta], 0                       // Beta == 0
s_cbranch_scc0 label_GW_Beta_2                     // Branch if Beta is not zero

s_and_b32 s30, 255, s[sgprSizeI]                   // s30 = s[sgprSizeI] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s31                // wg0 >= nwg0-1 ?
s_cselect_b32 s30, s30, 0                          // set rMT0
s_cmpk_gt_u32 s30, 0                               // rMT0 > 0
s_cbranch_scc1 label_GW_B0_E1_M_1                  // jump if edges required
s_and_b32 s30, 255, s[sgprSizeJ]                   // s30 = s[sgprSizeJ] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s31                // wg1 >= nwg1-1
s_cselect_b32 s30, s30, 0                          // set rMT1
s_cmpk_gt_u32 s30, 0                               // rMT1 > 0
s_cbranch_scc1 label_GW_B0_E1_N_1                  // jump if edges required
label_GW_B0_E0_2:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=26 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_lshl_u32 v15, v7, v4, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+80], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+81], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+82], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+83], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+84], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+85], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+86], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+87], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+88], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+89], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+90], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+91], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+92], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+93], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+94], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+95], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+96], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+97], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+98], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+99], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+100], acc49         // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+101], acc53         // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+102], acc57         // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+103], acc61         // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+104], acc65         // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+105], acc69         // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+106], acc73         // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+107], acc77         // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+108], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+109], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+110], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+111], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+112], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+113], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+114], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+115], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+116], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+117], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+118], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+119], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+120], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+121], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+122], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+123], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+124], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+125], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+126], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+127], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+136], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+137], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+138], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+139], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+140], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+141], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+142], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+143], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+144], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+145], acc197        // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+146], acc201        // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+147], acc205        // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+148], acc209        // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+149], acc213        // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+150], acc217        // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+151], acc221        // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+152], acc225        // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+153], acc229        // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+154], acc233        // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+155], acc237        // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+156], acc241        // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+157], acc245        // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+158], acc249        // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+159], acc253        // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+160], acc2          // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+161], acc6          // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+162], acc10         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+163], acc14         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+164], acc18         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+165], acc22         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+166], acc26         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+167], acc30         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+168], acc34         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+169], acc38         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+170], acc42         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+171], acc46         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+172], acc50         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+173], acc54         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+174], acc58         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+175], acc62         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+176], acc66         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+177], acc70         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+178], acc74         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+179], acc78         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+180], acc82         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+181], acc86         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+182], acc90         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+183], acc94         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+184], acc98         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+185], acc102        // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+186], acc106        // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+187], acc110        // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+188], acc114        // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+189], acc118        // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+190], acc122        // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+191], acc126        // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+192], acc130        // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+193], acc134        // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+194], acc138        // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+195], acc142        // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+196], acc146        // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+197], acc150        // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+198], acc154        // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+199], acc158        // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+200], acc162        // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+201], acc166        // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+202], acc170        // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+203], acc174        // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+204], acc178        // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+205], acc182        // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+206], acc186        // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+207], acc190        // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+208], acc194        // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+209], acc198        // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+210], acc202        // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+211], acc206        // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+212], acc210        // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+213], acc214        // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+214], acc218        // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+215], acc222        // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+216], acc226        // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+217], acc230        // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+218], acc234        // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+219], acc238        // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+220], acc242        // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+221], acc246        // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+222], acc250        // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+223], acc254        // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+224], acc3          // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+225], acc7          // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+226], acc11         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+227], acc15         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+228], acc19         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+229], acc23         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+230], acc27         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+231], acc31         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+232], acc35         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+233], acc39         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+234], acc43         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+235], acc47         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+236], acc51         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+237], acc55         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+238], acc59         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+239], acc63         // copy acc to vreg[207]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+216:vgprValuC+216+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+216:vgprValuC+216+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+218:vgprValuC+218+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+218:vgprValuC+218+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+220:vgprValuC+220+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+220:vgprValuC+220+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+222:vgprValuC+222+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+222:vgprValuC+222+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+224:vgprValuC+224+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+224:vgprValuC+224+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+226:vgprValuC+226+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+226:vgprValuC+226+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+228:vgprValuC+228+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+228:vgprValuC+228+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+230:vgprValuC+230+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+230:vgprValuC+230+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+232:vgprValuC+232+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+232:vgprValuC+232+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+234:vgprValuC+234+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+234:vgprValuC+234+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+236:vgprValuC+236+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+236:vgprValuC+236+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+238:vgprValuC+238+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+238:vgprValuC+238+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[96:99], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[104:107], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[112:115], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[120:123], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[136:139], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[144:147], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[152:155], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[160:163], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[168:171], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[176:179], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[184:187], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[192:195], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[200:203], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[208:211], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v216, v[vgprValuC+216], v[vgprValuC+217] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v217, v[vgprValuC+218], v[vgprValuC+219] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v218, v[vgprValuC+220], v[vgprValuC+221] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v219, v[vgprValuC+222], v[vgprValuC+223] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[216:219], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v224, v[vgprValuC+224], v[vgprValuC+225] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v225, v[vgprValuC+226], v[vgprValuC+227] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v226, v[vgprValuC+228], v[vgprValuC+229] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v227, v[vgprValuC+230], v[vgprValuC+231] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[224:227], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v232, v[vgprValuC+232], v[vgprValuC+233] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v233, v[vgprValuC+234], v[vgprValuC+235] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v234, v[vgprValuC+236], v[vgprValuC+237] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v235, v[vgprValuC+238], v[vgprValuC+239] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[232:235], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_accvgpr_read_b32 v[vgprValuC+24], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+25], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+26], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+27], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+28], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+29], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+30], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+31], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+32], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+33], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+34], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+35], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+36], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+37], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+38], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+39], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+40], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+41], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+42], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+43], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+44], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+45], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+46], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+47], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+48], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+49], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+50], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+51], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+52], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+53], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+54], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+55], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+56], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+57], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+58], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+59], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+60], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+61], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+62], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+63], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+64], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+65], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+66], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+67], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+68], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+69], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+70], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+71], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End_2                            // jump to end
label_GW_B0_E1_N_1:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=24 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v15, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v15, v10, v15, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v128, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v10, v128, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v129, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v129, v10, v129, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v130, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v10, v130, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v131, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v131, v10, v131, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v135, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v10, v135, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v216, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v216, v10, v216, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v217, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v217, v10, v217, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v218, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v218, v10, v218, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v219, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v219, v10, v219, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v220, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v220, v10, v220, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v221, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v221, v10, v221, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v222, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v222, v10, v222, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v223, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v223, v10, v223, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v224, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v224, v10, v224, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v225, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v225, v10, v225, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v226, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v226, v10, v226, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v227, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v227, v10, v227, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v228, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v228, v10, v228, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v229, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v229, v10, v229, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v230, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v230, v10, v230, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v231, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v231, v10, v231, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v232, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v232, v10, v232, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v233, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v233, v10, v233, s[34:35]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+16], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+17], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+18], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+19], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+20], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+21], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+22], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+23], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+24], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+25], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+26], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+27], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+28], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+29], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+30], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+31], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+32], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+33], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+34], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+35], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+36], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+37], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+38], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+39], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+40], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+41], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+42], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+43], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+44], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+45], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+46], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+47], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+48], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+49], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+50], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+51], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+52], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+53], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+54], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+55], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+56], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+57], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+58], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+59], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+60], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+61], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+62], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+63], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+64], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+65], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+66], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+67], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+68], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+69], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+70], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+71], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+72], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+73], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+74], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+75], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+76], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+77], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+78], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+79], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+80], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+81], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+82], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+83], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+84], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+85], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+86], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+87], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+88], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+89], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+90], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+91], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+92], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+93], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+94], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+95], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+96], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+97], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+98], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+99], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+100], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+101], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+102], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+103], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+104], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+105], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+106], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+107], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+108], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+109], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+110], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+111], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+112], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+113], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+114], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+115], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+116], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+117], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+118], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+119], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+120], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+121], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+122], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+123], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+124], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+125], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+126], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+127], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+136], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+137], acc197        // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+138], acc201        // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+139], acc205        // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+140], acc209        // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+141], acc213        // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+142], acc217        // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+143], acc221        // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+144], acc225        // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+145], acc229        // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+146], acc233        // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+147], acc237        // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+148], acc241        // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+149], acc245        // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+150], acc249        // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+151], acc253        // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+152], acc2          // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+153], acc6          // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+154], acc10         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+155], acc14         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+156], acc18         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+157], acc22         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+158], acc26         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+159], acc30         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+160], acc34         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+161], acc38         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+162], acc42         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+163], acc46         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+164], acc50         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+165], acc54         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+166], acc58         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+167], acc62         // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+168], acc66         // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+169], acc70         // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+170], acc74         // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+171], acc78         // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+172], acc82         // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+173], acc86         // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+174], acc90         // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+175], acc94         // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+176], acc98         // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+177], acc102        // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+178], acc106        // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+179], acc110        // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+180], acc114        // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+181], acc118        // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+182], acc122        // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+183], acc126        // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+184], acc130        // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+185], acc134        // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+186], acc138        // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+187], acc142        // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+188], acc146        // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+189], acc150        // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+190], acc154        // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+191], acc158        // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+192], acc162        // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+193], acc166        // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+194], acc170        // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+195], acc174        // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+196], acc178        // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+197], acc182        // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+198], acc186        // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+199], acc190        // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+200], acc194        // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+201], acc198        // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+202], acc202        // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+203], acc206        // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+204], acc210        // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+205], acc214        // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+206], acc218        // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+207], acc222        // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+208], acc226        // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+209], acc230        // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+210], acc234        // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+211], acc238        // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+212], acc242        // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+213], acc246        // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+214], acc250        // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+215], acc254        // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+176:vgprValuC+176+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+176:vgprValuC+176+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+178:vgprValuC+178+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+178:vgprValuC+178+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+180:vgprValuC+180+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+180:vgprValuC+180+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+182:vgprValuC+182+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+182:vgprValuC+182+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+184:vgprValuC+184+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+184:vgprValuC+184+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+186:vgprValuC+186+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+186:vgprValuC+186+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+188:vgprValuC+188+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+188:vgprValuC+188+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+190:vgprValuC+190+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+190:vgprValuC+190+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+192:vgprValuC+192+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+192:vgprValuC+192+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+194:vgprValuC+194+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+194:vgprValuC+194+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+196:vgprValuC+196+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+196:vgprValuC+196+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+198:vgprValuC+198+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+198:vgprValuC+198+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+200:vgprValuC+200+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+200:vgprValuC+200+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+202:vgprValuC+202+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+202:vgprValuC+202+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+204:vgprValuC+204+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+204:vgprValuC+204+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+206:vgprValuC+206+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+206:vgprValuC+206+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+208:vgprValuC+208+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+208:vgprValuC+208+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+210:vgprValuC+210+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+210:vgprValuC+210+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+212:vgprValuC+212+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+212:vgprValuC+212+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+214:vgprValuC+214+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+214:vgprValuC+214+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[32:35], v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[40:43], v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[48:51], v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[56:59], v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[64:67], v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[72:75], v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[80:83], v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[88:91], v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+97] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v97, v[vgprValuC+98], v[vgprValuC+99] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v98, v[vgprValuC+100], v[vgprValuC+101] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v99, v[vgprValuC+102], v[vgprValuC+103] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[96:99], v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+105] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v105, v[vgprValuC+106], v[vgprValuC+107] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v106, v[vgprValuC+108], v[vgprValuC+109] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v107, v[vgprValuC+110], v[vgprValuC+111] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[104:107], v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+113] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v113, v[vgprValuC+114], v[vgprValuC+115] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v114, v[vgprValuC+116], v[vgprValuC+117] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v115, v[vgprValuC+118], v[vgprValuC+119] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[112:115], v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+121] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v121, v[vgprValuC+122], v[vgprValuC+123] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v122, v[vgprValuC+124], v[vgprValuC+125] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v123, v[vgprValuC+126], v[vgprValuC+127] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[120:123], v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v136, v[vgprValuC+136], v[vgprValuC+137] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v137, v[vgprValuC+138], v[vgprValuC+139] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v138, v[vgprValuC+140], v[vgprValuC+141] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v139, v[vgprValuC+142], v[vgprValuC+143] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[136:139], v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v144, v[vgprValuC+144], v[vgprValuC+145] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v145, v[vgprValuC+146], v[vgprValuC+147] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v146, v[vgprValuC+148], v[vgprValuC+149] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v147, v[vgprValuC+150], v[vgprValuC+151] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[144:147], v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v152, v[vgprValuC+152], v[vgprValuC+153] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v153, v[vgprValuC+154], v[vgprValuC+155] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v154, v[vgprValuC+156], v[vgprValuC+157] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v155, v[vgprValuC+158], v[vgprValuC+159] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[152:155], v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v160, v[vgprValuC+160], v[vgprValuC+161] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v161, v[vgprValuC+162], v[vgprValuC+163] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v162, v[vgprValuC+164], v[vgprValuC+165] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v163, v[vgprValuC+166], v[vgprValuC+167] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[160:163], v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v168, v[vgprValuC+168], v[vgprValuC+169] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v169, v[vgprValuC+170], v[vgprValuC+171] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v170, v[vgprValuC+172], v[vgprValuC+173] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v171, v[vgprValuC+174], v[vgprValuC+175] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[168:171], v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v176, v[vgprValuC+176], v[vgprValuC+177] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v177, v[vgprValuC+178], v[vgprValuC+179] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v178, v[vgprValuC+180], v[vgprValuC+181] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v179, v[vgprValuC+182], v[vgprValuC+183] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[176:179], v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v184, v[vgprValuC+184], v[vgprValuC+185] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v185, v[vgprValuC+186], v[vgprValuC+187] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v186, v[vgprValuC+188], v[vgprValuC+189] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v187, v[vgprValuC+190], v[vgprValuC+191] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[184:187], v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v192, v[vgprValuC+192], v[vgprValuC+193] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v193, v[vgprValuC+194], v[vgprValuC+195] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v194, v[vgprValuC+196], v[vgprValuC+197] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v195, v[vgprValuC+198], v[vgprValuC+199] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[192:195], v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v200, v[vgprValuC+200], v[vgprValuC+201] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v201, v[vgprValuC+202], v[vgprValuC+203] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v202, v[vgprValuC+204], v[vgprValuC+205] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v203, v[vgprValuC+206], v[vgprValuC+207] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[200:203], v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v208, v[vgprValuC+208], v[vgprValuC+209] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v209, v[vgprValuC+210], v[vgprValuC+211] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v210, v[vgprValuC+212], v[vgprValuC+213] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v211, v[vgprValuC+214], v[vgprValuC+215] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[208:211], v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v15, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v15, v10, v15, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v80, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v10, v80, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v81, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v81, v10, v81, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v82, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v10, v82, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v83, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v83, v10, v83, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v84, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v10, v84, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v85, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v85, v10, v85, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v86, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v10, v86, s[34:35]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+16], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+17], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+18], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+19], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+20], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+21], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+22], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+23], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+24], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+25], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+26], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+27], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+28], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+29], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+30], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+31], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+32], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+33], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+34], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+35], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+36], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+37], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+38], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+39], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+40], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+41], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+42], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+43], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+44], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+45], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+46], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+47], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+48], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+49], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+50], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+51], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+52], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+53], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+54], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+55], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+56], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+57], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+58], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+59], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+60], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+61], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+62], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+63], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+64], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+65], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+66], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+67], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+68], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+69], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+70], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+71], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+72], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+73], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+74], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+75], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+76], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+77], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+78], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+79], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+17] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v17, v[vgprValuC+18], v[vgprValuC+19] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v18, v[vgprValuC+20], v[vgprValuC+21] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v19, v[vgprValuC+22], v[vgprValuC+23] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[16:19], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[32:35], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[40:43], v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[48:51], v83, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[56:59], v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[64:67], v85, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[72:75], v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End_2                            // jump to end
label_GW_B0_E1_M_1:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=114 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1); (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v129, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v129, v10, v129, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v130, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v10, v130, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v131, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v131, v10, v131, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v135, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v10, v135, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v136, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v10, v136, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v137, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v10, v137, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v138, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v10, v138, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v139, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v10, v139, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v140, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v10, v140, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v141, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v10, v141, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v142, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v10, v142, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v143, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v10, v143, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v144, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v10, v144, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v145, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v10, v145, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v146, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v146, v10, v146, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v147, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v10, v147, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v148, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v148, v10, v148, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v149, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v10, v149, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v150, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v150, v10, v150, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v151, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v10, v151, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v152, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v152, v10, v152, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v153, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v10, v153, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v154, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v154, v10, v154, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v155, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v10, v155, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v156, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v156, v10, v156, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v157, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v10, v157, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v158, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v158, v10, v158, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v159, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v10, v159, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v160, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v160, v10, v160, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v161, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v10, v161, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v162, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v162, v10, v162, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v163, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v10, v163, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v164, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v164, v10, v164, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v165, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v10, v165, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v166, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v166, v10, v166, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v167, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v10, v167, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v168, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v168, v10, v168, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v169, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v10, v169, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v170, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v170, v10, v170, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v171, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v171, v10, v171, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v172, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v172, v10, v172, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v173, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v173, v10, v173, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v174, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v174, v10, v174, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v175, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v175, v10, v175, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v176, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v176, v10, v176, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v177, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v177, v10, v177, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v178, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v178, v10, v178, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v179, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v179, v10, v179, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v180, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v180, v10, v180, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v181, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v181, v10, v181, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v182, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v182, v10, v182, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v183, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v183, v10, v183, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v184, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v184, v10, v184, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v185, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v185, v10, v185, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v186, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v186, v10, v186, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v187, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v187, v10, v187, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v188, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v188, v10, v188, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v189, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v189, v10, v189, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v190, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v190, v10, v190, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v191, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v191, v10, v191, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v192, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v192, v10, v192, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v193, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v193, v10, v193, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v194, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v194, v10, v194, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v195, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v195, v10, v195, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v196, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v196, v10, v196, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v197, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v197, v10, v197, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v198, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v198, v10, v198, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v199, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v199, v10, v199, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v200, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v200, v10, v200, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v201, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v201, v10, v201, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v202, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v202, v10, v202, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v203, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v203, v10, v203, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v204, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v204, v10, v204, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v205, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v205, v10, v205, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v206, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v206, v10, v206, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v207, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v207, v10, v207, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v208, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v208, v10, v208, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v209, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v209, v10, v209, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v210, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v210, v10, v210, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v211, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v211, v10, v211, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v212, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v212, v10, v212, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v213, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v213, v10, v213, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v214, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v214, v10, v214, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v215, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v215, v10, v215, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v216, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v216, v10, v216, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v217, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v217, v10, v217, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v218, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v218, v10, v218, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v219, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v219, v10, v219, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v220, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v220, v10, v220, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v221, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v221, v10, v221, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v222, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v222, v10, v222, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v223, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v223, v10, v223, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v224, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v224, v10, v224, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v225, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v225, v10, v225, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v226, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v226, v10, v226, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v227, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v227, v10, v227, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v228, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v228, v10, v228, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v229, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v229, v10, v229, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v230, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v230, v10, v230, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v231, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v231, v10, v231, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v232, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v232, v10, v232, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v233, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v233, v10, v233, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v234, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v234, v10, v234, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v235, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v235, v10, v235, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v236, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v236, v10, v236, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v237, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v237, v10, v237, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v238, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v238, v10, v238, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v239, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v239, v10, v239, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v240, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v240, v10, v240, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v241, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v241, v10, v241, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v242, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v242, v10, v242, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v243, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v243, v10, v243, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v244, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v244, v10, v244, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v245, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v245, v10, v245, s[34:35]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+15], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+16], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+17], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+18], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+19], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+20], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+21], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+22], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+23], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+24], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+25], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+26], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+27], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+28], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+29], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+30], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+31], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+32], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+33], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+34], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+35], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+36], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+37], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+38], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+39], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+40], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+41], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+42], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+43], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+44], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+45], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+46], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+47], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+48], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+49], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+50], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+51], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+52], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+53], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+54], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+55], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+56], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+57], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+58], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+59], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+60], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+61], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+62], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+63], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+64], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+65], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+66], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+67], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+68], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+69], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+70], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+71], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+72], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+73], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+74], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+75], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+76], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+77], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+78], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+79], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+80], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+81], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+82], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+83], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+84], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+85], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+86], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+87], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+88], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+89], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+90], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+91], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+92], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+93], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+94], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+95], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+96], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+97], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+98], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+99], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+100], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+101], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+102], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+103], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+104], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+105], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+106], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+107], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+108], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+109], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+110], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+111], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+112], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+113], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+114], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+115], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+116], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+117], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+118], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+119], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+120], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+121], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+122], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+123], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+124], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+125], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+126], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+127], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+128], acc197        // copy acc to vreg[113]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4), (0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1)] */
v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_mul_f32 v[vgprValuC+128], s[sgprAlpha], v[vgprValuC+128] // *= alpha

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1
buffer_store_short v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1
buffer_store_short v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1
buffer_store_short v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1
buffer_store_short v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1
buffer_store_short v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1
buffer_store_short v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1
buffer_store_short v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1
buffer_store_short v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1
buffer_store_short v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1
buffer_store_short v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1
buffer_store_short v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1
buffer_store_short v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1
buffer_store_short v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1
buffer_store_short v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1
buffer_store_short v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1
buffer_store_short v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1
buffer_store_short v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1
buffer_store_short v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1
buffer_store_short v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1
buffer_store_short v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1
buffer_store_short v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1
buffer_store_short v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1
buffer_store_short v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1
buffer_store_short v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1
buffer_store_short v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1
buffer_store_short v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1
buffer_store_short v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1
buffer_store_short v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1
buffer_store_short v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1
buffer_store_short v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1
buffer_store_short v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v91, v[vgprValuC+91], v[vgprValuC+91] // convert C to bf16 in gwvw==1
buffer_store_short v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v92, v[vgprValuC+92], v[vgprValuC+92] // convert C to bf16 in gwvw==1
buffer_store_short v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v93, v[vgprValuC+93], v[vgprValuC+93] // convert C to bf16 in gwvw==1
buffer_store_short v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v94, v[vgprValuC+94], v[vgprValuC+94] // convert C to bf16 in gwvw==1
buffer_store_short v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v95, v[vgprValuC+95], v[vgprValuC+95] // convert C to bf16 in gwvw==1
buffer_store_short v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+96] // convert C to bf16 in gwvw==1
buffer_store_short v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v97, v[vgprValuC+97], v[vgprValuC+97] // convert C to bf16 in gwvw==1
buffer_store_short v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v98, v[vgprValuC+98], v[vgprValuC+98] // convert C to bf16 in gwvw==1
buffer_store_short v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v99, v[vgprValuC+99], v[vgprValuC+99] // convert C to bf16 in gwvw==1
buffer_store_short v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v100, v[vgprValuC+100], v[vgprValuC+100] // convert C to bf16 in gwvw==1
buffer_store_short v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v101, v[vgprValuC+101], v[vgprValuC+101] // convert C to bf16 in gwvw==1
buffer_store_short v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v102, v[vgprValuC+102], v[vgprValuC+102] // convert C to bf16 in gwvw==1
buffer_store_short v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v103, v[vgprValuC+103], v[vgprValuC+103] // convert C to bf16 in gwvw==1
buffer_store_short v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+104] // convert C to bf16 in gwvw==1
buffer_store_short v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v105, v[vgprValuC+105], v[vgprValuC+105] // convert C to bf16 in gwvw==1
buffer_store_short v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v106, v[vgprValuC+106], v[vgprValuC+106] // convert C to bf16 in gwvw==1
buffer_store_short v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v107, v[vgprValuC+107], v[vgprValuC+107] // convert C to bf16 in gwvw==1
buffer_store_short v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v108, v[vgprValuC+108], v[vgprValuC+108] // convert C to bf16 in gwvw==1
buffer_store_short v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v109, v[vgprValuC+109], v[vgprValuC+109] // convert C to bf16 in gwvw==1
buffer_store_short v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v110, v[vgprValuC+110], v[vgprValuC+110] // convert C to bf16 in gwvw==1
buffer_store_short v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v111, v[vgprValuC+111], v[vgprValuC+111] // convert C to bf16 in gwvw==1
buffer_store_short v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+112] // convert C to bf16 in gwvw==1
buffer_store_short v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v113, v[vgprValuC+113], v[vgprValuC+113] // convert C to bf16 in gwvw==1
buffer_store_short v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v114, v[vgprValuC+114], v[vgprValuC+114] // convert C to bf16 in gwvw==1
buffer_store_short v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v115, v[vgprValuC+115], v[vgprValuC+115] // convert C to bf16 in gwvw==1
buffer_store_short v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v116, v[vgprValuC+116], v[vgprValuC+116] // convert C to bf16 in gwvw==1
buffer_store_short v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v117, v[vgprValuC+117], v[vgprValuC+117] // convert C to bf16 in gwvw==1
buffer_store_short v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v118, v[vgprValuC+118], v[vgprValuC+118] // convert C to bf16 in gwvw==1
buffer_store_short v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v119, v[vgprValuC+119], v[vgprValuC+119] // convert C to bf16 in gwvw==1
buffer_store_short v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+120] // convert C to bf16 in gwvw==1
buffer_store_short v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v121, v[vgprValuC+121], v[vgprValuC+121] // convert C to bf16 in gwvw==1
buffer_store_short v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v122, v[vgprValuC+122], v[vgprValuC+122] // convert C to bf16 in gwvw==1
buffer_store_short v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v123, v[vgprValuC+123], v[vgprValuC+123] // convert C to bf16 in gwvw==1
buffer_store_short v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v124, v[vgprValuC+124], v[vgprValuC+124] // convert C to bf16 in gwvw==1
buffer_store_short v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v125, v[vgprValuC+125], v[vgprValuC+125] // convert C to bf16 in gwvw==1
buffer_store_short v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v126, v[vgprValuC+126], v[vgprValuC+126] // convert C to bf16 in gwvw==1
buffer_store_short v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v127, v[vgprValuC+127], v[vgprValuC+127] // convert C to bf16 in gwvw==1
buffer_store_short v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v128, v[vgprValuC+128], v[vgprValuC+128] // convert C to bf16 in gwvw==1
buffer_store_short v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1); (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,14,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v129, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v129, v10, v129, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v130, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v10, v130, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v131, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v131, v10, v131, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v135, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v10, v135, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v136, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v136, v10, v136, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v137, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v10, v137, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v138, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v138, v10, v138, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v139, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v10, v139, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v140, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v140, v10, v140, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v141, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v10, v141, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v142, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v142, v10, v142, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v143, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v10, v143, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v144, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v144, v10, v144, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v145, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v10, v145, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v146, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v146, v10, v146, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v147, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v10, v147, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v148, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v148, v10, v148, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v149, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v10, v149, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v150, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v150, v10, v150, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v151, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v10, v151, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v152, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v152, v10, v152, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v153, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v10, v153, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v154, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v154, v10, v154, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v155, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v10, v155, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v156, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v156, v10, v156, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v157, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v10, v157, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v158, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v158, v10, v158, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v159, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v10, v159, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v160, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v160, v10, v160, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v161, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v10, v161, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v162, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v162, v10, v162, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v163, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v10, v163, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v164, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v164, v10, v164, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v165, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v10, v165, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v166, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v166, v10, v166, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v167, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v10, v167, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v168, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v168, v10, v168, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v169, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v10, v169, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v170, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v170, v10, v170, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v171, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v171, v10, v171, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v172, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v172, v10, v172, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v173, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v173, v10, v173, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v174, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v174, v10, v174, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v175, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v175, v10, v175, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v176, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v176, v10, v176, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v177, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v177, v10, v177, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v178, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v178, v10, v178, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v179, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v179, v10, v179, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v180, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v180, v10, v180, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v181, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v181, v10, v181, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v182, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v182, v10, v182, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v183, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v183, v10, v183, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v184, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v184, v10, v184, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v185, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v185, v10, v185, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v186, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v186, v10, v186, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v187, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v187, v10, v187, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v188, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v188, v10, v188, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v189, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v189, v10, v189, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v190, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v190, v10, v190, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v191, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v191, v10, v191, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v192, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v192, v10, v192, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v193, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v193, v10, v193, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v194, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v194, v10, v194, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v195, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v195, v10, v195, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v196, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v196, v10, v196, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v197, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v197, v10, v197, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v198, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v198, v10, v198, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v199, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v199, v10, v199, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v200, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v200, v10, v200, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v201, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v201, v10, v201, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v202, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v202, v10, v202, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v203, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v203, v10, v203, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v204, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v204, v10, v204, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v205, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v205, v10, v205, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v206, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v206, v10, v206, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v207, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v207, v10, v207, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v208, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v208, v10, v208, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v209, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v209, v10, v209, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v210, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v210, v10, v210, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v211, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v211, v10, v211, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v212, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v212, v10, v212, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v213, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v213, v10, v213, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v214, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v214, v10, v214, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v215, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v215, v10, v215, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v216, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v216, v10, v216, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v217, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v217, v10, v217, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v218, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v218, v10, v218, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v219, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v219, v10, v219, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v220, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v220, v10, v220, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v221, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v221, v10, v221, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v222, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v222, v10, v222, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v223, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v223, v10, v223, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v224, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v224, v10, v224, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v225, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v225, v10, v225, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v226, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v226, v10, v226, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v227, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v227, v10, v227, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v228, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v228, v10, v228, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v229, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v229, v10, v229, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v230, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v230, v10, v230, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v231, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v231, v10, v231, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v232, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v232, v10, v232, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v233, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v233, v10, v233, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v234, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v234, v10, v234, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v235, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v235, v10, v235, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v236, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v236, v10, v236, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v237, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v237, v10, v237, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v238, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v238, v10, v238, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v239, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v239, v10, v239, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v240, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v240, v10, v240, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v241, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v241, v10, v241, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v242, v7, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v242, v10, v242, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v243, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v243, v10, v243, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v244, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v244, v10, v244, s[34:35]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v245, v7, v8, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v245, v10, v245, s[34:35]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+15], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+16], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+17], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+18], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+19], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+20], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+21], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+22], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+23], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+24], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+25], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+26], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+27], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+28], acc253         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+29], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+30], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+31], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+32], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+33], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+34], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+35], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+36], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+37], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+38], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+39], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+40], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+41], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+42], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+43], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+44], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+45], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+46], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+47], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+48], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+49], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+50], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+51], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+52], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+53], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+54], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+55], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+56], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+57], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+58], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+59], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+60], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+61], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+62], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+63], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+64], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+65], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+66], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+67], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+68], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+69], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+70], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+71], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+72], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+73], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+74], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+75], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+76], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+77], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+78], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+79], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+80], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+81], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+82], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+83], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+84], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+85], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+86], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+87], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+88], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+89], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+90], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+91], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+92], acc254         // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+93], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+94], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+95], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+96], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+97], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+98], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+99], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+100], acc31         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+101], acc35         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+102], acc39         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+103], acc43         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+104], acc47         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+105], acc51         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+106], acc55         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+107], acc59         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+108], acc63         // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+109], acc67         // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+110], acc71         // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+111], acc75         // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+112], acc79         // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+113], acc83         // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+114], acc87         // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+115], acc91         // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+116], acc95         // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+117], acc99         // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+118], acc103        // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+119], acc107        // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+120], acc111        // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+121], acc115        // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+122], acc119        // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+123], acc123        // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+124], acc127        // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+125], acc131        // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+126], acc135        // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+127], acc139        // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+128], acc143        // copy acc to vreg[227]

/* rC *= alpha batchElements=[(0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6), (0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3)] */
v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_mul_f32 v[vgprValuC+128], s[sgprAlpha], v[vgprValuC+128] // *= alpha

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1
buffer_store_short v15, v129, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1
buffer_store_short v16, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v131, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v136, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v138, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v140, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v142, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v144, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v146, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v148, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v150, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v152, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v154, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v156, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v158, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v160, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v162, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v164, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v166, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v168, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v170, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v171, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v172, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v173, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v174, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v175, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v176, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v177, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v178, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v62, v[vgprValuC+62], v[vgprValuC+62] // convert C to bf16 in gwvw==1
buffer_store_short v62, v179, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v63, v[vgprValuC+63], v[vgprValuC+63] // convert C to bf16 in gwvw==1
buffer_store_short v63, v180, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+64] // convert C to bf16 in gwvw==1
buffer_store_short v64, v181, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v65, v[vgprValuC+65], v[vgprValuC+65] // convert C to bf16 in gwvw==1
buffer_store_short v65, v182, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v66, v[vgprValuC+66], v[vgprValuC+66] // convert C to bf16 in gwvw==1
buffer_store_short v66, v183, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v67, v[vgprValuC+67], v[vgprValuC+67] // convert C to bf16 in gwvw==1
buffer_store_short v67, v184, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v68, v[vgprValuC+68], v[vgprValuC+68] // convert C to bf16 in gwvw==1
buffer_store_short v68, v185, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v69, v[vgprValuC+69], v[vgprValuC+69] // convert C to bf16 in gwvw==1
buffer_store_short v69, v186, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v70, v[vgprValuC+70], v[vgprValuC+70] // convert C to bf16 in gwvw==1
buffer_store_short v70, v187, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v71, v[vgprValuC+71], v[vgprValuC+71] // convert C to bf16 in gwvw==1
buffer_store_short v71, v188, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+72] // convert C to bf16 in gwvw==1
buffer_store_short v72, v189, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v73, v[vgprValuC+73], v[vgprValuC+73] // convert C to bf16 in gwvw==1
buffer_store_short v73, v190, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v74, v[vgprValuC+74], v[vgprValuC+74] // convert C to bf16 in gwvw==1
buffer_store_short v74, v191, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v75, v[vgprValuC+75], v[vgprValuC+75] // convert C to bf16 in gwvw==1
buffer_store_short v75, v192, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v76, v[vgprValuC+76], v[vgprValuC+76] // convert C to bf16 in gwvw==1
buffer_store_short v76, v193, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v77, v[vgprValuC+77], v[vgprValuC+77] // convert C to bf16 in gwvw==1
buffer_store_short v77, v194, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v78, v[vgprValuC+78], v[vgprValuC+78] // convert C to bf16 in gwvw==1
buffer_store_short v78, v195, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v79, v[vgprValuC+79], v[vgprValuC+79] // convert C to bf16 in gwvw==1
buffer_store_short v79, v196, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+80] // convert C to bf16 in gwvw==1
buffer_store_short v80, v197, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v81, v[vgprValuC+81], v[vgprValuC+81] // convert C to bf16 in gwvw==1
buffer_store_short v81, v198, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v82, v[vgprValuC+82], v[vgprValuC+82] // convert C to bf16 in gwvw==1
buffer_store_short v82, v199, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v83, v[vgprValuC+83], v[vgprValuC+83] // convert C to bf16 in gwvw==1
buffer_store_short v83, v200, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v84, v[vgprValuC+84], v[vgprValuC+84] // convert C to bf16 in gwvw==1
buffer_store_short v84, v201, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v85, v[vgprValuC+85], v[vgprValuC+85] // convert C to bf16 in gwvw==1
buffer_store_short v85, v202, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v86, v[vgprValuC+86], v[vgprValuC+86] // convert C to bf16 in gwvw==1
buffer_store_short v86, v203, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v87, v[vgprValuC+87], v[vgprValuC+87] // convert C to bf16 in gwvw==1
buffer_store_short v87, v204, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+88] // convert C to bf16 in gwvw==1
buffer_store_short v88, v205, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v89, v[vgprValuC+89], v[vgprValuC+89] // convert C to bf16 in gwvw==1
buffer_store_short v89, v206, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v90, v[vgprValuC+90], v[vgprValuC+90] // convert C to bf16 in gwvw==1
buffer_store_short v90, v207, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v91, v[vgprValuC+91], v[vgprValuC+91] // convert C to bf16 in gwvw==1
buffer_store_short v91, v208, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v92, v[vgprValuC+92], v[vgprValuC+92] // convert C to bf16 in gwvw==1
buffer_store_short v92, v209, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v93, v[vgprValuC+93], v[vgprValuC+93] // convert C to bf16 in gwvw==1
buffer_store_short v93, v210, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v94, v[vgprValuC+94], v[vgprValuC+94] // convert C to bf16 in gwvw==1
buffer_store_short v94, v211, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v95, v[vgprValuC+95], v[vgprValuC+95] // convert C to bf16 in gwvw==1
buffer_store_short v95, v212, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v96, v[vgprValuC+96], v[vgprValuC+96] // convert C to bf16 in gwvw==1
buffer_store_short v96, v213, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v97, v[vgprValuC+97], v[vgprValuC+97] // convert C to bf16 in gwvw==1
buffer_store_short v97, v214, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v98, v[vgprValuC+98], v[vgprValuC+98] // convert C to bf16 in gwvw==1
buffer_store_short v98, v215, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v99, v[vgprValuC+99], v[vgprValuC+99] // convert C to bf16 in gwvw==1
buffer_store_short v99, v216, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v100, v[vgprValuC+100], v[vgprValuC+100] // convert C to bf16 in gwvw==1
buffer_store_short v100, v217, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v101, v[vgprValuC+101], v[vgprValuC+101] // convert C to bf16 in gwvw==1
buffer_store_short v101, v218, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v102, v[vgprValuC+102], v[vgprValuC+102] // convert C to bf16 in gwvw==1
buffer_store_short v102, v219, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v103, v[vgprValuC+103], v[vgprValuC+103] // convert C to bf16 in gwvw==1
buffer_store_short v103, v220, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v104, v[vgprValuC+104], v[vgprValuC+104] // convert C to bf16 in gwvw==1
buffer_store_short v104, v221, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v105, v[vgprValuC+105], v[vgprValuC+105] // convert C to bf16 in gwvw==1
buffer_store_short v105, v222, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v106, v[vgprValuC+106], v[vgprValuC+106] // convert C to bf16 in gwvw==1
buffer_store_short v106, v223, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v107, v[vgprValuC+107], v[vgprValuC+107] // convert C to bf16 in gwvw==1
buffer_store_short v107, v224, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v108, v[vgprValuC+108], v[vgprValuC+108] // convert C to bf16 in gwvw==1
buffer_store_short v108, v225, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v109, v[vgprValuC+109], v[vgprValuC+109] // convert C to bf16 in gwvw==1
buffer_store_short v109, v226, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v110, v[vgprValuC+110], v[vgprValuC+110] // convert C to bf16 in gwvw==1
buffer_store_short v110, v227, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v111, v[vgprValuC+111], v[vgprValuC+111] // convert C to bf16 in gwvw==1
buffer_store_short v111, v228, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v112, v[vgprValuC+112], v[vgprValuC+112] // convert C to bf16 in gwvw==1
buffer_store_short v112, v229, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v113, v[vgprValuC+113], v[vgprValuC+113] // convert C to bf16 in gwvw==1
buffer_store_short v113, v230, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v114, v[vgprValuC+114], v[vgprValuC+114] // convert C to bf16 in gwvw==1
buffer_store_short v114, v231, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v115, v[vgprValuC+115], v[vgprValuC+115] // convert C to bf16 in gwvw==1
buffer_store_short v115, v232, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v116, v[vgprValuC+116], v[vgprValuC+116] // convert C to bf16 in gwvw==1
buffer_store_short v116, v233, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v117, v[vgprValuC+117], v[vgprValuC+117] // convert C to bf16 in gwvw==1
buffer_store_short v117, v234, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v118, v[vgprValuC+118], v[vgprValuC+118] // convert C to bf16 in gwvw==1
buffer_store_short v118, v235, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v119, v[vgprValuC+119], v[vgprValuC+119] // convert C to bf16 in gwvw==1
buffer_store_short v119, v236, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v120, v[vgprValuC+120], v[vgprValuC+120] // convert C to bf16 in gwvw==1
buffer_store_short v120, v237, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v121, v[vgprValuC+121], v[vgprValuC+121] // convert C to bf16 in gwvw==1
buffer_store_short v121, v238, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v122, v[vgprValuC+122], v[vgprValuC+122] // convert C to bf16 in gwvw==1
buffer_store_short v122, v239, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v123, v[vgprValuC+123], v[vgprValuC+123] // convert C to bf16 in gwvw==1
buffer_store_short v123, v240, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v124, v[vgprValuC+124], v[vgprValuC+124] // convert C to bf16 in gwvw==1
buffer_store_short v124, v241, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v125, v[vgprValuC+125], v[vgprValuC+125] // convert C to bf16 in gwvw==1
buffer_store_short v125, v242, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v126, v[vgprValuC+126], v[vgprValuC+126] // convert C to bf16 in gwvw==1
buffer_store_short v126, v243, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v127, v[vgprValuC+127], v[vgprValuC+127] // convert C to bf16 in gwvw==1
buffer_store_short v127, v244, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v128, v[vgprValuC+128], v[vgprValuC+128] // convert C to bf16 in gwvw==1
buffer_store_short v128, v245, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v10, BufferOOB
/* (d1,vc1,d0,vc0)=(0,28,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v43, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v43, v10, v43, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v44, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v44, v10, v44, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v45, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v45, v10, v45, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v46, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v46, v10, v46, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v47, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v47, v10, v47, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v48, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v48, v10, v48, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v49, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v49, v10, v49, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v50, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v50, v10, v50, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v51, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v51, v10, v51, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v52, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v52, v10, v52, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v53, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v53, v10, v53, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v54, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v54, v10, v54, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v55, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v55, v10, v55, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v56, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v56, v10, v56, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v57, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v57, v10, v57, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v58, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v10, v58, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v59, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v59, v10, v59, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v60, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v60, v10, v60, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v61, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v61, v10, v61, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v62, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v10, v62, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v5, vcc, v5, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v6, v6, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v7, v7, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[30:31], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v63, v7, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v63, v10, v63, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,1) */
v_add_co_u32 v8, vcc, v4, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v64, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v10, v64, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,2) */
v_add_co_u32 v8, vcc, v4, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v65, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v65, v10, v65, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,3) */
v_add_co_u32 v8, vcc, v4, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v66, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v10, v66, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,4) */
v_add_co_u32 v8, vcc, v4, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v67, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v67, v10, v67, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,5) */
v_add_co_u32 v8, vcc, v4, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v68, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v10, v68, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,6) */
v_add_co_u32 v8, vcc, v4, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v69, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v69, v10, v69, s[34:35]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,7) */
v_add_co_u32 v8, vcc, v4, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[30:31], v8, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[34:35], v5, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[34:35], s[30:31], s[34:35]             // in0 && in1
v_add_lshl_u32 v70, v7, v8, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v10, v70, s[34:35]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+15], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+16], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+17], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+18], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+19], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+20], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+21], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+22], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+23], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+24], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+25], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+26], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+27], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+28], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+29], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+30], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+31], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+32], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+33], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+34], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+35], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+36], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+37], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+38], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+39], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+40], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+41], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+42], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */
v_mul_f32 v[vgprValuC+15], s[sgprAlpha], v[vgprValuC+15] // *= alpha
v_pk_mul_f32 v[vgprValuC+16:vgprValuC+16+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_mul_f32 v[vgprValuC+42], s[sgprAlpha], v[vgprValuC+42] // *= alpha

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16
v_cvt_pk_bf16_f32 v15, v[vgprValuC+15], v[vgprValuC+15] // convert C to bf16 in gwvw==1
buffer_store_short v15, v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v16, v[vgprValuC+16], v[vgprValuC+16] // convert C to bf16 in gwvw==1
buffer_store_short v16, v44, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v45, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v46, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v47, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v48, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v49, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v51, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v52, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v53, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v55, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v56, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v57, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v59, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v65, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v67, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v69, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End_2                            // jump to end
label_GW_Beta_2:
s_and_b32 s30, 255, s[sgprSizeI]                   // s30 = s[sgprSizeI] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s31                // wg0 >= nwg0-1 ?
s_cselect_b32 s30, s30, 0                          // set rMT0
s_cmpk_gt_u32 s30, 0                               // rMT0 > 0
s_cbranch_scc1 label_GW_B1_E1_M                    // jump if edges required
s_and_b32 s30, 255, s[sgprSizeJ]                   // s30 = s[sgprSizeJ] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s31                // wg1 >= nwg1-1
s_cselect_b32 s30, s30, 0                          // set rMT1
s_cmpk_gt_u32 s30, 0                               // rMT1 > 0
s_cbranch_scc1 label_GW_B1_E1_N                    // jump if edges required
label_GW_B1_E0:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=18 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_add_lshl_u32 v16, v6, v4, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4
buffer_load_dwordx4 v[20:23], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[128:131], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[176:179], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[180:183], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[184:187], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[188:191], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[192:195], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[196:199], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[200:203], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[204:207], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[208:211], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[212:215], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[216:219], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[220:223], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[224:227], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[228:231], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[232:235], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
s_lshl_b32 s12, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[236:239], v16, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
v_add_lshl_u32 v15, v7, v4, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=4, coord0Vgpr=4
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+80], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+81], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+82], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+83], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+84], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+85], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+86], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+87], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+88], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+89], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+90], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+91], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+92], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+93], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+94], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+95], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+96], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+97], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+98], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+99], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+100], acc49         // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+101], acc53         // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+102], acc57         // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+103], acc61         // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+104], acc65         // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+105], acc69         // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+106], acc73         // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+107], acc77         // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+108], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+109], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+110], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+111], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+112], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+113], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+114], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+115], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+116], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+117], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+118], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+119], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+120], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+121], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+122], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+123], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+124], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+125], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+126], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+127], acc157        // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+136], acc161        // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+137], acc165        // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+138], acc169        // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+139], acc173        // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+140], acc177        // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+141], acc181        // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+142], acc185        // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+143], acc189        // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+144], acc193        // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+145], acc197        // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+146], acc201        // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+147], acc205        // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+148], acc209        // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+149], acc213        // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+150], acc217        // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+151], acc221        // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+152], acc225        // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+153], acc229        // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+154], acc233        // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+155], acc237        // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+156], acc241        // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+157], acc245        // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+158], acc249        // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+159], acc253        // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+160], acc2          // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+161], acc6          // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+162], acc10         // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+163], acc14         // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+164], acc18         // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+165], acc22         // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+166], acc26         // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+167], acc30         // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+168], acc34         // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+169], acc38         // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+170], acc42         // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+171], acc46         // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+172], acc50         // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+173], acc54         // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+174], acc58         // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+175], acc62         // copy acc to vreg[143]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0), (0, 0, 16, 0), (0, 0, 17, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+88:vgprValuC+88+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+90:vgprValuC+90+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+92:vgprValuC+92+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+94:vgprValuC+94+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+96:vgprValuC+96+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+98:vgprValuC+98+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+100:vgprValuC+100+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+102:vgprValuC+102+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+104:vgprValuC+104+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+106:vgprValuC+106+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+108:vgprValuC+108+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+110:vgprValuC+110+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+112:vgprValuC+112+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+114:vgprValuC+114+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+116:vgprValuC+116+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+118:vgprValuC+118+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+120:vgprValuC+120+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+122:vgprValuC+122+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+124:vgprValuC+124+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+126:vgprValuC+126+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+136:vgprValuC+136+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+138:vgprValuC+138+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+140:vgprValuC+140+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+142:vgprValuC+142+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+144:vgprValuC+144+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+144:vgprValuC+144+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+146:vgprValuC+146+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+146:vgprValuC+146+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+148:vgprValuC+148+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+148:vgprValuC+148+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+150:vgprValuC+150+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+150:vgprValuC+150+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+152:vgprValuC+152+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+152:vgprValuC+152+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+154:vgprValuC+154+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+154:vgprValuC+154+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+156:vgprValuC+156+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+156:vgprValuC+156+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+158:vgprValuC+158+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+158:vgprValuC+158+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+160:vgprValuC+160+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+160:vgprValuC+160+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+162:vgprValuC+162+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+162:vgprValuC+162+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+164:vgprValuC+164+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+164:vgprValuC+164+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+166:vgprValuC+166+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+166:vgprValuC+166+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+168:vgprValuC+168+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+168:vgprValuC+168+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+170:vgprValuC+170+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+170:vgprValuC+170+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+172:vgprValuC+172+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+172:vgprValuC+172+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+174:vgprValuC+174+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+174:vgprValuC+174+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v12, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v13, 0x7fff0000                          // fp32 Nan
v_mov_b32 v14, 0x7fff                              // rounding bias for bfloat16

s_waitcnt vmcnt(17)                                // vmcnt(17) = 18 - 1 (beta) (interleaved)
v_cvt_f32_bf16 v8, v20 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+24], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v20 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+25], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v21 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+26], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v21 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+27], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v22 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+28], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v22 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+29], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v23 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+30], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v23 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+31], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(16) = 18 - 2 (beta) (interleaved)
v_cvt_f32_bf16 v8, v128 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+32], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v128 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+33], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v129 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+34], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v129 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+35], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v130 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+36], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v130 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+37], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v131 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+38], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v131 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+39], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(15) = 18 - 3 (beta) (interleaved)
v_cvt_f32_bf16 v8, v176 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+40], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v176 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+41], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v177 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+42], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v177 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+43], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v178 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+44], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v178 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+45], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v179 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+46], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v179 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+47], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(14) = 18 - 4 (beta) (interleaved)
v_cvt_f32_bf16 v8, v180 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+48], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v180 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+49], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v181 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+50], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v181 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+51], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v182 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+52], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v182 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+53], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v183 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+54], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v183 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+55], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(13) = 18 - 5 (beta) (interleaved)
v_cvt_f32_bf16 v8, v184 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+56], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v184 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+57], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v185 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+58], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v185 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+59], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v186 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+60], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v186 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+61], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v187 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+62], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v187 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+63], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(12) = 18 - 6 (beta) (interleaved)
v_cvt_f32_bf16 v8, v188 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+64], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v188 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+65], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v189 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+66], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v189 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+67], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v190 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+68], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v190 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+69], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v191 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+70], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v191 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+71], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(11) = 18 - 7 (beta) (interleaved)
v_cvt_f32_bf16 v8, v192 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+72], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v192 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+73], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v193 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+74], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v193 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+75], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v194 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+76], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v194 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+77], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v195 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+78], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v195 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+79], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(10) = 18 - 8 (beta) (interleaved)
v_cvt_f32_bf16 v8, v196 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+80], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v196 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+81], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v197 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+82], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v197 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+83], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v198 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+84], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v198 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+85], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v199 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+86], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v199 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+87], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(9) = 18 - 9 (beta) (interleaved)
v_cvt_f32_bf16 v8, v200 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+88], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v200 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+89], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v201 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+90], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v201 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+91], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v202 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+92], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v202 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+93], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v203 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+94], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v203 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+95], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_pk_bf16_f32 v88, v[vgprValuC+88], v[vgprValuC+89] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v89, v[vgprValuC+90], v[vgprValuC+91] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v90, v[vgprValuC+92], v[vgprValuC+93] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v91, v[vgprValuC+94], v[vgprValuC+95] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s12, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[88:91], v15, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(17)                                // vmcnt(8) = 18 - 10 (beta) (interleaved)
v_cvt_f32_bf16 v8, v204 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+96], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v204 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+97], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v205 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+98], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, v205 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+99], v8, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v8, 