/*
 *     Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 */

/* Basically following the order in mma.hpp */
/* Load fragment, m16n16k16 */
__device__ static __inline__ void
__pgi_wmma_ld_a_m16n16k16_rmjr(signed char* frag, signed char* p, int lda)
{
  __hmma_m16n16k16_ld_a((int*)frag, (const int*)p, lda, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_a_m16n16k16_cmjr(signed char* frag, signed char* p, int lda)
{
  __hmma_m16n16k16_ld_a((int*)frag, (const int*)p, lda, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m16n16k16_rmjr(signed char* frag, signed char* p, int ldb)
{
  __hmma_m16n16k16_ld_b((int*)frag, (const int*)p, ldb, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m16n16k16_cmjr(signed char* frag, signed char* p, int ldb)
{
  __hmma_m16n16k16_ld_b((int*)frag, (const int*)p, ldb, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m16n16k16_r2(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m16n16k16_ld_c_f16((int*)frag, (const int*)p, ldc, 0);
  if (layout == 1)
    __hmma_m16n16k16_ld_c_f16((int*)frag, (const int*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m16n16k16_r2_cmjr(signed char* frag, signed char* p, int ldc)
{
  __hmma_m16n16k16_ld_c_f16((int*)frag, (const int*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m16n16k16_r4(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m16n16k16_ld_c_f32((float*)frag, (const float*)p, ldc, 0);
  if (layout == 1)
    __hmma_m16n16k16_ld_c_f32((float*)frag, (const float*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m16n16k16_r4_cmjr(signed char* frag, signed char* p, int ldc)
{
  __hmma_m16n16k16_ld_c_f32((float*)frag, (const float*)p, ldc, 1);
}

/* Load fragment, m32n8k16 */
__device__ static __inline__ void
__pgi_wmma_ld_a_m32n8k16_rmjr(signed char* frag, signed char* p, int lda)
{
  __hmma_m32n8k16_ld_a((int*)frag, (const int*)p, lda, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_a_m32n8k16_cmjr(signed char* frag, signed char* p, int lda)
{
  __hmma_m32n8k16_ld_a((int*)frag, (const int*)p, lda, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m32n8k16_rmjr(signed char* frag, signed char* p, int ldb)
{
  __hmma_m32n8k16_ld_b((int*)frag, (const int*)p, ldb, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m32n8k16_cmjr(signed char* frag, signed char* p, int ldb)
{
  __hmma_m32n8k16_ld_b((int*)frag, (const int*)p, ldb, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m32n8k16_r2(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m32n8k16_ld_c_f16((int*)frag, (const int*)p, ldc, 0);
  if (layout == 1)
    __hmma_m32n8k16_ld_c_f16((int*)frag, (const int*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m32n8k16_r2_cmjr(signed char* frag, signed char* p, int ldc)
{
  __hmma_m32n8k16_ld_c_f16((int*)frag, (const int*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m32n8k16_r4(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m32n8k16_ld_c_f32((float*)frag, (const float*)p, ldc, 0);
  if (layout == 1)
    __hmma_m32n8k16_ld_c_f32((float*)frag, (const float*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m32n8k16_r4_cmjr(signed char* frag, signed char* p, int ldc)
{
  __hmma_m32n8k16_ld_c_f32((float*)frag, (const float*)p, ldc, 1);
}

/* Load fragment, m8n32k16 */
__device__ static __inline__ void
__pgi_wmma_ld_a_m8n32k16_rmjr(signed char* frag, signed char* p, int lda)
{
  __hmma_m8n32k16_ld_a((int*)frag, (const int*)p, lda, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_a_m8n32k16_cmjr(signed char* frag, signed char* p, int lda)
{
  __hmma_m8n32k16_ld_a((int*)frag, (const int*)p, lda, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m8n32k16_rmjr(signed char* frag, signed char* p, int ldb)
{
  __hmma_m8n32k16_ld_b((int*)frag, (const int*)p, ldb, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m8n32k16_cmjr(signed char* frag, signed char* p, int ldb)
{
  __hmma_m8n32k16_ld_b((int*)frag, (const int*)p, ldb, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m8n32k16_r2(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m8n32k16_ld_c_f16((int*)frag, (const int*)p, ldc, 0);
  if (layout == 1)
    __hmma_m8n32k16_ld_c_f16((int*)frag, (const int*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m8n32k16_r2_cmjr(signed char* frag, signed char* p, int ldc)
{
  __hmma_m8n32k16_ld_c_f16((int*)frag, (const int*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m8n32k16_r4(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m8n32k16_ld_c_f32((float*)frag, (const float*)p, ldc, 0);
  if (layout == 1)
    __hmma_m8n32k16_ld_c_f32((float*)frag, (const float*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m8n32k16_r4_cmjr(signed char* frag, signed char* p, int ldc)
{
  __hmma_m8n32k16_ld_c_f32((float*)frag, (const float*)p, ldc, 1);
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
/* Load fragment, m16n16k8 */
__device__ static __inline__ void
__pgi_wmma_ld_a_m16n16k8_rmjr(signed char* frag, signed char* p, int lda)
{
  __mma_tf32_m16n16k8_ld_a((int*)frag, (const int*)p, lda, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_a_m16n16k8_cmjr(signed char* frag, signed char* p, int lda)
{
  __mma_tf32_m16n16k8_ld_a((int*)frag, (const int*)p, lda, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m16n16k8_rmjr(signed char* frag, signed char* p, int ldb)
{
  __mma_tf32_m16n16k8_ld_b((int*)frag, (const int*)p, ldb, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m16n16k8_cmjr(signed char* frag, signed char* p, int ldb)
{
  __mma_tf32_m16n16k8_ld_b((int*)frag, (const int*)p, ldb, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m16n16k8(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __mma_tf32_m16n16k8_ld_c((float*)frag, (const float*)p, ldc, 0);
  if (layout == 1)
    __mma_tf32_m16n16k8_ld_c((float*)frag, (const float*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m16n16k8_cmjr(signed char* frag, signed char* p, int ldc)
{
  __mma_tf32_m16n16k8_ld_c((float*)frag, (const float*)p, ldc, 1);
}

/* Load fragment, m8n8k4 */
__device__ static __inline__ void
__pgi_wmma_ld_a_m8n8k4_rmjr(signed char* frag, signed char* p, int lda)
{
  __dmma_m8n8k4_ld_a((double *)frag, (const double*)p, lda, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_a_m8n8k4_cmjr(signed char* frag, signed char* p, int lda)
{
  __dmma_m8n8k4_ld_a((double *)frag, (const double*)p, lda, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m8n8k4_rmjr(signed char* frag, signed char* p, int ldb)
{
  __dmma_m8n8k4_ld_b((double *)frag, (const double*)p, ldb, 0);
}

__device__ static __inline__ void
__pgi_wmma_ld_b_m8n8k4_cmjr(signed char* frag, signed char* p, int ldb)
{
  __dmma_m8n8k4_ld_b((double *)frag, (const double*)p, ldb, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m8n8k4(signed char* frag, signed char* p, int ldc, int layout)
{
  if (layout == 0)
    __dmma_m8n8k4_ld_c((double *)frag, (const double*)p, ldc, 0);
  if (layout == 1)
    __dmma_m8n8k4_ld_c((double *)frag, (const double*)p, ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_ld_c_m8n8k4_cmjr(signed char* frag, signed char* p, int ldc)
{
  __dmma_m8n8k4_ld_c((double *)frag, (const double*)p, ldc, 1);
}
#endif

/* -------------------------------------------------------------------------- */

/* Store fragment, m16n16k16 */
__device__ static __inline__ void
__pgi_wmma_st_c_m16n16k16_r2(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m16n16k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __hmma_m16n16k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m16n16k16_r2_cmjr(signed char* p, signed char* frag, int ldc)
{
  __hmma_m16n16k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m16n16k16_r4(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m16n16k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __hmma_m16n16k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m16n16k16_r4_cmjr(signed char* p, signed char* frag, int ldc)
{
  __hmma_m16n16k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

/* Store fragment, m32n8k16 */
__device__ static __inline__ void
__pgi_wmma_st_c_m32n8k16_r2(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m32n8k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __hmma_m32n8k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m32n8k16_r2_cmjr(signed char* p, signed char* frag, int ldc)
{
  __hmma_m32n8k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m32n8k16_r4(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m32n8k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __hmma_m32n8k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m32n8k16_r4_cmjr(signed char* p, signed char* frag, int ldc)
{
  __hmma_m32n8k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

/* Store fragment, m8n32k16 */
__device__ static __inline__ void
__pgi_wmma_st_c_m8n32k16_r2(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m8n32k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __hmma_m8n32k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m8n32k16_r2_cmjr(signed char* p, signed char* frag, int ldc)
{
  __hmma_m8n32k16_st_c_f16((int*)p, (int*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m8n32k16_r4(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __hmma_m8n32k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __hmma_m8n32k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m8n32k16_r4_cmjr(signed char* p, signed char* frag, int ldc)
{
  __hmma_m8n32k16_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
/* Store fragment, m16n16k8 */
__device__ static __inline__ void
__pgi_wmma_st_c_m16n16k8_r4(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __mma_m16n16k8_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __mma_m16n16k8_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m16n16k8_r4_cmjr(signed char* p, signed char* frag, int ldc)
{
  __mma_m16n16k8_st_c_f32((float*)p, (float*)frag, (unsigned int) ldc, 1);
}

/* Store fragment, m8n8k4 */
__device__ static __inline__ void
__pgi_wmma_st_c_m8n8k4_r8(signed char* p, signed char* frag, int ldc, int layout)
{
  if (layout == 0)
    __dmma_m8n8k4_st_c_f64((double*)p, (double*)frag, (unsigned int) ldc, 0);
  if (layout == 1)
    __dmma_m8n8k4_st_c_f64((double*)p, (double*)frag, (unsigned int) ldc, 1);
}

__device__ static __inline__ void
__pgi_wmma_st_c_m8n8k4_r8_cmjr(signed char* p, signed char* frag, int ldc)
{
  __dmma_m8n8k4_st_c_f64((double*)p, (double*)frag, (unsigned int) ldc, 1);
}
#endif

/* MMA functions for shape m16n16k16, no saturation */
/* -------------------------------------------------------------------------- */
/* D fp16 C fp16 */
__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_rmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_rmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_cmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_cmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 3, 0);
}

/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m16n16k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 3, 0);
}

/* D fp32 C fp16 */
__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_rmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_rmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_cmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_cmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 3, 0);
}

/* D fp32 C fp32 */
__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}

/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r4_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m16n16k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 3, 0);
}

/* D fp16 C fp32 */
__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k16_r2_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m16n16k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}

/* MMA functions for shape m32n8k16, no saturation */
/* -------------------------------------------------------------------------- */
/* D fp16 C fp16 */
__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_rmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_rmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_cmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_cmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 3, 0);
}
/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m32n8k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 3, 0);
}

/* D fp32 C fp16 */
__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_rmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_rmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_cmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_cmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 3, 0);
}

/* D fp32 C fp32 */
__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}
/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r4_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m32n8k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 3, 0);
}

/* D fp16 C fp32 */
__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m32n8k16_r2_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m32n8k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}

/* MMA functions for shape m8n32k16, no saturation */
/* -------------------------------------------------------------------------- */
/* D fp16 C fp16 */
__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_rmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_rmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_cmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_cmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)c, 3, 0);
}
/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  int ival = 0;
  ((int*)(d))[0] = ival; ((int*)(d))[1] = ival;
  ((int*)(d))[2] = ival; ((int*)(d))[3] = ival;
  __hmma_m8n32k16_mma_f16f16((int*)d, (int*)a, (int*)b, (int*)d, 3, 0);
}

/* D fp32 C fp16 */
__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_rmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_rmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_cmjr_rmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_cmjr_cmjr_r2(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f16((float*)d, (int*)a, (int*)b, (int*)c, 3, 0);
}

/* D fp32 C fp32 */
__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}
/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r4_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __hmma_m8n32k16_mma_f32f32((float*)d, (int*)a, (int*)b, (float*)d, 3, 0);
}

/* D fp16 C fp32 */
__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n32k16_r2_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __hmma_m8n32k16_mma_f16f32((int*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
/* MMA functions for shape m16n16k8, tf32 */
/* -------------------------------------------------------------------------- */
/* A, B, C, and D fp32 */
__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_rmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_rmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_cmjr_rmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_cmjr_cmjr_r4(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)c, 3, 0);
}

/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m16n16k8_r4_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 8; i++ )
    ((float*)(d))[i] = 0.0;
  __mma_tf32_m16n16k8_mma_f32((float*)d, (int*)a, (int*)b, (float*)d, 3, 0);
}

/* MMA functions for shape m8n8k4, double */
/* -------------------------------------------------------------------------- */
/* A, B, C, and D fp32 */
__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_rmjr_rmjr_r8(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)c, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_rmjr_cmjr_r8(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)c, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_cmjr_rmjr_r8(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)c, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_cmjr_cmjr_r8(signed char* d, signed char* a, signed char* b, signed char* c)
{
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)c, 3, 0);
}

/* 3 operand */
__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_rmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 2; i++ )
    ((double*)(d))[i] = 0.0;
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)d, 0, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_rmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 2; i++ )
    ((double*)(d))[i] = 0.0;
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)d, 1, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_cmjr_rmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 2; i++ )
    ((double*)(d))[i] = 0.0;
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)d, 2, 0);
}

__device__ static __inline__ void
__pgi_wmma_m8n8k4_r8_cmjr_cmjr(signed char* d, signed char* a, signed char* b)
{
  for (int i = 0; i < 2; i++ )
    ((double*)(d))[i] = 0.0;
  __dmma_m8n8k4_mma_f64((double*)d, (double*)a, (double*)b, (double*)d, 3, 0);
}
#endif

/* That's it for load, store, compute.  The rest are helpers */

/* These were guarded with cplusplus starting in CUDA 9.0 */

/* Assignment and conversion */
/* ------------------------- */
/* These might need pgi definitions because we are not using "half" */
__device__ static __inline__ float
__pgi_half2float(unsigned short h)
{
  float val;
  asm volatile("cvt.f32.f16 %0, %1;"
               : "=f"(val)  : "h"(h));
  return val;
}
__device__ static __inline__ unsigned short
__pgi_float2half_rn(float x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.f32 %0, %1;"
               : "=h"(val)  : "f"(x));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_float2half2_rn(float x)
{
  unsigned int val;
  asm volatile("{.reg .f16 low;\n"
               "cvt.rn.f16.f32 low, %1;\n"
               "mov.b32 %0, {low,low};}\n" : "=r"(val)  : "f"(x));
  return val;
}
__device__ static __inline__ void
__pgi_real2_assign2real4(signed char *y, signed char *x)
{
  unsigned short a = *((unsigned short *)x);
  ((float *)y)[0] = __pgi_half2float(a);
}
__device__ static __inline__ void
__pgi_real4_assign2real2(signed char *y, signed char *x)
{
  float a = *((float *)x);
  ((unsigned short *)y)[0] = __pgi_float2half_rn(a);
}
__device__ static __inline__ void
__pgi_halves2half2(signed char *z, signed char *y, signed char *x)
{
  unsigned int val;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  asm volatile("mov.b32 %0, {%1, %2};"
               : "=r"(val)  : "h"(a), "h"(b));
  ((unsigned int *)z)[0] = val;
}
__device__ static __inline__ unsigned int
__pgi_make_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  asm volatile("mov.b32 %0, {%1, %2};"
               : "=r"(val)  : "h"(a), "h"(b));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_make_v2real2from1(unsigned short x)
{
  unsigned int val;
  asm volatile("mov.b32 %0, {%1, %2};"
               : "=r"(val)  : "h"(x), "h"(x));
  return val;
}

/* Fill routines called either through API or assignment statements */
/* ------------------------------------------------------------------------- */
__device__ static __inline__ void
__pgi_wmma_fill_frag_accr2(signed char* frag, signed char* val)
{
  /* Fill 8 elements, two at a time */
  unsigned int ival = __pgi_make_v2real2from1(*((unsigned short *)val));
  ((unsigned int*)(frag))[0] = ival;
  ((unsigned int*)(frag))[1] = ival;
  ((unsigned int*)(frag))[2] = ival;
  ((unsigned int*)(frag))[3] = ival;
} /* wmma_fill_frag_accr2 */

__device__ static __inline__ void
__pgi_wmma_fill_frag_accr4(signed char* frag, float val)
{
  /* Fill 8 elements, one at a time */
  ((float*)(frag))[0] = val;
  ((float*)(frag))[1] = val;
  ((float*)(frag))[2] = val;
  ((float*)(frag))[3] = val;
  ((float*)(frag))[4] = val;
  ((float*)(frag))[5] = val;
  ((float*)(frag))[6] = val;
  ((float*)(frag))[7] = val;
} /* wmma_fill_frag_accr4 */

__device__ static __inline__ void
__pgi_wmma_fill_frag_accr8(signed char* frag, double val)
{
  /* Fill 8 elements, one at a time */
  ((double*)(frag))[0] = val;
  ((double*)(frag))[1] = val;
} /* wmma_fill_frag_accr8 */

__device__ static __inline__ void
__pgi_wmma_fill_frag_ab(signed char* frag, signed char* val)
{
  /* Fill 16 elements, two at a time */
  unsigned int ival = __pgi_make_v2real2from1(*((unsigned short *)val));
  ((unsigned int*)(frag))[0] = ival;
  ((unsigned int*)(frag))[1] = ival;
  ((unsigned int*)(frag))[2] = ival;
  ((unsigned int*)(frag))[3] = ival;
  ((unsigned int*)(frag))[4] = ival;
  ((unsigned int*)(frag))[5] = ival;
  ((unsigned int*)(frag))[6] = ival;
  ((unsigned int*)(frag))[7] = ival;
} /* wmma_fill_frag_ab */

__device__ static __inline__ void
__pgi_wmma_fill_frag_ab_r4(signed char* frag, float val)
{
  /* Fill 8 elements, one at a time */
  ((float*)(frag))[0] = val;
  ((float*)(frag))[1] = val;
  ((float*)(frag))[2] = val;
  ((float*)(frag))[3] = val;
} /* wmma_fill_frag_ab_r4 */

__device__ static __inline__ void
__pgi_wmma_fill_frag_ab_r8(signed char* frag, double val)
{
  ((double*)(frag))[0] = val;
} /* wmma_fill_frag_ab_r8 */

__device__ static __inline__ void
__pgi_wmma_copy_frag_accr2(signed char* y, signed char* x)
{
  /* Fill 8 elements, two at a time */
  ((unsigned int*)(y))[0] = ((unsigned int*)(x))[0];
  ((unsigned int*)(y))[1] = ((unsigned int*)(x))[1];
  ((unsigned int*)(y))[2] = ((unsigned int*)(x))[2];
  ((unsigned int*)(y))[3] = ((unsigned int*)(x))[3];
} /* wmma_copy_frag_accr2 */

__device__ static __inline__ void
__pgi_wmma_copy_frag_accr4(signed char* y, signed char* x)
{
  /* Fill 8 elements, one at a time */
  ((float*)(y))[0] = ((float*)(x))[0];
  ((float*)(y))[1] = ((float*)(x))[1];
  ((float*)(y))[2] = ((float*)(x))[2];
  ((float*)(y))[3] = ((float*)(x))[3];
  ((float*)(y))[4] = ((float*)(x))[4];
  ((float*)(y))[5] = ((float*)(x))[5];
  ((float*)(y))[6] = ((float*)(x))[6];
  ((float*)(y))[7] = ((float*)(x))[7];
} /* wmma_copy_frag_accr4 */

__device__ static __inline__ void
__pgi_wmma_copy_frag_accr8(signed char* y, signed char* x)
{
  /* Fill 8 elements, one at a time */
  ((double*)(y))[0] = ((double*)(x))[0];
  ((double*)(y))[1] = ((double*)(x))[1];
} /* wmma_copy_frag_accr8 */

/* wmma comparison functions */
/* ------------------------- */
__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm16n16k16r2s(signed char* sa, float bval)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_float2half2_rn(bval);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.eq.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpeq_cm16n16k16r2s */

__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm16n16k16r2sr2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_make_v2real2from1((unsigned short)*sb);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.eq.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
}

__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm16n16k16r4s(signed char* sa, float val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] == val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpeq_cm16n16k16r4s */

__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm16n16k16r2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, b, val;

  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    b = ((unsigned int*)(sb))[i];
    asm volatile("set.eq.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpeq_cm16n16k16r2 */

__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm16n16k16r4(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] == ((float*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpeq_cm16n16k16r4 */

__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm8n8k4r8s(signed char* sa, double val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] == val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpeq_cm8n8k4r8s */

__device__ static __inline__ long long
__pgi_wmma_cmpeq_cm8n8k4r8(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] == ((double*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpeq_cm8n8k4r8 */

/* ----- */

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm16n16k16r2s(signed char* sa, float bval)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_float2half2_rn(bval);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.ne.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpne_cm16n16k16r2s */

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm16n16k16r2sr2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_make_v2real2from1((unsigned short)*sb);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.ne.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
}

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm16n16k16r4s(signed char* sa, float val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] != val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpne_cm16n16k16r4s */

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm16n16k16r2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, b, val;

  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    b = ((unsigned int*)(sb))[i];
    asm volatile("set.ne.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpne_cm16n16k16r2 */

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm16n16k16r4(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] != ((float*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpne_cm16n16k16r4 */

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm8n8k4r8s(signed char* sa, double val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] != val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpne_cm8n8k4r8s */

__device__ static __inline__ long long
__pgi_wmma_cmpne_cm8n8k4r8(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] != ((double*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpne_cm8n8k4r8 */

/* ----- */

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm16n16k16r2s(signed char* sa, float bval)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_float2half2_rn(bval);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.lt.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmplt_cm16n16k16r2s */

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm16n16k16r2sr2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_make_v2real2from1((unsigned short)*sb);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.lt.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
}

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm16n16k16r4s(signed char* sa, float val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] < val ? 1 : 0;
  }
  return llres;
} /* wmma_cmplt_cm16n16k16r4s */

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm16n16k16r2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, b, val;

  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    b = ((unsigned int*)(sb))[i];
    asm volatile("set.lt.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmplt_cm16n16k16r2 */

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm16n16k16r4(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] < ((float*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmplt_cm16n16k16r4 */

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm8n8k4r8s(signed char* sa, double val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] < val ? 1 : 0;
  }
  return llres;
} /* wmma_cmplt_cm8n8k4r8s */

__device__ static __inline__ long long
__pgi_wmma_cmplt_cm8n8k4r8(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] < ((double*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmplt_cm8n8k4r8 */

/* ----- */

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm16n16k16r2s(signed char* sa, float bval)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_float2half2_rn(bval);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.gt.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpgt_cm16n16k16r2s */

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm16n16k16r2sr2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_make_v2real2from1((unsigned short)*sb);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.gt.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
}

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm16n16k16r4s(signed char* sa, float val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] > val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpgt_cm16n16k16r4s */

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm16n16k16r2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, b, val;

  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    b = ((unsigned int*)(sb))[i];
    asm volatile("set.gt.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpgt_cm16n16k16r2 */

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm16n16k16r4(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] > ((float*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpgt_cm16n16k16r4 */

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm8n8k4r8s(signed char* sa, double val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] > val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpgt_cm8n8k4r8s */

__device__ static __inline__ long long
__pgi_wmma_cmpgt_cm8n8k4r8(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] > ((double*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpgt_cm8n8k4r8 */

/* ----- */

__device__ static __inline__ long long
__pgi_wmma_cmple_cm16n16k16r2s(signed char* sa, float bval)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_float2half2_rn(bval);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.le.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmple_cm16n16k16r2s */

__device__ static __inline__ long long
__pgi_wmma_cmple_cm16n16k16r2sr2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_make_v2real2from1((unsigned short)*sb);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.le.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
}

__device__ static __inline__ long long
__pgi_wmma_cmple_cm16n16k16r4s(signed char* sa, float val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] <= val ? 1 : 0;
  }
  return llres;
} /* wmma_cmple_cm16n16k16r4s */

__device__ static __inline__ long long
__pgi_wmma_cmple_cm16n16k16r2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, b, val;

  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    b = ((unsigned int*)(sb))[i];
    asm volatile("set.le.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmple_cm16n16k16r2 */

__device__ static __inline__ long long
__pgi_wmma_cmple_cm16n16k16r4(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] <= ((float*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmple_cm16n16k16r4 */

__device__ static __inline__ long long
__pgi_wmma_cmple_cm8n8k4r8s(signed char* sa, double val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] <= val ? 1 : 0;
  }
  return llres;
} /* wmma_cmple_cm8n8k4r8s */

__device__ static __inline__ long long
__pgi_wmma_cmple_cm8n8k4r8(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] <= ((double*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmple_cm8n8k4r8 */

/* ----- */

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm16n16k16r2s(signed char* sa, float bval)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_float2half2_rn(bval);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.ge.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpge_cm16n16k16r2s */

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm16n16k16r2sr2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, val;
  unsigned int b = __pgi_make_v2real2from1((unsigned short)*sb);
  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    asm volatile("set.ge.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
}

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm16n16k16r4s(signed char* sa, float val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] >= val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpge_cm16n16k16r4s */

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm16n16k16r2(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  unsigned int a, b, val;

  for (int i = 0; i < 4; i++) {
    a = ((unsigned int*)(sa))[i];
    b = ((unsigned int*)(sb))[i];
    asm volatile("set.ge.f16x2.f16x2 %0, %1, %2;"
                 : "=r"(val)  : "r"(a), "r"(b));
    lres[2*i]   = (val & 0x0000ffff) ? 1 : 0;
    lres[2*i+1] = (val & 0xffff0000) ? 1 : 0;
  }
  return llres;
} /* wmma_cmpge_cm16n16k16r2 */

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm16n16k16r4(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 8; i++) {
    lres[i] = ((float*)(sa))[i] >= ((float*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpge_cm16n16k16r4 */

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm8n8k4r8s(signed char* sa, double val)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] >= val ? 1 : 0;
  }
  return llres;
} /* wmma_cmpge_cm8n8k4r8s */

__device__ static __inline__ long long
__pgi_wmma_cmpge_cm8n8k4r8(signed char* sa, signed char* sb)
{
  long long llres;
  signed char *lres = (signed char *)&llres;
  for (int i = 0; i < 2; i++) {
    lres[i] = ((double*)(sa))[i] >= ((double*)(sb))[i] ? 1 : 0;
  }
  return llres;
} /* wmma_cmpge_cm8n8k4r8 */

extern __device__ int   __pgi_shfl_xori2(int var, int lanemask);
extern __device__ float __pgi_shfl_xorf2(float var, int lanemask);
extern __device__ double __pgi_shfl_xord2(double var, int lanemask);

/* Logical reductions        */
/* ------------------------- */
__device__ static __inline__ int
__pgi_wmma_shfl_wmmalogical(int lcount)
{
  int lnbr = 0;
  lnbr = __pgi_shfl_xori2(lcount, (unsigned int)1);
  lcount = lcount + lnbr;
  lnbr = __pgi_shfl_xori2(lcount, (unsigned int)2);
  lcount = lcount + lnbr;
  lnbr = __pgi_shfl_xori2(lcount, (unsigned int)4);
  lcount = lcount + lnbr;
  lnbr = __pgi_shfl_xori2(lcount, (unsigned int)8);
  lcount = lcount + lnbr;
  lnbr = __pgi_shfl_xori2(lcount, (unsigned int)16);
  lcount = lcount + lnbr;
  return lcount;
}

__device__ static __inline__ int
__pgi_wmma_any_wmmalogical1(signed char *lvals)
{
  int lcount = 0;
  for (int i = 0; i < 8; i++)
    if (lvals[i]) lcount++;
  return (__pgi_wmma_shfl_wmmalogical(lcount) > 0);
}

__device__ static __inline__ int
__pgi_wmma_all_wmmalogical1(signed char *lvals)
{
  int lcount = 0;
  for (int i = 0; i < 8; i++)
    if (lvals[i]) lcount++;
  return (__pgi_wmma_shfl_wmmalogical(lcount) == 256);
}

__device__ static __inline__ int
__pgi_wmma_count_wmmalogical1(signed char *lvals)
{
  int lcount = 0;
  for (int i = 0; i < 8; i++)
    if (lvals[i]) lcount++;
  return __pgi_wmma_shfl_wmmalogical(lcount);
}

__device__ static __inline__ int
__pgi_wmma_any_wmmalogical5(signed char *lvals)
{
  int lcount = 0;
  for (int i = 0; i < 2; i++)
    if (lvals[i]) lcount++;
  return (__pgi_wmma_shfl_wmmalogical(lcount) > 0);
}

__device__ static __inline__ int
__pgi_wmma_all_wmmalogical5(signed char *lvals)
{
  int lcount = 0;
  for (int i = 0; i < 2; i++)
    if (lvals[i]) lcount++;
  return (__pgi_wmma_shfl_wmmalogical(lcount) == 64);
}

__device__ static __inline__ int
__pgi_wmma_count_wmmalogical5(signed char *lvals)
{
  int lcount = 0;
  for (int i = 0; i < 2; i++)
    if (lvals[i]) lcount++;
  return __pgi_wmma_shfl_wmmalogical(lcount);
}

/* wmma merge functions      */
/* ------------------------- */
__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r2r2r2m(signed char* d, signed char* t, signed char* f, signed char* l)
{
  for (int i = 0; i < 8; i++) {
    ((unsigned short*)(d))[i] = l[i] ?
                        ((unsigned short*)(t))[i] : ((unsigned short*)(f))[i];
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r2sr2m(signed char* d, signed char* t, signed char* f, signed char* l)
{
  unsigned short ts = ((unsigned short*)(t))[0];
  for (int i = 0; i < 8; i++) {
    ((unsigned short*)(d))[i] = l[i] ? ts : ((unsigned short*)(f))[i];
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r2r2sm(signed char* d, signed char* t, signed char* f, signed char* l)
{
  unsigned short fs = ((unsigned short*)(f))[0];
  for (int i = 0; i < 8; i++) {
    ((unsigned short*)(d))[i] = l[i] ? ((unsigned short*)(t))[i] : fs;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r2ssm(signed char* d, signed char* t, signed char* f, signed char* l)
{
  unsigned short ts = ((unsigned short*)(t))[0];
  unsigned short fs = ((unsigned short*)(f))[0];
  for (int i = 0; i < 8; i++) {
    ((unsigned short*)(d))[i] = l[i] ? ts : fs;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r4r4r4m(signed char* d, signed char* t, signed char* f, signed char* l)
{
  for (int i = 0; i < 8; i++) {
    ((float*)(d))[i] = l[i] ?
                        ((float*)(t))[i] : ((float*)(f))[i];
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r4sr4m(signed char* d, float t, signed char* f, signed char* l)
{
  for (int i = 0; i < 8; i++) {
    ((float*)(d))[i] = l[i] ? t : ((float*)(f))[i] ;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r4r4sm(signed char* d, signed char* t, float f, signed char* l)
{
  for (int i = 0; i < 8; i++) {
    ((float*)(d))[i] = l[i] ? ((float*)(t))[i] : f;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm16n16k16r4ssm(signed char* d, float t, float f, signed char* l)
{
  for (int i = 0; i < 8; i++) {
    ((float*)(d))[i] = l[i] ? t : f;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm8n8k4r8r8r8m(signed char* d, signed char* t, signed char* f, signed char* l)
{
  for (int i = 0; i < 2; i++) {
    ((double*)(d))[i] = l[i] ?
                        ((double*)(t))[i] : ((double*)(f))[i];
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm8n8k4r8sr8m(signed char* d, double t, signed char* f, signed char* l)
{
  for (int i = 0; i < 2; i++) {
    ((double*)(d))[i] = l[i] ? t : ((double*)(f))[i] ;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm8n8k4r8r8sm(signed char* d, signed char* t, double f, signed char* l)
{
  for (int i = 0; i < 2; i++) {
    ((double*)(d))[i] = l[i] ? ((double*)(t))[i] : f;
  }
}

__device__ static __inline__ void
__pgi_wmma_merge_cm8n8k4r8ssm(signed char* d, double t, double f, signed char* l)
{
  for (int i = 0; i < 2; i++) {
    ((double*)(d))[i] = l[i] ? t : f;
  }
}

/* arithmetic on real(2) */
/* -------------------------------------------------------------------------*/
__device__ static __inline__ unsigned short
__pgi_wmma_add_real2(signed char *x, signed char *y)
{
  unsigned short val;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  asm volatile("add.f16 %0, %1, %2;"
               : "=h"(val)  : "h"(a), "h"(b));
  return val;
}

__device__ static __inline__ unsigned short
__pgi_wmma_sub_real2(signed char *x, signed char *y)
{
  unsigned short val;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  asm volatile("sub.f16 %0, %1, %2;"
               : "=h"(val)  : "h"(a), "h"(b));
  return val;
}
__device__ static __inline__ unsigned short
__pgi_wmma_mul_real2(signed char *x, signed char *y)
{
  unsigned short val;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  asm volatile("mul.f16 %0, %1, %2;"
               : "=h"(val)  : "h"(a), "h"(b));
  return val;
}
__device__ static __inline__ unsigned short
__pgi_wmma_div_real2(signed char *x, signed char *y)
{
  float fa, fb, fv, rcp;
  unsigned short val, abs;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  const unsigned short den = 0x008f;

  fa = __pgi_half2float(a);
  fb = __pgi_half2float(b);

  asm volatile("rcp.approx.f32 %0, %1;" : "=f"(rcp)  : "f"(fb));

  fv = rcp * fa;
  val = __pgi_float2half_rn(fv);
  abs = val & 0x7fff;
  if ((abs < den) && (!abs == 0x0)) {
    float err = __fmaf_rn(-fb, fv, fa);
    fv = __fmaf_rn(rcp, err, fv);
    val = __pgi_float2half_rn(fv);
  }
  return val;
}
__device__ static __inline__ unsigned short
__pgi_wmma_fma_real2(signed char *x, signed char *y, signed char *z)
{
  unsigned short val;
  const unsigned short a = *((unsigned short *)x);
  const unsigned short b = *((unsigned short *)y);
  const unsigned short c = *((unsigned short *)z);
  asm volatile("fma.rn.f16 %0, %1, %2, %3;"
               : "=h"(val)  : "h"(a), "h"(b), "h"(c));
  return val;
}

__device__ static __inline__ unsigned short
__pgi_wmma_realint_real2(int x, int kind)
{
  unsigned short h;
  h = 0;
  if (kind == 2) {
    asm volatile("cvt.rn.f16.s32 %0, %1;" : "=h"(h) : "r"(x));
  }
  return h;
}

__device__ static __inline__ unsigned short
__pgi_wmma_realreal_real2(float x, int kind)
{
  unsigned short h;
  h = __pgi_float2half_rn(x);
  return h;
}

__device__ static __inline__ float
__pgi_wmma_realreal2_real(signed char *x, int kind)
{
  unsigned short a = *((unsigned short *)x);
  return __pgi_half2float(a);
}

/* vector(2) of real(2) */
/* -------------------------------------------------------------------------*/
__device__ static __inline__ unsigned int
__pgi_negate_v2real2(signed char *x)
{
  unsigned int val;
  const unsigned int a = 0x0;
  const unsigned int b = *((unsigned int *)x);
  asm volatile("sub.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_add_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("add.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_sub_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("sub.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_mul_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("mul.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_div_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  unsigned short a, b;
  a = __pgi_wmma_div_real2(x, y);
  b = __pgi_wmma_div_real2(x+2, y+2);
  asm volatile("mov.b32 %0, {%1, %2};"
               : "=r"(val)  : "h"(a), "h"(b));
  return val;
}
__device__ static __inline__ unsigned int
__pgi_fma_v2real2(signed char *x, signed char *y, signed char*z)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  const unsigned int c = *((unsigned int *)z);
  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;"
               : "=r"(val)  : "r"(a), "r"(b), "r"(c));
  return val;
}

/* comparison of vector(2) of real(2), returning v2logical2 */
/* -------------------------------------------------------------------------*/
__device__ static __inline__ unsigned int
__pgi_wmma_cmpeq_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("set.eq.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpne_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("set.ne.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpgt_v2real2_byvalue(unsigned int a, unsigned int b)
{
  unsigned int val;
  asm volatile("set.gt.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpgt_v2real2(signed char *x, signed char *y)
{
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  return __pgi_wmma_cmpgt_v2real2_byvalue(a, b);
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmplt_v2real2_byvalue(unsigned int a, unsigned int b)
{
  unsigned int val;
  asm volatile("set.lt.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmplt_v2real2(signed char *x, signed char *y)
{
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  return __pgi_wmma_cmplt_v2real2_byvalue(a, b);
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpge_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("set.ge.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmple_v2real2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("set.le.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

/* comparison of vector(2) of real(2) and real(4), returning v2logical2 */
/* -------------------------------------------------------------------------*/
__device__ static __inline__ unsigned int
__pgi_wmma_cmpeq_v2realr(signed char *x, float y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = __pgi_float2half2_rn(y);
  asm volatile("set.eq.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpne_v2realr(signed char *x, float y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = __pgi_float2half2_rn(y);
  asm volatile("set.ne.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpgt_v2realr(signed char *x, float y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = __pgi_float2half2_rn(y);
  asm volatile("set.gt.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmplt_v2realr(signed char *x, float y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = __pgi_float2half2_rn(y);
  asm volatile("set.lt.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmpge_v2realr(signed char *x, float y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = __pgi_float2half2_rn(y);
  asm volatile("set.ge.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_wmma_cmple_v2realr(signed char *x, float y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = __pgi_float2half2_rn(y);
  asm volatile("set.le.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

/* logical expressions of v2logical2 type */
/* -------------------------------------------------------------------------*/
__device__ static __inline__ unsigned int
__pgi_lognot_v2logical2(signed char *x)
{
  unsigned int val = *((unsigned int *)x);
  return (val ^ 0x3c003c00);
}

__device__ static __inline__ unsigned int
__pgi_logand_v2logical2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  val = a & b;
  return val;
}

__device__ static __inline__ unsigned int
__pgi_logor_v2logical2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  val = a | b;
  return val;
}

__device__ static __inline__ unsigned int
__pgi_logeqv_v2logical2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("set.eq.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

__device__ static __inline__ unsigned int
__pgi_logneqv_v2logical2(signed char *x, signed char *y)
{
  unsigned int val;
  const unsigned int a = *((unsigned int *)x);
  const unsigned int b = *((unsigned int *)y);
  asm volatile("set.ne.f16x2.f16x2 %0, %1, %2;"
               : "=r"(val)  : "r"(a), "r"(b));
  return val;
}

/* compression of v2logical2 using any, all, and count */
/* -------------------------------------------------------------------------*/
__device__ static __inline__ unsigned int
__pgi_any_v2logical2(signed char *x)
{
  unsigned int val = *((unsigned int *)x);
  val = (val & 0xffff0000) || (val & 0x0000ffff);
  return val;
}

__device__ static __inline__ unsigned int
__pgi_all_v2logical2(signed char *x)
{
  unsigned int val = *((unsigned int *)x);
  val = (val & 0xffff0000) && (val & 0x0000ffff);
  return val;
}

__device__ static __inline__ unsigned int
__pgi_count_v2logical2(signed char *x)
{
  unsigned int val = *((unsigned int *)x);
  unsigned int count = 0;
  if (val & 0xffff0000) count++;
  if (val & 0x0000ffff) count++;
  return count;
}

/* Maxval */
__device__ static __inline__ unsigned short
__pgi_maxval_cm16n16k16r2(signed char* x)
{
  unsigned int val = ((unsigned int *)(x))[0];
  unsigned int y, cres;
  for (int i = 1; i < 4; i++) {
    y = ((unsigned int *)(x))[i];
    cres = __pgi_wmma_cmpgt_v2real2_byvalue(val, y);
    if ((cres & 0x0000ffff)!=0) cres = cres | 0x0000ffff;
    if ((cres & 0xffff0000)!=0) cres = cres | 0xffff0000;
    val = (val & cres) | (y & ~cres);
  }
  for (int i = 1; i < 32; i*=2) {
    y = __pgi_shfl_xori2(val, (unsigned int)i);
    cres = __pgi_wmma_cmpgt_v2real2_byvalue(val, y);
    if ((cres & 0x0000ffff)!=0) cres = cres | 0x0000ffff;
    if ((cres & 0xffff0000)!=0) cres = cres | 0xffff0000;
    val = (val & cres) | (y & ~cres);
  }
  y = ((val & 0xffff0000) >> 16) | ((val & 0x0000ffff) << 16);
  cres = __pgi_wmma_cmpgt_v2real2_byvalue(val, y);
  if ((cres & 0x0000ffff)!=0) 
    return ((unsigned short) (val & 0x0000ffff));
  else
    return ((unsigned short) (  y & 0x0000ffff));
}

__device__ static __inline__ float
__pgi_maxval_cm16n16k16r4(signed char* frag)
{
  float val, nbr;
  val = ((float*)(frag))[0];
  for (int i = 1; i < 8; i++) 
    if (((float*)(frag))[i] > val) val  = ((float*)(frag))[i];
  nbr = __pgi_shfl_xorf2(val, (unsigned int)1);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)2);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)4);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)8);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)16);
  if (nbr > val) val = nbr;
  return val;
}

__device__ static __inline__ unsigned short
__pgi_maxval_cm16n16k16r2_wm(signed char* x, signed char* mask)
{
  unsigned int val = 0xfc00fc00;  /* -Inf */
  unsigned int y, cres;
  for (int i = 0; i < 4; i++) {
    y = ((unsigned int *)(x))[i];
    cres = __pgi_wmma_cmpgt_v2real2_byvalue(val, y);
    if (((cres & 0x0000ffff)!=0) || (*mask++ == 0)) cres = cres | 0x0000ffff;
    if (((cres & 0xffff0000)!=0) || (*mask++ == 0)) cres = cres | 0xffff0000;
    val = (val & cres) | (y & ~cres);
  }
  for (int i = 1; i < 32; i*=2) {
    y = __pgi_shfl_xori2(val, (unsigned int)i);
    cres = __pgi_wmma_cmpgt_v2real2_byvalue(val, y);
    if ((cres & 0x0000ffff)!=0) cres = cres | 0x0000ffff;
    if ((cres & 0xffff0000)!=0) cres = cres | 0xffff0000;
    val = (val & cres) | (y & ~cres);
  }
  y = ((val & 0xffff0000) >> 16) | ((val & 0x0000ffff) << 16);
  cres = __pgi_wmma_cmpgt_v2real2_byvalue(val, y);
  if ((cres & 0x0000ffff)!=0) 
    return ((unsigned short) (val & 0x0000ffff));
  else
    return ((unsigned short) (  y & 0x0000ffff));
}

__device__ static __inline__ float
__pgi_maxval_cm16n16k16r4_wm(signed char* frag, signed char* mask)
{
  float val, nbr;
  if (mask[0])
    val = ((float*)(frag))[0];
  else
    val = int_as_float(0xff7fffff);
  for (int i = 1; i < 8; i++) {
    if (mask[i]) {
      if (((float*)(frag))[i] > val) val  = ((float*)(frag))[i];
    }
  }
  nbr = __pgi_shfl_xorf2(val, (unsigned int)1);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)2);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)4);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)8);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)16);
  if (nbr > val) val = nbr;
  return val;
}

__device__ static __inline__ double
__pgi_maxval_cm8n8k4r8(signed char* frag)
{
  double val, nbr;
  val = ((double*)(frag))[0];
  if (((double*)(frag))[1] > val) val  = ((double*)(frag))[1];
  nbr = __pgi_shfl_xord2(val, (unsigned int)1);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)2);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)4);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)8);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)16);
  if (nbr > val) val = nbr;
  return val;
}

__device__ static __inline__ double
__pgi_maxval_cm8n8k4r8_wm(signed char* frag, signed char *mask)
{
  double val, nbr;
  if (mask[0])
    val = ((double*)(frag))[0];
  else
    val = __longlong_as_double(0xffefffffffffffffll);
  if (mask[1]) {
    if (((double*)(frag))[1] > val) val  = ((double*)(frag))[1];
  }
  nbr = __pgi_shfl_xord2(val, (unsigned int)1);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)2);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)4);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)8);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)16);
  if (nbr > val) val = nbr;
  return val;
}

__device__ static __inline__ float
__pgi_maxabsval_cm16n16k16r4(signed char* frag)
{
  float val, nbr;
  val = ((float*)(frag))[0];
  if (val < 0.0) val = -val;
  for (int i = 1; i < 8; i++) {
    nbr = ((float*)(frag))[i];
    if (nbr < 0.0) nbr = -nbr;
    if (nbr > val) val  = nbr;
  }
  nbr = __pgi_shfl_xorf2(val, (unsigned int)1);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)2);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)4);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)8);
  if (nbr > val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)16);
  if (nbr > val) val = nbr;
  return val;
}

/* ---- */

__device__ static __inline__ unsigned short
__pgi_minval_cm16n16k16r2(signed char* x)
{
  unsigned int val = ((unsigned int *)(x))[0];
  unsigned int y, cres;
  for (int i = 1; i < 4; i++) {
    y = ((unsigned int *)(x))[i];
    cres = __pgi_wmma_cmplt_v2real2_byvalue(val, y);
    if ((cres & 0x0000ffff)!=0) cres = cres | 0x0000ffff;
    if ((cres & 0xffff0000)!=0) cres = cres | 0xffff0000;
    val = (val & cres) | (y & ~cres);
  }
  for (int i = 1; i < 32; i*=2) {
    y = __pgi_shfl_xori2(val, (unsigned int)i);
    cres = __pgi_wmma_cmplt_v2real2_byvalue(val, y);
    if ((cres & 0x0000ffff)!=0) cres = cres | 0x0000ffff;
    if ((cres & 0xffff0000)!=0) cres = cres | 0xffff0000;
    val = (val & cres) | (y & ~cres);
  }
  y = ((val & 0xffff0000) >> 16) | ((val & 0x0000ffff) << 16);
  cres = __pgi_wmma_cmplt_v2real2_byvalue(val, y);
  if ((cres & 0x0000ffff)!=0) 
    return ((unsigned short) (val & 0x0000ffff));
  else
    return ((unsigned short) (  y & 0x0000ffff));
}

__device__ static __inline__ float
__pgi_minval_cm16n16k16r4(signed char* frag)
{
  float val, nbr;
  val = ((float*)(frag))[0];
  for (int i = 1; i < 8; i++) 
    if (((float*)(frag))[i] < val) val  = ((float*)(frag))[i];
  nbr = __pgi_shfl_xorf2(val, (unsigned int)1);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)2);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)4);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)8);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)16);
  if (nbr < val) val = nbr;
  return val;
}

__device__ static __inline__ float
__pgi_minval_cm16n16k16r4_wm(signed char* frag, signed char* mask)
{
  float val, nbr;
  if (mask[0])
    val = ((float*)(frag))[0];
  else
    val = int_as_float(0x7f7fffff);
  for (int i = 1; i < 8; i++) {
    if (mask[i]) {
      if (((float*)(frag))[i] < val) val  = ((float*)(frag))[i];
    }
  }
  nbr = __pgi_shfl_xorf2(val, (unsigned int)1);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)2);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)4);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)8);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)16);
  if (nbr < val) val = nbr;
  return val;
}

__device__ static __inline__ double
__pgi_minval_cm8n8k4r8(signed char* frag)
{
  double val, nbr;
  val = ((double*)(frag))[0];
  if (((double*)(frag))[1] < val) val  = ((double*)(frag))[1];
  nbr = __pgi_shfl_xord2(val, (unsigned int)1);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)2);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)4);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)8);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)16);
  if (nbr < val) val = nbr;
  return val;
}

__device__ static __inline__ double
__pgi_minval_cm8n8k4r8_wm(signed char* frag, signed char *mask)
{
  double val, nbr;
  if (mask[0])
    val = ((double*)(frag))[0];
  else
    val = __longlong_as_double(0x7fefffffffffffffll);
  if (mask[1]) {
    if (((double*)(frag))[1] < val) val  = ((double*)(frag))[1];
  }
  nbr = __pgi_shfl_xord2(val, (unsigned int)1);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)2);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)4);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)8);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xord2(val, (unsigned int)16);
  if (nbr < val) val = nbr;
  return val;
}

__device__ static __inline__ float
__pgi_minabsval_cm16n16k16r4(signed char* frag)
{
  float val, nbr;
  val = ((float*)(frag))[0];
  if (val < 0.0) val = -val;
  for (int i = 1; i < 8; i++) {
    nbr = ((float*)(frag))[i];
    if (nbr < 0.0) nbr = -nbr;
    if (nbr < val) val  = nbr;
  }
  nbr = __pgi_shfl_xorf2(val, (unsigned int)1);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)2);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)4);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)8);
  if (nbr < val) val = nbr;
  nbr = __pgi_shfl_xorf2(val, (unsigned int)16);
  if (nbr < val) val = nbr;
  return val;
}

/* Memory operations on vectors */
__device__ static __inline__ int
__isGlobal(signed char *ptr)
{
  /* Returns whether it is global, and if so, the alignment */
  unsigned int ret, algn;
  algn = ((unsigned long long)((void *)ptr)) & 0xffff;
  asm volatile ("{ \n\t"
                "    .reg .pred p; \n\t"
                "    isspacep.global p, %1; \n\t"
                "    selp.u32 %0, 1, 0, p;  \n\t"
                "} \n\t" : "=r"(ret) : "l"(ptr));
  return (ret > 0) ? __ffs(algn | 0x100) : ret ;
}
__device__ static __inline__ void
__pgi_assgn1_v2real2(signed char *y, signed char *x)
{
  /* real(2), dimension(2), assigned to type(v2real2) */
  unsigned int algn = ((unsigned long long)((void *)x)) & 0xffff;
  if (__ffs(algn | 0x10) > 2) {
    ((unsigned int *)y)[0] = ((unsigned int *)x)[0];
  } else {
    __pgi_halves2half2(y, x, x+2);
  }
}
__device__ static __inline__ void
__pgi_assgn2_v2real2(signed char *y, signed char *x)
{
  /* type(v2real2) assigned to real(2), dimension(2) */
  unsigned int algn = ((unsigned long long)((void *)y)) & 0xffff;
  if (__ffs(algn | 0x10) > 2) {
    ((unsigned int *)y)[0] = ((unsigned int *)x)[0];
  } else {
    ((unsigned short *)y)[0] = ((unsigned short *)x)[0];
    ((unsigned short *)y)[1] = ((unsigned short *)x)[1];
  }
}
__device__ static __inline__ void
__pgi_assgn3_v2real2(signed char *y, float x)
{
  /* float assigned to both halves of type(v2real2) */
  ((unsigned int *)y)[0] = __pgi_float2half2_rn(x);
}

__device__ static __inline__ void
__pgi_assgn_v8real2(signed char *y, signed char *x)
{
  ((unsigned long long *)y)[0] = ((unsigned long long *)x)[0];
  ((unsigned long long *)y)[1] = ((unsigned long long *)x)[1];
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
__device__ static __inline__ float
__pgi_float_to_tf32(float in)
{
  float ret;
  asm("{\n  .reg .b32 __$1;"
      "\n   cvt.rna.tf32.f32 __$1, %1;"
      "\n   mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) );
  return ret;
}
#endif
