/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #define TRMM_KERNEL_N TRMM_KERNEL_LR #define TRMM_KERNEL_T TRMM_KERNEL_LC #else #define GEMM_KERNEL GEMM_KERNEL_N #define TRMM_KERNEL_N TRMM_KERNEL_LN #define TRMM_KERNEL_T TRMM_KERNEL_LT #endif #undef TIMING #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; #ifdef TIMING unsigned long long rpcc_counter; unsigned long long innercost = 0; unsigned long long outercost = 0; unsigned long long gemmcost = 0; unsigned long long trmmcost = 0; double total; #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) min_l = m; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; if( min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_IUTCOPY(min_l, min_i, a, lda, 0, 0, sa); #else TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } for(is = min_i; is < min_l; is += min_i){ min_i = min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; if( min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_IUTCOPY(min_l, min_i, a, lda, 0, is, sa); #else TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_N(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is); STOP_RPCC(trmmcost); } for(ls = min_l; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = ls; if (min_i > GEMM_P) min_i = GEMM_P; if( min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(gemmcost); START_RPCC(); GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } for(is = min_i; is < ls; is += min_i){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; if( min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); START_RPCC(); GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } for(is = ls; is < ls + min_l; is += min_i){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; if( min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_IUTCOPY(min_l, min_i, a, lda, ls, is, sa); #else TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_N(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); STOP_RPCC(trmmcost); } } #else min_l = m; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; if (min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } for(is = m - min_l + min_i; is < m; is += min_i){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; if (min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, is, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l); STOP_RPCC(trmmcost); } for(ls = m - min_l; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; if (min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } for(is = ls - min_l + min_i; is < ls; is += min_i){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; if (min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l); STOP_RPCC(trmmcost); } for(is = ls; is < m; is += min_i){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; if (min_i > GEMM_UNROLL_M){ min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); START_RPCC(); GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } } #endif } #ifdef TIMING total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost; printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kerlnel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., gemmcost / total * 100., trmmcost / total * 100., (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2., (double)n * (double)n * (double)n / total * 100. * (double)COMPSIZE / 2.); #endif return 0; }