Skip to content

(Updated)Detailled Benchmark of fgemv for Givaro::Integer in the field of RNS on a single desktop

ZHG2017 edited this page Jun 19, 2019 · 2 revisions

Note p = (0 for sequential, 1 for <RNSModulus, grain>, 2 for <RNSModulus, threads>, 3 for ParSeqHelper::Compose<ParSeqHelper::Parallel<FFLAS::CuttingStrategy::RNSModulus, grain>, ParSeqHelper::Parallel<rec, StrategyParameter::TwoDAdaptive>>)

(Updated with Givaro::absCompare)Benchmark using OpenMP

OMP_NUM_THREADS=1

100 bits

Time method m(dimension m of the matrix) k(dimension k of the matrix) i(number of repetitions) s(seed)
1.94056 0 4000 4000 3 -303297624
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(0) InfNorm: compute bound on the output: 77ms
Thread(0) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 77ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP:     init: 77ms
-------------------------------
rns_double::init  FOR1D loop: 287ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 597ms
-------------------------------
rns_double::init: 198ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP:   to RNS: 1082ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.709293 (12) moduli 
==========================================
FGEMM_MP:  RNS Mul: 709ms
FGEMM_MP: from RNS: 41ms
-------------------------------

(Updated with Givaro::absCompare)Benchmark using OpenMP

OMP_NUM_THREADS=4

100 bits

Time method m(dimension m of the matrix) k(dimension k of the matrix) i(number of repetitions) s(seed)
1.31869 1 4000 4000 3 -303297624
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(2) InfNorm: compute bound on the output: 75ms
Thread(2) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 75ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP:     init: 76ms
-------------------------------
rns_double::init  FOR1D loop: 125ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 321ms
-------------------------------
rns_double::init: 195ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP:   to RNS: 643ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.71181 (12) moduli 
==========================================
FGEMM_MP:  RNS Mul: 711ms
FGEMM_MP: from RNS: 43ms
-------------------------------

Benchmark using OpenMP

OMP_NUM_THREADS=1

100 bits

Time method m(dimension m of the matrix) k(dimension k of the matrix) i(number of repetitions) s(seed)
2.21434 0 4000 4000 3 -303297624
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(0) InfNorm: compute bound on the output: 636ms
Thread(0) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 637ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP:     init: 637ms
-------------------------------
rns_double::init  FOR1D loop: 204ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 437ms
-------------------------------
rns_double::init: 181ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP:   to RNS: 823ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.748757 (12) moduli 
==========================================
FGEMM_MP:  RNS Mul: 748ms
FGEMM_MP: from RNS: 4ms
-------------------------------

Benchmark using OpenMP

OMP_NUM_THREADS=4

100 bits

Time method m(dimension m of the matrix) k(dimension k of the matrix) i(number of repetitions) s(seed)
2.05828 1 4000 4000 3 -303297624
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(1) InfNorm: compute bound on the output: 700ms
Thread(1) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 700ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP:     init: 701ms
-------------------------------
rns_double::init  FOR1D loop: 164ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 402ms
-------------------------------
rns_double::init: 181ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP:   to RNS: 748ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init  FOR1D loop: 0ms
-------------------------------
rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.731949 (12) moduli 
==========================================
FGEMM_MP:  RNS Mul: 731ms
FGEMM_MP: from RNS: 4ms
-------------------------------

The extraction of code segment for the rns_double::init FOR1D loop in the file rns-double.inl

    inline void rns_double::init(size_t m, size_t n, double* Arns, size_t rda, const integer* A, size_t lda, size_t k, bool RNS_MAJOR) const
    {
#ifdef PROFILE_FGEMM_MP
        FFLAS::Timer chrono;
        chrono.start();
#endif
        if (k>_ldm){
            FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_double [init] -> rns basis is too small to handle integers with 2^(16*k) values ");
            std::cerr<<"with k="<<k<<" _ldm="<<_ldm<<std::endl;
        }
        const size_t mn=m*n;
        if (mn) {
        double *A_beta = FFLAS::fflas_new<double >(mn*k);
        const integer* Aiter=A;
        // split A into A_beta according to a Kronecker transform in base 2^16
        auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Column,FFLAS::StrategyParameter::Threads);

        Givaro::Timer tkr; tkr.start();
        // #ifndef __FFLASFFPACK_SEQUENTIAL
        // 			auto sp=SPLITTER(MAX_THREADS);
        // #else
        // 			auto sp=SPLITTER(1);
        // #endif
        // FOR2D(i,j,m,n,sp,
        //       TASK(MODE(READ(Aiter[0]) READWRITE(A_beta[0])),
        //for(size_t i=0;i<m;i++)
        //PAR_BLOCK{

        FOR1D(i,m,sp,

//	    PARFOR1D(i,m,SPLITTER(NUM_THREADS),
                 for(size_t j=0;j<n;j++){
                 size_t idx=j+i*n;
                 const mpz_t*    m0     = reinterpret_cast<const mpz_t*>(Aiter+j+i*lda);
                 const uint16_t* m0_ptr = reinterpret_cast<const uint16_t*>(m0[0]->_mp_d);
                 size_t l=0;
                 //size_t maxs=std::min(k,(Aiter[j+i*lda].size())<<2);
                 size_t maxs=std::min(k,(Aiter[j+i*lda].size())*sizeof(mp_limb_t)/2);// to ensure 32 bits portability
//std::cout<<"Thread("<<omp_get_thread_num()<<")  ==rns_double::init== j:="<<j<<std::endl;
#ifdef __FFLASFFPACK_HAVE_LITTLE_ENDIAN
                 if (m0[0]->_mp_size >= 0)
                 for (;l<maxs;l++)
                 A_beta[l+idx*k]=  m0_ptr[l];
                 else
                 for (;l<maxs;l++)
                 A_beta[l+idx*k]= - double(m0_ptr[l]);
#else
                 if (m0[0]->_mp_size >= 0)
                 for (;l<maxs;l++) {
                 size_t l2 = l ^ ((sizeof(mp_limb_t)/2U) - 1U);
                 A_beta[l+idx*k]=  m0_ptr[l2];
                 }
                 else
                     for (;l<maxs;l++) {
                         size_t l2 = l ^ ((sizeof(mp_limb_t)/2U) - 1U);
                         A_beta[l+idx*k]= - double(m0_ptr[l2]);
                     }
#endif
                 for (;l<k;l++)
                     A_beta[l+idx*k]=  0.;

                 // 	   );
                 }
        )


#ifdef PROFILE_FGEMM_MP
        chrono.stop();
        std::cout<<"-------------------------------"<<std::endl;
        std::cout<<"rns_double::init  FOR1D loop: "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
        chrono.start();
#endif

        tkr.stop();
        //if(m>1 && n>1) std::cerr<<"Kronecker : "<<tkr.realtime()<<std::endl;
        if (RNS_MAJOR==false) {
            // Arns = _crt_in x A_beta^T
            Givaro::Timer tfgemm; tfgemm.start();
#ifndef ENABLE_CHECKER_fgemm
            FFLAS::fgemm (Givaro::ZRing<double>(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,_size,mn,k,1.0,_crt_in.data(),_ldm,A_beta,k,0.,Arns,rda,
                          FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive>());
#else
            cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,(int)_size,(int)mn,(int)k,1.0,_crt_in.data(),(int)_ldm,A_beta,(int)k,0.,Arns,(int)rda);
#endif
            tfgemm.stop();
            //if(m>1 && n>1) 	std::cerr<<"fgemm : "<<tfgemm.realtime()<<std::endl;
        }
        else {
            // Arns =  A_beta x _crt_in^T
#ifndef ENABLE_CHECKER_fgemm
            FFLAS::fgemm (Givaro::ZRing<double>(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,mn,_size,k,1.0,A_beta, k, _crt_in.data(),_ldm,0.,Arns,_size,
                          FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive>());
#else
            cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,(int)mn,(int)_size,(int)k,1.0,A_beta,(int)k,_crt_in.data(),(int)_ldm,0.,Arns,(int)_size);
#endif
        }

#ifdef PROFILE_FGEMM_MP
        chrono.stop();
        std::cout<<"-------------------------------"<<std::endl;
        std::cout<<"rns_double::init  Arns = _crt_in x A_beta^T or Arns =  A_beta x _crt_in^T : "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
        chrono.start();
#endif

        Givaro::Timer tred; tred.start();

        reduce(mn,Arns,rda,RNS_MAJOR);
        tred.stop();
        //if(m>1 && n>1) 			std::cerr<<"Reduce : "<<tred.realtime()<<std::endl;

        FFLAS::fflas_delete( A_beta);

#ifdef CHECK_RNS
        bool ok=true;
        for (size_t i=0;i<m;i++)
            for(size_t j=0;j<n;j++)
                for(size_t k=0;k<_size;k++){
                    ok&= (((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0)) == (int64_t) Arns[i*n+j+k*rda]);
                    if (((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0))
                        != (int64_t) Arns[i*n+j+k*rda])
                    {
                        std::cout<<((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0))
                        <<" != "
                        <<(int64_t) Arns[i*n+j+k*rda]
                        <<std::endl;
                    }
                }
        std::cout<<"RNS freduce ... "<<(ok?"OK":"ERROR")<<std::endl;
#endif
        }
#ifdef PROFILE_FGEMM_MP
        chrono.stop();
        std::cout<<"-------------------------------"<<std::endl;
        std::cout<<"rns_double::init: "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
#endif
    }

The extraction of code segment for the rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T in the file fflas_bounds.inl

    inline Givaro::Integer
    InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){
        Givaro::Integer max = 0;
        size_t log=0;
#ifdef PROFILE_FGEMM_MP
        Timer chrono;
        chrono.start();
#endif

#if 1  //Sequential //////////////////////////////////////////////////
        for (size_t i=0; i<M; ++i){
             for (size_t j=0; j<N; ++j) {
                  const Givaro::Integer & x(A[i*lda+j]);
                 if ((x.bitsize() >= log) && (abs(x) > max)){
                     max = abs(x);
                     //                                        max = x;
                     log = x.bitsize();
                 }
 
                 if (Givaro::absCompare(x,max)>0) max = x;
              }
         }
#else  //Parallel /////////////////////////////////////////////////////

        std::vector<Givaro::Integer> vmax(M,0);
        auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads);
        SYNCH_GROUP({

              FORBLOCK1D(iter, M, sp,
                          TASK(MODE(CONSTREFERENCE(A,max,vmax) ),
                                {
                                for(auto i=iter.begin(); i!=iter.end(); ++i)
                                {
                                    for (size_t j=0; j<N; ++j) {
                                        const Givaro::Integer & x(A[i*lda+j]);
                                        if (Givaro::absCompare(x,vmax[i])>0){ vmax[i] = x;}
                                    }

                                }
                                })
                          );

        });
    max=vmax[0];
    for (size_t i=0; i<M; ++i){ 
        if (Givaro::absCompare(vmax[i],max)>0){ max = vmax[i];}
    }
#endif ///////////////////////////////////////////////////////////////

#ifdef PROFILE_FGEMM_MP
        chrono.stop();
        std::cout<<"Thread("<<omp_get_thread_num()<<") InfNorm: compute bound on the output: "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
#endif
        return max;
    }