-
Notifications
You must be signed in to change notification settings - Fork 23
(Updated)Detailled Benchmark of fgemv for Givaro::Integer in the field of RNS on a single desktop
ZHG2017 edited this page Jun 19, 2019
·
2 revisions
Note p = (0 for sequential, 1 for <RNSModulus, grain>, 2 for <RNSModulus, threads>, 3 for ParSeqHelper::Compose<ParSeqHelper::Parallel<FFLAS::CuttingStrategy::RNSModulus, grain>, ParSeqHelper::Parallel<rec, StrategyParameter::TwoDAdaptive>>)
Time | method | m(dimension m of the matrix) | k(dimension k of the matrix) | i(number of repetitions) | s(seed) |
---|---|---|---|---|---|
1.94056 | 0 | 4000 | 4000 | 3 | -303297624 |
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(0) InfNorm: compute bound on the output: 77ms
Thread(0) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 77ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP: init: 77ms
-------------------------------
rns_double::init FOR1D loop: 287ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 597ms
-------------------------------
rns_double::init: 198ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP: to RNS: 1082ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.709293 (12) moduli
==========================================
FGEMM_MP: RNS Mul: 709ms
FGEMM_MP: from RNS: 41ms
-------------------------------
Time | method | m(dimension m of the matrix) | k(dimension k of the matrix) | i(number of repetitions) | s(seed) |
---|---|---|---|---|---|
1.31869 | 1 | 4000 | 4000 | 3 | -303297624 |
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(2) InfNorm: compute bound on the output: 75ms
Thread(2) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 75ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP: init: 76ms
-------------------------------
rns_double::init FOR1D loop: 125ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 321ms
-------------------------------
rns_double::init: 195ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP: to RNS: 643ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.71181 (12) moduli
==========================================
FGEMM_MP: RNS Mul: 711ms
FGEMM_MP: from RNS: 43ms
-------------------------------
Time | method | m(dimension m of the matrix) | k(dimension k of the matrix) | i(number of repetitions) | s(seed) |
---|---|---|---|---|---|
2.21434 | 0 | 4000 | 4000 | 3 | -303297624 |
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(0) InfNorm: compute bound on the output: 636ms
Thread(0) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 637ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP: init: 637ms
-------------------------------
rns_double::init FOR1D loop: 204ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 437ms
-------------------------------
rns_double::init: 181ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP: to RNS: 823ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.748757 (12) moduli
==========================================
FGEMM_MP: RNS Mul: 748ms
FGEMM_MP: from RNS: 4ms
-------------------------------
Time | method | m(dimension m of the matrix) | k(dimension k of the matrix) | i(number of repetitions) | s(seed) |
---|---|---|---|---|---|
2.05828 | 1 | 4000 | 4000 | 3 | -303297624 |
-------------------------------
FGEMM_MP: compute bit size of feasible prime for FFLAS 0ms
Thread(1) InfNorm: compute bound on the output: 700ms
Thread(1) InfNorm: compute bound on the output: 0ms
-------------------------------
FGEMM_MP: compute bound on the output 700ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: allocate data for RNS representation: 0ms
-------------------------------
FGEMM_MP: nb prime: 12
FGEMM_MP: init: 701ms
-------------------------------
rns_double::init FOR1D loop: 164ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 402ms
-------------------------------
rns_double::init: 181ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
FGEMM_MP: to RNS: 748ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
rns_double::init FOR1D loop: 0ms
-------------------------------
rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : 0ms
-------------------------------
rns_double::init: 0ms
-------------------------------
FGEMM_MP: compute alpha and beta in RNS: 0ms
==========================================
Pointwise fgemm : 0.731949 (12) moduli
==========================================
FGEMM_MP: RNS Mul: 731ms
FGEMM_MP: from RNS: 4ms
-------------------------------
The extraction of code segment for the rns_double::init FOR1D loop in the file rns-double.inl
inline void rns_double::init(size_t m, size_t n, double* Arns, size_t rda, const integer* A, size_t lda, size_t k, bool RNS_MAJOR) const
{
#ifdef PROFILE_FGEMM_MP
FFLAS::Timer chrono;
chrono.start();
#endif
if (k>_ldm){
FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_double [init] -> rns basis is too small to handle integers with 2^(16*k) values ");
std::cerr<<"with k="<<k<<" _ldm="<<_ldm<<std::endl;
}
const size_t mn=m*n;
if (mn) {
double *A_beta = FFLAS::fflas_new<double >(mn*k);
const integer* Aiter=A;
// split A into A_beta according to a Kronecker transform in base 2^16
auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Column,FFLAS::StrategyParameter::Threads);
Givaro::Timer tkr; tkr.start();
// #ifndef __FFLASFFPACK_SEQUENTIAL
// auto sp=SPLITTER(MAX_THREADS);
// #else
// auto sp=SPLITTER(1);
// #endif
// FOR2D(i,j,m,n,sp,
// TASK(MODE(READ(Aiter[0]) READWRITE(A_beta[0])),
//for(size_t i=0;i<m;i++)
//PAR_BLOCK{
FOR1D(i,m,sp,
// PARFOR1D(i,m,SPLITTER(NUM_THREADS),
for(size_t j=0;j<n;j++){
size_t idx=j+i*n;
const mpz_t* m0 = reinterpret_cast<const mpz_t*>(Aiter+j+i*lda);
const uint16_t* m0_ptr = reinterpret_cast<const uint16_t*>(m0[0]->_mp_d);
size_t l=0;
//size_t maxs=std::min(k,(Aiter[j+i*lda].size())<<2);
size_t maxs=std::min(k,(Aiter[j+i*lda].size())*sizeof(mp_limb_t)/2);// to ensure 32 bits portability
//std::cout<<"Thread("<<omp_get_thread_num()<<") ==rns_double::init== j:="<<j<<std::endl;
#ifdef __FFLASFFPACK_HAVE_LITTLE_ENDIAN
if (m0[0]->_mp_size >= 0)
for (;l<maxs;l++)
A_beta[l+idx*k]= m0_ptr[l];
else
for (;l<maxs;l++)
A_beta[l+idx*k]= - double(m0_ptr[l]);
#else
if (m0[0]->_mp_size >= 0)
for (;l<maxs;l++) {
size_t l2 = l ^ ((sizeof(mp_limb_t)/2U) - 1U);
A_beta[l+idx*k]= m0_ptr[l2];
}
else
for (;l<maxs;l++) {
size_t l2 = l ^ ((sizeof(mp_limb_t)/2U) - 1U);
A_beta[l+idx*k]= - double(m0_ptr[l2]);
}
#endif
for (;l<k;l++)
A_beta[l+idx*k]= 0.;
// );
}
)
#ifdef PROFILE_FGEMM_MP
chrono.stop();
std::cout<<"-------------------------------"<<std::endl;
std::cout<<"rns_double::init FOR1D loop: "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
chrono.start();
#endif
tkr.stop();
//if(m>1 && n>1) std::cerr<<"Kronecker : "<<tkr.realtime()<<std::endl;
if (RNS_MAJOR==false) {
// Arns = _crt_in x A_beta^T
Givaro::Timer tfgemm; tfgemm.start();
#ifndef ENABLE_CHECKER_fgemm
FFLAS::fgemm (Givaro::ZRing<double>(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,_size,mn,k,1.0,_crt_in.data(),_ldm,A_beta,k,0.,Arns,rda,
FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive>());
#else
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,(int)_size,(int)mn,(int)k,1.0,_crt_in.data(),(int)_ldm,A_beta,(int)k,0.,Arns,(int)rda);
#endif
tfgemm.stop();
//if(m>1 && n>1) std::cerr<<"fgemm : "<<tfgemm.realtime()<<std::endl;
}
else {
// Arns = A_beta x _crt_in^T
#ifndef ENABLE_CHECKER_fgemm
FFLAS::fgemm (Givaro::ZRing<double>(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,mn,_size,k,1.0,A_beta, k, _crt_in.data(),_ldm,0.,Arns,_size,
FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive>());
#else
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,(int)mn,(int)_size,(int)k,1.0,A_beta,(int)k,_crt_in.data(),(int)_ldm,0.,Arns,(int)_size);
#endif
}
#ifdef PROFILE_FGEMM_MP
chrono.stop();
std::cout<<"-------------------------------"<<std::endl;
std::cout<<"rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T : "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
chrono.start();
#endif
Givaro::Timer tred; tred.start();
reduce(mn,Arns,rda,RNS_MAJOR);
tred.stop();
//if(m>1 && n>1) std::cerr<<"Reduce : "<<tred.realtime()<<std::endl;
FFLAS::fflas_delete( A_beta);
#ifdef CHECK_RNS
bool ok=true;
for (size_t i=0;i<m;i++)
for(size_t j=0;j<n;j++)
for(size_t k=0;k<_size;k++){
ok&= (((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0)) == (int64_t) Arns[i*n+j+k*rda]);
if (((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0))
!= (int64_t) Arns[i*n+j+k*rda])
{
std::cout<<((A[i*lda+j] % (int64_t) _basis[k])+(A[i*lda+j]<0?(int64_t)_basis[k]:0))
<<" != "
<<(int64_t) Arns[i*n+j+k*rda]
<<std::endl;
}
}
std::cout<<"RNS freduce ... "<<(ok?"OK":"ERROR")<<std::endl;
#endif
}
#ifdef PROFILE_FGEMM_MP
chrono.stop();
std::cout<<"-------------------------------"<<std::endl;
std::cout<<"rns_double::init: "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
#endif
}
The extraction of code segment for the rns_double::init Arns = _crt_in x A_beta^T or Arns = A_beta x _crt_in^T in the file fflas_bounds.inl
inline Givaro::Integer
InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){
Givaro::Integer max = 0;
size_t log=0;
#ifdef PROFILE_FGEMM_MP
Timer chrono;
chrono.start();
#endif
#if 1 //Sequential //////////////////////////////////////////////////
for (size_t i=0; i<M; ++i){
for (size_t j=0; j<N; ++j) {
const Givaro::Integer & x(A[i*lda+j]);
if ((x.bitsize() >= log) && (abs(x) > max)){
max = abs(x);
// max = x;
log = x.bitsize();
}
if (Givaro::absCompare(x,max)>0) max = x;
}
}
#else //Parallel /////////////////////////////////////////////////////
std::vector<Givaro::Integer> vmax(M,0);
auto sp=SPLITTER(NUM_THREADS,FFLAS::CuttingStrategy::Row,FFLAS::StrategyParameter::Threads);
SYNCH_GROUP({
FORBLOCK1D(iter, M, sp,
TASK(MODE(CONSTREFERENCE(A,max,vmax) ),
{
for(auto i=iter.begin(); i!=iter.end(); ++i)
{
for (size_t j=0; j<N; ++j) {
const Givaro::Integer & x(A[i*lda+j]);
if (Givaro::absCompare(x,vmax[i])>0){ vmax[i] = x;}
}
}
})
);
});
max=vmax[0];
for (size_t i=0; i<M; ++i){
if (Givaro::absCompare(vmax[i],max)>0){ max = vmax[i];}
}
#endif ///////////////////////////////////////////////////////////////
#ifdef PROFILE_FGEMM_MP
chrono.stop();
std::cout<<"Thread("<<omp_get_thread_num()<<") InfNorm: compute bound on the output: "<<uint64_t(chrono.realtime()*1000)<<"ms"<<std::endl;
#endif
return max;
}