I am a little bit new to thrust I am trying to migrate the following code to use make use of gpus but this one seems a little difficult
#include <iostream>
#include <complex>
#include <random>
#include <omp.h>
#include <chrono>
using Complex = std::complex<double>;
void fill_random_complex(Complex *array, int n) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<double> dis(0.0, 1.0);
for (int i = 0; i < n; ++i) {
double real_part = dis(gen);
double imag_part = dis(gen);
array[i] = Complex(real_part, imag_part);
}
}
void process_loop(
Complex *ag,
Complex *bg,
double *c_distr,
int *nbnd_loc,
int *ngk,
int l1_e, int l2_s, int l2_e,
int kpt_pool_nloc, int gstart,
int nimage, int idx, int pert_nglob,
int ag_dim1, int ag_dim2, int ag_dim3,
int bg_dim1, int bg_dim2, int bg_dim3,
int c_distr_dim1)
{
auto start_time = std::chrono::high_resolution_clock::now();
for (int il1 = 0; il1 < l1_e; ++il1) {
for (int il2 = l2_s - 1; il2 < l2_e; ++il2) {
int ig1 = nimage * il1 + idx;
double reduce = 0.0;
for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
int nbndval = nbnd_loc[iks];
int npw = ngk[iks];
for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
for (int il3 = 0; il3 < npw; ++il3) {
int ag_index = il3 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
int bg_index = il3 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));
reduce += 2.0 * std::real(ag[ag_index]) * std::real(bg[bg_index])
+ 2.0 * std::imag(ag[ag_index]) * std::imag(bg[bg_index]);
}
}
if (gstart == 2) {
for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
int ag_index = 0 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
int bg_index = 0 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));
reduce -= std::real(ag[ag_index]) * std::real(bg[bg_index]);
}
}
}
int c_distr_index = ig1 * c_distr_dim1 + il2;
c_distr[c_distr_index] = reduce;
}
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = end_time - start_time;
std::cout << "Time for process_loop: " << elapsed_time.count() << " seconds" << std::endl;
}
int main() {
const int nimg = 1;
const int idx = 0;
const int kpt_pool_nloc = 1;
const int l1_e = 200;
const int l2_s = 1;
const int l2_e = 100;
const int pert_nglob = 200;
const int gstart = 2;
const int ag_dim1 = 16984;
const int ag_dim2 = 128;
const int ag_dim3 = kpt_pool_nloc;
const int ag_dim4 = l1_e;
const int bg_dim1 = 16984;
const int bg_dim2 = 128;
const int bg_dim3 = kpt_pool_nloc;
const int bg_dim4 = l2_e;
const int c_distr_dim1 = pert_nglob;
int *nbnd_loc = new int[kpt_pool_nloc];
int *ngk = new int[kpt_pool_nloc];
nbnd_loc[0] = 128; // Number of bands
ngk[0] = 16984; // Number of grid points
int ag_size = ag_dim1 * ag_dim2 * ag_dim3 * ag_dim4;
int bg_size = bg_dim1 * bg_dim2 * bg_dim3 * bg_dim4;
int c_distr_size = c_distr_dim1 * pert_nglob;
Complex *ag = new Complex[ag_size];
Complex *bg = new Complex[bg_size];
double *c_distr = new double[c_distr_size];
fill_random_complex(ag, ag_size);
fill_random_complex(bg, bg_size);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<double> dis(0.0, 1.0);
for (int i = 0; i < c_distr_size; ++i) {
c_distr[i] = dis(gen);
}
int nimage = nimg;
process_loop(ag, bg, c_distr, nbnd_loc, ngk, l1_e, l2_s, l2_e, kpt_pool_nloc, gstart, nimage, idx, pert_nglob, ag_dim1, ag_dim2, ag_dim3, bg_dim1, bg_dim2, bg_dim3, c_distr_dim1);
delete[] ag;
delete[] bg;
delete[] c_distr;
delete[] nbnd_loc;
delete[] ngk;
return 0;
}
What I tried is implementing a functor for the two sums, I perform loop fusion:
for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
int nbndval = nbnd_loc[iks];
int npw = ngk[iks];
for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
for (int il3 = 0; il3 < npw; ++il3) {
int ag_index = il3 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
int bg_index = il3 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));
reduce += 2.0 * std::real(ag[ag_index]) * std::real(bg[bg_index])
+ 2.0 * std::imag(ag[ag_index]) * std::imag(bg[bg_index]);
}
}
}
if (gstart == 2) {
for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
int nbndval = nbnd_loc[iks];
for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
int ag_index = 0 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
int bg_index = 0 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));
reduce -= std::real(ag[ag_index]) * std::real(bg[bg_index]);
}
}
}
Then define a functor for each of the operations:
struct calculate_ag_bg {
Complex* ag; Complex* bg;
int npw;
calculate_ag_bg(Complex* _ag, Complex* _bg, int _npw)
: ag(_ag), bg(_bg), npw(_npw) {}
__host__ __device__
double operator()(int idx) const {
int lbnd = idx / npw, il3 = idx % npw;
double a_real = ag[il3 + lbnd * npw].real() * bg[il3 + lbnd * npw].real();
double b_imag = ag[il3 + lbnd * npw].imag() * bg[il3 + lbnd * npw].imag();
double a = 2.0 * a_real;
double b = 2.0 * b_imag;
return a + b;
}
};
struct correct_ag_bg_prod_kernel {
Complex* ag; Complex* bg; int npw;
correct_ag_bg_prod_kernel(Complex* _ag, Complex* _bg, int _npw)
: ag(_ag), bg(_bg), npw(_npw) {}
__host__ __device__
double operator()(int idx) const {
return ag[idx * npw].real() * bg[idx * npw].real();
}
};
and use thrust::transform_reduce
to perform the reductions within for loop, but I got bad performance which is to be expected. While kpt_pool_nloc
is 1 in the above code, it could be more so the code would have to accommodate that and neither nbnd_loc
nor ngk
are guaranteed to uniform. Maybe there is a better way to doing this using transform_iterator
or things or of that sort.