HΦ  3.1.0
mltplyMPIBoost.c File Reference
#include "mpi.h"
#include "Common.h"
#include "mfmemory.h"
#include "wrapperMPI.h"

Go to the source code of this file.

Functions

void zgemm_ (char *TRANSA, char *TRANSB, int *M, int *N, int *K, double complex *ALPHA, double complex *matJL, int *LDA, double complex *arrayz, int *LDB, double complex *BETA, double complex *arrayx, int *LDC)
 
void child_general_int_spin_MPIBoost (struct BindStruct *X, double complex *tmp_v0, double complex *tmp_v1, double complex *tmp_v2, double complex *tmp_v3)
 

Function Documentation

◆ child_general_int_spin_MPIBoost()

void child_general_int_spin_MPIBoost ( struct BindStruct X,
double complex *  tmp_v0,
double complex *  tmp_v1,
double complex *  tmp_v2,
double complex *  tmp_v3 
)

Exchange term in Spin model

Author
Mitsuaki Kawamura (The University of Tokyo)
Youhei Yamaji (The University of Tokyo)
Parameters
[in,out]X
[out]tmp_v0Result v0 = H v1
[in]tmp_v1v0 = H v1
[in,out]tmp_v2bufffer
[in,out]tmp_v3bufffer

Definition at line 36 of file mltplyMPIBoost.c.

References nproc, X, and zgemm_().

Referenced by mltplySpinGCBoost().

43 {
44 #ifdef MPI
45 
46  //double complex dam_pr = 0;
47  // MPI_Status statusMPI;
48 
49  // int ierr;
50  // int INFO;
51  char TRANSA, TRANSB;
52  int M, N, K, LDA, LDB, LDC;
53  double complex ALPHA, BETA;
54  long unsigned int i_max;
55  long unsigned int j, k, ell, iloop;
56  long unsigned int i1, i2;
57  long unsigned int iomp;
58  long unsigned int ell4, ell5, ell6, m0, Ipart1;
59  long unsigned int mi, mj, mri, mrj, mrk, mrl;
60  int indj;
61  long unsigned int ellrl, ellrk, ellrj, ellri, elli1, elli2, ellj1, ellj2;
62  long unsigned int iSS1, iSS2, iSSL1, iSSL2;
63  double complex **vecJ;
64  double complex **matJ, **matJ2;
65  double complex *matJL;
66  double complex *matI;
67  double complex **matB;
68  double complex *arrayz;
69  double complex *arrayx;
70  double complex *arrayw;
71  long unsigned int ishift1, ishift2, ishift3, ishift4, ishift5, pivot_flag, num_J_star;
72  long unsigned int pow4, pow5, pow41, pow51;
73  //long unsigned int pow1, pow2, pow3, pow4, pow5, pow11, pow21, pow31, pow41, pow51;
74 
75  i_max = X->Check.idim_max;
76 
77 /*
78 //zero clear
79  #pragma omp parallel for default(none) private(j) \
80  shared(i_max,tmp_v0)
81  for(j=0;j<i_max;j++){
82  tmp_v0[j+1]=0.0;
83  }
84 */
85 
86  c_malloc2(vecJ, 3, 3);
87  c_malloc2(matJ, 4, 4);
88  c_malloc2(matJ2, 4, 4);
89  c_malloc2(matB, 2, 2);
90  c_malloc1(matJL, (64*64));
91  c_malloc1(matI, (64*64));
92 
93 // c_malloc1(arrayx, (64*((int)pow(2.0, 16))));
94 // c_malloc1(arrayz, (64*((int)pow(2.0, 16))));
95 // c_malloc1(arrayw, (64*((int)pow(2.0, 16))));
96 
97  //defmodelBoost(X->Boost.W0, X->Boost.R0, X->Boost.num_pivot, X->Boost.ishift_nspin, X->Boost.list_6spin_star, X->Boost.list_6spin_pair, 1, X->Boost.arrayJ, X->Boost.vecB);
98 
99  for(iloop=0; iloop < X->Boost.R0; iloop++){
100 
101 
102  for(j=iloop*X->Boost.num_pivot; j < (iloop+1)*X->Boost.num_pivot; j++){
103 
104  num_J_star = (long unsigned int)X->Boost.list_6spin_star[j][0]; //(0,j)
105  ishift1 = (long unsigned int)X->Boost.list_6spin_star[j][1]; //(1,j)
106  ishift2 = (long unsigned int)X->Boost.list_6spin_star[j][2]; //(2,j)
107  ishift3 = (long unsigned int)X->Boost.list_6spin_star[j][3]; //(3,j)
108  ishift4 = (long unsigned int)X->Boost.list_6spin_star[j][4]; //(4,j)
109  ishift5 = (long unsigned int)X->Boost.list_6spin_star[j][5]; //(5,j)
110  pivot_flag = (long unsigned int)X->Boost.list_6spin_star[j][6]; //(6,j)
111  //pow1 = (int)pow(2.0,ishift1);
112  //pow2 = (int)pow(2.0,ishift1+ishift2);
113  //pow3 = (int)pow(2.0,ishift1+ishift2+ishift3);
114  pow4 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4);
115  pow5 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5);
116  //pow11= (int)pow(2.0,ishift1+1);
117  //pow21= (int)pow(2.0,ishift1+ishift2+1);
118  //pow31= (int)pow(2.0,ishift1+ishift2+ishift3+1);
119  pow41= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+1);
120  pow51= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+1);
121 
122  for(k=0; k < (64*64); k++){
123  matJL[k] = 0.0 + 0.0*I;
124  matI[k] = 0.0 + 0.0*I;
125  }
126  for(k=0; k < 64; k++){
127  matI[k+64*k] = 1.0;
128  }
129 
130  for(ell=0; ell < num_J_star; ell++){
131  mi = (long unsigned int)X->Boost.list_6spin_pair[j][0][ell]; //(1,ell,j)
132  mj = (long unsigned int)X->Boost.list_6spin_pair[j][1][ell]; //(2,ell,j)
133  mri = (long unsigned int)X->Boost.list_6spin_pair[j][2][ell]; //(3,ell,j)
134  mrj = (long unsigned int)X->Boost.list_6spin_pair[j][3][ell]; //(4,ell,j)
135  mrk = (long unsigned int)X->Boost.list_6spin_pair[j][4][ell]; //(5,ell,j)
136  mrl = (long unsigned int)X->Boost.list_6spin_pair[j][5][ell]; //(6,ell,j)
137  indj = X->Boost.list_6spin_pair[j][6][ell]; //(7,ell,j)
138  for(i1 = 0; i1 < 3; i1++){
139  for(i2 = 0; i2 < 3; i2++){
140  vecJ[i1][i2] = X->Boost.arrayJ[(indj-1)][i1][i2];
141  }
142  }
143  //matJSS(1,1) = vecJ(3,3)
144  matJ[0][0] = vecJ[2][2];
145  //matJSS(1,2)= vecJ(1,1)-vecJ(2,2)-dcmplx(0.0d0,1.0d0)*vecJ(1,2)-dcmplx(0.0d0,1.0d0)*vecJ(2,1)
146  matJ[0][1] = vecJ[0][0]-vecJ[1][1]-I*vecJ[0][1]-I*vecJ[1][0];
147  //matJSS(1,3)= vecJ(3,1)-dcmplx(0.0d0,1.0d0)*vecJ(3,2)
148  matJ[0][2] = vecJ[2][0]-I*vecJ[2][1];
149  //matJSS(1,4)= vecJ(1,3)-dcmplx(0.0d0,1.0d0)*vecJ(2,3)
150  matJ[0][3] = vecJ[0][2]-I*vecJ[1][2];
151  //matJSS(2,1)= vecJ(1,1)-vecJ(2,2)+dcmplx(0.0d0,1.0d0)*vecJ(1,2)+dcmplx(0.0d0,1.0d0)*vecJ(2,1)
152  matJ[1][0] = vecJ[0][0]-vecJ[1][1]+I*vecJ[0][1]+I*vecJ[1][0];
153  //matJSS(2,2)= vecJ(3,3)
154  matJ[1][1] = vecJ[2][2];
155  //matJSS(2,3)=dcmplx(-1.0d0,0.0d0)*vecJ(1,3)-dcmplx(0.0d0,1.0d0)*vecJ(2,3)
156  matJ[1][2] =(-1.0)*vecJ[0][2]-I*vecJ[1][2];
157  //matJSS(2,4)=dcmplx(-1.0d0,0.0d0)*vecJ(3,1)-dcmplx(0.0d0,1.0d0)*vecJ(3,2)
158  matJ[1][3] =(-1.0)*vecJ[2][0]-I*vecJ[2][1];
159  //matJSS(3,1)= vecJ(3,1)+dcmplx(0.0d0,1.0d0)*vecJ(3,2)
160  matJ[2][0] = vecJ[2][0]+I*vecJ[2][1];
161  //matJSS(3,2)=dcmplx(-1.0d0,0.0d0)*vecJ(1,3)+dcmplx(0.0d0,1.0d0)*vecJ(2,3)
162  matJ[2][1] =(-1.0)*vecJ[0][2]+I*vecJ[1][2];
163  //matJSS(3,3)=dcmplx(-1.0d0,0.0d0)*vecJ(3,3)
164  matJ[2][2] =(-1.0)*vecJ[2][2];
165  //matJSS(3,4)= vecJ(1,1)+vecJ(2,2)+dcmplx(0.0d0,1.0d0)*vecJ(1,2)-dcmplx(0.0d0,1.0d0)*vecJ(2,1)
166  matJ[2][3] = vecJ[0][0]+vecJ[1][1]+I*vecJ[0][1]-I*vecJ[1][0];
167  //matJSS(4,1)= vecJ(1,3)+dcmplx(0.0d0,1.0d0)*vecJ(2,3)
168  matJ[3][0] = vecJ[0][2]+I*vecJ[1][2];
169  //matJSS(4,2)=dcmplx(-1.0d0,0.0d0)*vecJ(3,1)+dcmplx(0.0d0,1.0d0)*vecJ(3,2)
170  matJ[3][1] =(-1.0)*vecJ[2][0]+I*vecJ[2][1];
171  //matJSS(4,3)= vecJ(1,1)+vecJ(2,2)-dcmplx(0.0d0,1.0d0)*vecJ(1,2)+dcmplx(0.0d0,1.0d0)*vecJ(2,1)
172  matJ[3][2] = vecJ[0][0]+vecJ[1][1]-I*vecJ[0][1]+I*vecJ[1][0];
173  //matJSS(4,4)=dcmplx(-1.0d0,0.0d0)*vecJ(3,3)
174  matJ[3][3] =(-1.0)*vecJ[2][2];
175 
176  matJ2[3][3] = matJ[0][0];
177  matJ2[3][0] = matJ[0][1];
178  matJ2[3][1] = matJ[0][2];
179  matJ2[3][2] = matJ[0][3];
180  matJ2[0][3] = matJ[1][0];
181  matJ2[0][0] = matJ[1][1];
182  matJ2[0][1] = matJ[1][2];
183  matJ2[0][2] = matJ[1][3];
184  matJ2[1][3] = matJ[2][0];
185  matJ2[1][0] = matJ[2][1];
186  matJ2[1][1] = matJ[2][2];
187  matJ2[1][2] = matJ[2][3];
188  matJ2[2][3] = matJ[3][0];
189  matJ2[2][0] = matJ[3][1];
190  matJ2[2][1] = matJ[3][2];
191  matJ2[2][2] = matJ[3][3];
192 
193  for(ellri=0; ellri<2; ellri++){
194  for(ellrj=0; ellrj<2; ellrj++){
195  for(ellrk=0; ellrk<2; ellrk++){
196  for(ellrl=0; ellrl<2; ellrl++){
197  for(elli1=0; elli1<2; elli1++){
198  for(ellj1=0; ellj1<2; ellj1++){
199  for(elli2=0; elli2<2; elli2++){
200  for(ellj2=0; ellj2<2; ellj2++){
201 
202  iSSL1 = elli1*(int)pow(2,mi) + ellj1*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
203  iSSL2 = elli2*(int)pow(2,mi) + ellj2*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
204  iSS1 = elli1 + 2*ellj1;
205  iSS2 = elli2 + 2*ellj2;
206  matJL[iSSL1+64*iSSL2] += matJ2[iSS1][iSS2];
207  }
208  }
209  }
210  }
211  }
212  }
213  }
214  }
215 
216 
217  }/* loop for ell */
218 
219  /* external magnetic field B */
220  if(pivot_flag==1){
221  matB[0][0] = + X->Boost.vecB[2]; // -BM
222  matB[1][1] = - X->Boost.vecB[2]; // -BM
223  //matB[0][1] = - X->Boost.vecB[0] + I*X->Boost.vecB[1]; // -BM
224  //matB[1][0] = - X->Boost.vecB[0] - I*X->Boost.vecB[1]; // -BM
225  matB[0][1] = - X->Boost.vecB[0] - I*X->Boost.vecB[1]; // -BM
226  matB[1][0] = - X->Boost.vecB[0] + I*X->Boost.vecB[1]; // -BM
227  for(ellri=0; ellri<2; ellri++){
228  for(ellrj=0; ellrj<2; ellrj++){
229  for(ellrk=0; ellrk<2; ellrk++){
230  for(ellrl=0; ellrl<2; ellrl++){
231  for(ellj1=0; ellj1<2; ellj1++){
232  for(elli1=0; elli1<2; elli1++){
233  for(elli2=0; elli2<2; elli2++){
234  for(ellj2=0; ellj2<X->Boost.ishift_nspin; ellj2++){
235  iSSL1 = elli1*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
236  iSSL2 = elli2*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
237  matJL[iSSL1+64*iSSL2] += matB[elli1][elli2];
238  }
239  }
240  }
241  }
242  }
243  }
244  }
245  }
246  }
247  /* external magnetic field B */
248 
249  iomp=i_max/(int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+2);
250 
251  #pragma omp parallel default(none) private(arrayx,arrayz,arrayw,ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \
252  shared(matJL,matI,iomp,i_max,myrank,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,tmp_v0,tmp_v1,tmp_v3)
253  {
254 
255  c_malloc1(arrayx, (64*((int)pow(2.0,ishift4+ishift5-1))));
256  c_malloc1(arrayz, (64*((int)pow(2.0,ishift4+ishift5-1))));
257  c_malloc1(arrayw, (64*((int)pow(2.0,ishift4+ishift5-1))));
258 
259 #pragma omp for
260  for(ell6 = 0; ell6 < iomp; ell6++){
261  Ipart1=pow51*2*ell6;
262  for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
263  for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
264  for(m0 = 0; m0 < 16; m0++){
265  arrayz[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
266  arrayz[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
267  arrayz[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
268  arrayz[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
269  tmp_v3[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
270  tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
271  tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
272  tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
273  arrayx[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
274  arrayx[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
275  arrayx[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
276  arrayx[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
277  }
278  }
279  }
280 
281 
282  for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
283  for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
284  for(m0 = 0; m0 < 16; m0++){
285  arrayz[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
286  arrayz[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
287  arrayz[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
288  arrayz[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
289  tmp_v3[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
290  tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
291  tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
292  tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
293  arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
294  arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
295  arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
296  arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
297  }
298 
299  }
300  }
301 
302  TRANSA = 'N';
303  TRANSB = 'N';
304  M = 64;
305  N = (int)pow(2.0, ishift4+ishift5-1);
306  K = 64;
307  ALPHA = 1.0;
308  LDA = 64;
309  LDB = 64;
310  BETA = 1.0;
311  LDC = 64;
312 
313  zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matJL,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC);
314  //zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matI,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC);
315 /*
316  for(ell5=0;ell5<(64*N);ell5++){
317  arrayw[ell5]=0.0;
318  }
319  for(ell5=0;ell5<64;ell5++){
320  for(ell4=0;ell4<64;ell4++){
321  for(m0=0;m0<N;m0++){
322  arrayw[(ell5+64*m0)] += matJL[(ell5+64*ell4)]*arrayz[(ell4+64*m0)];
323  }
324  }
325  }
326  for(ell5=0;ell5<64*N;ell5++){
327  arrayx[ell5] += arrayw[ell5];
328  }
329 */
330 
331 
332 
333  for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
334  for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
335  for(m0 = 0; m0 < 16; m0++){
336  tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)))];
337  tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)))];
338  tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)))];
339  tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)))];
340  }
341  }
342  }
343  for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
344  for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
345  for(m0 = 0; m0 < 16; m0++){
346  tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
347  tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
348  tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
349  tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
350  }
351  }
352  }
353 
354  }/* omp parallel for */
355  c_free1(arrayz, (64*((int)pow(2.0,ishift4+ishift5-1))) );
356  c_free1(arrayx, (64*((int)pow(2.0,ishift4+ishift5-1))) );
357  c_free1(arrayw, (64*((int)pow(2.0,ishift4+ishift5-1))) );
358 
359  }/* omp parallel */
360 
361  if(pivot_flag==1){
362  iomp=i_max/(int)pow(2.0,X->Boost.ishift_nspin);
363  #pragma omp parallel for default(none) private(ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \
364  firstprivate(iomp) shared(i_max,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,X,tmp_v0,tmp_v1)
365  for(ell5 = 0; ell5 < iomp; ell5++ ){
366  for(ell4 = 0; ell4 < (int)pow(2.0,X->Boost.ishift_nspin); ell4++){
367  tmp_v0[(1 + ell5+(i_max/(int)pow(2.0,X->Boost.ishift_nspin))*ell4)] = tmp_v1[(1 + ell4+((int)pow(2.0,X->Boost.ishift_nspin))*ell5)];
368  }
369  }
370  iomp=i_max/(int)pow(2.0,X->Boost.ishift_nspin);
371  #pragma omp parallel for default(none) private(ell4,ell5) \
372  firstprivate(iomp) shared(i_max,X,tmp_v1,tmp_v3)
373  for(ell5 = 0; ell5 < iomp; ell5++ ){
374  for(ell4 = 0; ell4 < (int)pow(2.0,X->Boost.ishift_nspin); ell4++){
375  tmp_v1[(1 + ell5+(i_max/(int)pow(2.0,X->Boost.ishift_nspin))*ell4)] = tmp_v3[(1 + ell4+((int)pow(2.0,X->Boost.ishift_nspin))*ell5)];
376  }
377  }
378  }
379  else{
380  #pragma omp parallel for default(none) private(ell4) \
381  shared(i_max,tmp_v0,tmp_v1,tmp_v3)
382  for(ell4 = 0; ell4 < i_max; ell4++ ){
383  tmp_v0[1 + ell4] = tmp_v1[1 + ell4];
384  tmp_v1[1 + ell4] = tmp_v3[1 + ell4];
385  }
386  }/* if pivot_flag */
387 
388  }/* loop for j */
389 
390  /*
391  ierr = MPI_Alltoall(&tmp_v1[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
392  ierr = MPI_Alltoall(&tmp_v0[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
393  */
394  MPI_Alltoall(&tmp_v1[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
395  MPI_Alltoall(&tmp_v0[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
396 
397 
398  iomp=(int)pow(2.0,X->Boost.W0)/nproc;
399  #pragma omp parallel for default(none) private(ell4,ell5,ell6) \
400  firstprivate(iomp) shared(i_max,X,nproc,tmp_v0,tmp_v1,tmp_v2,tmp_v3)
401  //for(ell4 = 0; ell4 < (int)pow(2.0,X->Boost.W0)/nproc; ell4++ ){
402  for(ell4 = 0; ell4 < iomp; ell4++ ){
403  for(ell5 = 0; ell5 < nproc; ell5++ ){
404  for(ell6 = 0; ell6 < (int)(i_max/(int)pow(2.0,X->Boost.W0)); ell6++ ){
405  tmp_v1[(1 + ell6+ell5*i_max/(int)pow(2.0,X->Boost.W0)+ell4*i_max/((int)pow(2.0,X->Boost.W0)/nproc))] = tmp_v3[(1 + ell6+ell4*i_max/(int)pow(2.0,X->Boost.W0)+ell5*i_max/nproc)];
406  tmp_v0[(1 + ell6+ell5*i_max/(int)pow(2.0,X->Boost.W0)+ell4*i_max/((int)pow(2.0,X->Boost.W0)/nproc))] = tmp_v2[(1 + ell6+ell4*i_max/(int)pow(2.0,X->Boost.W0)+ell5*i_max/nproc)];
407  }
408  }
409  }
410 
411 
412  }/* loop for iloop */
413 
414 /*
415  dam_pr= X_child_general_int_spin_MPIBoost
416  (
417  matJ, X, tmp_v0, tmp_v1);
418 
419  X->Large.prdct += dam_pr;
420 */
421 // c_free1(arrayz, (int)pow(2.0, 16));
422 // c_free1(arrayx, (int)pow(2.0, 16));
423 // c_free1(arrayw, (int)pow(2.0, 16));
424 
425  c_free2(vecJ, 3, 3);
426  c_free2(matJ, 4, 4);
427  c_free2(matJ2, 4, 4);
428  c_free2(matB, 2, 2);
429  c_free1(matJL, (64*64));
430  c_free1(matI, (64*64));
431 
432 #endif
433 
434 }/*void child_general_int_spin_MPIBoost*/
int nproc
Number of processors, defined in InitializeMPI()
Definition: global.h:161
void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double complex *ALPHA, double complex *matJL, int *LDA, double complex *arrayz, int *LDB, double complex *BETA, double complex *arrayx, int *LDC)
struct EDMainCalStruct X
Definition: struct.h:431

◆ zgemm_()

void zgemm_ ( char *  TRANSA,
char *  TRANSB,
int *  M,
int *  N,
int *  K,
double complex *  ALPHA,
double complex *  matJL,
int *  LDA,
double complex *  arrayz,
int *  LDB,
double complex *  BETA,
double complex *  arrayx,
int *  LDC 
)