52 int M, N, K, LDA, LDB, LDC;
53 double complex ALPHA, BETA;
54 long unsigned int i_max;
55 long unsigned int j, k, ell, iloop;
56 long unsigned int i1, i2;
57 long unsigned int iomp;
58 long unsigned int ell4, ell5, ell6, m0, Ipart1;
59 long unsigned int mi, mj, mri, mrj, mrk, mrl;
61 long unsigned int ellrl, ellrk, ellrj, ellri, elli1, elli2, ellj1, ellj2;
62 long unsigned int iSS1, iSS2, iSSL1, iSSL2;
63 double complex **vecJ;
64 double complex **matJ, **matJ2;
65 double complex *matJL;
67 double complex **matB;
68 double complex *arrayz;
69 double complex *arrayx;
70 double complex *arrayw;
71 long unsigned int ishift1, ishift2, ishift3, ishift4, ishift5, pivot_flag, num_J_star;
72 long unsigned int pow4, pow5, pow41, pow51;
75 i_max =
X->Check.idim_max;
86 c_malloc2(vecJ, 3, 3);
87 c_malloc2(matJ, 4, 4);
88 c_malloc2(matJ2, 4, 4);
89 c_malloc2(matB, 2, 2);
90 c_malloc1(matJL, (64*64));
91 c_malloc1(matI, (64*64));
99 for(iloop=0; iloop <
X->Boost.R0; iloop++){
102 for(j=iloop*
X->Boost.num_pivot; j < (iloop+1)*
X->Boost.num_pivot; j++){
104 num_J_star = (
long unsigned int)
X->Boost.list_6spin_star[j][0];
105 ishift1 = (
long unsigned int)
X->Boost.list_6spin_star[j][1];
106 ishift2 = (
long unsigned int)
X->Boost.list_6spin_star[j][2];
107 ishift3 = (
long unsigned int)
X->Boost.list_6spin_star[j][3];
108 ishift4 = (
long unsigned int)
X->Boost.list_6spin_star[j][4];
109 ishift5 = (
long unsigned int)
X->Boost.list_6spin_star[j][5];
110 pivot_flag = (
long unsigned int)
X->Boost.list_6spin_star[j][6];
114 pow4 = (
int)pow(2.0,ishift1+ishift2+ishift3+ishift4);
115 pow5 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5);
119 pow41= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+1);
120 pow51= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+1);
122 for(k=0; k < (64*64); k++){
123 matJL[k] = 0.0 + 0.0*I;
124 matI[k] = 0.0 + 0.0*I;
126 for(k=0; k < 64; k++){
130 for(ell=0; ell < num_J_star; ell++){
131 mi = (
long unsigned int)
X->Boost.list_6spin_pair[j][0][ell];
132 mj = (
long unsigned int)
X->Boost.list_6spin_pair[j][1][ell];
133 mri = (
long unsigned int)
X->Boost.list_6spin_pair[j][2][ell];
134 mrj = (
long unsigned int)
X->Boost.list_6spin_pair[j][3][ell];
135 mrk = (
long unsigned int)
X->Boost.list_6spin_pair[j][4][ell];
136 mrl = (
long unsigned int)
X->Boost.list_6spin_pair[j][5][ell];
137 indj =
X->Boost.list_6spin_pair[j][6][ell];
138 for(i1 = 0; i1 < 3; i1++){
139 for(i2 = 0; i2 < 3; i2++){
140 vecJ[i1][i2] =
X->Boost.arrayJ[(indj-1)][i1][i2];
144 matJ[0][0] = vecJ[2][2];
146 matJ[0][1] = vecJ[0][0]-vecJ[1][1]-I*vecJ[0][1]-I*vecJ[1][0];
148 matJ[0][2] = vecJ[2][0]-I*vecJ[2][1];
150 matJ[0][3] = vecJ[0][2]-I*vecJ[1][2];
152 matJ[1][0] = vecJ[0][0]-vecJ[1][1]+I*vecJ[0][1]+I*vecJ[1][0];
154 matJ[1][1] = vecJ[2][2];
156 matJ[1][2] =(-1.0)*vecJ[0][2]-I*vecJ[1][2];
158 matJ[1][3] =(-1.0)*vecJ[2][0]-I*vecJ[2][1];
160 matJ[2][0] = vecJ[2][0]+I*vecJ[2][1];
162 matJ[2][1] =(-1.0)*vecJ[0][2]+I*vecJ[1][2];
164 matJ[2][2] =(-1.0)*vecJ[2][2];
166 matJ[2][3] = vecJ[0][0]+vecJ[1][1]+I*vecJ[0][1]-I*vecJ[1][0];
168 matJ[3][0] = vecJ[0][2]+I*vecJ[1][2];
170 matJ[3][1] =(-1.0)*vecJ[2][0]+I*vecJ[2][1];
172 matJ[3][2] = vecJ[0][0]+vecJ[1][1]-I*vecJ[0][1]+I*vecJ[1][0];
174 matJ[3][3] =(-1.0)*vecJ[2][2];
176 matJ2[3][3] = matJ[0][0];
177 matJ2[3][0] = matJ[0][1];
178 matJ2[3][1] = matJ[0][2];
179 matJ2[3][2] = matJ[0][3];
180 matJ2[0][3] = matJ[1][0];
181 matJ2[0][0] = matJ[1][1];
182 matJ2[0][1] = matJ[1][2];
183 matJ2[0][2] = matJ[1][3];
184 matJ2[1][3] = matJ[2][0];
185 matJ2[1][0] = matJ[2][1];
186 matJ2[1][1] = matJ[2][2];
187 matJ2[1][2] = matJ[2][3];
188 matJ2[2][3] = matJ[3][0];
189 matJ2[2][0] = matJ[3][1];
190 matJ2[2][1] = matJ[3][2];
191 matJ2[2][2] = matJ[3][3];
193 for(ellri=0; ellri<2; ellri++){
194 for(ellrj=0; ellrj<2; ellrj++){
195 for(ellrk=0; ellrk<2; ellrk++){
196 for(ellrl=0; ellrl<2; ellrl++){
197 for(elli1=0; elli1<2; elli1++){
198 for(ellj1=0; ellj1<2; ellj1++){
199 for(elli2=0; elli2<2; elli2++){
200 for(ellj2=0; ellj2<2; ellj2++){
202 iSSL1 = elli1*(int)pow(2,mi) + ellj1*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
203 iSSL2 = elli2*(int)pow(2,mi) + ellj2*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
204 iSS1 = elli1 + 2*ellj1;
205 iSS2 = elli2 + 2*ellj2;
206 matJL[iSSL1+64*iSSL2] += matJ2[iSS1][iSS2];
221 matB[0][0] = +
X->Boost.vecB[2];
222 matB[1][1] = -
X->Boost.vecB[2];
225 matB[0][1] = -
X->Boost.vecB[0] - I*
X->Boost.vecB[1];
226 matB[1][0] = -
X->Boost.vecB[0] + I*
X->Boost.vecB[1];
227 for(ellri=0; ellri<2; ellri++){
228 for(ellrj=0; ellrj<2; ellrj++){
229 for(ellrk=0; ellrk<2; ellrk++){
230 for(ellrl=0; ellrl<2; ellrl++){
231 for(ellj1=0; ellj1<2; ellj1++){
232 for(elli1=0; elli1<2; elli1++){
233 for(elli2=0; elli2<2; elli2++){
234 for(ellj2=0; ellj2<
X->Boost.ishift_nspin; ellj2++){
235 iSSL1 = elli1*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
236 iSSL2 = elli2*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
237 matJL[iSSL1+64*iSSL2] += matB[elli1][elli2];
249 iomp=i_max/(int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+2);
251 #pragma omp parallel default(none) private(arrayx,arrayz,arrayw,ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \ 252 shared(matJL,matI,iomp,i_max,myrank,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,tmp_v0,tmp_v1,tmp_v3) 255 c_malloc1(arrayx, (64*((
int)pow(2.0,ishift4+ishift5-1))));
256 c_malloc1(arrayz, (64*((
int)pow(2.0,ishift4+ishift5-1))));
257 c_malloc1(arrayw, (64*((
int)pow(2.0,ishift4+ishift5-1))));
260 for(ell6 = 0; ell6 < iomp; ell6++){
262 for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
263 for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
264 for(m0 = 0; m0 < 16; m0++){
265 arrayz[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
266 arrayz[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
267 arrayz[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
268 arrayz[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
269 tmp_v3[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
270 tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
271 tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
272 tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
273 arrayx[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
274 arrayx[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
275 arrayx[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
276 arrayx[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
282 for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
283 for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
284 for(m0 = 0; m0 < 16; m0++){
285 arrayz[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
286 arrayz[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
287 arrayz[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
288 arrayz[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
289 tmp_v3[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
290 tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
291 tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
292 tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
293 arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
294 arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
295 arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
296 arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
305 N = (int)pow(2.0, ishift4+ishift5-1);
313 zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matJL,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC);
333 for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
334 for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
335 for(m0 = 0; m0 < 16; m0++){
336 tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
337 tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
338 tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
339 tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
343 for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
344 for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
345 for(m0 = 0; m0 < 16; m0++){
346 tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
347 tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
348 tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
349 tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
355 c_free1(arrayz, (64*((
int)pow(2.0,ishift4+ishift5-1))) );
356 c_free1(arrayx, (64*((
int)pow(2.0,ishift4+ishift5-1))) );
357 c_free1(arrayw, (64*((
int)pow(2.0,ishift4+ishift5-1))) );
362 iomp=i_max/(int)pow(2.0,
X->Boost.ishift_nspin);
363 #pragma omp parallel for default(none) private(ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \ 364 firstprivate(iomp) shared(i_max,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,X,tmp_v0,tmp_v1) 365 for(ell5 = 0; ell5 < iomp; ell5++ ){
366 for(ell4 = 0; ell4 < (int)pow(2.0,
X->Boost.ishift_nspin); ell4++){
367 tmp_v0[(1 + ell5+(i_max/(int)pow(2.0,
X->Boost.ishift_nspin))*ell4)] = tmp_v1[(1 + ell4+((int)pow(2.0,
X->Boost.ishift_nspin))*ell5)];
370 iomp=i_max/(int)pow(2.0,
X->Boost.ishift_nspin);
371 #pragma omp parallel for default(none) private(ell4,ell5) \ 372 firstprivate(iomp) shared(i_max,X,tmp_v1,tmp_v3) 373 for(ell5 = 0; ell5 < iomp; ell5++ ){
374 for(ell4 = 0; ell4 < (int)pow(2.0,
X->Boost.ishift_nspin); ell4++){
375 tmp_v1[(1 + ell5+(i_max/(int)pow(2.0,
X->Boost.ishift_nspin))*ell4)] = tmp_v3[(1 + ell4+((int)pow(2.0,
X->Boost.ishift_nspin))*ell5)];
380 #pragma omp parallel for default(none) private(ell4) \ 381 shared(i_max,tmp_v0,tmp_v1,tmp_v3) 382 for(ell4 = 0; ell4 < i_max; ell4++ ){
383 tmp_v0[1 + ell4] = tmp_v1[1 + ell4];
384 tmp_v1[1 + ell4] = tmp_v3[1 + ell4];
394 MPI_Alltoall(&tmp_v1[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
395 MPI_Alltoall(&tmp_v0[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
398 iomp=(int)pow(2.0,
X->Boost.W0)/
nproc;
399 #pragma omp parallel for default(none) private(ell4,ell5,ell6) \ 400 firstprivate(iomp) shared(i_max,X,nproc,tmp_v0,tmp_v1,tmp_v2,tmp_v3) 402 for(ell4 = 0; ell4 < iomp; ell4++ ){
403 for(ell5 = 0; ell5 <
nproc; ell5++ ){
404 for(ell6 = 0; ell6 < (int)(i_max/(
int)pow(2.0,
X->Boost.W0)); ell6++ ){
405 tmp_v1[(1 + ell6+ell5*i_max/(int)pow(2.0,
X->Boost.W0)+ell4*i_max/((int)pow(2.0,
X->Boost.W0)/
nproc))] = tmp_v3[(1 + ell6+ell4*i_max/(int)pow(2.0,
X->Boost.W0)+ell5*i_max/
nproc)];
406 tmp_v0[(1 + ell6+ell5*i_max/(int)pow(2.0,
X->Boost.W0)+ell4*i_max/((int)pow(2.0,
X->Boost.W0)/
nproc))] = tmp_v2[(1 + ell6+ell4*i_max/(int)pow(2.0,
X->Boost.W0)+ell5*i_max/
nproc)];
427 c_free2(matJ2, 4, 4);
429 c_free1(matJL, (64*64));
430 c_free1(matI, (64*64));
int nproc
Number of processors, defined in InitializeMPI()
void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double complex *ALPHA, double complex *matJL, int *LDA, double complex *arrayz, int *LDB, double complex *BETA, double complex *arrayx, int *LDC)