87 int mpi_stbmm(
double **V, int64_t nrow,
int m,
int m_rowwise,
Block *tpltzblocks,
int nb_blocks_local,
int nb_blocks_all, int64_t idp,
int local_V_size,
Flag flag_stgy, MPI_Comm comm)
89 #else //for sequential use only
90 int mpi_stbmm(
double **V, int64_t nrow,
int m,
int m_rowwise,
Block *tpltzblocks,
int nb_blocks_local,
int nb_blocks_all, int64_t idp,
int local_V_size,
Flag flag_stgy)
101 MPI_Comm_rank(comm, &rank);
102 MPI_Comm_size(comm, &size);
126 nnew = (
int*) calloc(nb_blocks_local,
sizeof(
int));
128 int local_V_size_new;
129 int n_rowwise=local_V_size;
131 int status_params =
get_overlapping_blocks_params( nb_blocks_local, tpltzblocks, local_V_size, nrow, idp, &idpnew, &local_V_size_new, nnew, &idv0, &idvn);
135 printf(
"status_params=%d\n", status_params);
137 if( status_params == 0) {
142 if (tpltzblocks[idv0].lambda==0 || tpltzblocks[idvn].lambda==0)
147 fprintf(file,
"new parameters caracteristics:\n");
148 fprintf(file,
"[%d] idp=%ld ; idpnew=%ld\n", rank, idp, idpnew);
149 fprintf(file,
"[%d] local_V_size=%d ; local_V_size_new=%d\n", rank, local_V_size, local_V_size_new);
150 for(i=0;i<nb_blocks_local;i++)
151 fprintf(file,
"[%d] n[%d]=%d ; nnew[%d]=%d\n", rank, i, (tpltzblocks[i].n), i, nnew[i]);
152 for(i=0;i<nb_blocks_local;i++)
153 fprintf(file,
"[%d] tpltzblocks[%d].idv=%ld\n", rank, i, tpltzblocks[i].idv);
157 int vShft=idpnew-idp;
161 int idvm0 = idpnew/nrow;
162 int idvmn = (idpnew+local_V_size_new-1)/nrow;
164 int ncol_rank = idvmn-idvm0+1;
169 nb_blocks_rank = idvn - idv0 + 1;
171 nb_blocks_rank = (ncol_rank-2)*nb_blocks_local + (nb_blocks_local-idv0) + (idvn+1);
174 fprintf(file,
"[%d] nb_blocks_rank=%d, nb_blocks_local=%d\n", rank, nb_blocks_rank, nb_blocks_local);
177 int idvp0 = idpnew%nrow-tpltzblocks[idv0].
idv;
180 idvpn = tpltzblocks[idvn].
idv+nnew[idvn]-1 - (idpnew+local_V_size_new-1)%nrow ;
184 int offset0, offsetn;
185 int distcorrmin_idv0 = (tpltzblocks[idv0].
lambda)-1;
186 int distcorrmin_idvn = (tpltzblocks[idvn].
lambda)-1;
189 offset0 = min( idvp0, distcorrmin_idv0);
191 offsetn = min(idvpn, distcorrmin_idvn);
199 toSendLeft = min( tpltzblocks[idv0].idv+nnew[idv0]-idpnew%nrow, distcorrmin_idv0);
202 toSendRight = min( (idpnew+local_V_size_new)%nrow-tpltzblocks[idvn].idv, distcorrmin_idvn);
205 int flag_optimlambda=1;
207 int lambdaOut_offset;
210 int lambdaOut_size, lambdaIn_size;
212 if (flag_optimlambda==1) {
213 LambdaOut=(
double *) calloc((toSendLeft+toSendRight)*m_rowwise,
sizeof(double));
214 lambdaOut_offset = toSendLeft*m_rowwise;
215 lambdaIn_offset = offset0*m_rowwise;
216 lambdaOut_size = (toSendLeft+toSendRight)*m_rowwise ;
217 lambdaIn_size = (offset0+offsetn)*m_rowwise;
220 LambdaOut=(
double *) calloc((tpltzblocks[idv0].lambda+tpltzblocks[idvn].lambda)*m_rowwise,
sizeof(double));
221 lambdaOut_offset = tpltzblocks[idv0].
lambda*m_rowwise;
222 lambdaIn_offset = tpltzblocks[idv0].
lambda*m_rowwise;
223 lambdaOut_size = (tpltzblocks[idv0].
lambda+tpltzblocks[idvn].
lambda)*m_rowwise;
224 lambdaIn_size = (tpltzblocks[idv0].
lambda+tpltzblocks[idvn].
lambda)*m_rowwise;
229 for (j=0;j<m_rowwise;j++)
230 for (i=0;i<toSendLeft;i++)
231 LambdaOut[i+j*toSendLeft]=(*V)[i+j*n_rowwise];
234 for (j=0;j<m_rowwise;j++)
235 for (i=0;i<toSendRight;i++)
236 LambdaOut[i+j*toSendRight+lambdaOut_offset]=(*V)[i+j*n_rowwise+local_V_size-toSendRight];
242 if(rank==0 || offset0==0)
243 left = MPI_PROC_NULL;
244 if(rank==size-1 || offsetn==0)
245 right = MPI_PROC_NULL;
247 double *LambdaIn=(
double *) calloc(lambdaIn_size,
sizeof(
double));
250 int flag_blockingcomm=0;
251 MPI_Request requestLeft_r, requestLeft_s;
252 MPI_Request requestRight_r, requestRight_s;
254 if (flag_blockingcomm==1) {
256 MPI_Sendrecv( LambdaOut, toSendLeft*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, (LambdaIn+lambdaIn_offset), offsetn*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm, &status);
257 MPI_Sendrecv( (LambdaOut+lambdaOut_offset), toSendRight*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, LambdaIn, offset0*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm, &status);
262 MPI_Irecv((LambdaIn+lambdaIn_offset), offsetn*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm, &requestLeft_r);
263 MPI_Isend(LambdaOut, toSendLeft*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm, &requestLeft_s);
266 MPI_Irecv(LambdaIn, offset0*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm, &requestRight_r);
267 MPI_Isend((LambdaOut+lambdaOut_offset), toSendRight*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm, &requestRight_s);
275 int v0rank_size, vnrank_size;
276 if (nb_blocks_rank == 1) {
277 v0rank_size = ((idpnew+local_V_size_new-1)%nrow +1) - idpnew%nrow + offset0 + offsetn;
281 v0rank_size = tpltzblocks[idv0].
idv + nnew[idv0] - idpnew%nrow + offset0;
282 vnrank_size = ((idpnew+local_V_size_new-1)%nrow +1) - tpltzblocks[idvn].
idv + offsetn;
288 if (flag_blockingcomm!=1) {
290 MPI_Wait(&requestLeft_r, &status);
291 MPI_Wait(&requestLeft_s, &status);
292 MPI_Wait(&requestRight_r, &status);
293 MPI_Wait(&requestRight_s, &status);
316 fftw_complex *V_fft, *T_fft;
318 fftw_plan plan_f, plan_b;
337 for(iblock=idv0;iblock<idv0+nb_blocks_rank;iblock++) {
338 id = iblock%nb_blocks_local;
348 fprintf(file,
"[%d] First block...\n", rank);
350 vblock_size=v0rank_size;
351 id0block=(offset_id0-offset0);
353 V1block = (
double *) calloc(vblock_size*m_rowwise,
sizeof(
double));
355 for (j=0;j<m_rowwise;j++) {
356 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
357 for (i=0;i<offset0;i++)
358 V1block[i+j*vblock_size] = LambdaIn[i+j*offset0];
365 int currentsize_middlepart=min(vblock_size-offset0, local_V_size_new);
367 for (j=0;j<m_rowwise;j++) {
368 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
369 for (i=0;i<currentsize_middlepart;i++)
370 V1block[offset0+i+j*vblock_size] = (*V)[i+vShft+j*n_rowwise];
373 if (nb_blocks_rank == 1) {
374 for (j=0;j<m_rowwise;j++) {
375 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
376 for (i=0;i<offsetn;i++) {
377 V1block[vblock_size-offsetn+i+j*vblock_size] = LambdaIn[i+lambdaIn_offset+j*offsetn];
383 tpltz_init(vblock_size, tpltzblocks[
id].lambda, &nfft, &blocksize, &T_fft, (tpltzblocks[
id].T_block), &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
387 fprintf(file,
"[%d] Before stmm_main call : nfft = %d, blocksize = %d\n", rank, nfft, blocksize);
388 stmm_main(&V1block, vblock_size, m_rowwise, 0, m_rowwise*vblock_size, (tpltzblocks[
id].T_block), T_fft, tpltzblocks[
id].lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
393 int currentsize=min(vblock_size-offset0, local_V_size_new);
394 for (j=0;j<m_rowwise;j++) {
395 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
396 for (i=0;i<currentsize;i++)
397 (*V)[vShft+i+j*n_rowwise] = V1block[offset0+i+j*vblock_size];
407 else if(iblock!=idv0 && iblock!=idv0+nb_blocks_rank-1) {
411 fprintf(file,
"[%d] generic block...\n");
413 vblock_size=nnew[id];
416 V1block = (
double *) calloc(vblock_size*m_rowwise,
sizeof(
double));
418 idv1 = (tpltzblocks[id].
idv)-idp%nrow - vShft + offset0 +nrow*( (iblock/nb_blocks_local) );
420 idv2 = (tpltzblocks[id].
idv)-(idpnew)%nrow+vShft + nrow*( (iblock/nb_blocks_local) );
422 for (j=0;j<m_rowwise;j++) {
423 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
424 for (i=0;i<vblock_size;i++)
425 V1block[i+j*vblock_size] = (*V)[i+idv2+j*n_rowwise];
430 tpltz_init(nnew[
id], tpltzblocks[
id].lambda, &nfft, &blocksize, &T_fft, (tpltzblocks[
id].T_block), &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
434 fprintf(file,
"[%d] Before stmm_main call : nfft = %d, blocksize = %d\n", rank, nfft, blocksize);
435 stmm_main(&V1block, vblock_size, m_rowwise, 0, m_rowwise*vblock_size, (tpltzblocks[
id].T_block), T_fft, tpltzblocks[
id].lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
441 for (j=0;j<m_rowwise;j++) {
442 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
443 for (i=0;i<vblock_size;i++) {
444 (*V)[i+idv2+j*n_rowwise] = V1block[i+j*vblock_size];
455 else if(iblock==idv0+nb_blocks_rank-1 && iblock!= idv0) {
457 fprintf(file,
"[%d] last block...\n");
459 vblock_size=vnrank_size;
462 V1block = (
double *) calloc(vblock_size*m_rowwise,
sizeof(
double));
464 idv1 = (tpltzblocks[id].
idv) - idp%nrow - vShft + offset0 + nrow*( (iblock/nb_blocks_local) );
465 idv2 = (tpltzblocks[id].
idv)-(idpnew)%nrow+vShft + nrow*( (iblock/nb_blocks_local) );
468 for (j=0;j<m_rowwise;j++) {
469 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
470 for (i=0;i<vblock_size-offsetn;i++)
471 V1block[i+j*vblock_size] = (*V)[i+idv2+j*n_rowwise];
475 for (j=0;j<m_rowwise;j++) {
476 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
477 for (i=0;i<offsetn;i++)
478 V1block[vblock_size-offsetn+i+j*vblock_size] = LambdaIn[i+lambdaIn_offset+j*offsetn];
483 tpltz_init(vblock_size, tpltzblocks[
id].lambda, &nfft, &blocksize, &T_fft, (tpltzblocks[
id].T_block), &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
487 fprintf(file,
"[%d] Before stmm_main call : nfft = %d, blocksize = %d\n", rank, nfft, blocksize);
489 stmm_main(&V1block, vblock_size, m_rowwise, 0, vblock_size*m_rowwise, (tpltzblocks[
id].T_block), T_fft, tpltzblocks[
id].lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
493 for (j=0;j<m_rowwise;j++) {
494 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
495 for (i=0;i<vnrank_size-offsetn;i++) {
496 (*V)[idv2+i+j*n_rowwise] = V1block[i+j*vblock_size];
529 int get_overlapping_blocks_params(
int nbloc,
Block *tpltzblocks,
int local_V_size, int64_t nrow, int64_t idp, int64_t *idpnew,
int *local_V_size_new,
int *nnew,
int *ifirstBlock,
int *ilastBlock)
532 int ib, nblockOK=0, nfullcol_data;
533 int64_t firstrow, lastrow;
538 nfullcol_data = max(0, (local_V_size-(nrow-idp%nrow)%nrow-(idp+local_V_size)%nrow)/nrow );
541 if( nfullcol_data > 0) {
543 for( ib=0; ib<nbloc; ib++) {
544 if( tpltzblocks[ib].idv < nrow) {
545 nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv);
554 lastrow = (idp+local_V_size-1)%nrow;
556 if( firstrow < lastrow) {
558 for( ib=0; ib<nbloc; ib++) {
559 if( (tpltzblocks[ib].idv+tpltzblocks[ib].n > firstrow) && (tpltzblocks[ib].
idv < lastrow+1)) {
560 nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv);
568 for( ib=0; ib<nbloc; ib++) {
569 if( (tpltzblocks[ib].idv + tpltzblocks[ib].n > firstrow) && (tpltzblocks[ib].idv < nrow)) {
570 nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv);
574 if( (tpltzblocks[ib].idv < lastrow+1) && (tpltzblocks[ib].idv+tpltzblocks[ib].n > 0)) {
575 nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv);
583 printf(
"nblockOK=%d\n", nblockOK);
586 if( nblockOK == 0)
return(0);
593 for( *ifirstBlock = -1; *ifirstBlock == -1; ) {
594 for(ib=0;ib<nbloc;ib++) {
595 if(nnew[ib] != 0 && idptmp%nrow < tpltzblocks[ib].idv+nnew[ib])
break;
598 if (ib<nbloc && tpltzblocks[ib].idv <= idptmp%nrow) {
602 else if (ib<nbloc && tpltzblocks[ib].idv > idptmp%nrow) {
606 int idvfirstcolumn = idptmp/nrow;
607 *idpnew = tpltzblocks[ib].
idv+idvfirstcolumn*nrow;
610 idptmp += nrow-idptmp%nrow;
616 idptmp = idp+local_V_size-1;
618 for( *ilastBlock = -1; *ilastBlock == -1; ) {
619 for(ib=nbloc-1;ib>=0;ib--) {
620 if(nnew[ib] != 0 && tpltzblocks[ib].idv <= idptmp%nrow)
break;
624 if (ib>=0 && idptmp%nrow < tpltzblocks[ib].idv+nnew[ib]) {
626 *local_V_size_new = local_V_size-(*idpnew)+idp;
628 else if (ib>=0 && tpltzblocks[ib].idv+nnew[ib] <= idptmp%nrow) {
635 int idvlastcolumn = idptmp/nrow;
636 *local_V_size_new = tpltzblocks[ib].
idv+nnew[ib]+idvlastcolumn*nrow - (*idpnew);
640 idptmp = idptmp - (idptmp%nrow)-1;