96 str_mess = (
char *) malloc(100 *
sizeof(
char));
98 sprintf (str_mess,
"Error on line %d of %s. Toeplitz band width > vector size\n", line, file);
100 sprintf (str_mess,
"Error on line %d of %s. Bad allocation.\n", line, file);
101 if(error_number == 3)
102 sprintf (str_mess,
"Error on line %d of %s. Error at fftw multithread initialization.\n", line, file);
103 if(error_number == 7)
104 sprintf (str_mess,
"Error on line %d of %s.\n", line, file);
105 fprintf(stderr,
"%s", str_mess);
106 printf(
"%s", str_mess);
144 else if (bs_flag==2) {
146 min_pow2 = (int) ceil( log(min_bs)/log(2) );
147 bs = pow(2, min_pow2);
152 else if (bs_flag==3) {
154 min_pow2 = (int) ceil( log(min_bs)/log(2) );
155 bs = pow(2, min_pow2);
160 else if (bs_flag==4 || bs_flag==0) {
162 min_pow2 = (int) ceil( log(min_bs)/log(2) );
163 bs = pow(2, min_pow2);
168 else if (bs_flag==5) {
171 while(bs < 2*(lambda-1)*log(bs+1) && bs<n) {
175 else if (bs_flag==6) {
179 min_pow2 = (int) ceil( log(min_bs)/log(2) );
181 min_pow2 = max(min_pow2, pow(2,14));
184 bs = pow(2, min_pow2);
196 printf(
"Error. Wrong value for bs_flag. Set to auto mode.\n");
198 min_pow2 = (int) ceil( log(min_bs)/log(2) );
199 bs = pow(2, min_pow2);
206 printf(
"Computed optimal blocksize is %d (with lambda = %d)\n", bs, lambda);
226 else if (flag_nfft==1)
228 else if (flag_nfft==2)
231 printf(
"Error. Wrong value for flag_nfft. Set to auto mode.\n");
257 int tpltz_init(
int n,
int lambda,
int *nfft,
int *blocksize, fftw_complex **T_fft,
double *T, fftw_complex **V_fft,
double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b,
Flag flag_stgy)
280 n_thread = omp_get_max_threads();
288 printf(
"Using %d threads\n", n_thread);
289 printf(
"nfft = %d\n", *nfft);
296 #ifdef fftw_MULTITHREADING
310 rhs_init_fftw(nfft, (*blocksize), V_fft, V_rfft, plan_f, plan_b, fftw_flag);
317 printf(
"Initialization finished successfully\n");
338 status = fftw_init_threads();
343 fftw_plan_with_nthreads(fftw_n_thread);
346 printf(
"Using multithreaded FFTW with %d threads\n", fftw_n_thread);
365 int rhs_init_fftw(
int *nfft,
int fft_size, fftw_complex **V_fft,
double **V_rfft, fftw_plan *plan_f, fftw_plan *plan_b,
int fftw_flag)
368 *V_fft = (fftw_complex*) fftw_malloc((*nfft)*(fft_size/2+1) *
sizeof(fftw_complex) );
369 *V_rfft = (
double*) fftw_malloc((*nfft)*fft_size *
sizeof(double) );
370 if (*V_fft==0 || *V_rfft==0)
373 *plan_f = fftw_plan_many_dft_r2c(1, &fft_size, (*nfft), *V_rfft, &fft_size, 1, fft_size, *V_fft, NULL, 1, fft_size/2+1, fftw_flag );
374 *plan_b = fftw_plan_many_dft_c2r(1, &fft_size, (*nfft), *V_fft, NULL, 1, fft_size/2+1, *V_rfft, &fft_size, 1, fft_size, fftw_flag );
396 int circ_fftw_flag = FFTW_ESTIMATE;
398 *T_fft = (fftw_complex*) fftw_malloc( (fft_size/2+1) *
sizeof(fftw_complex) );
401 double *T_circ = (
double*) (*T_fft);
405 plan_f_T = fftw_plan_dft_r2c_1d( fft_size, T_circ, *T_fft, circ_fftw_flag );
408 #pragma omp parallel for
409 for(i=0; i<fft_size+2;i++)
413 for(i=1;i<lambda;i++) {
415 T_circ[fft_size-i] = T[i]; }
417 fftw_execute(plan_f_T);
418 fftw_destroy_plan(plan_f_T);
435 int tpltz_cleanup(fftw_complex **T_fft, fftw_complex **V_fft,
double **V_rfft,fftw_plan *plan_f, fftw_plan *plan_b){
436 fftw_destroy_plan(*plan_f);
437 fftw_destroy_plan(*plan_b);
441 #ifdef fftw_MULTITHREADING
442 fftw_cleanup_threads();
459 int copy_block(
int ninrow,
int nincol,
double *Vin,
int noutrow,
int noutcol,
double *Vout,
int inrow,
int incol,
int nblockrow,
int nblockcol,
int outrow,
int outcol,
double norm,
int set_zero_flag)
461 int i, j, p, offsetIn, offsetOut;
464 if( (nblockcol > nincol) || (nblockrow > ninrow) || (nblockcol > noutcol) || (nblockrow > noutrow)) {
465 printf(
"Error in routine copy_block. Bad size setup.\n");
470 #pragma omp parallel for //private(i) num_threads(NB_OMPTHREADS_CPBLOCK)
471 for(i=0;i<noutrow*noutcol;i++)
475 offsetIn = ninrow*incol+inrow;
476 offsetOut = noutrow*outcol+outrow;
479 for(i=0;i<nblockcol*nblockrow;i++) {
482 Vout[offsetOut+j*noutrow+p] = Vin[offsetIn+j*ninrow+p]*norm;
509 int scmm_direct(
int fft_size,
int nfft, fftw_complex *C_fft,
int ncol,
double *V_rfft,
double **CV, fftw_complex *V_fft, fftw_plan plan_f_V, fftw_plan plan_b_CV)
512 int sizeT = fft_size/2+1;
516 fftw_execute(plan_f_V);
523 #pragma omp parallel for private(idx) //num_threads(nfft)
524 for(i=0;i<ncol*sizeT;i++) {
526 V_fft[i][0] = C_fft[idx][0]*V_fft[i][0]-C_fft[idx][1]*V_fft[i][1];
527 V_fft[i][1] = C_fft[idx][0]*V_fft[i][1]+C_fft[idx][1]*V_fft[i][0]; }
552 fftw_execute(plan_b_CV);
587 int scmm_basic(
double **V,
int blocksize,
int m, fftw_complex *C_fft,
double **CV, fftw_complex *V_fft,
double *V_rfft,
int nfft, fftw_plan plan_f_V, fftw_plan plan_b_CV)
591 int nloop = (int) ceil((1.0*m)/nfft);
594 int ncol = min(nfft, m);
598 #pragma omp parallel for //num_threads(NB_OMPTHREADS_BASIC)//schedule(dynamic,1)
599 for( i=0;i<blocksize*ncol;i++)
605 for(k=0;k<nloop;k++) {
607 ncol = m-(nloop-1)*nfft;
614 copy_block(blocksize, m, (*V), blocksize, ncol, V_rfft, 0, k*nfft, blocksize, ncol, 0, 0, 1.0, 0);
618 scmm_direct(blocksize, nfft, C_fft, ncol, V_rfft, CV, V_fft, plan_f_V, plan_b_CV);
622 copy_block(blocksize, ncol, V_rfft, blocksize, m, (*CV), 0, 0, blocksize, ncol, 0, k*nfft, 1.0/((
double) blocksize), 0);
659 int stmm_core(
double **V,
int n,
int m,
double *T, fftw_complex *T_fft,
int blocksize,
int lambda, fftw_complex *V_fft,
double *V_rfft,
int nfft, fftw_plan plan_f, fftw_plan plan_b,
int flag_offset,
int flag_nofft)
673 int distcorrmin= lambda-1;
675 int blocksize_eff = blocksize-2*distcorrmin;
679 nbloc = ceil((1.0*(n-2*distcorrmin))/blocksize_eff);
681 nbloc = ceil( (1.0*n)/blocksize_eff);
684 printf(
"nbloc=%d, n=%d, m=%d, blocksize=%d, blocksize_eff=%d\n", nbloc, n, m, blocksize, blocksize_eff);
686 double *V_bloc, *TV_bloc;
687 V_bloc = (
double *) calloc(blocksize*m,
sizeof(
double));
688 TV_bloc = (
double *) calloc(blocksize*m,
sizeof(
double));
689 if((V_bloc==0)||(TV_bloc==0))
702 currentsize = min( blocksize-distcorrmin+offset, n-iV);
705 copy_block( n, m, *V, blocksize, m, V_bloc, 0, 0, currentsize, m, distcorrmin-offset, 0, 1.0, 0);
711 status =
scmm_basic(&V_bloc, blocksize, m, T_fft, &TV_bloc, V_fft, V_rfft, nfft, plan_f, plan_b);
714 printf(
"Error in stmm_core.");
719 iV = blocksize_eff-distcorrmin+offset;
722 currentsize = min( blocksize, n-iV);
724 int flag_reset = (currentsize!=blocksize);
725 copy_block( n, m, *V, blocksize, m, V_bloc, iV, 0, currentsize, m, 0, 0, 1.0, flag_reset);
729 currentsize = min( blocksize_eff, n-iTV);
730 copy_block( blocksize, m, TV_bloc, n, m, *V, distcorrmin, 0, currentsize, m, iTV, 0, 1.0, 0);
733 iTV += blocksize_eff;
735 for(k=1;k<nbloc;k++) {
741 status =
scmm_basic(&V_bloc, blocksize, m, T_fft, &TV_bloc, V_fft, V_rfft, nfft, plan_f, plan_b);
743 if (status!=0)
break;
749 currentsize = min(blocksize, n-iV);
751 int flag_resetk = (currentsize!=blocksize);
752 copy_block( n, m, *V, blocksize, m, V_bloc, iV, 0, currentsize, m, 0, 0, 1.0, flag_resetk);
756 currentsize = min( blocksize_eff, n-iTV);
757 copy_block( blocksize, m, TV_bloc, n, m, *V, distcorrmin, 0, currentsize, m, iTV, 0, 1.0, 0);
758 iTV += blocksize_eff;
770 printf(
"time stmm_core=%f\n", t2-t1);
803 int stmm_main(
double **V,
int n,
int m,
int id0,
int l,
double *T, fftw_complex *T_fft,
int lambda, fftw_complex *V_fft,
double *V_rfft, fftw_plan plan_f, fftw_plan plan_b,
int blocksize,
int nfft,
Flag flag_stgy)
808 int distcorrmin= lambda-1;
809 int flag_prod_strategy_nofft=0;
810 int flag_shortcut_m_eff_eq_1=1;
811 int flag_shortcut_nbcol_eq_1=1;
812 int flag_nfullcol_in_middle=0;
813 int flag_optim_offset_for_nfft=0;
817 int m_eff = (id0+l-1)/n - id0/n + 1 ;
829 if (m_eff==1 && flag_shortcut_m_eff_eq_1==1 && nfft==1 || flag_no_rshp==1 && id0==0 && l==n*m) {
837 stmm_core(V, nr, m_eff, T, T_fft, blocksize, lambda, V_fft, V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
848 nfullcol = max(0, (l-(n-id0%n)%n-(id0+l)%n)/n );
850 if (flag_nfullcol_in_middle==1)
851 nloop_middle = ceil(1.0*(nfullcol)/nfft);
853 nloop_middle = (nfullcol)/nfft;
855 if (flag_nfullcol_in_middle==1)
858 m_middle = nfft*nloop_middle;
861 int vmiddle_size = n*m_middle;
865 printf(
"nloop_middle=%d , m_middle=%d\n", nloop_middle, m_middle);
869 if (nloop_middle>0) {
871 int offset_middle = (n-id0%n)%n;
872 Vmiddle = (*V)+offset_middle;
875 stmm_core(&Vmiddle, n, m_middle, T, T_fft, blocksize, lambda, V_fft, V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
881 int v1edge_size = min(l,(n-id0%n)%n);
882 int v2edge_size = max( l-(v1edge_size+vmiddle_size) , 0);
883 int vedge_size = v1edge_size+v2edge_size;
888 int m_v1edge, m_v2edge;
889 m_v1edge = (v1edge_size>0)*1;
890 m_v2edge = m-(m_v1edge+m_middle);
891 int nbcol = m_v1edge+m_v2edge;
893 nocol = (
int *) calloc(nbcol,
sizeof(
double));
898 for(i=(m_v1edge);i<nbcol;i++)
902 printf(
"nbcol=%d , m_v1edge=%d , m_v2edge=%d\n", nbcol, m_v1edge, m_v2edge);
905 if (nbcol==1 && nfft==1 && flag_shortcut_nbcol_eq_1==1) {
908 int offset_edge = n*nocol[0];
909 Vedge = (*V)+offset_edge;
911 stmm_core(&Vedge, vedge_size, nbcol, T, T_fft, blocksize, lambda, V_fft, V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
920 int lconc = vedge_size;
921 int v1_size=lconc+(distcorrmin)*(nbcol-1);
922 int fft_size = ceil(1.0*v1_size/nfft)+2*distcorrmin;
924 int flag_format_rshp = (nfft>1)*2 + (nfft==1 && nbcol>1)*1 + (nfft==1 && nbcol==1)*0;
925 int nrshp, mrshp, lrshp;
927 define_rshp_size(flag_format_rshp, fft_size, nfft, v1_size, vedge_size, &nrshp, &mrshp, &lrshp);
931 Vrshp = (
double *) calloc(lrshp,
sizeof(
double));
936 fprintf(file,
"nrshp=%d , mrshp=%d , lrshp=%d\n", nrshp, mrshp, lrshp);
937 fprintf(file,
"flag_format_rshp=%d\n", flag_format_rshp);
940 build_reshape(Vin, nocol, nbcol, lconc, n, m, id0, l, lambda, nfft, Vrshp, nrshp, mrshp, lrshp, flag_format_rshp);
943 if (flag_format_rshp==2 && flag_optim_offset_for_nfft==1)
949 stmm_core(&Vrshp, nrshp, mrshp, T, T_fft, blocksize, lambda, V_fft, V_rfft, nfft, plan_f, plan_b, flag_offset, flag_nofft);
951 extract_result(Vout, nocol, nbcol, lconc, n, m, id0, l, lambda, nfft, Vrshp, nrshp, mrshp, lrshp, flag_format_rshp);
980 int mpi_stmm(
double **V,
int n,
int m,
int id0,
int l,
double *T,
int lambda,
Flag flag_stgy, MPI_Comm comm)
987 MPI_Comm_rank(comm, &rank);
988 MPI_Comm_size(comm, &size);
996 int clast_r = (idf-1)/n;
997 int m_eff = clast_r - cfirst + 1 ;
1002 int right = rank + 1;
1003 int left = rank - 1;
1004 int v1_size = l + 2*lambda;
1005 if (rank==0 || cfirst*n==id0){
1007 left = MPI_PROC_NULL;}
1008 if (rank==(size-1) || clast*n==idf){
1010 right = MPI_PROC_NULL;}
1013 Lambda=(
double *) malloc(2*lambda *
sizeof(
double));
1017 for(i=0;i<lambda;i++) {
1019 Lambda[i+lambda]=(*V)[i+l-lambda]; }
1022 printf(
"[rank %d] Left comm with %d | Right comm with %d\n", rank, left, right);
1025 MPI_Sendrecv_replace(Lambda, lambda, MPI_DOUBLE, left, MPI_USER_TAG, right, MPI_USER_TAG, comm, &status);
1026 MPI_Sendrecv_replace((Lambda+lambda), lambda, MPI_DOUBLE, right, MPI_USER_TAG, left, MPI_USER_TAG, comm, &status);
1033 if(left==MPI_PROC_NULL && right==MPI_PROC_NULL)
1035 else if(left==MPI_PROC_NULL) {
1036 *V = realloc(*V, v1_size *
sizeof(
double));
1041 V1 = (
double *) malloc(v1_size *
sizeof(
double));
1043 if (left!=MPI_PROC_NULL){
1044 for(i=0;i<lambda;i++)
1045 V1[i] = Lambda[i+lambda];
1047 if (right!=MPI_PROC_NULL){
1048 for(i=0;i<lambda;i++)
1049 V1[i+v1_size-lambda] = Lambda[i]; }
1053 if (left!=MPI_PROC_NULL){
1055 #pragma omp parallel for
1056 for(i=offset;i<l+offset;i++)
1057 V1[i] = (*V)[i-offset]; }
1059 fftw_complex *V_fft, *T_fft;
1061 fftw_plan plan_f, plan_b;
1065 int nfft, blocksize;
1067 tpltz_init(v1_size, lambda , &nfft, &blocksize, &T_fft, T, &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
1070 printf(
"[rank %d] Before middle-level call : blocksize=%d, nfft=%d\n", rank, blocksize, nfft);
1072 stmm_main(&V1, n, m, id0, v1_size, T, T_fft, lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
1079 if (left!=MPI_PROC_NULL)
1081 if(left==MPI_PROC_NULL && right==MPI_PROC_NULL)
1083 else if(left==MPI_PROC_NULL) {
1084 V1 = realloc(V1, l *
sizeof(
double));
1089 #pragma omp parallel for
1090 for(i=offset;i<l+offset;i++)
1091 (*V)[i-offset] = V1[i]; }
1093 if(left!=MPI_PROC_NULL)