MIDAPACK - MIcrowave Data Analysis PACKage  1.1b
Parallel software tools for high performance CMB DA analysis
 All Data Structures Files Functions Variables Typedefs Groups Pages
toeplitz_block.c
Go to the documentation of this file.
1 
49 #include "toeplitz.h"
50 
51 
52 //r1.1 - Frederic Dauvergne (APC)
53 //This is the routines related to the Toeplitz blocks diagonal routine.
54 //There is a sequential equivalent routine in the file toeplitz_seq.c
55 
56 //todo:
57 //- remove the nooptimize communication
58 
59 //=========================================================================
60 #ifdef W_MPI
61 
62 
87 int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, int64_t idp, int local_V_size, Flag flag_stgy, MPI_Comm comm)
88 {
89 #else //for sequential use only
90 int mpi_stbmm(double **V, int64_t nrow, int m, int m_rowwise, Block *tpltzblocks, int nb_blocks_local, int nb_blocks_all, int64_t idp, int local_V_size, Flag flag_stgy)
91 {
92 #endif
93 
94 
95  //MPI parameters
96  int rank; //process rank
97  int size; //process number
98 
99 #ifdef W_MPI
100  MPI_Status status;
101  MPI_Comm_rank(comm, &rank);
102  MPI_Comm_size(comm, &size);
103 
104 #else
105  rank=0;
106  size=1;
107 #endif
108 
109  PRINT_RANK=rank;
110 
111  FILE *file;
112  file = stdout;
113 
114  int i,j,k; //some indexes
115 
116 
117  //identification of the mpi neighbours process to communicate when there is a shared block
118  int right = rank+1;
119  int left = rank-1;
120 
121 
122  //Define the indices for each process
123  int idv0, idvn; //indice of the first and the last block of V for each processes
124 
125  int *nnew;
126  nnew = (int*) calloc(nb_blocks_local, sizeof(int));
127  int64_t idpnew;
128  int local_V_size_new;
129  int n_rowwise=local_V_size;
130 
131  int status_params = get_overlapping_blocks_params( nb_blocks_local, tpltzblocks, local_V_size, nrow, idp, &idpnew, &local_V_size_new, nnew, &idv0, &idvn);
132 
133 
134  if(PRINT_RANK==0 && VERBOSE>2)
135  printf("status_params=%d\n", status_params);
136 
137  if( status_params == 0) {
138  free(nnew);
139  return(0); //no work to be done
140  }
141 
142  if (tpltzblocks[idv0].lambda==0 || tpltzblocks[idvn].lambda==0)
143  return print_error_message (2, __FILE__, __LINE__);
144 
145 
146  if(PRINT_RANK==0 && VERBOSE>2) { //print on screen news parameters definition if VERBOSE
147  fprintf(file, "new parameters caracteristics:\n");
148  fprintf(file, "[%d] idp=%ld ; idpnew=%ld\n", rank, idp, idpnew);
149  fprintf(file, "[%d] local_V_size=%d ; local_V_size_new=%d\n", rank, local_V_size, local_V_size_new);
150  for(i=0;i<nb_blocks_local;i++)
151  fprintf(file, "[%d] n[%d]=%d ; nnew[%d]=%d\n", rank, i, (tpltzblocks[i].n), i, nnew[i]);
152  for(i=0;i<nb_blocks_local;i++)
153  fprintf(file, "[%d] tpltzblocks[%d].idv=%ld\n", rank, i, tpltzblocks[i].idv);
154 }
155 
156 
157  int vShft=idpnew-idp; //new first element of relevance in V
158 
159  //Define the column indices:
160  //index of the first and the last column of V for the current process
161  int idvm0 = idpnew/nrow;
162  int idvmn = (idpnew+local_V_size_new-1)/nrow;
163  //number of columns of V for the current process
164  int ncol_rank = idvmn-idvm0+1;
165  //number of blocks for the current process with possibly repetitions
166  int nb_blocks_rank;
167 
168  if(ncol_rank == 1) // Empty process not allowed
169  nb_blocks_rank = idvn - idv0 + 1;
170  else
171  nb_blocks_rank = (ncol_rank-2)*nb_blocks_local + (nb_blocks_local-idv0) + (idvn+1); //in this case nb_blocks_local = nblocs_all
172 
173  if(PRINT_RANK==0 && VERBOSE>2)
174  fprintf(file, "[%d] nb_blocks_rank=%d, nb_blocks_local=%d\n", rank, nb_blocks_rank, nb_blocks_local);
175 
176  //Define the indices for the first and the last element in each blocks
177  int idvp0 = idpnew%nrow-tpltzblocks[idv0].idv; //index of the first element of the process in the first block
178  int idvpn; //reverse index of the last element of the process in the last block
179  //It's the number of remaining elements needed to fully complete the last block
180  idvpn = tpltzblocks[idvn].idv+nnew[idvn]-1 - (idpnew+local_V_size_new-1)%nrow ;
181 
182 
183  //Define the offsets for the first and last blocks of the process for V1
184  int offset0, offsetn;
185  int distcorrmin_idv0 = (tpltzblocks[idv0].lambda)-1;
186  int distcorrmin_idvn = (tpltzblocks[idvn].lambda)-1;
187 
188  //if(idvp0 != 0)
189  offset0 = min( idvp0, distcorrmin_idv0);
190  //if(idvpn != 0)
191  offsetn = min(idvpn, distcorrmin_idvn);
192 
193 
194  int toSendLeft=0;
195  int toSendRight=0;
196 
197 #ifdef W_MPI
198  if(offset0!=0) {
199  toSendLeft = min( tpltzblocks[idv0].idv+nnew[idv0]-idpnew%nrow, distcorrmin_idv0);
200  }
201  if( offsetn != 0) {
202  toSendRight = min( (idpnew+local_V_size_new)%nrow-tpltzblocks[idvn].idv, distcorrmin_idvn);
203  }
204 
205  int flag_optimlambda=1; //to allocate only the memory place needed
206 
207  int lambdaOut_offset;
208  int lambdaIn_offset;
209  double *LambdaOut;
210  int lambdaOut_size, lambdaIn_size;
211 
212  if (flag_optimlambda==1) {
213  LambdaOut=(double *) calloc((toSendLeft+toSendRight)*m_rowwise, sizeof(double));
214  lambdaOut_offset = toSendLeft*m_rowwise;
215  lambdaIn_offset = offset0*m_rowwise;
216  lambdaOut_size = (toSendLeft+toSendRight)*m_rowwise ;
217  lambdaIn_size = (offset0+offsetn)*m_rowwise;
218  }
219  else {
220  LambdaOut=(double *) calloc((tpltzblocks[idv0].lambda+tpltzblocks[idvn].lambda)*m_rowwise, sizeof(double));
221  lambdaOut_offset = tpltzblocks[idv0].lambda*m_rowwise;
222  lambdaIn_offset = tpltzblocks[idv0].lambda*m_rowwise;
223  lambdaOut_size = (tpltzblocks[idv0].lambda+tpltzblocks[idvn].lambda)*m_rowwise;
224  lambdaIn_size = (tpltzblocks[idv0].lambda+tpltzblocks[idvn].lambda)*m_rowwise;
225  }
226 
227 
228  if(offset0!=0) {
229  for (j=0;j<m_rowwise;j++)
230  for (i=0;i<toSendLeft;i++)
231  LambdaOut[i+j*toSendLeft]=(*V)[i+j*n_rowwise]; //good because toSendLeft=0 if it
232  } //doesnt start on a the first block.
233  if( offsetn != 0) {
234  for (j=0;j<m_rowwise;j++)
235  for (i=0;i<toSendRight;i++)
236  LambdaOut[i+j*toSendRight+lambdaOut_offset]=(*V)[i+j*n_rowwise+local_V_size-toSendRight];
237  } //good too using same argument than for offset0!=0
238  //if local_V_size!=local_V_size_new+vShft mean there is extra
239  //terms a the end and so offsetn=0
240  //idpnew+local_V_size_new = idp+local_V_size and vShft=idpnew-idp
241  //so local_V_size=vShft+local_V_size_new
242  if(rank==0 || offset0==0)
243  left = MPI_PROC_NULL;
244  if(rank==size-1 || offsetn==0)
245  right = MPI_PROC_NULL;
246 
247  double *LambdaIn=(double *) calloc(lambdaIn_size, sizeof(double));
248 
249 
250  int flag_blockingcomm=0; //to use blocking comm
251  MPI_Request requestLeft_r, requestLeft_s;
252  MPI_Request requestRight_r, requestRight_s;
253 
254  if (flag_blockingcomm==1) {
255 //send and receive data
256  MPI_Sendrecv( LambdaOut, toSendLeft*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, (LambdaIn+lambdaIn_offset), offsetn*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm, &status);
257  MPI_Sendrecv( (LambdaOut+lambdaOut_offset), toSendRight*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, LambdaIn, offset0*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm, &status);
258 
259  }
260  else {
261 //to the Left
262  MPI_Irecv((LambdaIn+lambdaIn_offset), offsetn*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm, &requestLeft_r);
263  MPI_Isend(LambdaOut, toSendLeft*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm, &requestLeft_s);
264 
265 //to the Right
266  MPI_Irecv(LambdaIn, offset0*m_rowwise, MPI_DOUBLE, left, MPI_USER_TAG, comm, &requestRight_r);
267  MPI_Isend((LambdaOut+lambdaOut_offset), toSendRight*m_rowwise, MPI_DOUBLE, right, MPI_USER_TAG, comm, &requestRight_s);
268 
269  }
270 
271 #endif
272 
273 
274 //size of the first and the last block for the current process
275  int v0rank_size, vnrank_size;
276  if (nb_blocks_rank == 1) { //only one block
277  v0rank_size = ((idpnew+local_V_size_new-1)%nrow +1) - idpnew%nrow + offset0 + offsetn;
278  vnrank_size = 0; //just for convenience - no really need it
279  }
280  else { //more than one block
281  v0rank_size = tpltzblocks[idv0].idv + nnew[idv0] - idpnew%nrow + offset0;
282  vnrank_size = ((idpnew+local_V_size_new-1)%nrow +1) - tpltzblocks[idvn].idv + offsetn;
283  }
284 
285 
286 #ifdef W_MPI
287 
288 if (flag_blockingcomm!=1) {
289  //MPI_Wait for lambda comm
290  MPI_Wait(&requestLeft_r, &status);
291  MPI_Wait(&requestLeft_s, &status);
292  MPI_Wait(&requestRight_r, &status);
293  MPI_Wait(&requestRight_s, &status);
294 
295 }
296 
297 
298  free(LambdaOut);
299 
300 #endif
301 
302 
303 //---------------------------------------
304 //initialization for the blocks loop
305 
306  int idv1=0; //old index of *V1
307  int idv2=0; //index
308 
309 
310  int mid; //local number of column for the current block
311  //index of the first element of the process inside the first block
312  int offset_id0;
313  offset_id0 = idvp0;
314 
315 //fftw variables
316  fftw_complex *V_fft, *T_fft;
317  double *V_rfft;
318  fftw_plan plan_f, plan_b;
319 //init local block vector
320  double *V1block;
321 // int lambdaShft;
322 
323 
324 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
325 //loop on the blocks inside the process
326  int nfft, blocksize;
327  int iblock; //index for the loop on the blocks
328 // int loopindex;
329  int id; //indice of the current block
330 
331  int vblock_size;
332  int id0block;
333 
334  int jj;
335 
336 
337  for(iblock=idv0;iblock<idv0+nb_blocks_rank;iblock++) {
338  id = iblock%nb_blocks_local; //index of current block
339 
340 
341  if(nnew[id]>0) { //the block is ok
342 
343 #ifdef W_MPI
344 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
345 //first case : First block of the process
346  if(iblock==idv0) {
347  if(PRINT_RANK==0 && VERBOSE>2)
348  fprintf(file, "[%d] First block...\n", rank);
349 
350  vblock_size=v0rank_size;
351  id0block=(offset_id0-offset0);
352 
353  V1block = (double *) calloc(vblock_size*m_rowwise, sizeof(double));
354 
355  for (j=0;j<m_rowwise;j++) {
356 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
357  for (i=0;i<offset0;i++)
358  V1block[i+j*vblock_size] = LambdaIn[i+j*offset0];
359  }
360 //note: check if copyblock could be used instead.
361 
362 
363 //if (nb_blocks_rank == 1) currentsize_middlepart=vblock_size-offset0-offsetn = local_V_size_new
364 //else currentsize_middlepart=vblock_size-offset0
365  int currentsize_middlepart=min(vblock_size-offset0, local_V_size_new);
366 
367  for (j=0;j<m_rowwise;j++) {
368 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
369  for (i=0;i<currentsize_middlepart;i++)
370  V1block[offset0+i+j*vblock_size] = (*V)[i+vShft+j*n_rowwise];
371  }
372 
373 if (nb_blocks_rank == 1) {
374  for (j=0;j<m_rowwise;j++) {
375 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
376  for (i=0;i<offsetn;i++) {
377  V1block[vblock_size-offsetn+i+j*vblock_size] = LambdaIn[i+lambdaIn_offset+j*offsetn];
378  }}
379 }
380 
381 
382  //init Toeplitz arrays
383  tpltz_init(vblock_size, tpltzblocks[id].lambda, &nfft, &blocksize, &T_fft, (tpltzblocks[id].T_block), &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
384 
385  //Toeplitz computation
386  if(PRINT_RANK==0 && VERBOSE>2)
387  fprintf(file, "[%d] Before stmm_main call : nfft = %d, blocksize = %d\n", rank, nfft, blocksize);
388  stmm_main(&V1block, vblock_size, m_rowwise, 0, m_rowwise*vblock_size, (tpltzblocks[id].T_block), T_fft, tpltzblocks[id].lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
389 
390  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
391 
392 
393  int currentsize=min(vblock_size-offset0, local_V_size_new);
394  for (j=0;j<m_rowwise;j++) {
395 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
396  for (i=0;i<currentsize;i++)
397  (*V)[vShft+i+j*n_rowwise] = V1block[offset0+i+j*vblock_size];
398  }
399 
400  free(V1block);
401 
402  }//end (First case)
403 
404 
405 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
406 //Generic case : Generic block of the process
407  else if(iblock!=idv0 && iblock!=idv0+nb_blocks_rank-1) {
408 #endif
409 
410  if(PRINT_RANK==0 && VERBOSE>2)
411  fprintf(file, "[%d] generic block...\n");
412 
413  vblock_size=nnew[id];
414  id0block=0;
415 
416  V1block = (double *) calloc(vblock_size*m_rowwise, sizeof(double));
417 
418  idv1 = (tpltzblocks[id].idv)-idp%nrow - vShft + offset0 +nrow*( (iblock/nb_blocks_local) ); //no need
419 // idv2 = idv[id]-idp%nrow + nrow*( (iblock/nb_blocks_local) );
420  idv2 = (tpltzblocks[id].idv)-(idpnew)%nrow+vShft + nrow*( (iblock/nb_blocks_local) );
421 
422  for (j=0;j<m_rowwise;j++) {
423 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
424  for (i=0;i<vblock_size;i++)
425  V1block[i+j*vblock_size] = (*V)[i+idv2+j*n_rowwise];
426 // V1block[i] = (*V)[i+idv1-offset0+vShft];
427  }
428 
429  //init Toeplitz arrays
430  tpltz_init(nnew[id], tpltzblocks[id].lambda, &nfft, &blocksize, &T_fft, (tpltzblocks[id].T_block), &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
431 
432  //Toeplitz computation
433  if(PRINT_RANK==0 && VERBOSE>2)
434  fprintf(file, "[%d] Before stmm_main call : nfft = %d, blocksize = %d\n", rank, nfft, blocksize);
435  stmm_main(&V1block, vblock_size, m_rowwise, 0, m_rowwise*vblock_size, (tpltzblocks[id].T_block), T_fft, tpltzblocks[id].lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
436 
437 
438  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
439 
440 
441  for (j=0;j<m_rowwise;j++) {
442 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
443  for (i=0;i<vblock_size;i++) {
444  (*V)[i+idv2+j*n_rowwise] = V1block[i+j*vblock_size];
445  }}
446 
447 
448  free(V1block);
449 
450 #ifdef W_MPI
451  } //end (Generic case)
452 
453 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
454 // Last case : Last block of the process
455  else if(iblock==idv0+nb_blocks_rank-1 && iblock!= idv0) {
456  if(PRINT_RANK==0 && VERBOSE>2)
457  fprintf(file, "[%d] last block...\n");
458 
459  vblock_size=vnrank_size;
460  id0block=0;
461 
462  V1block = (double *) calloc(vblock_size*m_rowwise, sizeof(double));
463 
464  idv1 = (tpltzblocks[id].idv) - idp%nrow - vShft + offset0 + nrow*( (iblock/nb_blocks_local) );
465  idv2 = (tpltzblocks[id].idv)-(idpnew)%nrow+vShft + nrow*( (iblock/nb_blocks_local) );
466 
467 
468  for (j=0;j<m_rowwise;j++) {
469 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
470  for (i=0;i<vblock_size-offsetn;i++)
471  V1block[i+j*vblock_size] = (*V)[i+idv2+j*n_rowwise];
472 // V1block[i] = (*V)[i+idv1-offset0+vShft];
473  }
474 
475  for (j=0;j<m_rowwise;j++) {
476 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
477  for (i=0;i<offsetn;i++)
478  V1block[vblock_size-offsetn+i+j*vblock_size] = LambdaIn[i+lambdaIn_offset+j*offsetn];
479  }
480 
481 
482  //init Toeplitz arrays
483  tpltz_init(vblock_size, tpltzblocks[id].lambda, &nfft, &blocksize, &T_fft, (tpltzblocks[id].T_block), &V_fft, &V_rfft, &plan_f, &plan_b, flag_stgy);
484 
485  //Toeplitz computation
486  if(PRINT_RANK==0 && VERBOSE>2)
487  fprintf(file, "[%d] Before stmm_main call : nfft = %d, blocksize = %d\n", rank, nfft, blocksize);
488 
489  stmm_main(&V1block, vblock_size, m_rowwise, 0, vblock_size*m_rowwise, (tpltzblocks[id].T_block), T_fft, tpltzblocks[id].lambda, V_fft, V_rfft, plan_f, plan_b, blocksize, nfft, flag_stgy);
490 
491  tpltz_cleanup(&T_fft, &V_fft, &V_rfft, &plan_f, &plan_b);
492 
493  for (j=0;j<m_rowwise;j++) {
494 #pragma omp parallel for //num_threads(NB_OMPTHREADS_STBMM)
495  for (i=0;i<vnrank_size-offsetn;i++) {
496  (*V)[idv2+i+j*n_rowwise] = V1block[i+j*vblock_size];
497  }}
498 
499 
500  free(V1block);
501 
502  }//end of last block
503  else { break; }//error //we can put the generic case here instead of between first and last cases
504 #endif
505 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
506  }//end of if(nnew[id]>0)
507  }//end of loop over the blocks
508 
509 
510  free(LambdaIn);
511 
512 
513  return 0;
514 }
515 
516 //#endif
517 
518 //====================================================================
519 
523 
529 int get_overlapping_blocks_params(int nbloc, Block *tpltzblocks, int local_V_size, int64_t nrow, int64_t idp, int64_t *idpnew, int *local_V_size_new, int *nnew, int *ifirstBlock, int *ilastBlock)
530 {
531 
532  int ib, nblockOK=0, nfullcol_data;
533  int64_t firstrow, lastrow;
534  int64_t idptmp;
535 
536 
537 //check how many full columns input data have
538  nfullcol_data = max(0, (local_V_size-(nrow-idp%nrow)%nrow-(idp+local_V_size)%nrow)/nrow );
539 
540 
541  if( nfullcol_data > 0) {
542 
543  for( ib=0; ib<nbloc; ib++) {
544  if( tpltzblocks[ib].idv < nrow) {
545  nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv); //block used for the product
546  nblockOK++;
547  }
548  }
549 
550  }
551  else { //no full column observed
552 
553  firstrow = idp%nrow;
554  lastrow = (idp+local_V_size-1)%nrow;
555 
556  if( firstrow < lastrow) { //just one column partially observed
557 
558  for( ib=0; ib<nbloc; ib++) {
559  if( (tpltzblocks[ib].idv+tpltzblocks[ib].n > firstrow) && (tpltzblocks[ib].idv < lastrow+1)) {
560  nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv); //block used for the product
561  nblockOK++;
562  }
563  }
564 
565  }
566  else { //two columns partially observed
567 
568  for( ib=0; ib<nbloc; ib++) {
569  if( (tpltzblocks[ib].idv + tpltzblocks[ib].n > firstrow) && (tpltzblocks[ib].idv < nrow)) { //intersects first partial column
570  nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv); //block used for the product
571  nblockOK++;
572  }
573 
574  if( (tpltzblocks[ib].idv < lastrow+1) && (tpltzblocks[ib].idv+tpltzblocks[ib].n > 0)) { //intersects second partial column
575  nnew[ib] = min( tpltzblocks[ib].n, nrow-tpltzblocks[ib].idv); //block used for the product
576  nblockOK++; //may overcount but we do not care
577  } //could use else insteed!
578  }
579  }
580  }
581 
582  if(PRINT_RANK==0 && VERBOSE>2)
583  printf("nblockOK=%d\n", nblockOK);
584 
585 
586  if( nblockOK == 0) return(0); //no blocks overlapping with the data
587 
588  //find the first and last relevant blocks for the begining and end of the local data V
589 
590  //first block
591  idptmp = idp;
592 
593  for( *ifirstBlock = -1; *ifirstBlock == -1; ) {
594  for(ib=0;ib<nbloc;ib++) {
595  if(nnew[ib] != 0 && idptmp%nrow < tpltzblocks[ib].idv+nnew[ib]) break;
596  }
597 
598  if (ib<nbloc && tpltzblocks[ib].idv <= idptmp%nrow) {
599  *ifirstBlock = ib;
600  *idpnew = idptmp;
601  }
602  else if (ib<nbloc && tpltzblocks[ib].idv > idptmp%nrow) {
603  *ifirstBlock = ib;
604  // int64_t extrabegining = tpltzblocks[ib].idv-idp%nrow; //note I put int64 just to be sure. Never used
605 // *idpnew = idp+extrabegining;//tpltzblocks[ib].idv;
606  int idvfirstcolumn = idptmp/nrow;
607  *idpnew = tpltzblocks[ib].idv+idvfirstcolumn*nrow;
608  }
609  else { //ib=nb_blocs
610  idptmp += nrow-idptmp%nrow; //(int) (nrow-idptmp%nrow);
611 // idtmp = (int) ceil((1.0*idpnew)/(1.0*nrow))*nrow; // go to the first element of the next column
612  }}
613 
614 
615  //last block
616  idptmp = idp+local_V_size-1;
617 
618  for( *ilastBlock = -1; *ilastBlock == -1; ) {
619  for(ib=nbloc-1;ib>=0;ib--) {
620  if(nnew[ib] != 0 && tpltzblocks[ib].idv <= idptmp%nrow) break;
621  }
622 
623 
624  if (ib>=0 && idptmp%nrow < tpltzblocks[ib].idv+nnew[ib]) {
625  *ilastBlock = ib;
626  *local_V_size_new = local_V_size-(*idpnew)+idp;
627  }
628  else if (ib>=0 && tpltzblocks[ib].idv+nnew[ib] <= idptmp%nrow) {
629  *ilastBlock = ib;
630  //int64_t extraend = (local_V_size-1+idp)%nrow+1-(tpltzblocks[ib].idv+nnew[ib]); //note I put int64 just to be sure
631  //*local_V_size_new = (local_V_size+idp)%nrow-(idv[*ilastBlock]+nnew[*ilastBlock]);
632  //idv[*ilastBlock]+nnew[*ilastBlock]-(*idpnew);
633  //*local_V_size_new = local_V_size-(*idpnew)+idp-extraend; //compute twice ... ? remove this one
634 
635  int idvlastcolumn = idptmp/nrow;
636  *local_V_size_new = tpltzblocks[ib].idv+nnew[ib]+idvlastcolumn*nrow - (*idpnew);
637 
638  }
639  else {
640  idptmp = idptmp - (idptmp%nrow)-1;//(int) idptmp - (idptmp%nrow)-1;
641 // idtmp = (int) floor( (1.0*idpnew)/(1.0*nrow))*nrow-1; // go to the last element of the previous column
642  }}
643 
644 
645  return(1);
646 }
647 
648 
649 
650