88 int mpi_gstbmm(
double **V,
int nrow,
int m,
int m_rowwise,
Block *tpltzblocks,
int nb_blocks_local,
int nb_blocks_all,
int id0p,
int local_V_size, int64_t *id0gap,
int *lgap,
int ngap,
Flag flag_stgy, MPI_Comm comm)
96 MPI_Comm_rank(comm, &rank);
97 MPI_Comm_size(comm, &size);
108 reset_gaps( V, id0p, local_V_size, m, nrow, m_rowwise, id0gap, lgap, ngap);
114 int nb_blockgappy_max;
117 Block *tpltzblocks_gappy;
124 fprintf(file,
"[%d] flag_skip_build_gappy_blocks=%d\n", rank, flag_skip_build_gappy_blocks);
126 if (flag_skip_build_gappy_blocks==1) {
129 mpi_stbmm(V, nrow, m, m_rowwise, tpltzblocks, nb_blocks_local, nb_blocks_all, id0p, local_V_size, flag_stgy, MPI_COMM_WORLD);
134 for(Tsize=i=0;i<nb_blocks_local;i++)
135 Tsize += tpltzblocks[i].lambda;
137 for(i=0;i<nb_blocks_local;i++) {
138 if (tpltzblocks[i].lambda>lambdamax)
139 lambdamax = tpltzblocks[i].
lambda;
143 nb_blockgappy_max = nb_blocks_local+ngap;
144 Tgappysize_max = Tsize + lambdamax*ngap;
147 tpltzblocks_gappy = (
Block *) calloc(nb_blockgappy_max,
sizeof(
Block));
156 build_gappy_blocks(nrow, m, tpltzblocks, nb_blocks_local, nb_blocks_all, id0gap, lgap, ngap, tpltzblocks_gappy, &nb_blocks_gappy, flag_param_distmin_fixed);
160 fprintf(file,
"[%d] nb_blocks_gappy=%d\n", rank, nb_blocks_gappy);
161 for(i=0;i<nb_blocks_gappy;i++)
162 fprintf(file,
"[%d] idvgappy[%d]=%d ; ngappy[%d]=%d\n", rank, i, tpltzblocks_gappy[i].idv, i, tpltzblocks_gappy[i].n );
167 mpi_stbmm(V, nrow, m, m_rowwise, tpltzblocks_gappy, nb_blocks_local, nb_blocks_all, id0p, local_V_size, flag_stgy, MPI_COMM_WORLD);
174 reset_gaps( V, id0p, local_V_size, m, nrow, m_rowwise, id0gap, lgap, ngap);
189 int reset_gaps(
double **V,
int id0,
int local_V_size,
int m,
int nrow,
int m_rowwise, int64_t *id0gap,
int *lgap,
int ngap)
193 for (j=0 ; j<m; j++) {
195 #pragma omp parallel for private(i) schedule(dynamic,1)
196 for (k=0 ; k<ngap; k++)
197 for (i=0 ; i<lgap[k]; i++)
198 if (id0gap[k]+i+j*nrow>=id0 && id0gap[k]+i+j*nrow <id0+local_V_size) {
199 for (l=0 ; l<m_rowwise; l++)
200 (*V)[id0gap[k]+i+j*nrow-id0+l*local_V_size] = 0.;
231 int build_gappy_blocks(
int nrow,
int m,
Block *tpltzblocks,
int nb_blocks_local,
int nb_blocks_all, int64_t *id0gap,
int *lgap,
int ngap,
Block *tpltzblocks_gappy,
int *nb_blocks_gappy_final,
int flag_param_distmin_fixed)
237 int igapfirstblock, igaplastblock;
240 if (flag_param_distmin_fixed!=0)
241 param_distmin = flag_param_distmin_fixed;
245 int igaplastblock_prev=-1;
246 int lambdaShftgappy=0;
250 int flag_igapfirstinside, flag_igaplastinside;
251 int nbloc = nb_blocks_local;
254 int idvtmp_firstblock;
257 int nb_blockgappy_max = nb_blocks_local+ngap;
270 for (k=0;k<ngap;k++) {
273 for( igapfirstblock = -1; igapfirstblock == -1; ) {
276 for(ib=0;ib<nbloc;ib++) {
277 if(tpltzblocks[ib].n != 0 && idtmp%nrow < tpltzblocks[ib].idv+tpltzblocks[ib].n)
break; }
279 if (ib<nbloc && tpltzblocks[ib].idv <= idtmp) {
281 flag_igapfirstinside = 1;
283 else if (ib<nbloc && tpltzblocks[ib].idv > idtmp) {
285 flag_igapfirstinside = 0;
289 flag_igapfirstinside = 0;
293 for( igaplastblock = -1; igaplastblock == -1; ) {
294 idtmp = id0gap[k]+lgap[k]-1;
296 for(ib=nbloc-1;ib>=0;ib--) {
297 if(tpltzblocks[ib].n != 0 && tpltzblocks[ib].idv <= idtmp)
break; }
299 if (ib>=0 && idtmp < tpltzblocks[ib].idv+tpltzblocks[ib].n) {
301 flag_igaplastinside = 1;
303 else if (ib>=0 && tpltzblocks[ib].idv+tpltzblocks[ib].n <= idtmp) {
305 flag_igaplastinside = 0;
309 flag_igaplastinside = 0;
313 if (igapfirstblock==igaplastblock)
314 distcorr_min = tpltzblocks[igapfirstblock].
lambda-1;
320 if (lgap[k]> max(distcorr_min, param_distmin) && igapfirstblock != -2 && igaplastblock != -2) {
322 idvtmp_firstblock = max( tpltzblocks[igapfirstblock].idv, id0gap[k_prev]+lgap[k_prev]);
325 if (igapfirstblock!=igaplastblock) {
329 else if (id0gap[k]-idvtmp_firstblock>=tpltzblocks[igapfirstblock].lambda && tpltzblocks[igaplastblock].idv + tpltzblocks[igaplastblock].n - (id0gap[k]+lgap[k])>=tpltzblocks[igaplastblock].lambda) {
333 else if (igapfirstblock==igaplastblock){
335 int ngappyleft_tmp = id0gap[k]-idvtmp_firstblock;
336 int leftadd = max(0, tpltzblocks[igapfirstblock].lambda - ngappyleft_tmp);
337 int ngappyright_tmp = tpltzblocks[igaplastblock].
idv + tpltzblocks[igaplastblock].
n - (id0gap[k]+lgap[k]);
338 int rightadd = max(0,tpltzblocks[igapfirstblock].lambda - ngappyright_tmp);
339 int restgap = lgap[k] - (leftadd+rightadd);
342 flag_gapok = (restgap >= max(0, param_distmin));
354 for (
id=igaplastblock_prev+1;
id<igapfirstblock;
id++) {
356 tpltzblocks_gappy[nblocks_gappy].
T_block = tpltzblocks[id].
T_block;
357 tpltzblocks_gappy[nblocks_gappy].
lambda = tpltzblocks[id].
lambda;
358 tpltzblocks_gappy[nblocks_gappy].
n = tpltzblocks[id].
n;
359 tpltzblocks_gappy[nblocks_gappy].
idv = tpltzblocks[id].
idv;
361 nblocks_gappy = nblocks_gappy + 1;
366 if (igaplastblock_prev==igapfirstblock && k!= 0) {
367 nblocks_gappy = nblocks_gappy - 1;
373 if (flag_igapfirstinside==1 && (id0gap[k]-idvtmp_firstblock)>0) {
375 tpltzblocks_gappy[nblocks_gappy].
T_block = tpltzblocks[igapfirstblock].
T_block;
376 tpltzblocks_gappy[nblocks_gappy].
lambda = tpltzblocks[id].
lambda;
377 tpltzblocks_gappy[nblocks_gappy].
n = id0gap[k]-idvtmp_firstblock;
378 tpltzblocks_gappy[nblocks_gappy].
n = max( tpltzblocks_gappy[nblocks_gappy].n, tpltzblocks[igapfirstblock].lambda);
380 tpltzblocks_gappy[nblocks_gappy].
idv = idvtmp_firstblock;
381 nblocks_gappy = nblocks_gappy + 1;
386 if (flag_igaplastinside==1 && (tpltzblocks[igaplastblock].idv+tpltzblocks[igaplastblock].n -(id0gap[k]+lgap[k]))>0 ) {
388 tpltzblocks_gappy[nblocks_gappy].
T_block = tpltzblocks[igaplastblock].
T_block;
389 tpltzblocks_gappy[nblocks_gappy].
lambda = tpltzblocks[id].
lambda;
390 tpltzblocks_gappy[nblocks_gappy].
n = tpltzblocks[igaplastblock].
idv+tpltzblocks[igaplastblock].
n-(id0gap[k]+lgap[k]);
391 int rightadd0 = max(0, tpltzblocks[igapfirstblock].lambda - tpltzblocks_gappy[nblocks_gappy].n);
393 tpltzblocks_gappy[nblocks_gappy].
n = max( tpltzblocks_gappy[nblocks_gappy].n , tpltzblocks[igaplastblock].lambda);
395 tpltzblocks_gappy[nblocks_gappy].
idv = id0gap[k]+lgap[k]-rightadd0;
397 nblocks_gappy = nblocks_gappy + 1;
398 lambdaShftgappy = lambdaShftgappy + tpltzblocks[igaplastblock].
lambda;
402 igaplastblock_prev = igaplastblock;
411 for (
id=igaplastblock_prev+1;
id<nb_blocks_local;
id++) {
413 tpltzblocks_gappy[nblocks_gappy].
T_block = tpltzblocks[id].
T_block;
414 tpltzblocks_gappy[nblocks_gappy].
lambda = tpltzblocks[id].
lambda;
415 tpltzblocks_gappy[nblocks_gappy].
n = tpltzblocks[id].
n;
416 tpltzblocks_gappy[nblocks_gappy].
idv = tpltzblocks[id].
idv;
417 nblocks_gappy = nblocks_gappy + 1;
418 lambdaShftgappy = lambdaShftgappy + tpltzblocks[id].
lambda;
422 *nb_blocks_gappy_final = nblocks_gappy;