KMR
kmratoa.c
Go to the documentation of this file.
1 /* kmratoa.c (2017-05-18) */
2 /* Copyright (C) 2012-2018 RIKEN R-CCS */
3 
4 /** \file kmratoa.c Communication Routines. KMR makes almost all data
5  exchanges through this. Some exceptions are "kmrmapms.c" and
6  "kmrfiles.c". It provides operations with size_t data length. */
7 
8 /* Used MPI routines: Alltoall, Alltoallv, Allgather, Allgatherv,
9  Allreduce, Gatherv, Exscan. Irecv, Isend, Irsend, Sendrecv,
10  Waitall. */
11 
12 #include <mpi.h>
13 #include <stdlib.h>
14 #include <limits.h>
15 #include <errno.h>
16 #include <assert.h>
17 #include "kmr.h"
18 #include "kmrimpl.h"
19 
20 #define MAX(a,b) (((a)>(b))?(a):(b))
21 #define MIN(a,b) (((a)<(b))?(a):(b))
22 
23 static int kmr_alltoallv_mpi(KMR *mr, void *sbuf, long *scnts, long *sdsps,
24  void *rbuf, long *rcnts, long *rdsps);
25 static int kmr_alltoallv_naive(KMR *mr, void *sbuf, long *scnts, long *sdsps,
26  void *rbuf, long *rcnts, long *rdsps);
27 static int kmr_alltoallv_bruck(KMR *mr, long maxcnt,
28  void *sbuf, long *scnts, long *sdsps,
29  void *rbuf, long *rcnts, long *rdsps);
30 static int kmr_alltoall_bruck(KMR *mr, void *sbuf, void *rbuf, int cnt);
31 static void kmr_atoa_dump_(KMR *mr, void *sbuf, int sz, char *title, int step);
32 
33 /* Checks if X is a power of two or four. */
34 
35 static inline _Bool
36 kmr_powerof2_p(int x)
37 {
38  return ((x > 0) && ((x & (x - 1)) == 0));
39 }
40 
41 static inline _Bool
42 kmr_powerof4_p(int x)
43 {
44  return (kmr_powerof2_p(x) && ((x & 0x2aaaaaaa) == 0));
45 }
46 
47 /** Calls all-to-all to exchange one long-integer. */
48 
49 int
50 kmr_exchange_sizes(KMR *mr, long *sbuf, long *rbuf)
51 {
52  MPI_Comm comm = mr->comm;
53  int cc;
54  cc = MPI_Alltoall(sbuf, 1, MPI_LONG, rbuf, 1, MPI_LONG, comm);
55  assert(cc == MPI_SUCCESS);
56  return MPI_SUCCESS;
57 }
58 
59 /** Calls all-gather for collecting one long-integer. */
60 
61 int
62 kmr_gather_sizes(KMR *mr, long siz, long *rbuf)
63 {
64  MPI_Comm comm = mr->comm;
65  int cc;
66  cc = MPI_Allgather(&siz, 1, MPI_LONG, rbuf, 1, MPI_LONG, comm);
67  assert(cc == MPI_SUCCESS);
68  return MPI_SUCCESS;
69 }
70 
71 /** All-gathers data, or gathers data when RANKZEROONLY. */
72 
73 int
74 kmr_allgatherv(KMR *mr, _Bool rankzeroonly, void *sbuf, long scnt,
75  void *rbuf, long *rcnts, long *rdsps)
76 {
77  MPI_Comm comm = mr->comm;
78  int nprocs = mr->nprocs;
79  int self = mr->rank;
80  int *rsz;
81  int *rdp;
82  if (!rankzeroonly || self == 0) {
83  rsz = kmr_malloc(sizeof(int) * (size_t)nprocs);
84  rdp = kmr_malloc(sizeof(int) * (size_t)nprocs);
85  for (int r = 0; r < nprocs; r++) {
86  assert(INT_MIN <= rcnts[r] && rcnts[r] <= INT_MAX);
87  assert(INT_MIN <= rdsps[r] && rdsps[r] <= INT_MAX);
88  rsz[r] = (int)rcnts[r];
89  rdp[r] = (int)rdsps[r];
90  }
91  } else {
92  rsz = 0;
93  rdp = 0;
94  }
95  int cc;
96  if (rankzeroonly) {
97  cc = MPI_Gatherv(sbuf, (int)scnt, MPI_BYTE,
98  rbuf, rsz, rdp, MPI_BYTE, 0, comm);
99  assert(cc == MPI_SUCCESS);
100  } else {
101  cc = MPI_Allgatherv(sbuf, (int)scnt, MPI_BYTE,
102  rbuf, rsz, rdp, MPI_BYTE, comm);
103  assert(cc == MPI_SUCCESS);
104  }
105  if (rsz != 0) {
106  kmr_free(rsz, (sizeof(int) * (size_t)nprocs));
107  }
108  if (rdp != 0) {
109  kmr_free(rdp, (sizeof(int) * (size_t)nprocs));
110  }
111  return MPI_SUCCESS;
112 }
113 
114 /* ================================================================ */
115 
116 /** Does all-to-all-v, but it takes the arguments by long-integers.
117  It switches the methods with regard to the size of the largest
118  message among the ranks. Setting ATOA_THRESHOLD=0 forces to use
119  MPI all-to-all-v. It switches to a naive implementation of
120  all-to-all-v when the sizes of messages are very large (larger
121  than 16GB). */
122 
123 int
124 kmr_alltoallv(KMR *mr, void *sbuf, long *scnts, long *sdsps,
125  void *rbuf, long *rcnts, long *rdsps)
126 {
127  MPI_Comm comm = mr->comm;
128  int nprocs = mr->nprocs;
129 
130  long LIMIT = ((long)INT_MAX * 8L);
131  long cap = ((mr->atoa_size_limit == 0) ? LIMIT : mr->atoa_size_limit);
132  assert(((long)INT_MIN * 8L) <= -cap && cap <= ((long)INT_MAX * 8L));
133 
134  int cc;
135 
136  /* Take the size of the largest message. It sets MAXCNT to
137  (LIMIT+1), when some messages exceed the 16GB limit. */
138 
139  long maxcnt = 0;
140  for (int r = 0; r < nprocs; r++) {
141  if ((scnts[r] <= cap) && (rcnts[r] <= cap)
142  && (sdsps[r] <= cap) && (rdsps[r] <= cap)) {
143  maxcnt = MAX(maxcnt, scnts[r]);
144  } else {
145  maxcnt = (LIMIT + 1);
146  break;
147  }
148  }
149  cc = MPI_Allreduce(MPI_IN_PLACE, &maxcnt, 1, MPI_LONG, MPI_MAX, comm);
150  assert(cc == MPI_SUCCESS);
151 
152  /* Switch the methods. */
153 
154  if (maxcnt == (LIMIT + 1)) {
155  cc = kmr_alltoallv_naive(mr, sbuf, scnts, sdsps, rbuf, rcnts, rdsps);
156  assert(cc == MPI_SUCCESS);
157  } else if (kmr_powerof4_p(nprocs) && nprocs != 1
158  && mr->atoa_threshold != 0
159  && maxcnt < mr->atoa_threshold) {
160  cc = kmr_alltoallv_bruck(mr, maxcnt,
161  sbuf, scnts, sdsps, rbuf, rcnts, rdsps);
162  assert(cc == MPI_SUCCESS);
163  } else {
164  assert(maxcnt <= cap);
165  cc = kmr_alltoallv_mpi(mr, sbuf, scnts, sdsps, rbuf, rcnts, rdsps);
166  assert(cc == MPI_SUCCESS);
167  }
168  return MPI_SUCCESS;
169 }
170 
171 /* Does all-to-all-v using MPI_Alltoallv. It takes the sizes and the
172  offsets upto 16 GB (for the restriction using integers of MPI). It
173  assumes data is 8-byte aligned. */
174 
175 static int
176 kmr_alltoallv_mpi(KMR *mr,
177  void *sbuf, long *scnts, long *sdsps,
178  void *rbuf, long *rcnts, long *rdsps)
179 {
180  MPI_Comm comm = mr->comm;
181  int nprocs = mr->nprocs;
182  int *ssz = kmr_malloc(sizeof(int) * (size_t)nprocs);
183  int *sdp = kmr_malloc(sizeof(int) * (size_t)nprocs);
184  int *rsz = kmr_malloc(sizeof(int) * (size_t)nprocs);
185  int *rdp = kmr_malloc(sizeof(int) * (size_t)nprocs);
186 
187  for (int r = 0; r < nprocs; r++) {
188  assert(INT_MIN * 8L <= scnts[r] && scnts[r] <= INT_MAX * 8L);
189  assert(INT_MIN * 8L <= rcnts[r] && rcnts[r] <= INT_MAX * 8L);
190  assert(INT_MIN * 8L <= sdsps[r] && sdsps[r] <= INT_MAX * 8L);
191  assert(INT_MIN * 8L <= rdsps[r] && rdsps[r] <= INT_MAX * 8L);
192  assert(((scnts[r] & 7) == 0)
193  && ((rcnts[r] & 7) == 0)
194  && ((sdsps[r] & 7) == 0)
195  && ((rdsps[r] & 7) == 0));
196  ssz[r] = (int)(scnts[r] / 8L);
197  rsz[r] = (int)(rcnts[r] / 8L);
198  sdp[r] = (int)(sdsps[r] / 8L);
199  rdp[r] = (int)(rdsps[r] / 8L);
200  }
201  int cc;
202  cc = MPI_Alltoallv(sbuf, ssz, sdp, MPI_LONG,
203  rbuf, rsz, rdp, MPI_LONG, comm);
204  assert(cc == MPI_SUCCESS);
205 
206  kmr_free(ssz, (sizeof(int) * (size_t)nprocs));
207  kmr_free(rsz, (sizeof(int) * (size_t)nprocs));
208  kmr_free(sdp, (sizeof(int) * (size_t)nprocs));
209  kmr_free(rdp, (sizeof(int) * (size_t)nprocs));
210  return MPI_SUCCESS;
211 }
212 
213 /* Does all-to-all-v using Bruck all-to-all. It takes the size of the
214  largest message among the ranks as MAXCNT. It uses not-"v"
215  all-to-all by expanding each buffer to the largest one. */
216 
217 static int
218 kmr_alltoallv_bruck(KMR *mr, long maxcnt,
219  void *sbuf, long *scnts, long *sdsps,
220  void *rbuf, long *rcnts, long *rdsps)
221 {
222  int nprocs = mr->nprocs;
223  char *sptr = sbuf;
224  char *rptr = rbuf;
225  int cc;
226 
227  char *sb = kmr_malloc((size_t)(maxcnt * nprocs));
228  char *rb = kmr_malloc((size_t)(maxcnt * nprocs));
229  for (int i = 0; i < nprocs; i++) {
230  memcpy(&sb[maxcnt * i], &sptr[sdsps[i]], (size_t)scnts[i]);
231  }
232  cc = kmr_alltoall_bruck(mr, sb, rb, (int)maxcnt);
233  assert(cc == MPI_SUCCESS);
234  for (int i = 0; i < nprocs; i++) {
235  memcpy(&rptr[rdsps[i]], &rb[maxcnt * i], (size_t)rcnts[i]);
236  }
237  kmr_free(sb, (size_t)(maxcnt * nprocs));
238  kmr_free(rb, (size_t)(maxcnt * nprocs));
239  return MPI_SUCCESS;
240 }
241 
242 /* Waits for some requests (at least one) finish. It is used in
243  kmr_alltoallv_naive(). It returns the number of remaining
244  requests. It cleans the request array RQS by removing finished
245  requests. */
246 
247 static int
248 kmr_alltoallv_wait_requests(KMR *mr, int reqcnt, MPI_Request *rqs,
249  MPI_Status *sts, int *indexes)
250 {
251  //printf("[%03d] kmr_alltoallv_wait_requests\n", mr->rank); fflush(0);
252  int cc;
253  int dones;
254  cc = MPI_Waitsome(reqcnt, rqs, &dones, indexes, sts);
255  assert(dones != MPI_UNDEFINED);
256  if (cc == MPI_ERR_IN_STATUS) {
257  for (int i = 0; i < dones; i++) {
258  assert(0 <= indexes[i] && indexes[i] < reqcnt);
259  assert(sts[indexes[i]].MPI_ERROR == MPI_SUCCESS);
260  }
261  cc = MPI_SUCCESS;
262  }
263 
264  int i;
265  int j;
266  i = 0;
267  j = 0;
268  while (j < reqcnt) {
269  if (rqs[j] != MPI_REQUEST_NULL) {
270  if (i != j) {
271  rqs[i] = rqs[j];
272  }
273  i++;
274  j++;
275  } else {
276  j++;
277  }
278  assert(i <= j);
279  }
280  return i;
281 }
282 
283 /* Does all-to-all-v naively using isend and irecv. It is used in
284  case of large messages. */
285 
286 static int
287 kmr_alltoallv_naive(KMR *mr, void *sbuf, long *scnts, long *sdsps,
288  void *rbuf, long *rcnts, long *rdsps)
289 {
290  //printf("[%03d] kmr_alltoallv_naive\n", mr->rank); fflush(0);
291  MPI_Comm comm = mr->comm;
292  int nprocs = mr->nprocs;
293  int self = mr->rank;
294  int tag = KMR_TAG_ATOA;
295  long chunk = ((mr->atoa_size_limit == 0) ? INT_MAX : mr->atoa_size_limit);
296  assert(0 < chunk && chunk <= (long)INT_MAX);
297  int requestslimit = ((mr->atoa_requests_limit == 0)
298  ? (4 * 1024)
299  : mr->atoa_requests_limit);
300 
301  int cc;
302  cc = MPI_SUCCESS;
303 
304  /* Takes I in the range [-N+1, 2N-2], and returns a value in [0, N-1]. */
305 
306 #define KMR_WRAPAROUND(I,N) \
307  (((I)>=0) ? (((I)<(N)) ? (I) : ((I)-(N))) : ((I)+(N)))
308 
309  MPI_Request *rqs = kmr_malloc(sizeof(MPI_Request) * (size_t)requestslimit);
310  MPI_Status *sts = kmr_malloc(sizeof(MPI_Status) * (size_t)requestslimit);
311  int *indexes = kmr_malloc(sizeof(int) * (size_t)requestslimit);
312 
313  char *rptr = rbuf;
314  char *sptr = sbuf;
315 
316  int reqcnt;
317  reqcnt = 0;
318  for (int i = 0; i < nprocs; i++) {
319  int src = KMR_WRAPAROUND((self - i), nprocs);
320  int dst = KMR_WRAPAROUND((self + i), nprocs);
321 
322  long rsize = rcnts[src];
323  long rchunks = ((rsize + chunk - 1) / chunk);
324 
325  long ssize = scnts[dst];
326  long schunks = ((ssize + chunk - 1) / chunk);
327 
328  assert((rchunks + schunks) <= INT_MAX);
329 
330  /* It needs at least requests for one turn to avoid deadlock. */
331 
332  if (requestslimit < (int)(rchunks + schunks)) {
333  char ee[160];
334  snprintf(ee, 160, ("kmr_alltoallv: exceed the limit of requests"
335  " (atoa_requests_limit=%d needed=%ld)"),
336  requestslimit, (rchunks + schunks));
337  kmr_error(mr, ee);
338  }
339 
340  /* Receive from src. */
341 
342  long roff;
343  roff = 0;
344  while (roff < rsize) {
345  /* Wait when request slots are in short. */
346  while (reqcnt >= requestslimit) {
347  reqcnt = kmr_alltoallv_wait_requests(mr, reqcnt, rqs, sts,
348  indexes);
349  }
350 
351  assert(reqcnt < requestslimit);
352  int siz = (int)MIN((rsize - roff), chunk);
353  cc = MPI_Irecv(&rptr[rdsps[src] + roff], siz, MPI_BYTE,
354  src, tag, comm, &rqs[reqcnt]);
355  assert(cc == MPI_SUCCESS);
356  roff += siz;
357  reqcnt++;
358  }
359 
360  /* And, send to dst. */
361 
362  long soff;
363  soff = 0;
364  while (soff < ssize) {
365  /* Wait when request slots are in short. */
366  while (reqcnt >= requestslimit) {
367  reqcnt = kmr_alltoallv_wait_requests(mr, reqcnt, rqs, sts,
368  indexes);
369  }
370 
371  assert(reqcnt < requestslimit);
372  int siz = (int)MIN((ssize - soff), chunk);
373  cc = MPI_Isend(&sptr[sdsps[dst] + soff], siz, MPI_BYTE,
374  dst, tag, comm, &rqs[reqcnt]);
375  assert(cc == MPI_SUCCESS);
376  soff += siz;
377  reqcnt++;
378  }
379  }
380 
381  cc = MPI_Waitall(reqcnt, rqs, sts);
382  if (cc == MPI_ERR_IN_STATUS) {
383  for (int i = 0; i < reqcnt; i++) {
384  assert(sts[i].MPI_ERROR == MPI_SUCCESS);
385  }
386  cc = MPI_SUCCESS;
387  }
388 
389  kmr_free(rqs, (sizeof(MPI_Request) * (size_t)requestslimit));
390  kmr_free(sts, (sizeof(MPI_Status) * (size_t)requestslimit));
391  kmr_free(indexes, (sizeof(int) * (size_t)requestslimit));
392  return MPI_SUCCESS;
393 #undef KMR_WRAPAROUND
394 }
395 
396 /* Does all-to-all, using Bruck-like butter-fly pattern. */
397 
398 static int
399 kmr_alltoall_bruck(KMR *mr, void *sbuf, void *rbuf, int cnt)
400 {
401 #define DUMP_(X0,X1,X2,X3,X4) if (tracing) kmr_atoa_dump_(X0,X1,X2,X3,X4)
402  MPI_Comm comm = mr->comm;
403  int nprocs = mr->nprocs;
404  int rank = mr->rank;
405  int tag = KMR_TAG_ATOA;
406  _Bool tracing = mr->trace_alltoall;
407  assert((nprocs & 3) == 0);
408  int nprocs4th = (nprocs / 4);
409  int cc;
410 
411  int lognprocs = 0;
412  while ((1 << lognprocs) < nprocs) {
413  lognprocs++;
414  }
415  assert((1 << lognprocs) == nprocs);
416 
417  char *buf0 = kmr_malloc((size_t)(cnt * nprocs));
418  char *buf1 = kmr_malloc((size_t)(cnt * nprocs));
419  memcpy(buf0, sbuf, (size_t)(cnt * nprocs));
420 
421  MPI_Request rqs[6];
422  for (int stage = 0; stage < lognprocs; stage += 2) {
423  DUMP_(mr, buf0, cnt, "step", stage);
424  for (int j = 0; j < nprocs4th; j++) {
425  for (int i = 0; i < 4; i++) {
426  void *s = &buf0[cnt * (i + (j * 4))];
427  void *r = &buf1[cnt * (nprocs4th * i + j)];
428  memcpy(r, s, (size_t)cnt);
429  }
430  }
431  DUMP_(mr, buf1, cnt, "pack", stage);
432  for (int k = 0; k < 4; k++) {
433  int flip = (k << stage);
434  int peer = (rank ^ flip);
435  int baserank = ((rank >> stage) & 3);
436  int basepeer = ((peer >> stage) & 3);
437  if (k == 0) {
438  void *s = &buf1[cnt * (baserank * nprocs4th)];
439  void *r = &buf0[cnt * (baserank * nprocs4th)];
440  memcpy(r, s, (size_t)(cnt * nprocs4th));
441  } else {
442  void *s = &buf1[cnt * (basepeer * nprocs4th)];
443  void *r = &buf0[cnt * (basepeer * nprocs4th)];
444 #if 0
445  cc = MPI_Sendrecv(s, (cnt * nprocs4th), MPI_BYTE, peer, tag,
446  r, (cnt * nprocs4th), MPI_BYTE, peer, tag,
447  comm, MPI_STATUS_IGNORE);
448  assert(cc == MPI_SUCCESS);
449 #else
450  cc = MPI_Isend(s, (cnt * nprocs4th), MPI_BYTE, peer, tag,
451  comm, &rqs[(k - 1) * 2 + 1]);
452  assert(cc == MPI_SUCCESS);
453  cc = MPI_Irecv(r, (cnt * nprocs4th), MPI_BYTE, peer, tag,
454  comm, &rqs[(k - 1) * 2]);
455  assert(cc == MPI_SUCCESS);
456 #endif
457  }
458  }
459  cc = MPI_Waitall(6, rqs, MPI_STATUSES_IGNORE);
460  assert(cc == MPI_SUCCESS);
461  DUMP_(mr, buf0, cnt, "exchange", stage);
462  }
463  memcpy(rbuf, buf0, (size_t)(cnt * nprocs));
464  kmr_free(buf0, (size_t)(cnt * nprocs));
465  kmr_free(buf1, (size_t)(cnt * nprocs));
466  return MPI_SUCCESS;
467 }
468 
469 /* Displays buffer contents (first byte) in the middle of all-to-all.
470  It does nothing when the number of ranks is large. */
471 
472 static void
473 kmr_atoa_dump_(KMR *mr, void *sbuf, int sz, char *title, int step)
474 {
475  MPI_Comm comm = mr->comm;
476  int nprocs = mr->nprocs;
477  int rank = mr->rank;
478  int cc;
479  if (nprocs <= 64) {
480  char *xbuf;
481  if (rank == 0) {
482  xbuf = malloc((size_t)(sz * nprocs * nprocs));
483  assert(xbuf != 0);
484  } else {
485  xbuf = 0;
486  }
487  cc = MPI_Gather(sbuf, (sz * nprocs), MPI_BYTE,
488  xbuf, (sz * nprocs), MPI_BYTE,
489  0, comm);
490  assert(cc == MPI_SUCCESS);
491  if (rank == 0) {
492  fprintf(stderr, ";;KMR %s (%d)\n", title, step);
493  for (int j = 0; j < nprocs; j++) {
494  fprintf(stderr, ";;KMR ");
495  for (int i = 0; i < nprocs; i++) {
496  fprintf(stderr, "%02x ",
497  (0xff & xbuf[(i * (sz * nprocs)) + (j * sz)]));
498  }
499  fprintf(stderr, "\n");
500  }
501  fprintf(stderr, ";;KMR\n");
502  fflush(0);
503  }
504  if (xbuf != 0) {
505  free(xbuf);
506  }
507  MPI_Barrier(comm);
508  }
509 }
510 
511 /* ================================================================ */
512 
513 #if 0
514 int
515 kmr_exscan(void *sbuf, void *rbuf, int cnt, MPI_Datatype dt, MPI_Op op,
516  MPI_Comm comm)
517 {
518  const int SCANTAG = 60;
519  MPI_Comm comm = kvs->c.mr->comm;
520  int nprocs = kvs->c.mr->nprocs;
521  int self = kvs->c.mr->rank;
522  int cc;
523  /*cc = MPI_Exscan(sbuf, rbuf, cnt, dt, op, comm);*/
524  for (int stage = 1; stage < nprocs; stage <<= 1) {
525  int peer = (self ^ stage);
526  if (peer < nprocs) {
527  cc = MPI_Sendrecv(&ssz, 1, MPI_LONG, peer, SCANTAG,
528  &rsz, 1, MPI_LONG, peer, SCANTAG,
529  comm, MPI_STATUS_IGNORE);
530  assert(cc == MPI_SUCCESS);
531  cc = MPI_Sendrecv(sbuf, ssz, MPI_BYTE, peer, SCANTAG,
532  rbuf, rsz, MPI_BYTE, peer, SCANTAG,
533  comm, MPI_STATUS_IGNORE);
534  assert(cc == MPI_SUCCESS);
535  if (self > peer) {
536  /* Do not include the first element of segment. */
537  if ((self & (stage - 1)) != 0) {
538  kmr_add_kv_vector(kvo, rbuf, rsz);
539  }
540  }
541  /* reducevalue*=xbuf */
542  if (commute || self > peer) {
543  kmr_add_kv_vector(kvs, rbuf, rsz);
544  } else {
545  /* PUT AT FRONT */
546  kmr_add_kv_vector(kvs, rbuf, rsz);
547  }
548  }
549  if (kvs->element_count > threshold) {
550  reduce();
551  }
552  }
553  return MPI_SUCCESS;
554 }
555 #endif
556 
557 /*
558 Copyright (C) 2012-2018 RIKEN R-CCS
559 This library is distributed WITHOUT ANY WARRANTY. This library can be
560 redistributed and/or modified under the terms of the BSD 2-Clause License.
561 */
int kmr_allgatherv(KMR *mr, _Bool rankzeroonly, void *sbuf, long scnt, void *rbuf, long *rcnts, long *rdsps)
All-gathers data, or gathers data when RANKZEROONLY.
Definition: kmratoa.c:74
Utilities Private Part (do not include from applications).
#define kmr_malloc(Z)
Allocates memory, or aborts when failed.
Definition: kmrimpl.h:177
KMR Context.
Definition: kmr.h:247
int kmr_exchange_sizes(KMR *mr, long *sbuf, long *rbuf)
Calls all-to-all to exchange one long-integer.
Definition: kmratoa.c:50
KMR Interface.
int kmr_alltoallv(KMR *mr, void *sbuf, long *scnts, long *sdsps, void *rbuf, long *rcnts, long *rdsps)
Does all-to-all-v, but it takes the arguments by long-integers.
Definition: kmratoa.c:124
int kmr_gather_sizes(KMR *mr, long siz, long *rbuf)
Calls all-gather for collecting one long-integer.
Definition: kmratoa.c:62