root/trunk/src/io/bmi/bmi_portals/portals.c @ 8015

Revision 8015, 66.1 KB (checked in by pcarns, 4 years ago)

Add a flags field to bmi method ops structure. BMI_METHOD_FLAG_NO_POLLING
can be used to disable implicit busy polling on a particular BMI method.

Line 
1/*
2 * Portals BMI method.
3 *
4 * Copyright (C) 2007-8 Pete Wyckoff <pw@osc.edu>
5 *
6 * See COPYING in top-level directory.
7 */
8#include <string.h>
9#include <errno.h>
10
11#if defined(__LIBCATAMOUNT__) || defined(__CRAYXT_COMPUTE_LINUX_TARGET) || defined(__CRAYXT_SERVICE)
12/* Cray XT3 and XT4 version, both catamount and compute-node-linux */
13#define PTL_IFACE_DEFAULT PTL_IFACE_SS
14#include <portals/portals3.h>
15#include <sys/utsname.h>
16#else
17/* TCP version */
18#include <portals/portals3.h>
19#include <portals/p3nal_utcp.h>  /* sets PTL_IFACE_DEFAULT to UTCP */
20#include <portals/p3api/debug.h>
21#include <netdb.h>  /* gethostbyname */
22#include <arpa/inet.h>  /* inet_ntop */
23#endif
24
25#include <assert.h>
26#include <sys/signal.h>
27#define __PINT_REQPROTO_ENCODE_FUNCS_C  /* include definitions */
28#include <src/io/bmi/bmi-method-support.h>   /* bmi_method_ops */
29#include <src/io/bmi/bmi-method-callback.h>  /* bmi_method_addr_reg_callback */
30#include <src/common/gossip/gossip.h>
31#include <src/common/gen-locks/gen-locks.h>  /* gen_mutex_t */
32#include <src/common/misc/pvfs2-internal.h>  /* lld */
33#include <src/common/id-generator/id-generator.h>
34
35#ifdef HAVE_VALGRIND_H
36#include <memcheck.h>
37#else
38#define VALGRIND_MAKE_MEM_DEFINED(addr,len)
39#endif
40
41#ifdef __GNUC__
42#  define __unused __attribute__((unused))
43#else
44#  define __unused
45#endif
46
47/*
48 * Debugging macros.
49 */
50#if 1
51#define DEBUG_LEVEL 2
52#define debug(lvl,fmt,args...) \
53    do { \
54        if (lvl <= DEBUG_LEVEL) \
55            gossip_debug(GOSSIP_BMI_DEBUG_PORTALS, fmt, ##args); \
56    } while (0)
57#else
58#  define debug(lvl,fmt,...) do { } while (0)
59#endif
60
61/*
62 * No global locking.  Portals has its own library-level locking, but
63 * we need to keep the BMI polling thread away from the main thread that
64 * is doing send/recv.
65 */
66static gen_mutex_t ni_mutex = GEN_MUTEX_INITIALIZER;
67static gen_mutex_t list_mutex = GEN_MUTEX_INITIALIZER;
68static gen_mutex_t pma_mutex = GEN_MUTEX_INITIALIZER;
69static gen_mutex_t eq_mutex = GEN_MUTEX_INITIALIZER;
70
71/*
72 * Handle given by upper layer, which must be handed back to create
73 * method_addrs.
74 */
75static int bmi_portals_method_id;
76
77/*
78 * Various static ptls objects.  One per instance of the code.
79 */
80static ptl_handle_ni_t ni = PTL_INVALID_HANDLE;
81static ptl_handle_me_t mark_me = PTL_INVALID_HANDLE;
82static ptl_handle_me_t zero_me = PTL_INVALID_HANDLE;
83static ptl_handle_md_t zero_md;
84static ptl_handle_eq_t eq = PTL_INVALID_HANDLE;
85static int ni_init_dup;  /* did runtime open the nic for us? */
86
87/*
88 * Server listens at this well-known portal for unexpected messages.  Clients
89 * will let portals pick a portal for them.  There tend not to be too many of
90 * these:  utcp has 8, so pick a small number.
91 *
92 * This is also used for clients.  If they used MEAttachAny, then
93 * we'd have to remember the ptl_index from a sendunexpected over to the
94 * answering send call in the server.  No need for connection state just
95 * for that.
96 *
97 * Cray has a bunch reserved.  SNLers suggest using ARMCI's.
98 */
99static const ptl_pt_index_t ptl_index = 37;
100
101/*
102 * Handy const.  Cray version needs padding, though, so initialize this
103 * elsewhere.
104 */
105static ptl_process_id_t any_pid;
106
107/*
108 * Cray does not have these, but TCP does.
109 */
110#ifndef HAVE_PTLERRORSTR
111static const char *PtlErrorStr(unsigned int ptl_errno)
112{
113    return ptl_err_str[ptl_errno];
114}
115#endif
116
117#ifndef HAVE_PTLEVENTKINDSTR
118static const char *PtlEventKindStr(ptl_event_kind_t ev_kind)
119{
120    extern const char *ptl_event_str[];
121
122    return ptl_event_str[ev_kind];
123}
124#endif
125
126/*
127 * Match bits.  The lower 32 bits always carry the bmi_tag.  If this bit
128 * in the top is set, it is an unexpected message.  The secondmost top bit is
129 * used when posting a _send_, strangely enough.  If the send is too long,
130 * and the receiver has not preposted, later the receiver will issue a Get
131 * to us for the data.  That get will use the second set of match bits.
132 *
133 * The rest of the 30 top bits are used to encode a sequence number per
134 * peer.  As BMI can post multiple sends with the same tag, we have to
135 * be careful that if send #2 for a given tag goes to the zero_md, that
136 * when he does the get, he grabs from buffer #2, not buffer #1 because
137 * the sender was too slow in unlinking it.
138 */
139static const uint64_t match_bits_unexpected = 1ULL << 63;  /* 8... */
140static const uint64_t match_bits_long_send = 1ULL << 62;   /* 4... */
141static const uint32_t match_bits_seqno_max = 1UL << 30;
142static const int      match_bits_seqno_shift = 32;
143
144static uint64_t mb_from_tag_and_seqno(uint32_t tag, uint32_t seqno)
145{
146    uint64_t mb;
147
148    mb = seqno;
149    mb <<= match_bits_seqno_shift;
150    mb |= tag;
151    /* caller may set the long send bit too */
152    return mb;
153}
154
155/*
156 * Buffer for incoming unexpected send messages.  Only the server needs
157 * a list of these.  Each message can be no more than 8k.  The total memory
158 * devoted to these on the server is 256k.  Not each unexpected message
159 * will take the full 8k, depending on the size of the request.
160 *
161 * Arriving data is copied out of these as quickly as possible.  No refcnt
162 * as in the nonprepost case below, where data sits around in those buffers.
163 */
164#define UNEXPECTED_MESSAGE_SIZE (8 << 10)
165#define UNEXPECTED_QUEUE_SIZE   (256 << 10)
166#define UNEXPECTED_NUM_MD 2
167#define UNEXPECTED_SIZE_PER_MD  (UNEXPECTED_QUEUE_SIZE/UNEXPECTED_NUM_MD)
168
169#define UNEXPECTED_MD_INDEX_OFFSET (1)
170#define NONPREPOST_MD_INDEX_OFFSET (UNEXPECTED_NUM_MD + 1)
171
172static char *unexpected_buf = NULL;
173/* poor-man's circular buffer */
174static ptl_handle_me_t unexpected_me[UNEXPECTED_NUM_MD];
175static ptl_handle_md_t unexpected_md[UNEXPECTED_NUM_MD];
176static int unexpected_need_repost[UNEXPECTED_NUM_MD];
177static int unexpected_need_repost_sum;
178static int unexpected_is_posted[UNEXPECTED_NUM_MD];
179
180/*
181 * This scheme relies on the zero page being unused, i.e. addrsesses
182 * from 0 up to 4k or so.
183 */
184static int unexpected_md_index(void *user_ptr)
185{
186    int i;
187    uintptr_t d = (uintptr_t) user_ptr;
188
189    if (d >= UNEXPECTED_MD_INDEX_OFFSET &&
190        d < UNEXPECTED_MD_INDEX_OFFSET + UNEXPECTED_NUM_MD)
191        return d - UNEXPECTED_MD_INDEX_OFFSET;
192    else
193        return -1;
194}
195
196/*
197 * More buffers for non-preposted receive handling.  While the unexpected
198 * ones above are for explicit API needs of BMI, these are just to work
199 * around the lack of a requirement to pre-post buffers in BMI.  Portals
200 * would otherwise drop messages when there is no matching receive.  Instead
201 * we grab them (or parts of them) with these buffers.
202 *
203 * Only the first 8k of each non-preposted message is saved, but we leave
204 * room for up to 8M of these.  If the total message size is larger, the
205 * receiver will do a Get to fetch the rest.  Note that each non-preposted
206 * message causes a struct bmip_work to be malloced too.
207 *
208 * The nonprepost_refcnt[] variable handles what happens when the MD is
209 * unlinked.  As nonprepost messages arrive, each new rq adds 1 to the
210 * refcnt.  As receives are posted and consume rqs, the refcnt drops.
211 * If an UNLINK event happens, is_posted goes zero.  If an rq consumption
212 * drops the refcnt to zero, and it is not posted, it is reinitialized
213 * and reposted.  Whew.
214 */
215#define NONPREPOST_MESSAGE_SIZE (8 << 10)
216#define NONPREPOST_QUEUE_SIZE (8 << 20)
217#define NONPREPOST_NUM_MD 2
218
219static char *nonprepost_buf = NULL;
220/* poor-man's circular buffer */
221static ptl_handle_me_t nonprepost_me[NONPREPOST_NUM_MD];
222static ptl_handle_md_t nonprepost_md[NONPREPOST_NUM_MD];
223static int nonprepost_is_posted[NONPREPOST_NUM_MD];
224static int nonprepost_refcnt[NONPREPOST_NUM_MD];
225
226static int nonprepost_md_index(void *user_ptr)
227{
228    int i;
229    uintptr_t d = (uintptr_t) user_ptr;
230
231    if (d >= NONPREPOST_MD_INDEX_OFFSET &&
232        d < NONPREPOST_MD_INDEX_OFFSET + NONPREPOST_NUM_MD)
233        return d - NONPREPOST_MD_INDEX_OFFSET;
234    else
235        return -1;
236}
237
238/*
239 * "private data" part of method_addr.  These describe peers, so
240 * have the portal and pid of the remote side.  We need to keep a list
241 * to be able to find a pma from a given pid, even though these are
242 * already kept in a list up in BMI.  Some day fix that interface.
243 */
244struct bmip_method_addr {
245    struct qlist_head list;
246    char *hostname;  /* given by user, converted to a nid by us */
247    char *peername;  /* for rev_lookup */
248    ptl_process_id_t pid;  /* this is a struct with u32 nid + u32 pid */
249    uint32_t seqno_out;  /* each send has a separate sequence number */
250    uint32_t seqno_in;
251};
252static QLIST_HEAD(pma_list);
253
254/*
255 * Work item queue states document the lifetime of a send or receive.
256 */
257enum work_state {
258    SQ_WAITING_ACK,
259    SQ_WAITING_GET,
260    SQ_WAITING_USER_TEST,
261    SQ_CANCELLED,
262    RQ_WAITING_INCOMING,
263    RQ_WAITING_GET,
264    RQ_WAITING_USER_TEST,
265    RQ_WAITING_USER_POST,
266    RQ_LEN_ERROR,
267    RQ_CANCELLED,
268};
269
270static const char *state_name(enum work_state state)
271{
272    switch (state) {
273    case SQ_WAITING_ACK:
274        return "SQ_WAITING_ACK";
275    case SQ_WAITING_GET:
276        return "SQ_WAITING_GET";
277    case SQ_WAITING_USER_TEST:
278        return "SQ_WAITING_USER_TEST";
279    case SQ_CANCELLED:
280        return "SQ_CANCELLED";
281    case RQ_WAITING_INCOMING:
282        return "RQ_WAITING_INCOMING";
283    case RQ_WAITING_GET:
284        return "RQ_WAITING_GET";
285    case RQ_WAITING_USER_TEST:
286        return "RQ_WAITING_USER_TEST";
287    case RQ_WAITING_USER_POST:
288        return "RQ_WAITING_USER_POST";
289    case RQ_LEN_ERROR:
290        return "RQ_LEN_ERROR";
291    case RQ_CANCELLED:
292        return "RQ_CANCELLED";
293    }
294    return "(unknown state)";
295}
296
297/*
298 * Common structure for both send and recv outstanding work items.  There
299 * is a queue of these for each send and recv that are frequently searched.
300 */
301struct bmip_work {
302    struct qlist_head list;
303    int type;               /* BMI_SEND or BMI_RECV */
304    enum work_state state;  /* send or receive state */
305    struct method_op mop;   /* so BMI can use ids to find these */
306
307    bmi_size_t tot_len;     /* size posted for send or recv */
308    bmi_size_t actual_len;  /* recv: possibly shorter than posted */
309
310    bmi_msg_tag_t bmi_tag;  /* recv: unexpected or nonpp tag that arrived */
311    uint64_t match_bits;    /* recv: full match bits, including seqno */
312
313    int is_unexpected;      /* send: if user posted this as unexpected */
314
315    ptl_handle_me_t me;     /* recv: prepost match list entry */
316                            /* send: send me for possible get */
317    ptl_handle_md_t md;     /* recv: prepost or get destination, to cancel */
318                            /* send: send md for possible get */
319    ptl_handle_me_t tme;
320    ptl_handle_md_t tmd;
321    int saw_send_end_and_ack; /* send: make sure both states before unlink */
322
323    /* non-preposted receive, keep ref to a nonpp static buffer */
324    const void *nonpp_buf;   /* pointer to nonpp buffer in MD */
325    int nonpp_md;            /* index into nonprepost_md[] */
326
327    /* unexpected receive, unex_buf is malloced to hold the data */
328    void *unex_buf;
329};
330
331/*
332 * All operations are on exactly one list.  Even queue items that cannot
333 * be reached except from BMI lookup are on a list.  The lists are pretty
334 * specific.
335 *
336 * q_send_waiting_ack - sent the message, waiting for ack
337 * q_send_waiting_get - sent the message, ack says truncated, wait his get
338 * q_recv_waiting_incoming - posted recv, waiting for data to arrive
339 * q_recv_waiting_get - he sent before we recvd, now we sent get
340 * q_recv_nonprepost - data arrived before recv was posted
341 * q_unexpected_done - unexpected message arrived, waiting testunexpected
342 * q_done - send or recv completed, waiting caller to test
343 */
344/* lists that are never shared, or even looked at, but keep code prettier so
345 * we don't have to see if we need to qdel before qadd */
346static QLIST_HEAD(q_send_waiting_ack);
347static QLIST_HEAD(q_send_waiting_get);
348static QLIST_HEAD(q_recv_waiting_incoming);
349static QLIST_HEAD(q_recv_waiting_get);
350
351/* real lists that need locking between event handler and test/post */
352static QLIST_HEAD(q_recv_nonprepost);
353static QLIST_HEAD(q_unexpected_done);
354static QLIST_HEAD(q_done);
355
356static struct bmi_method_addr *addr_from_nidpid(ptl_process_id_t pid);
357static void unexpected_repost(int which);
358static int nonprepost_init(void);
359static int nonprepost_fini(void);
360static int unexpected_fini(void);
361static void nonprepost_repost(int which);
362static const char *bmip_rev_lookup(struct bmi_method_addr *addr);
363
364
365/*----------------------------------------------------------------------------
366 * Test
367 */
368
369/*
370 * Deal with an event, advancing the relevant state machines.
371 */
372static int handle_event(ptl_event_t *ev)
373{
374    struct bmip_work *sq, *rq;
375    int which, ret;
376
377    if (ev->ni_fail_type != 0) {
378        gossip_err("%s: ni err %d\n", __func__, ev->ni_fail_type);
379        return -EIO;
380    }
381
382    debug(6, "%s: event type %s\n", __func__, PtlEventKindStr(ev->type));
383
384    switch (ev->type) {
385    case PTL_EVENT_SEND_END:
386        /*
387         * Sometimes this state happens _after_ the ACK.  Boggle.  Cannot
388         * unlink the sq until this state.  Doing it in the ack state may be
389         * too early.  But we don't know if it is safe to unlink until the
390         * ack comes back and says if he received it, or if he will do a
391         * Get on the MD.  So just mark a flag.  It goes to two only if
392         * the ack indicated the other side will not need to do a get.
393         *
394         * Note that an outgoing get request also triggers this.  Sigh.
395         */
396        sq = ev->md.user_ptr;
397        if (sq->type == BMI_RECV) {
398                rq = ev->md.user_ptr;
399                debug(2, "%s, rq %p stat %s get went out\n", __func__, rq,
400                      state_name(rq->state));
401                break;
402        }
403        debug(2, "%s: sq %p went out len %llu/%llu mb %llx\n", __func__, sq,
404              ev->mlength, ev->rlength, ev->match_bits);
405        if (!sq->is_unexpected && ++sq->saw_send_end_and_ack == 2) {
406                debug(2, "%s: saw end last, unlinking %p me %d (md %d)\n",
407                      __func__, sq, sq->me, sq->md);
408                ret = PtlMEUnlink(sq->me);
409                if (ret)
410                    gossip_err("%s: PtlMEUnlink sq %p: %s\n", __func__,
411                               sq, PtlErrorStr(ret));
412        }
413        break;
414
415    case PTL_EVENT_ACK:
416        /* recv an ack from him, advance the state and unlink */
417        sq = ev->md.user_ptr;
418        debug(2, "%s: sq %p ack rcvd len %llu/%llu\n",
419              __func__, sq, ev->mlength, ev->rlength);
420
421        /*
422         * the rlength always comes back as 0 on catamount, even if we
423         * sent 51200 bytes
424         */
425        if (ev->mlength != ev->rlength) {
426            gossip_err("%s: mlen %llu and rlen %llu do not agree\n", __func__,
427                       ev->mlength, ev->rlength);
428            exit(1);
429        }
430
431        if (ev->mlength > 0) {
432            /* make sure both SEND_END and ACK happened for these */
433            if (!sq->is_unexpected && ++sq->saw_send_end_and_ack == 2) {
434                    debug(2, "%s: saw ack last, unlinking %p\n", __func__, sq);
435                    ret = PtlMEUnlink(sq->me);
436                    if (ret)
437                        gossip_err("%s: PtlMEUnlink sq %p: %s\n", __func__,
438                                   sq, PtlErrorStr(ret));
439            }
440            sq->state = SQ_WAITING_USER_TEST;
441            gen_mutex_lock(&list_mutex);
442            qlist_del(&sq->list);
443            qlist_add_tail(&sq->list, &q_done);
444            gen_mutex_unlock(&list_mutex);
445        } else {
446            /* otherside will Get, then automatic unlink on threshold=3 */
447            sq->state = SQ_WAITING_GET;
448            gen_mutex_lock(&list_mutex);
449            qlist_del(&sq->list);
450            qlist_add_tail(&sq->list, &q_send_waiting_get);
451            gen_mutex_unlock(&list_mutex);
452        }
453        break;
454
455    case PTL_EVENT_PUT_END:
456        /*
457         * Peer did a send to us.  Four cases:
458         *   1.  unexpected message, user_ptr is &unexpected_md[i];
459         *   2a. non-preposted message, user_ptr is &preposted_md[i];
460         *   2b. zero md, non-preposted that was too big and truncated
461         *   3.  expected pre-posted receive, our rq in user_ptr.
462         */
463        which = unexpected_md_index(ev->md.user_ptr);
464        if (which >= 0) {
465            /* build new unexpected rq and copy in the data */
466            debug(2, "%s: unexpected len %lld put to us, mb %llx\n", __func__,
467                  lld(ev->mlength), llu(ev->match_bits));
468            rq = malloc(sizeof(*rq));
469            if (!rq) {
470                    gossip_err("%s: alloc unexpected rq\n", __func__);
471                    break;
472            }
473            if (ev->mlength > UNEXPECTED_MESSAGE_SIZE)
474                exit(1);
475
476            /*
477             * malloc this separately to hand to testunexpected caller; that
478             * is the semantics of the call, and makes managing the MDs
479             * easier.
480             */
481            rq->type = BMI_RECV;
482            rq->unex_buf = malloc(ev->mlength);
483            if (!rq->unex_buf) {
484                    gossip_err("%s: alloc unexpected rq data\n", __func__);
485                    free(rq);
486                    break;
487            }
488            rq->actual_len = ev->mlength;
489            rq->bmi_tag = ev->match_bits & 0xffffffffULL;  /* just 32 bits */
490            rq->mop.addr = addr_from_nidpid(ev->initiator);
491            memcpy(rq->unex_buf, (char *) ev->md.start + ev->offset,
492                   ev->mlength);
493            gen_mutex_lock(&list_mutex);
494            qlist_add_tail(&rq->list, &q_unexpected_done);
495            gen_mutex_unlock(&list_mutex);
496            debug(1, "%s: unexpected %d offset %llu\n", __func__, which,
497                  llu(ev->offset));
498            if (UNEXPECTED_SIZE_PER_MD - ev->offset < UNEXPECTED_MESSAGE_SIZE) {
499                debug(1, "%s: reposting unexpected %d\n", __func__, which);
500                if (unexpected_need_repost[which] == 0) {
501                    unexpected_need_repost[which] = 1;
502                    ++unexpected_need_repost_sum;
503                }
504            }
505            /* try to unpost some, if they are free now */
506            if (unexpected_need_repost_sum) {
507                for (which = 0; which < UNEXPECTED_NUM_MD; which++) {
508                    if (unexpected_need_repost[which])
509                        unexpected_repost(which);
510                }
511            }
512            break;
513        }
514
515        which = nonprepost_md_index(ev->md.user_ptr);
516        if (which >= 0 || ev->md_handle == zero_md) {
517            /* build new nonprepost rq, but just keep pointer to the data, or
518             * if truncated, build the req but no data to hang onto */
519            debug(1, "%s: nonprepost len %llu/%llu mb %llx%s\n",
520                  __func__, llu(ev->mlength), llu(ev->rlength),
521                  ev->match_bits,
522                  ev->md_handle == zero_md ? ", truncated" : "");
523
524            if (which >= 0 && ev->md_handle == zero_md) {
525                gossip_err("%s: which %d but zero md\n", __func__, which);
526                exit(1);
527            }
528
529            rq = malloc(sizeof(*rq));
530            if (!rq) {
531                    gossip_err("%s: alloc nonprepost rq\n", __func__);
532                    break;
533            }
534
535            rq->type = BMI_RECV;
536            rq->state = RQ_WAITING_USER_POST;
537            rq->actual_len = ev->rlength;
538            rq->bmi_tag = ev->match_bits & 0xffffffffULL;  /* just 32 bits */
539            rq->match_bits = ev->match_bits;
540            rq->mop.addr = addr_from_nidpid(ev->initiator);
541            if (ev->md_handle == zero_md) {
542                rq->nonpp_buf = NULL;
543            } else {
544                rq->nonpp_buf = (char *) ev->md.start + ev->offset;
545                rq->nonpp_md = which;
546                /* keep a ref to this md until the recv finishes */
547                ++nonprepost_refcnt[rq->nonpp_md];
548            }
549            debug(2, "%s: rq %p NEW NONPREPOST mb 0x%llx%s\n", __func__,
550                  rq, llu(rq->match_bits),
551                  ev->md_handle == zero_md ? ", truncated" : "");
552            gen_mutex_lock(&list_mutex);
553            qlist_add_tail(&rq->list, &q_recv_nonprepost);
554            gen_mutex_unlock(&list_mutex);
555            break;
556        }
557
558        /* must be something we preposted, with user_ptr is rq */
559        rq = ev->md.user_ptr;
560#ifdef DEBUG_CNL_ODDITIES
561        if ((uintptr_t) rq & 1) {
562            debug(1, "%s: OFF BY 1 rq %p\n", __func__, rq);
563            rq = (void *) ((uintptr_t) rq - 1);
564        }
565#endif
566        rq->actual_len = ev->rlength;  /* attempted length sent */
567        rq->state = RQ_WAITING_USER_TEST;
568        if (rq->actual_len > rq->tot_len)
569            rq->state = RQ_LEN_ERROR;
570        debug(1, "%s: rq %p len %lld tag 0x%llx mb 0x%llx thresh %d put to us\n",
571              __func__, rq, lld(rq->actual_len), llu(rq->bmi_tag),
572              llu(rq->match_bits), ev->md.threshold);
573        gen_mutex_lock(&list_mutex);
574        qlist_del(&rq->list);
575        qlist_add_tail(&rq->list, &q_done);
576        gen_mutex_unlock(&list_mutex);
577
578#ifdef DEBUG_CNL_ODDITIES
579        /*
580         * At least on linux compute nodes, the me does not auto-unlink
581         * properly, even though the md did get unlinked.  It is necessary
582         * to undo the ME too.  Note that the MD threshold is not updated
583         * to zero; it still sits at one (or whatever it was originally
584         * set up to be).
585         */
586        /* ret = PtlMDUnlink(rq->md); debug(2, "md unlink %d gives %s\n", rq->md, PtlErrorStr(ret)); */
587        /* ret = PtlMDUnlink(rq->tmd); debug(2, "tmd unlink %d gives %s\n", rq->tmd, PtlErrorStr(ret)); */
588        ret = PtlMEUnlink(rq->me); debug(2, "me unlink %d gives %s\n", rq->me, PtlErrorStr(ret));
589        ret = PtlMEUnlink(rq->tme); debug(2, "tme unlink %d gives %s\n", rq->tme, PtlErrorStr(ret));
590#endif
591        break;
592
593    case PTL_EVENT_GET_END:
594        /* our send, turned into a get from the receiver, is now done, as
595         * far as we are conerned, as he has gotten it from us */
596        sq = ev->md.user_ptr;
597        debug(1, "%s: peer got sq %p len %llu/%llu mb %llx\n", __func__, sq,
598              llu(ev->mlength), llu(ev->rlength), ev->match_bits);
599        sq->state = SQ_WAITING_USER_TEST;
600        gen_mutex_lock(&list_mutex);
601        qlist_del(&sq->list);
602        qlist_add_tail(&sq->list, &q_done);
603        gen_mutex_unlock(&list_mutex);
604        break;
605
606    case PTL_EVENT_REPLY_END:
607        rq = ev->md.user_ptr;
608        debug(2, "%s: get completed, rq %p\n", __func__, rq);
609        rq->state = RQ_WAITING_USER_TEST;
610        gen_mutex_lock(&list_mutex);
611        qlist_del(&rq->list);
612        qlist_add_tail(&rq->list, &q_done);
613        gen_mutex_unlock(&list_mutex);
614        break;
615
616    case PTL_EVENT_UNLINK:
617        /* XXX: does this ever get called on CNL?  Apparently not. */
618        debug(2, "%s: unlink event! user_ptr %p\n", __func__, ev->md.user_ptr);
619        which = nonprepost_md_index(ev->md.user_ptr);
620        if (which >= 0) {
621            debug(1, "%s: unlinked nonprepost md %d, is_posted %d refcnt %d\n",
622                  __func__, which, nonprepost_is_posted[which],
623                  nonprepost_refcnt[which]);
624            nonprepost_is_posted[which] = 0;
625            if (nonprepost_refcnt[which] == 0)
626                /* already satisfied all the recvs, can this happen so fast? */
627                nonprepost_repost(which);
628            break;
629        }
630
631        debug(1, "%s: unlinked a send or recv, nothing to do\n", __func__);
632
633        /*
634         * Expected recv, unlink just cleans it up.  Already got the send
635         * event.
636         */
637        break;
638
639    case PTL_EVENT_SEND_START:
640        debug(0, "%s: send start, a debugging message thresh %d\n", __func__,
641              ev->md.threshold);
642        break;
643    case PTL_EVENT_PUT_START:
644        debug(0, "%s: put start, a debugging message, thresh %d\n", __func__,
645              ev->md.threshold);
646        break;
647
648    default:
649        gossip_err("%s: unknown event %s\n", __func__,
650                   PtlEventKindStr(ev->type));
651        return -EIO;
652    }
653
654    return 0;
655}
656
657/*
658 * Try to drain everything off the EQ.  No idling can be done in here
659 * because we have to hold the eq lock for the duration, but if a recv
660 * post comes in, it wants to do the post now, not wait another 10 ms.
661 *
662 * If idle_ms == PTL_TIME_FOREVER, block until an event happens.  This is only
663 * used for PtlMDUpdate where we cannot progress until the event is delivered.
664 * Portals will increase the eq pending count when the first part of a message
665 * comes in, and not generate an event until the end.  This could be lots of
666 * packets.  Sorry this could introduce very long delays.
667 *
668 * Under the hood, utcp cannot implement the timeout anyway.
669 */
670static int __check_eq(int idle_ms)
671{
672    ptl_event_t ev;
673    int ret, i, ms = idle_ms;
674
675    for (;;) {
676        ret = PtlEQPoll(&eq, 1, ms, &ev, &i);
677        if (ret == PTL_OK || ret == PTL_EQ_DROPPED) {
678            VALGRIND_MAKE_MEM_DEFINED(&ev, sizeof(ev));
679            handle_event(&ev);
680            ms = 0;  /* just quickly pull events off */
681            if (ret == PTL_EQ_DROPPED) {
682                /* oh well, hope things retry, just point this out */
683                gossip_err("%s: PtlEQPoll: dropped some completions\n",
684                           __func__);
685            }
686        } else if (ret == PTL_EQ_EMPTY) {
687            ret = 0;
688            break;
689        } else {
690            gossip_err("%s: PtlEQPoll: %s\n", __func__, PtlErrorStr(ret));
691            ret = -EIO;
692            break;
693        }
694    }
695
696    return ret;
697}
698
699/*
700 * While doing a post_recv(), we have to make sure no events get processed
701 * on the EQ.  But other pvfs threads might call testunexpected() etc.  There
702 * is a mutex for the eq that wraps check_eq().  A separate lock/release eq
703 * is used to block out other senders or eq checkers during a send.
704 */
705static int check_eq(int idle_ms __unused)
706{
707    int ret;
708
709    gen_mutex_lock(&eq_mutex);
710    ret = __check_eq(0);  /* never idle */
711    gen_mutex_unlock(&eq_mutex);
712    return ret;
713}
714
715/*
716 * Used by testcontext and test.  Called with the list lock held.
717 */
718static void fill_done(struct bmip_work *w, bmi_op_id_t *id, bmi_size_t *size,
719                      void **user_ptr, bmi_error_code_t *err)
720{
721    *id = w->mop.op_id;
722    *size = (w->type == BMI_SEND) ? w->tot_len : w->actual_len;
723    if (user_ptr)
724        *user_ptr = w->mop.user_ptr;
725    *err = 0;
726    if (w->state == SQ_CANCELLED || w->state == RQ_CANCELLED)
727        *err = -PVFS_ETIMEDOUT;
728    if (w->state == RQ_LEN_ERROR)
729        *err = -PVFS_EOVERFLOW;
730
731    debug(2, "%s: %s %p size %llu peer %s\n", __func__,
732          w->type == BMI_SEND ? "sq" : "rq", w, llu(*size),
733          bmip_rev_lookup(w->mop.addr));
734
735    /* free resources too */
736    id_gen_fast_unregister(w->mop.op_id);
737    qlist_del(&w->list);
738    free(w);
739}
740
741static int bmip_testcontext(int incount, bmi_op_id_t *outids, int *outcount,
742                            bmi_error_code_t *errs, bmi_size_t *sizes,
743                            void **user_ptrs, int max_idle_time,
744                            bmi_context_id context_id __unused)
745{
746    struct bmip_work *w, *wn;
747    int ret, n = 0;
748    int timeout = 0;
749
750    for (;;) {
751        /*
752         * Poll once quickly to grab some completions.  Then if nothing
753         * has finished, come back and wait for the timeout.
754         */
755        ret = check_eq(timeout);
756        if (ret)
757            goto out;
758
759        gen_mutex_lock(&list_mutex);
760        qlist_for_each_entry_safe(w, wn, &q_done, list) {
761            if (n == incount)
762                break;
763            fill_done(w, &outids[n], &sizes[n],
764                      user_ptrs ? &user_ptrs[n] : NULL, &errs[n]);
765            ++n;
766        }
767        gen_mutex_unlock(&list_mutex);
768
769        if (n > 0 || timeout == max_idle_time)
770            break;
771
772        timeout = max_idle_time;
773    }
774
775out:
776    *outcount = n;
777    return ret;
778}
779
780/*
781 * Used by lots of BMI test codes, but never in core PVFS.  Easy enough
782 * to implement though.
783 */
784static int bmip_test(bmi_op_id_t id, int *outcount, bmi_error_code_t *err,
785                     bmi_size_t *size, void **user_ptr, int max_idle_time,
786                     bmi_context_id context_id __unused)
787{
788    struct bmip_work *w, *wn;
789    int ret, n = 0;
790    int timeout = 0;
791
792    for (;;) {
793        ret = check_eq(timeout);
794        if (ret)
795            goto out;
796
797        gen_mutex_lock(&list_mutex);
798        qlist_for_each_entry_safe(w, wn, &q_done, list) {
799            if (w->mop.op_id == id) {
800                fill_done(w, &id, size, user_ptr, err);
801                ++n;
802                break;
803            }
804        }
805        gen_mutex_unlock(&list_mutex);
806
807        if (n > 0 || timeout == max_idle_time)
808            break;
809
810        timeout = max_idle_time;
811    }
812
813out:
814    *outcount = n;
815    return ret;
816}
817
818/*
819 * Only used by one BMI test program, not worth the code space to implement.
820 */
821static int bmip_testsome(int num __unused, bmi_op_id_t *ids __unused,
822                         int *outcount __unused, int *other_thing __unused,
823                         bmi_error_code_t *err __unused,
824                         bmi_size_t *size __unused, void **user_ptr __unused,
825                         int max_idle_time __unused,
826                         bmi_context_id context_id __unused)
827{
828    gossip_err("%s: unimplemented\n", __func__);
829    return bmi_errno_to_pvfs(-ENOSYS);
830}
831
832/*
833 * Check the EQ, briefly, then return any unexpecteds, then wait for up
834 * to the idle time.
835 */
836static int bmip_testunexpected(int incount, int *outcount,
837                               struct bmi_method_unexpected_info *ui,
838                               int max_idle_time)
839{
840    struct bmip_work *w, *wn;
841    int ret, n = 0;
842    int timeout = 0;
843
844    for (;;) {
845        ret = check_eq(0);
846        if (ret)
847            goto out;
848
849        qlist_for_each_entry_safe(w, wn, &q_unexpected_done, list) {
850            if (n == incount)
851                break;
852            ui[n].error_code = 0;
853            ui[n].addr = w->mop.addr;
854            ui[n].size = w->actual_len;
855            ui[n].buffer = w->unex_buf;  /* preallocated above */
856            ui[n].tag = w->bmi_tag;
857            qlist_del(&w->list);
858            free(w);
859            ++n;
860        }
861
862        if (n > 0 || timeout == max_idle_time)
863            break;
864
865        timeout = max_idle_time;
866    }
867
868out:
869    *outcount = n;
870    return ret;
871}
872
873
874/*----------------------------------------------------------------------------
875 * Send
876 */
877
878/*
879 * Clients do not open the NIC until they go to connect to a server.
880 * In theory, which server could dictate which NIC.
881 *
882 * Server also calls this to initialize, but with a non-ANY pid.
883 */
884static int ensure_ni_initialized(struct bmip_method_addr *peer __unused,
885                                 ptl_process_id_t my_pid)
886{
887    int ret = 0;
888    ptl_process_id_t no_pid;
889    int nic_type;
890    ptl_md_t zero_mdesc = {
891        .threshold = PTL_MD_THRESH_INF,
892        .max_size = 0,
893        .options = PTL_MD_OP_PUT | PTL_MD_TRUNCATE | PTL_MD_MAX_SIZE |
894                   PTL_MD_EVENT_START_DISABLE,
895        .user_ptr = 0,
896    };
897
898    /* already initialized */
899    if (ni != PTL_INVALID_HANDLE)
900        return ret;
901
902    gen_mutex_lock(&ni_mutex);
903
904    /* check again now that we have the mutex */
905    if (ni != PTL_INVALID_HANDLE)
906        goto out;
907
908    /*
909     * XXX: should do this properly.  If server, we could look
910     * up our listen address and get an interface from that.  If client,
911     * lookup server, figure out how route would go to it, choose
912     * that interface.  Yeah.
913     */
914
915#if defined(__CRAYXT_SERVICE) || defined(__CRAYXT_COMPUTE_LINUX_TARGET)
916    /*
917     * Magic for Cray XT service nodes and compute node linux.
918     * Catamount uses default, TCP uses default.
919     */
920    nic_type = CRAY_USER_NAL;
921#else
922    nic_type = PTL_IFACE_DEFAULT;
923#endif
924
925    /* needed for TCP */
926    /* setenv("PTL_IFACE", "eth0", 0); */
927
928    ret = PtlNIInit(nic_type, my_pid.pid, NULL, NULL, &ni);
929#if defined(__LIBCATAMOUNT__) || defined(__CRAYXT_COMPUTE_LINUX_TARGET)
930    if (ret == PTL_IFACE_DUP && ni != PTL_INVALID_HANDLE) {
931        ret = 0;  /* already set up by pre-main on catamount nodes */
932        ni_init_dup = 1;
933    }
934#endif
935    if (ret) {
936        /* error number is bogus here, do not try to decode it */
937        gossip_err("%s: PtlNIInit failed: %s\n", __func__, PtlErrorStr(ret));
938        ni = PTL_INVALID_HANDLE;  /* init call nulls it out */
939        ret = -EIO;
940        goto out;
941    }
942
943    /*
944     * May not be able to assign PID to whatever we want.  Let's see
945     * what runtime has assigned.
946     */
947    {
948    ptl_process_id_t id;
949    ret = PtlGetId(ni, &id);
950    if (ret != 0) {
951        gossip_err("%s: PtlGetId failed: %d\n", __func__, ret);
952        ni = PTL_INVALID_HANDLE;
953        ret = -EIO;
954        goto out;
955    }
956    debug(0, "%s: runtime thinks my id is %d.%d\n", __func__, id.nid, id.pid);
957    }
958
959#if !(defined(__LIBCATAMOUNT__) || defined(__CRAYXT_SERVICE) || defined(__CRAYXT_COMPUTE_LINUX_TARGET))
960    /*
961     * Need an access control entry to allow everybody to talk, else root
962     * cannot talk to random user, e.g.  Not implemented on Cray.
963     */
964#ifdef HAVE_PTLACENTRY_JID
965    ret = PtlACEntry(ni, 0, any_pid, (ptl_uid_t) -1, (ptl_jid_t) -1, ptl_index);
966#else
967    ret = PtlACEntry(ni, 0, any_pid, (ptl_uid_t) -1, ptl_index);
968#endif
969    if (ret) {
970        gossip_err("%s: PtlACEntry: %s\n", __func__, PtlErrorStr(ret));
971        ret = -EIO;
972        goto out;
973    }
974#endif
975
976    /* a single EQ for all messages, with some arbitrary depth */
977    ret = PtlEQAlloc(ni, 100, NULL, &eq);
978    if (ret) {
979        gossip_err("%s: PtlEQAlloc: %s\n", __func__, PtlErrorStr(ret));
980        ret = -EIO;
981        goto out;
982    }
983
984    /* "mark" match entry that denotes the bottom of the prepost entries */
985    no_pid.nid = 0;
986    no_pid.pid = 0;
987    ret = PtlMEAttach(ni, ptl_index, no_pid, 0, 0, PTL_RETAIN, PTL_INS_BEFORE,
988                      &mark_me);
989    if (ret) {
990        gossip_err("%s: PtlMEAttach mark: %s\n", __func__, PtlErrorStr(ret));
991        ret = -EIO;
992        goto out;
993    }
994
995    /* "zero" grabs just the header (of nonprepost, not unexpected), drops the
996     * contents */
997    ret = PtlMEAttach(ni, ptl_index, any_pid, 0,
998                      (0x3fffffffULL << 32) | 0xffffffffULL, PTL_RETAIN,
999                      PTL_INS_AFTER, &zero_me);
1000    if (ret) {
1001        gossip_err("%s: PtlMEAttach zero: %s\n", __func__, PtlErrorStr(ret));
1002        ret = -EIO;
1003        goto out;
1004    }
1005
1006    zero_mdesc.eq_handle = eq;
1007    ret = PtlMDAttach(zero_me, zero_mdesc, PTL_RETAIN, &zero_md);
1008    if (ret) {
1009        gossip_err("%s: PtlMDAttach zero: %s\n", __func__, PtlErrorStr(ret));
1010        ret = -EIO;
1011        goto out;
1012    }
1013
1014    /* now it is time to build this queue, once per NI */
1015    nonprepost_init();
1016
1017out:
1018    if (ret) {
1019        if (mark_me != PTL_INVALID_HANDLE)
1020            PtlMEUnlink(zero_me);
1021        if (mark_me != PTL_INVALID_HANDLE)
1022            PtlMEUnlink(mark_me);
1023        if (eq != PTL_INVALID_HANDLE)
1024            PtlEQFree(eq);
1025        if (ni != PTL_INVALID_HANDLE)
1026            PtlNIFini(ni);
1027    }
1028    gen_mutex_unlock(&ni_mutex);
1029    return ret;
1030}
1031
1032/*
1033 * Fill in bits for BMI, used by caller for later test or cancel.
1034 */
1035static void fill_mop(struct bmip_work *w, bmi_op_id_t *id,
1036                     struct bmi_method_addr *addr, void *user_ptr,
1037                     bmi_context_id context_id)
1038{
1039    id_gen_fast_register(&w->mop.op_id, &w->mop);
1040    w->mop.addr = addr;
1041    w->mop.method_data = w;
1042    w->mop.user_ptr = user_ptr;
1043    w->mop.context_id = context_id;
1044    *id = w->mop.op_id;
1045}
1046
1047/*
1048 * Initialize an mdesc for a get or put, either sq or rq side.
1049 */
1050static void build_mdesc(struct bmip_work *w, ptl_md_t *mdesc, int numbufs,
1051                        void *const *buffers, const bmi_size_t *sizes)
1052{
1053    mdesc->threshold = 1;
1054    mdesc->options = 0;  /* PTL_MD_EVENT_START_DISABLE; */
1055    mdesc->eq_handle = eq;
1056    mdesc->user_ptr = w;
1057
1058    if (numbufs > 1) {
1059        int i;
1060        ptl_md_iovec_t *iov = (void *) &w[1];
1061        for (i=0; i<numbufs; i++) {
1062            iov[i].iov_base = buffers[i];
1063            iov[i].iov_len = sizes[i];
1064        }
1065        mdesc->options |= PTL_MD_IOVEC;
1066        mdesc->start = (void *) &w[1];
1067        mdesc->length = numbufs;
1068    } else {
1069        mdesc->start = *buffers;
1070        mdesc->length = *sizes;
1071    }
1072}
1073
1074/*
1075 * Generic interface for both send and sendunexpected, list and non-list send.
1076 */
1077static int
1078post_send(bmi_op_id_t *id, struct bmi_method_addr *addr,
1079          int numbufs, const void *const *buffers, const bmi_size_t *sizes,
1080          bmi_size_t total_size, bmi_msg_tag_t bmi_tag, void *user_ptr,
1081          bmi_context_id context_id, int is_unexpected)
1082{
1083    struct bmip_method_addr *pma = addr->method_data;
1084    struct bmip_work *sq;
1085    uint64_t mb;
1086    int ret;
1087    ptl_md_t mdesc;
1088
1089    /* unexpected messages must fit inside the agreed limit */
1090    if (is_unexpected && total_size > UNEXPECTED_MESSAGE_SIZE)
1091        return -EINVAL;
1092
1093    ret = ensure_ni_initialized(pma, any_pid);
1094    if (ret)
1095        goto out;
1096
1097    sq = malloc(sizeof(*sq) + numbufs * sizeof(ptl_md_iovec_t));
1098    if (!sq) {
1099        ret = -ENOMEM;
1100        goto out;
1101    }
1102    sq->type = BMI_SEND;
1103    sq->saw_send_end_and_ack = 0;
1104    sq->tot_len = total_size;
1105    sq->is_unexpected = is_unexpected;
1106    fill_mop(sq, id, addr, user_ptr, context_id);
1107    /* lose cast to fit in non-const portals iovec */
1108    build_mdesc(sq, &mdesc, numbufs, (void *const *)(uintptr_t) buffers, sizes);
1109    mdesc.threshold = 2;  /* put, ack */
1110
1111    sq->state = SQ_WAITING_ACK;
1112    gen_mutex_lock(&list_mutex);
1113    qlist_add_tail(&sq->list, &q_send_waiting_ack);
1114    gen_mutex_unlock(&list_mutex);
1115
1116    /* if not unexpected, use an ME in case he has to come get it */
1117    if (sq->is_unexpected) {
1118
1119        debug(2, "%s: sq %p len %lld peer %s tag %d unexpected\n", __func__, sq,
1120              lld(total_size), pma->peername, bmi_tag);
1121        /* md without any match entry, for sending */
1122        mb = match_bits_unexpected | bmi_tag;
1123        ret = PtlMDBind(ni, mdesc, PTL_UNLINK, &sq->md);
1124        if (ret) {
1125            gossip_err("%s: PtlMDBind: %s\n", __func__, PtlErrorStr(ret));
1126            return -EIO;
1127        }
1128        debug(2, "%s: bound md %d\n", __func__, sq->md);
1129    } else {
1130        /* seqno increments on every expected send (only) */
1131        if (++pma->seqno_out >= match_bits_seqno_max)
1132            pma->seqno_out = 0;
1133        mb = mb_from_tag_and_seqno(bmi_tag, pma->seqno_out);
1134
1135        debug(2, "%s: sq %p len %lld peer %s tag %d seqno %u mb 0x%llx\n",
1136              __func__, sq, lld(total_size), pma->peername, bmi_tag,
1137              pma->seqno_out, llu(mb));
1138
1139        /* long-send bit only on the ME, not as the outgoing mb in PtlPut */
1140        ret = PtlMEInsert(mark_me, pma->pid, match_bits_long_send | mb,
1141                          0, PTL_UNLINK, PTL_INS_BEFORE, &sq->me);
1142        if (ret) {
1143            gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
1144            return -EIO;
1145        }
1146
1147        /*
1148         * Truncate is used to send just what we have, which may be less
1149         * than he wants to receive.  Otherwise would have to chop down the
1150         * iovec list to fit on the Get-ter, or use GetRegion.
1151         */
1152        mdesc.options |= PTL_MD_OP_GET | PTL_MD_TRUNCATE;
1153        mdesc.threshold = 3;  /* put, ack, maybe get */
1154        ret = PtlMDAttach(sq->me, mdesc, PTL_UNLINK, &sq->md);
1155        if (ret) {
1156            gossip_err("%s: PtlMDBind: %s\n", __func__, PtlErrorStr(ret));
1157            return -EIO;
1158        }
1159    }
1160
1161    sq->bmi_tag = bmi_tag;  /* both for debugging dumps */
1162    sq->match_bits = mb;
1163
1164    ret = PtlPut(sq->md, PTL_ACK_REQ, pma->pid, ptl_index, 0, mb, 0, 0);
1165    if (ret) {
1166        gossip_err("%s: PtlPut: %s\n", __func__, PtlErrorStr(ret));
1167        return -EIO;
1168    }
1169
1170out:
1171    return ret;
1172}
1173
1174static int bmip_post_send(bmi_op_id_t *id, struct bmi_method_addr *remote_map,
1175                          const void *buffer, bmi_size_t total_size,
1176                          enum bmi_buffer_type buffer_flag __unused,
1177                          bmi_msg_tag_t tag, void *user_ptr,
1178                          bmi_context_id context_id)
1179{
1180    return post_send(id, remote_map, 0, &buffer, &total_size,
1181                     total_size, tag, user_ptr, context_id, 0);
1182}
1183
1184static int bmip_post_send_list(bmi_op_id_t *id, struct bmi_method_addr *remote_map,
1185                               const void *const *buffers,
1186                               const bmi_size_t *sizes, int list_count,
1187                               bmi_size_t total_size,
1188                               enum bmi_buffer_type buffer_flag __unused,
1189                               bmi_msg_tag_t tag, void *user_ptr,
1190                               bmi_context_id context_id)
1191{
1192    return post_send(id, remote_map, list_count, buffers, sizes,
1193                     total_size, tag, user_ptr, context_id, 0);
1194}
1195
1196static int bmip_post_sendunexpected(bmi_op_id_t *id,
1197                                    struct bmi_method_addr *remote_map,
1198                                    const void *buffer, bmi_size_t total_size,
1199                                    enum bmi_buffer_type bflag __unused,
1200                                    bmi_msg_tag_t tag, void *user_ptr,
1201                                    bmi_context_id context_id)
1202{
1203    return post_send(id, remote_map, 0, &buffer, &total_size,
1204                     total_size, tag, user_ptr, context_id, 1);
1205}
1206
1207static int bmip_post_sendunexpected_list(bmi_op_id_t *id,
1208                                         struct bmi_method_addr *remote_map,
1209                                         const void *const *buffers,
1210                                         const bmi_size_t *sizes,
1211                                         int list_count, bmi_size_t total_size,
1212                                         enum bmi_buffer_type bflag __unused,
1213                                         bmi_msg_tag_t tag, void *user_ptr,
1214                                         bmi_context_id context_id)
1215{
1216    return post_send(id, remote_map, list_count, buffers, sizes,
1217                     total_size, tag, user_ptr, context_id, 1);
1218}
1219
1220
1221/*----------------------------------------------------------------------------
1222 * Receive
1223 */
1224
1225/*
1226 * Assumes that buf/len will fit in numbufs/buffers/sizes.  For use
1227 * in copying out of non-preposted buffer area to user-supplied iovec.
1228 */
1229static void memcpy_buflist(int numbufs __unused, void *const *buffers,
1230                           const bmi_size_t *sizes, const void *vbuf,
1231                           bmi_size_t len)
1232{
1233    bmi_size_t thislen;
1234    const char *buf = vbuf;
1235    int i = 0;
1236
1237    while (len) {
1238        thislen = len;
1239        if (thislen > sizes[i])
1240            thislen = sizes[i];
1241        memcpy(buffers[i], buf, thislen);
1242        buf += thislen;
1243        len -= thislen;
1244        ++i;
1245    }
1246}
1247
1248/*
1249 * Part of post_recv.  Search the queue of non-preposted receives that the
1250 * other side has sent to us.  If one matches, satisfy this receive now,
1251 * else return and let the receive be preposted.
1252 */
1253static int match_nonprepost_recv(bmi_op_id_t *id, struct bmi_method_addr *addr,
1254                                 int numbufs, void *const *buffers,
1255                                 const bmi_size_t *sizes,
1256                                 bmi_size_t total_size, bmi_msg_tag_t tag,
1257                                 void *user_ptr, bmi_context_id context_id)
1258{
1259    struct bmip_method_addr *pma = addr->method_data;
1260    int found = 0;
1261    int ret = 0;
1262    ptl_md_t mdesc;
1263    struct bmip_work *rq;
1264    uint64_t mb;
1265
1266    /* expected match bits */
1267    mb = mb_from_tag_and_seqno(tag, pma->seqno_in);
1268
1269    /* XXX: remove bmi_tag comparison if match_bits works */
1270    gen_mutex_lock(&list_mutex);
1271    qlist_for_each_entry(rq, &q_recv_nonprepost, list) {
1272        debug(2, "%s: compare rq %p addr %p =? %p tag %u =? %u mb 0x%llx =? 0x%llx\n", __func__,
1273              rq, rq->mop.addr, addr, rq->bmi_tag, tag, llu(rq->match_bits),
1274              llu(mb));
1275        if (rq->mop.addr == addr && rq->bmi_tag == tag && rq->match_bits == mb) {
1276            found = 1;
1277            qlist_del(&rq->list);
1278            break;
1279        }
1280    }
1281    gen_mutex_unlock(&list_mutex);
1282    if (!found)
1283        goto out;   /* not found, proceed with prepost */
1284
1285    /* rq matched, use the addr found at event time */
1286    fill_mop(rq, id, rq->mop.addr, user_ptr, context_id);
1287
1288    /* verify length fits */
1289    if (rq->actual_len > total_size) {
1290        gossip_err("%s: send len %llu longer than posted %lld\n", __func__,
1291                   llu(rq->actual_len), lld(total_size));
1292        rq->state = RQ_LEN_ERROR;
1293        goto foundout;
1294    }
1295
1296    /* short message, copy and release MD buf */
1297    if (rq->nonpp_buf != NULL) {
1298        memcpy_buflist(numbufs, buffers, sizes, rq->nonpp_buf,
1299                       rq->actual_len);
1300        --nonprepost_refcnt[rq->nonpp_md];
1301        if (nonprepost_refcnt[rq->nonpp_md] == 0 &&
1302           !nonprepost_is_posted[rq->nonpp_md]) {
1303            nonprepost_repost(rq->nonpp_md);
1304        }
1305        rq->state = RQ_WAITING_USER_TEST;
1306        debug(2, "%s: found short message rq %p, copied\n", __func__, rq);
1307        goto foundout;
1308    }
1309
1310    /* initiate a get for the truncated message */
1311    if (numbufs > 1) {
1312        /* need room for the iovec somewhere, might as well be here */
1313        struct bmip_work *rq2 = malloc(sizeof(*rq) +
1314                                       numbufs * sizeof(ptl_md_iovec_t));
1315        if (!rq2) {
1316            ret = -ENOMEM;
1317            goto out;
1318        }
1319        memcpy(rq2, rq, sizeof(*rq));
1320        free(rq);
1321        rq = rq2;
1322    }
1323
1324    rq->tot_len = total_size;
1325    build_mdesc(rq, &mdesc, numbufs, buffers, sizes);
1326    mdesc.threshold = 2;  /* XXX: on Cray only, this must be 2, not 1 */
1327
1328    ret = PtlMDBind(ni, mdesc, PTL_UNLINK, &rq->md);
1329    if (ret) {
1330        gossip_err("%s: PtlMDBind: %s\n", __func__, PtlErrorStr(ret));
1331        ret = -EIO;
1332        free(rq);
1333        goto out;
1334    }
1335
1336    mb |= match_bits_long_send;
1337    debug(2, "%s: rq %p doing get mb 0x%llx\n", __func__, rq, llu(mb));
1338    ret = PtlGet(rq->md, pma->pid, ptl_index, 0, mb, 0);
1339    if (ret) {
1340        gossip_err("%s: PtlGet: %s\n", __func__, PtlErrorStr(ret));
1341        ret = -EIO;
1342        free(rq);
1343        goto out;
1344    }
1345
1346    rq->state = RQ_WAITING_GET;
1347    gen_mutex_lock(&list_mutex);
1348    qlist_add_tail(&rq->list, &q_recv_waiting_get);
1349    gen_mutex_unlock(&list_mutex);
1350    ret = 1;
1351    goto out;
1352
1353foundout:
1354    gen_mutex_lock(&list_mutex);
1355    qlist_add_tail(&rq->list, &q_done);
1356    gen_mutex_unlock(&list_mutex);
1357    ret = 1;  /* we handled it */
1358
1359out:
1360    return ret;
1361}
1362
1363static int post_recv(bmi_op_id_t *id, struct bmi_method_addr *addr,
1364                     int numbufs, void *const *buffers, const bmi_size_t *sizes,
1365                     bmi_size_t total_size, bmi_msg_tag_t tag,
1366                     void *user_ptr, bmi_context_id context_id)
1367{
1368    struct bmip_method_addr *pma = addr->method_data;
1369    struct bmip_work *rq = NULL;
1370    ptl_md_t mdesc;
1371    int ret, ms = 0;
1372    uint64_t mb = 0;
1373
1374    ret = ensure_ni_initialized(pma, any_pid);
1375    if (ret)
1376        goto out;
1377
1378    /* increment the expected seqno of the message he will send us */
1379    if (++pma->seqno_in >= match_bits_seqno_max)
1380        pma->seqno_in = 0;
1381
1382    debug(2, "%s: len %lld peer %s tag %d seqno %u\n", __func__,
1383          lld(total_size), pma->peername, tag, pma->seqno_in);
1384
1385    rq = NULL;
1386    gen_mutex_lock(&eq_mutex);  /* do not let test threads manipulate eq */
1387restart:
1388    /* drain the EQ */
1389    debug(2, "%s: check eq\n", __func__);
1390    __check_eq(ms);
1391
1392    /* first check the nonpreposted receive queue */
1393    debug(2, "%s: match nonprepost?\n", __func__);
1394    ret = match_nonprepost_recv(id, addr, numbufs, buffers, sizes,
1395                                total_size, tag, user_ptr, context_id);
1396
1397    if (ret != 0) {
1398        if (ret > 0)  /* handled it via the nonprepost queue */
1399            ret = 0;
1400        goto out;  /* or error */
1401    }
1402
1403    /* multiple trips through this loop caused by many MD_NO_UPDATEs, just
1404     * save the built-up rq and me/md from the first time through */
1405    if (!rq) {
1406        rq = malloc(sizeof(*rq) + numbufs * sizeof(ptl_md_iovec_t));
1407        if (!rq) {
1408            ret = -ENOMEM;
1409            goto out;
1410        }
1411        rq->type = BMI_RECV;
1412        rq->tot_len = total_size;
1413        rq->actual_len = 0;
1414        rq->bmi_tag = tag;
1415        fill_mop(rq, id, addr, user_ptr, context_id);
1416        memset(&mdesc, 0, sizeof(mdesc));
1417        build_mdesc(rq, &mdesc, numbufs, buffers, sizes);
1418        mdesc.threshold = 0;  /* initially inactive */
1419        mdesc.options |= PTL_MD_OP_PUT;
1420
1421        /* put at the end of the preposted list, just before the first
1422         * nonprepost or unex ME. */
1423        rq->me = PTL_INVALID_HANDLE;
1424        debug(2, "%s: me insert\n", __func__);
1425        mb = mb_from_tag_and_seqno(tag, pma->seqno_in);
1426        rq->match_bits = mb;
1427        ret = PtlMEInsert(mark_me, pma->pid, mb, 0, PTL_UNLINK,
1428                          PTL_INS_BEFORE, &rq->me);
1429        if (ret) {
1430            gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
1431            ret = -EIO;
1432            goto out;
1433        }
1434
1435        debug(2, "%s: md attach\n", __func__);
1436        ret = PtlMDAttach(rq->me, mdesc, PTL_UNLINK, &rq->md);
1437        if (ret) {
1438            gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
1439            ret = -EIO;
1440            goto out;
1441        }
1442        debug(2, "%s: me %d, md %d\n", __func__, rq->me, rq->md);
1443    }
1444
1445    /* now update it atomically with respect to the event stream from the NIC */
1446    mdesc.threshold = 1;
1447    debug(2, "%s: md update threshold to 1\n", __func__);
1448    ret = PtlMDUpdate(rq->md, NULL, &mdesc, eq);
1449    if (ret) {
1450        if (ret == PTL_MD_NO_UPDATE) {
1451            /* cannot block, other thread may have processed the event for us */
1452            debug(2, "%s: md update: no update\n", __func__);
1453            ms = PTL_TIME_FOREVER;
1454            goto restart;
1455        }
1456        gossip_err("%s: PtlMDUpdate: %s\n", __func__, PtlErrorStr(ret));
1457        ret = -EIO;
1458        goto out;
1459    }
1460
1461#ifdef DEBUG_CNL_ODDITIES
1462    {
1463    debug(2, "insert another\n");
1464    ret = PtlMEInsert(mark_me, pma->pid, 0, -1ULL, PTL_UNLINK,
1465                      PTL_INS_BEFORE, &rq->tme);
1466    if (ret) {
1467        gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
1468        ret = -EIO;
1469        goto out;
1470    }
1471
1472    debug(2, "%s: md attach\n", __func__);
1473    mdesc.user_ptr = (void *) ((uintptr_t) mdesc.user_ptr + 1);
1474    ret = PtlMDAttach(rq->tme, mdesc, PTL_UNLINK, &rq->tmd);
1475    if (ret) {
1476        gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
1477        ret = -EIO;
1478        goto out;
1479    }
1480    debug(2, "%s: me %d, md %d\n", __func__, rq->tme, rq->tmd);
1481    }
1482#endif
1483
1484
1485    debug(2, "%s: rq %p waiting incoming, len %lld peer %s tag %d seqno %u mb 0x%llx\n",
1486          __func__, rq, lld(total_size), pma->peername, tag, pma->seqno_in,
1487          llu(mb));
1488    rq->state = RQ_WAITING_INCOMING;
1489    gen_mutex_lock(&list_mutex);
1490    qlist_add_tail(&rq->list, &q_recv_waiting_incoming);
1491    gen_mutex_unlock(&list_mutex);
1492    rq = NULL;  /* happy rq, keep it */
1493
1494out:
1495    gen_mutex_unlock(&eq_mutex);
1496    if (rq) {
1497        /*
1498         * Alloced, then found MD_NO_UPDATE, and it had completed on the
1499         * unexpected.  Free this temp rq.  (Or error case too.)
1500         */
1501        if (rq->me != PTL_INVALID_HANDLE)
1502            PtlMEUnlink(rq->me);
1503        free(rq);
1504    }
1505    return ret;
1506}
1507
1508static int bmip_post_recv(bmi_op_id_t *id, struct bmi_method_addr *remote_map,
1509                          void *buffer, bmi_size_t expected_len,
1510                          bmi_size_t *actual_len __unused,
1511                          enum bmi_buffer_type buffer_flag __unused,
1512                          bmi_msg_tag_t tag, void *user_ptr,
1513                          bmi_context_id context_id)
1514{
1515    return post_recv(id, remote_map, 0, &buffer, &expected_len,
1516                     expected_len, tag, user_ptr, context_id);
1517}
1518
1519static int bmip_post_recv_list(bmi_op_id_t *id, struct bmi_method_addr *remote_map,
1520                               void *const *buffers, const bmi_size_t *sizes,
1521                               int list_count, bmi_size_t tot_expected_len,
1522                               bmi_size_t *tot_actual_len __unused,
1523                               enum bmi_buffer_type buffer_flag __unused,
1524                               bmi_msg_tag_t tag, void *user_ptr,
1525                               bmi_context_id context_id)
1526{
1527    return post_recv(id, remote_map, list_count, buffers, sizes,
1528                     tot_expected_len, tag, user_ptr, context_id);
1529}
1530
1531/* debugging */
1532#define show_queue(q) do { \
1533    fprintf(stderr, #q "\n"); \
1534    qlist_for_each_entry(w, &q, list) { \
1535        fprintf(stderr, "%s %p state %s len %llu tag 0x%llx mb 0x%0llx\n", \
1536                w->type == BMI_SEND ? "sq" : "rq", \
1537                w, state_name(w->state), \
1538                w->type == BMI_SEND ? llu(w->tot_len) : llu(w->actual_len), \
1539                llu(w->bmi_tag), llu(w->match_bits)); \
1540    } \
1541} while (0)
1542
1543static void dump_queues(int sig __unused)
1544{
1545    struct bmip_work *w;
1546
1547    /* debugging */
1548    show_queue(q_send_waiting_ack);
1549    show_queue(q_send_waiting_get);
1550    show_queue(q_recv_waiting_incoming);
1551    show_queue(q_recv_waiting_get);
1552    show_queue(q_recv_nonprepost);
1553    show_queue(q_unexpected_done);
1554    show_queue(q_done);
1555}
1556
1557/*
1558 * Cancel.  Grab the eq lock to keep things from finishing as we are
1559 * freeing them.  Hopefully this won't lead to core dumps in future
1560 * test operations.
1561 */
1562static int bmip_cancel(bmi_op_id_t id, bmi_context_id context_id __unused)
1563{
1564    int ret;
1565    struct method_op *mop;
1566    struct bmip_work *w;
1567
1568    gen_mutex_lock(&eq_mutex);
1569    __check_eq(0);
1570    mop = id_gen_fast_lookup(id);
1571    w = mop->method_data;
1572    fprintf(stderr, "%s: cancel %p state %s len %llu tag 0x%llx mb 0x%llx\n",
1573            __func__, w, state_name(w->state), llu(w->tot_len),
1574            llu(w->bmi_tag), llu(w->match_bits));
1575    switch (w->state) {
1576
1577    case SQ_WAITING_ACK:
1578        w->state = SQ_CANCELLED;
1579        goto link_done;
1580
1581    case SQ_WAITING_GET:
1582        w->state = SQ_CANCELLED;
1583        ret = PtlMEUnlink(w->me);
1584        if (ret)
1585            gossip_err("%s: PtlMEUnlink: %s\n", __func__, PtlErrorStr(ret));
1586        goto link_done;
1587
1588    case RQ_WAITING_INCOMING:
1589        w->state = RQ_CANCELLED;
1590        ret = PtlMEUnlink(w->me);
1591        if (ret)
1592            gossip_err("%s: PtlMEUnlink: %s\n", __func__, PtlErrorStr(ret));
1593        goto link_done;
1594
1595    case RQ_WAITING_GET:
1596        w->state = RQ_CANCELLED;
1597        ret = PtlMDUnlink(w->md);
1598        if (ret)
1599            /* complain, but might be okay if we raced with the completion */
1600            gossip_err("%s: PtlMDUnlink: %s\n", __func__, PtlErrorStr(ret));
1601        goto link_done;
1602
1603    case SQ_WAITING_USER_TEST:
1604    case RQ_WAITING_USER_TEST:
1605    case RQ_WAITING_USER_POST:
1606    case RQ_LEN_ERROR:
1607    case SQ_CANCELLED:
1608    case RQ_CANCELLED:
1609        /* nothing to do */
1610        break;
1611    }
1612    goto out;
1613
1614link_done:
1615    gen_mutex_lock(&list_mutex);
1616    qlist_del(&w->list);
1617    qlist_add_tail(&w->list, &q_done);
1618    gen_mutex_unlock(&list_mutex);
1619
1620out:
1621    gen_mutex_unlock(&eq_mutex);
1622
1623    /* debugging */
1624    dump_queues(0);
1625
1626    exit(1);
1627    return 0;
1628}
1629
1630static const char *bmip_rev_lookup(struct bmi_method_addr *addr)
1631{
1632    struct bmip_method_addr *pma = addr->method_data;
1633
1634    return pma->peername;
1635}
1636
1637/*
1638 * Build and fill a Portals-specific method_addr structure.  This routine
1639 * copies the hostname if it needs it.
1640 */
1641static struct bmi_method_addr *bmip_alloc_method_addr(const char *hostname,
1642                                                  ptl_process_id_t pid,
1643                                                  int register_with_bmi)
1644{
1645    struct bmi_method_addr *map;
1646    struct bmip_method_addr *pma;
1647    bmi_size_t extra;
1648    int ret;
1649
1650    /*
1651     * First search to see if this one already exists.
1652     */
1653    gen_mutex_lock(&pma_mutex);
1654    qlist_for_each_entry(pma, &pma_list, list) {
1655        if (pma->pid.pid == pid.pid && pma->pid.nid == pid.nid) {
1656            /* relies on alloc_method_addr() working like it does */
1657            map = &((struct bmi_method_addr *) pma)[-1];
1658            debug(2, "%s: found matching peer %s\n", __func__, pma->peername);
1659            goto out;
1660        }
1661    }
1662
1663    /* room for a peername tacked on the end too, no more than 10 digits */
1664    extra = sizeof(*pma) + 2 * (strlen(hostname) + 1) + 10 + 1;
1665
1666    map = bmi_alloc_method_addr(bmi_portals_method_id, extra);
1667    pma = map->method_data;
1668
1669    pma->hostname = (void *) &pma[1];
1670    pma->peername = pma->hostname + strlen(hostname) + 1;
1671    strcpy(pma->hostname, hostname);
1672    /* for debug/error messages via BMI_addr_rev_lookup */
1673    sprintf(pma->peername, "%s:%d", hostname, pid.pid);
1674
1675    pma->pid = pid;
1676    pma->seqno_in = 0;
1677    pma->seqno_out = 0;
1678    qlist_add(&pma->list, &pma_list);
1679
1680    if (register_with_bmi) {
1681        ret = bmi_method_addr_reg_callback(map);
1682        if (!ret) {
1683            gossip_err("%s: bmi_method_addr_reg_callback failed\n", __func__);
1684            free(map);
1685            map = NULL;
1686        }
1687    }
1688    debug(2, "%s: new peer %s\n", __func__, pma->peername);
1689
1690out:
1691    gen_mutex_unlock(&pma_mutex);
1692    return map;
1693}
1694
1695
1696#if !(defined(__LIBCATAMOUNT__) || defined(__CRAYXT_COMPUTE_LINUX_TARGET) || defined(__CRAYXT_SERVICE))
1697/*
1698 * Clients give hostnames.  Convert these to Portals nids.  This routine
1699 * specific for Portals-over-IP (tcp or utcp).
1700 */
1701static int bmip_nid_from_hostname(const char *hostname, uint32_t *nid)
1702{
1703    struct hostent *he;
1704
1705    he = gethostbyname(hostname);
1706    if (!he) {
1707        gossip_err("%s: gethostbyname cannot resolve %s\n", __func__, hostname);
1708        return 1;
1709    }
1710    if (he->h_length != 4) {
1711        gossip_err("%s: gethostbyname returns %d-byte addresses, hoped for 4\n",
1712                   __func__, he->h_length);
1713        return 1;
1714    }
1715    /* portals wants host byte order, apparently */
1716    *nid = htonl(*(uint32_t *) he->h_addr_list[0]);
1717    return 0;
1718}
1719
1720/*
1721 * This is called from the server, on seeing an unexpected message from
1722 * a client.  Convert that to a method_addr.  If BMI has never seen it
1723 * before, register it with BMI.
1724 *
1725 * Ugh, since Portals is connectionless, we need to search through
1726 * all the struct bmip_method_addr to find the pid that sent
1727 * this to us.  While BMI has a list of all method_addrs somewhere,
1728 * it is not exported to us.  So we have to maintain our own list
1729 * structure.
1730 */
1731static struct bmi_method_addr *addr_from_nidpid(ptl_process_id_t pid)
1732{
1733    struct bmi_method_addr *map;
1734    struct in_addr inaddr;
1735    char *hostname;
1736
1737    /* temporary, for ip 100.200.200.100 */
1738    hostname = malloc(INET_ADDRSTRLEN + 1);
1739    inaddr.s_addr = htonl(pid.nid);  /* ntop expects network format */
1740    inet_ntop(AF_INET, &inaddr, hostname, INET_ADDRSTRLEN);
1741
1742    map = bmip_alloc_method_addr(hostname, pid, 1);
1743    free(hostname);
1744
1745    return map;
1746}
1747
1748#else
1749
1750/*
1751 * Cray versions
1752 */
1753static int bmip_nid_from_hostname(const char *hostname, uint32_t *nid)
1754{
1755    int ret = -1;
1756    uint32_t v = 0;
1757    char *cp;
1758
1759    /*
1760     * There is apparently no API for this on Cray.  Chop up the hostname,
1761     * knowing the format.  Also can look at /proc/cray_xt/nid on linux
1762     * nodes.
1763     */
1764    if (strncmp(hostname, "nid", 3) == 0) {
1765        v = strtoul(hostname + 3, &cp, 10);
1766        if (*cp != '\0') {
1767            gossip_err("%s: convert nid<num> hostname %s failed\n", __func__,
1768                       hostname);
1769            v = 0;
1770        } else {
1771            ret = 0;
1772        }
1773    }
1774    if (ret)
1775        debug(0, "%s: no clue how to convert hostname %s\n", __func__,
1776              hostname);
1777    *nid = v;
1778    return ret;
1779}
1780
1781static struct bmi_method_addr *addr_from_nidpid(ptl_process_id_t pid)
1782{
1783    struct bmi_method_addr *map;
1784    char hostname[9];
1785
1786    sprintf(hostname, "nid%05d", pid.nid);
1787    map = bmip_alloc_method_addr(hostname, pid, 1);
1788    return map;
1789}
1790#endif
1791
1792/*
1793 * Break up a method string like:
1794 *   portals://hostname:pid/filesystem
1795 * into its constituent fields, storing them in an opaque
1796 * type, which is then returned.
1797 */
1798static struct bmi_method_addr *bmip_method_addr_lookup(const char *id)
1799{
1800    char *s, *cp, *cq;
1801    int ret;
1802    ptl_process_id_t pid;
1803    struct bmi_method_addr *map = NULL;
1804
1805    /* parse hostname */
1806    s = string_key("portals", id);  /* allocs a string "node27:3334/pvfs2-fs" */
1807    if (!s)
1808        return NULL;
1809    cp = strchr(s, ':');
1810    if (!cp) {
1811        gossip_err("%s: no ':' found\n", __func__);
1812        goto out;
1813    }
1814
1815    /* terminate hostname from s to cp */
1816    *cp = 0;
1817    ++cp;
1818
1819    /* strip /filesystem part */
1820    cq = strchr(cp, '/');
1821    if (cq)
1822        *cq = 0;
1823
1824    /* parse pid part */
1825    pid.pid = strtoul(cp, &cq, 10);
1826    if (cq == cp) {
1827        gossip_err("%s: invalid pid number\n", __func__);
1828        goto out;
1829    }
1830    if (*cq != '\0') {
1831        gossip_err("%s: extra characters after pid number\n", __func__);
1832        goto out;
1833    }
1834
1835    ret = bmip_nid_from_hostname(s, &pid.nid);
1836    if (ret)
1837        goto out;
1838
1839    /*
1840     * Lookup old one or Alloc new one, but don't call
1841     * bmi_method_addr_reg_callback---this is the client side, BMI is asking us
1842     * to look this up and will register it itself.
1843     */
1844    map = bmip_alloc_method_addr(s, pid, 0);
1845
1846out:
1847    free(s);
1848    return map;
1849}
1850
1851/*
1852 * "Listen" on a particular portal, specified in the address in pvfs2tab.
1853 */
1854static int unexpected_init(struct bmi_method_addr *listen_addr)
1855{
1856    struct bmip_method_addr *pma = listen_addr->method_data;
1857    int i, ret;
1858
1859    unexpected_buf = malloc(UNEXPECTED_QUEUE_SIZE);
1860    if (!unexpected_buf) {
1861        ret = -ENOMEM;
1862        goto out;
1863    }
1864
1865    ret = ensure_ni_initialized(pma, pma->pid);
1866    if (ret)
1867        goto out;
1868
1869    /*
1870     * We use two, half-size, MDs.  When one fills up, it is unlinked and we
1871     * find out about it via an event.  The second one is used to give us time
1872     * to repost the first.  Sort of a circular buffer structure.  This is
1873     * hopefully better than wasting a full 8k for every small control message.
1874     */
1875    unexpected_need_repost_sum = 0;
1876    for (i=0; i<UNEXPECTED_NUM_MD; i++) {
1877        unexpected_is_posted[i] = 0;
1878        unexpected_need_repost[i] = 0;
1879        unexpected_repost(i);
1880    }
1881
1882out:
1883    return ret;
1884}
1885
1886static void unexpected_repost(int which)
1887{
1888    int ret;
1889    ptl_md_t mdesc;
1890
1891    /* unlink used-up one */
1892    if (unexpected_is_posted[which]) {
1893        debug(1, "%s: trying unpost %d\n", __func__, which);
1894        ret = PtlMEUnlink(unexpected_me[which]);
1895        if (ret) {
1896            gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, which,
1897                       PtlErrorStr(ret));
1898            return;
1899        }
1900        debug(1, "%s: unposted %d\n", __func__, which);
1901        unexpected_need_repost[which] = 0;
1902        unexpected_is_posted[which] = 0;
1903        --unexpected_need_repost_sum;
1904    }
1905
1906    /* unexpected messages are limited by the API to a certain size */
1907    mdesc.start = unexpected_buf + which * (UNEXPECTED_QUEUE_SIZE / 2);
1908    mdesc.length = UNEXPECTED_QUEUE_SIZE / 2;
1909    mdesc.threshold = PTL_MD_THRESH_INF;
1910    mdesc.options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE
1911                  | PTL_MD_MAX_SIZE;
1912    mdesc.max_size = UNEXPECTED_MESSAGE_SIZE;
1913    mdesc.eq_handle = eq;
1914    mdesc.user_ptr = (void *) (uintptr_t) (UNEXPECTED_MD_INDEX_OFFSET + which);
1915
1916    /*
1917     * Take any tag, as long as it has the unexpected bit set, and not
1918     * the long send bit.  Not sure if we need both bits for this.  This always
1919     * goes at the very end of the list, just in front of zero.  The nonpp
1920     * and unex ones can be comingled, as they select different things, but
1921     * they must come after the preposts and before the zero md.
1922     */
1923    ret = PtlMEInsert(zero_me, any_pid, match_bits_unexpected,
1924                      (0x3fffffffULL << 32) | 0xffffffffULL, PTL_UNLINK,
1925                      PTL_INS_BEFORE, &unexpected_me[which]);
1926    if (ret) {
1927        gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
1928        return;
1929    }
1930
1931    /*
1932     * Put data here when it matches.  Do not auto-unlink else a new md
1933     * may get stuck in and cause a false match in unexpected_md_index above.
1934     * Do it all manually.  Also have to make sure these do not get reused
1935     * in case things sitting in the queue haven't been looked at yet.  Maybe
1936     * need to use md.user_ptr, or look at the match bits.
1937     */
1938    ret = PtlMDAttach(unexpected_me[which], mdesc, PTL_RETAIN,
1939                      &unexpected_md[which]);
1940    if (ret)
1941        gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
1942
1943    unexpected_is_posted[which] = 1;
1944    debug(1, "%s: reposted %d\n", __func__, which);
1945}
1946
1947static int unexpected_fini(void)
1948{
1949    int i, ret;
1950
1951    for (i=0; i<UNEXPECTED_NUM_MD; i++) {
1952        /* MDs go away when MEs unlinked */
1953        ret = PtlMEUnlink(unexpected_me[i]);
1954        if (ret) {
1955            gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, i,
1956                       PtlErrorStr(ret));
1957            return ret;
1958        }
1959    }
1960    free(unexpected_buf);
1961    return 0;
1962}
1963
1964/*
1965 * Manage nonprepost buffers.
1966 */
1967static int nonprepost_init(void)
1968{
1969    int i, ret;
1970
1971    nonprepost_buf = malloc(NONPREPOST_QUEUE_SIZE);
1972    if (!nonprepost_buf) {
1973        ret = -ENOMEM;
1974        goto out;
1975    }
1976
1977    /*
1978     * See comments above for unexpected.
1979     */
1980    for (i=0; i<NONPREPOST_NUM_MD; i++) {
1981        nonprepost_is_posted[i] = 0;
1982        nonprepost_refcnt[i] = 0;
1983        nonprepost_repost(i);
1984    }
1985
1986out:
1987    return ret;
1988}
1989
1990static void nonprepost_repost(int which)
1991{
1992    int ret;
1993    ptl_md_t mdesc;
1994    static int count = 0;
1995
1996    debug(0, "%s: WHICH %d\n", __func__, which);
1997    ++count;
1998    if (count > 2)
1999        exit(0);
2000
2001    /* unlink used-up one */
2002    if (unexpected_is_posted[which]) {
2003        debug(1, "%s: trying unpost %d\n", __func__, which);
2004        ret = PtlMEUnlink(unexpected_me[which]);
2005        if (ret) {
2006            gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, which,
2007                       PtlErrorStr(ret));
2008            return;
2009        }
2010        debug(1, "%s: unposted %d\n", __func__, which);
2011        unexpected_need_repost[which] = 0;
2012        unexpected_is_posted[which] = 0;
2013        --unexpected_need_repost_sum;
2014    }
2015
2016    /* only short messages that fit max_size go in here */
2017    mdesc.start = nonprepost_buf + which * (NONPREPOST_QUEUE_SIZE / 2);
2018    mdesc.length = NONPREPOST_QUEUE_SIZE / 2;
2019    mdesc.threshold = PTL_MD_THRESH_INF;
2020    mdesc.options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE
2021                  | PTL_MD_MAX_SIZE;
2022    mdesc.max_size = NONPREPOST_MESSAGE_SIZE;
2023    mdesc.eq_handle = eq;
2024    mdesc.user_ptr = (void *) (uintptr_t) (NONPREPOST_MD_INDEX_OFFSET + which);
2025
2026    /* XXX: maybe need manual unlink like for unexpecteds on CNL */
2027
2028    /* also at the very end of the list */
2029    /* match anything as long as top two bits are zero */
2030    ret = PtlMEInsert(zero_me, any_pid, 0,
2031                      (0x3fffffffULL << 32) | 0xffffffffULL,
2032                      PTL_UNLINK, PTL_INS_BEFORE, &nonprepost_me[which]);
2033    if (ret) {
2034        gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
2035        return;
2036    }
2037
2038    /* put data here when it matches; when full, unlink it */
2039    ret = PtlMDAttach(nonprepost_me[which], mdesc, PTL_UNLINK,
2040                      &nonprepost_md[which]);
2041    if (ret) {
2042        gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
2043        return;
2044    }
2045
2046    nonprepost_is_posted[which] = 1;
2047}
2048
2049static int nonprepost_fini(void)
2050{
2051    int i, ret;
2052
2053    for (i=0; i<NONPREPOST_NUM_MD; i++) {
2054        if (nonprepost_refcnt[i] != 0)
2055            gossip_err("%s: refcnt %d, should be zero\n", __func__,
2056                       nonprepost_refcnt[i]);
2057        if (!nonprepost_is_posted[i])
2058            continue;
2059        /* MDs go away when MEs unlinked */
2060        ret = PtlMEUnlink(nonprepost_me[i]);
2061        if (ret) {
2062            gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, i,
2063                       PtlErrorStr(ret));
2064        }
2065    }
2066    free(nonprepost_buf);
2067    return 0;
2068}
2069
2070
2071static void *bmip_memalloc(bmi_size_t len, enum bmi_op_type send_recv __unused)
2072{
2073    return malloc(len);
2074}
2075
2076static int bmip_memfree(void *buf, bmi_size_t len __unused,
2077                        enum bmi_op_type send_recv __unused)
2078{
2079    free(buf);
2080    return 0;
2081}
2082
2083static int bmip_unexpected_free(void *buf)
2084{
2085    free(buf);
2086    return 0;
2087}
2088
2089/*
2090 * No need to track these internally.  Just search the entire queue.
2091 */
2092static int bmip_open_context(bmi_context_id context_id __unused)
2093{
2094    return 0;
2095}
2096
2097static void bmip_close_context(bmi_context_id context_id __unused)
2098{
2099}
2100
2101/*
2102 * Callers sometimes want to know odd pieces of information.  Satisfy
2103 * them.
2104 */
2105static int bmip_get_info(int option, void *param)
2106{
2107    int ret = 0;
2108
2109    switch (option) {
2110        case BMI_CHECK_MAXSIZE:
2111            /* reality is 2^31, but shrink to avoid negative int */
2112            *(int *)param = (1UL << 31) - 1;
2113            break;
2114        case BMI_GET_UNEXP_SIZE:
2115            *(int *)param = UNEXPECTED_MESSAGE_SIZE;
2116            break;
2117        default:
2118            ret = -ENOSYS;
2119    }
2120    return ret;
2121}
2122
2123/*
2124 * Used to set some optional parameters and random functions, like ioctl.
2125 */
2126static int bmip_set_info(int option, void *param)
2127{
2128    switch (option) {
2129    case BMI_DROP_ADDR: {
2130        struct bmi_method_addr *addr = param;
2131        struct bmip_method_addr *pma = addr->method_data;
2132        gen_mutex_lock(&pma_mutex);
2133        qlist_del(&pma->list);
2134        gen_mutex_unlock(&pma_mutex);
2135        free(addr);
2136        break;
2137    }
2138    case BMI_OPTIMISTIC_BUFFER_REG:
2139        break;
2140    default:
2141        /* Should return -ENOSYS, but return 0 for caller ease. */
2142        break;
2143    }
2144    return 0;
2145}
2146
2147/*
2148 * This is called with a method_addr initialized by
2149 * bmip_method_addr_lookup.
2150 */
2151static int bmip_initialize(struct bmi_method_addr *listen_addr,
2152                           int method_id, int init_flags)
2153{
2154    int ret = -ENODEV, numint;
2155
2156    debug(0, "%s: init\n", __func__);
2157
2158    gen_mutex_lock(&ni_mutex);
2159
2160    any_pid.nid = PTL_NID_ANY;
2161    any_pid.pid = PTL_PID_ANY;
2162
2163    /* check params */
2164    if (!!listen_addr ^ (init_flags & BMI_INIT_SERVER)) {
2165        gossip_err("%s: server but no listen address\n", __func__);
2166        ret = -EIO;
2167        goto out;
2168    }
2169
2170    bmi_portals_method_id = method_id;
2171
2172    ret = PtlInit(&numint);
2173    if (ret) {
2174        gossip_err("%s: PtlInit failed\n", __func__);
2175        ret = -EIO;
2176        goto out;
2177    }
2178
2179/*
2180 * utcp has shorter names for debug symbols; define catamount to these
2181 * even though it never prints anything.
2182 */
2183#ifndef PTL_DBG_ALL
2184#define  PTL_DBG_ALL PTL_DEBUG_ALL
2185#define  PTL_DBG_NI_ALL PTL_DEBUG_NI_ALL
2186#endif
2187
2188    PtlNIDebug(PTL_INVALID_HANDLE, PTL_DBG_ALL | PTL_DBG_NI_ALL);
2189    /* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DBG_ALL | 0x001f0000); */
2190    /* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DBG_ALL | 0x00000000); */
2191    /* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DBG_DROP | 0x00000000); */
2192
2193    /* catamount has different debug symbols, but never prints anything */
2194    PtlNIDebug(PTL_INVALID_HANDLE, PTL_DEBUG_ALL | PTL_DEBUG_NI_ALL);
2195    /* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DEBUG_DROP | 0x00000000); */
2196
2197#if defined(__CRAYXT_SERVICE)
2198    /*
2199     * debug
2200     */
2201    signal(SIGUSR1, dump_queues);
2202#endif
2203
2204    /*
2205     * Allocate and build MDs for a queue of unexpected messages from
2206     * all hosts.  Drop lock for coming NI init call.
2207     */
2208    if (init_flags & BMI_INIT_SERVER) {
2209        gen_mutex_unlock(&ni_mutex);
2210        ret = unexpected_init(listen_addr);
2211        if (ret)
2212            PtlFini();
2213        return ret;
2214    }
2215
2216out:
2217    gen_mutex_unlock(&ni_mutex);
2218    return ret;
2219}
2220
2221/*
2222 * Shutdown.
2223 */
2224static int bmip_finalize(void)
2225{
2226    int ret;
2227
2228    /* do not delete pmas, bmi will call DROP on each for us */
2229
2230    /*
2231     * Assuming upper layer has called cancel/test on all the work
2232     * queue items, else we should probably walk the lists and purge those.
2233     */
2234
2235    gen_mutex_lock(&ni_mutex);
2236
2237    if (ni == PTL_INVALID_HANDLE)
2238        goto out;
2239
2240    nonprepost_fini();
2241    if (unexpected_buf)
2242        unexpected_fini();
2243
2244#if 0  /* example code:  stick this somewhere to test if the EQ is freeable */
2245    /* unexpected_fini(); */
2246    nonprepost_fini();
2247    ret = PtlMEUnlink(zero_me);
2248    if (ret)
2249        gossip_err("%s: PtlMEUnlink zero: %s\n", __func__, PtlErrorStr(ret));
2250    ret = PtlEQFree(eq);
2251    if (ret)
2252        gossip_err("%s: PtlEQFree: %s\n", __func__, PtlErrorStr(ret));
2253    printf("eqfree okay\n");
2254    exit(1);
2255#endif
2256
2257    /* destroy connection structures */
2258    ret = PtlMEUnlink(mark_me);
2259    if (ret)
2260        gossip_err("%s: PtlMEUnlink mark: %s\n", __func__, PtlErrorStr(ret));
2261
2262    ret = PtlMEUnlink(zero_me);
2263    if (ret)
2264        gossip_err("%s: PtlMEUnlink zero: %s\n", __func__, PtlErrorStr(ret));
2265
2266    ret = PtlEQFree(eq);
2267    if (ret)
2268        gossip_err("%s: PtlEQFree: %s\n", __func__, PtlErrorStr(ret));
2269
2270    ret = PtlNIFini(ni);
2271    if (ret)
2272        gossip_err("%s: PtlNIFini: %s\n", __func__, PtlErrorStr(ret));
2273
2274    /* do not call this if runtime opened the nic, else oops in sysio */
2275    if (ni_init_dup == 0)
2276        PtlFini();
2277
2278out:
2279    gen_mutex_unlock(&ni_mutex);
2280    return 0;
2281}
2282
2283/*
2284 * All addresses are in the same netmask.
2285 */
2286static int bmip_query_addr_range(struct bmi_method_addr *mop __unused,
2287                                 const char *wildcard __unused,
2288                                 int netmask __unused)
2289{
2290    return 1;
2291}
2292
2293const struct bmi_method_ops bmi_portals_ops =
2294{
2295    .method_name = "bmi_portals",
2296    .flags = 0,
2297    .initialize = bmip_initialize,
2298    .finalize = bmip_finalize,
2299    .set_info = bmip_set_info,
2300    .get_info = bmip_get_info,
2301    .memalloc = bmip_memalloc,
2302    .memfree = bmip_memfree,
2303    .unexpected_free = bmip_unexpected_free,
2304    .post_send = bmip_post_send,
2305    .post_sendunexpected = bmip_post_sendunexpected,
2306    .post_recv = bmip_post_recv,
2307    .test = bmip_test,
2308    .testsome = bmip_testsome,
2309    .testcontext = bmip_testcontext,
2310    .testunexpected = bmip_testunexpected,
2311    .method_addr_lookup = bmip_method_addr_lookup,
2312    .post_send_list = bmip_post_send_list,
2313    .post_recv_list = bmip_post_recv_list,
2314    .post_sendunexpected_list = bmip_post_sendunexpected_list,
2315    .open_context = bmip_open_context,
2316    .close_context = bmip_close_context,
2317    .cancel = bmip_cancel,
2318    .rev_lookup_unexpected = bmip_rev_lookup,
2319    .query_addr_range = bmip_query_addr_range,
2320};
2321
Note: See TracBrowser for help on using the browser.