[Pvfs2-cvs] commit by slang in pvfs2/src/io/bmi/bmi_portals: README
portals.c
CVS commit program
cvs at parl.clemson.edu
Tue Sep 9 16:22:15 EDT 2008
Update of /projects/cvsroot/pvfs2/src/io/bmi/bmi_portals
In directory parlweb1:/tmp/cvs-serv15473/src/io/bmi/bmi_portals
Modified Files:
Tag: directio-branch
README portals.c
Log Message:
reverse merge of changes to HEAD since branch. Includes small file changes.
Index: README
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/io/bmi/bmi_portals/README,v
diff -p -u -r1.1 -r1.1.18.1
--- README 19 Jul 2007 14:54:44 -0000 1.1
+++ README 9 Sep 2008 20:22:15 -0000 1.1.18.1
@@ -92,19 +92,21 @@ sort of user pointer that Portals provid
Match lists
-----------
-Diagram of the various match lists used.
+Diagram of the various match lists used. The two bars in each
+of the match lists separate the 64 bits into: 2 bits, 30 bits, 32 bits,
+in that order.
preposted receives
- match bmi_tag -> preposted buf
+ match 0 | seqno | bmi_tag -> preposted buf
- match bmi_tag -> preposted buf
+ match 0 | seqno | bmi_tag -> preposted buf
...
outgoing sends
- match 2 << 32 | bmi_tag <- preposted buf, respond to get
+ match 1 | seqno | bmi_tag <- preposted buf, respond to get
...
@@ -114,36 +116,41 @@ mark
nonpreopsted receive buffers
- match 0 << 32 | any -> nonprepost buffer1, max size
+ match 0 | any | any -> nonprepost buffer1, max size
- match 0 << 32 | any -> nonprepost buffer2, max size
+ match 0 | any | any -> nonprepost buffer2, max size
unexpected message buffers
- match 1 << 32 | any -> unexpected buffer1, max size
+ match 2 | any | any -> unexpected buffer1, max size
- match 1 << 32 | any -> unexpected buffer2, max size
+ match 2 | any | any -> unexpected buffer2, max size
zero
- match 0 << 32 | any -> no buffer, trunc, max size 0
+ match 0 | any | any -> no buffer, trunc, max size 0
Preposted receives must come first and be in order so that they match
-for expected incoming messages. The order of nonprepost and unexpected
-buffers doesn't matter, so we let them mix up among themselves. The
-mark entry is used to be able to find the point between the prepost
-and other entries, otherwise we'd need lots of code to track that by hand.
+for expected incoming messages. The outgoing posted sends come next, and
+can be mixed up with the receives since they have a unique bit 62 set.
+
+The order of nonprepost and unexpected buffers doesn't matter, so we let them
+mix up among themselves. The mark entry is used to be able to find the point
+between the prepost and other entries, otherwise we'd need lots of code to
+track that by hand.
The nonprepost and unexpected buffers are managed as "circular" lists,
where one is filled up until it is unlinked, then it is reposted after
-the other that has now started to fill up.
+the other that has now started to fill up. They are actually all mixed
+up in the area between mark and zero, as it doesn't matter which order
+they appear in.
Nonpreopst messages are kept in the buffers until the app posts a receive
that matches. If they fill up, later messages fall off the bottom.
Working apps will pre-post their receives before the sender tries to send
to them.
-Unexpected messages are a protocol feature of BMI. A special high-bit
+Unexpected messages are a protocol feature of BMI. A special high-bit 63
indicates this. They are limited in size by the protocol (8k here), and
are always new requests from a client to a server. As they arrive, they
are immediately copied into new buffers that are handed back to the server
@@ -155,6 +162,15 @@ Finally, for nonprepost messages that ar
md at the end that just generates an event on the sender and receiver. The
receiver does a get to read the data from the sender later when the app
finally posts the receive.
+
+Note that BMI will post multiple sends to the same dest with the same
+tag. With this scheme, a race condition can develop where peer A sends tag
+5 and tag 5, then peer B recvs and acks tag 5, then peer B recvs the second
+tag 5 to its zero md, then peer B has an internal post_recv and does a get
+to tag 5. This get hits the _first_ tag 5 on peer A, as peer A has not
+gotten around to processing the ack and unlinking that one. To get around
+this, we add a sequence number that sits in the 30 bits just above the 32
+bits reserved for the tag.
TODO Notes
Index: portals.c
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/io/bmi/bmi_portals/portals.c,v
diff -p -u -r1.8 -r1.8.10.1
--- portals.c 3 Feb 2008 15:36:20 -0000 1.8
+++ portals.c 9 Sep 2008 20:22:15 -0000 1.8.10.1
@@ -8,8 +8,8 @@
#include <string.h>
#include <errno.h>
-#ifdef __LIBCATAMOUNT__
-/* Cray XT3 version */
+#if defined(__LIBCATAMOUNT__) || defined(__CRAYXT_COMPUTE_LINUX_TARGET) || defined(__CRAYXT_SERVICE)
+/* Cray XT3 and XT4 version, both catamount and compute-node-linux */
#define PTL_IFACE_DEFAULT PTL_IFACE_SS
#include <portals/portals3.h>
#include <sys/utsname.h>
@@ -23,6 +23,7 @@
#endif
#include <assert.h>
+#include <sys/signal.h>
#define __PINT_REQPROTO_ENCODE_FUNCS_C /* include definitions */
#include <src/io/bmi/bmi-method-support.h> /* bmi_method_ops */
#include <src/io/bmi/bmi-method-callback.h> /* bmi_method_addr_reg_callback */
@@ -72,7 +73,6 @@ static gen_mutex_t eq_mutex = GEN_MUTEX_
* method_addrs.
*/
static int bmi_portals_method_id;
-static int bmi_portals_nic_type;
/*
* Various static ptls objects. One per instance of the code.
@@ -125,13 +125,32 @@ static const char *PtlEventKindStr(ptl_e
/*
* Match bits. The lower 32 bits always carry the bmi_tag. If this bit
- * in the top is set, it is an unexpected message. The second set is used
- * when posting a _send_, strangely enough. If the send is too long,
+ * in the top is set, it is an unexpected message. The secondmost top bit is
+ * used when posting a _send_, strangely enough. If the send is too long,
* and the receiver has not preposted, later the receiver will issue a Get
* to us for the data. That get will use the second set of match bits.
- */
-static const uint64_t match_bits_unexpected = 1ULL << 32;
-static const uint64_t match_bits_long_send = 2ULL << 32;
+ *
+ * The rest of the 30 top bits are used to encode a sequence number per
+ * peer. As BMI can post multiple sends with the same tag, we have to
+ * be careful that if send #2 for a given tag goes to the zero_md, that
+ * when he does the get, he grabs from buffer #2, not buffer #1 because
+ * the sender was too slow in unlinking it.
+ */
+static const uint64_t match_bits_unexpected = 1ULL << 63; /* 8... */
+static const uint64_t match_bits_long_send = 1ULL << 62; /* 4... */
+static const uint32_t match_bits_seqno_max = 1UL << 30;
+static const int match_bits_seqno_shift = 32;
+
+static uint64_t mb_from_tag_and_seqno(uint32_t tag, uint32_t seqno)
+{
+ uint64_t mb;
+
+ mb = seqno;
+ mb <<= match_bits_seqno_shift;
+ mb |= tag;
+ /* caller may set the long send bit too */
+ return mb;
+}
/*
* Buffer for incoming unexpected send messages. Only the server needs
@@ -145,20 +164,33 @@ static const uint64_t match_bits_long_se
#define UNEXPECTED_MESSAGE_SIZE (8 << 10)
#define UNEXPECTED_QUEUE_SIZE (256 << 10)
#define UNEXPECTED_NUM_MD 2
+#define UNEXPECTED_SIZE_PER_MD (UNEXPECTED_QUEUE_SIZE/UNEXPECTED_NUM_MD)
+
+#define UNEXPECTED_MD_INDEX_OFFSET (1)
+#define NONPREPOST_MD_INDEX_OFFSET (UNEXPECTED_NUM_MD + 1)
static char *unexpected_buf = NULL;
/* poor-man's circular buffer */
static ptl_handle_me_t unexpected_me[UNEXPECTED_NUM_MD];
static ptl_handle_md_t unexpected_md[UNEXPECTED_NUM_MD];
+static int unexpected_need_repost[UNEXPECTED_NUM_MD];
+static int unexpected_need_repost_sum;
+static int unexpected_is_posted[UNEXPECTED_NUM_MD];
-static int unexpected_md_index(ptl_handle_md_t md)
+/*
+ * This scheme relies on the zero page being unused, i.e. addrsesses
+ * from 0 up to 4k or so.
+ */
+static int unexpected_md_index(void *user_ptr)
{
int i;
+ uintptr_t d = (uintptr_t) user_ptr;
- for (i=0; i<UNEXPECTED_NUM_MD; i++)
- if (unexpected_md[i] == md)
- return i;
- return -1;
+ if (d >= UNEXPECTED_MD_INDEX_OFFSET &&
+ d < UNEXPECTED_MD_INDEX_OFFSET + UNEXPECTED_NUM_MD)
+ return d - UNEXPECTED_MD_INDEX_OFFSET;
+ else
+ return -1;
}
/*
@@ -191,14 +223,16 @@ static ptl_handle_md_t nonprepost_md[NON
static int nonprepost_is_posted[NONPREPOST_NUM_MD];
static int nonprepost_refcnt[NONPREPOST_NUM_MD];
-static int nonprepost_md_index(ptl_handle_md_t md)
+static int nonprepost_md_index(void *user_ptr)
{
int i;
+ uintptr_t d = (uintptr_t) user_ptr;
- for (i=0; i<UNEXPECTED_NUM_MD; i++)
- if (nonprepost_md[i] == md)
- return i;
- return -1;
+ if (d >= NONPREPOST_MD_INDEX_OFFSET &&
+ d < NONPREPOST_MD_INDEX_OFFSET + NONPREPOST_NUM_MD)
+ return d - NONPREPOST_MD_INDEX_OFFSET;
+ else
+ return -1;
}
/*
@@ -212,6 +246,8 @@ struct bmip_method_addr {
char *hostname; /* given by user, converted to a nid by us */
char *peername; /* for rev_lookup */
ptl_process_id_t pid; /* this is a struct with u32 nid + u32 pid */
+ uint32_t seqno_out; /* each send has a separate sequence number */
+ uint32_t seqno_in;
};
static QLIST_HEAD(pma_list);
@@ -226,6 +262,7 @@ enum work_state {
RQ_WAITING_INCOMING,
RQ_WAITING_GET,
RQ_WAITING_USER_TEST,
+ RQ_WAITING_USER_POST,
RQ_LEN_ERROR,
RQ_CANCELLED,
};
@@ -247,6 +284,8 @@ static const char *state_name(enum work_
return "RQ_WAITING_GET";
case RQ_WAITING_USER_TEST:
return "RQ_WAITING_USER_TEST";
+ case RQ_WAITING_USER_POST:
+ return "RQ_WAITING_USER_POST";
case RQ_LEN_ERROR:
return "RQ_LEN_ERROR";
case RQ_CANCELLED:
@@ -269,6 +308,7 @@ struct bmip_work {
bmi_size_t actual_len; /* recv: possibly shorter than posted */
bmi_msg_tag_t bmi_tag; /* recv: unexpected or nonpp tag that arrived */
+ uint64_t match_bits; /* recv: full match bits, including seqno */
int is_unexpected; /* send: if user posted this as unexpected */
@@ -276,7 +316,9 @@ struct bmip_work {
/* send: send me for possible get */
ptl_handle_md_t md; /* recv: prepost or get destination, to cancel */
/* send: send md for possible get */
- int me_unlink; /* send: me must be unlinked at test time */
+ ptl_handle_me_t tme;
+ ptl_handle_md_t tmd;
+ int saw_send_end_and_ack; /* send: make sure both states before unlink */
/* non-preposted receive, keep ref to a nonpp static buffer */
const void *nonpp_buf; /* pointer to nonpp buffer in MD */
@@ -314,7 +356,10 @@ static QLIST_HEAD(q_done);
static struct bmi_method_addr *addr_from_nidpid(ptl_process_id_t pid);
static void unexpected_repost(int which);
static int nonprepost_init(void);
+static int nonprepost_fini(void);
+static int unexpected_fini(void);
static void nonprepost_repost(int which);
+static const char *bmip_rev_lookup(struct bmi_method_addr *addr);
/*----------------------------------------------------------------------------
@@ -327,36 +372,71 @@ static void nonprepost_repost(int which)
static int handle_event(ptl_event_t *ev)
{
struct bmip_work *sq, *rq;
- int which;
+ int which, ret;
if (ev->ni_fail_type != 0) {
gossip_err("%s: ni err %d\n", __func__, ev->ni_fail_type);
return -EIO;
}
- debug(2, "%s: event type %s\n", __func__, PtlEventKindStr(ev->type));
+ debug(6, "%s: event type %s\n", __func__, PtlEventKindStr(ev->type));
switch (ev->type) {
case PTL_EVENT_SEND_END:
- /* ignore this state, already on the waiting list */
+ /*
+ * Sometimes this state happens _after_ the ACK. Boggle. Cannot
+ * unlink the sq until this state. Doing it in the ack state may be
+ * too early. But we don't know if it is safe to unlink until the
+ * ack comes back and says if he received it, or if he will do a
+ * Get on the MD. So just mark a flag. It goes to two only if
+ * the ack indicated the other side will not need to do a get.
+ *
+ * Note that an outgoing get request also triggers this. Sigh.
+ */
sq = ev->md.user_ptr;
- debug(2, "%s: sq %p went out\n", __func__, sq);
+ if (sq->type == BMI_RECV) {
+ rq = ev->md.user_ptr;
+ debug(2, "%s, rq %p stat %s get went out\n", __func__, rq,
+ state_name(rq->state));
+ break;
+ }
+ debug(2, "%s: sq %p went out len %llu/%llu mb %llx\n", __func__, sq,
+ ev->mlength, ev->rlength, ev->match_bits);
+ if (!sq->is_unexpected && ++sq->saw_send_end_and_ack == 2) {
+ debug(2, "%s: saw end last, unlinking %p me %d (md %d)\n",
+ __func__, sq, sq->me, sq->md);
+ ret = PtlMEUnlink(sq->me);
+ if (ret)
+ gossip_err("%s: PtlMEUnlink sq %p: %s\n", __func__,
+ sq, PtlErrorStr(ret));
+ }
break;
case PTL_EVENT_ACK:
/* recv an ack from him, advance the state and unlink */
sq = ev->md.user_ptr;
- debug(2, "%s: sq %p ack received\n", __func__, sq);
+ debug(2, "%s: sq %p ack rcvd len %llu/%llu\n",
+ __func__, sq, ev->mlength, ev->rlength);
+
+ /*
+ * the rlength always comes back as 0 on catamount, even if we
+ * sent 51200 bytes
+ */
if (ev->mlength != ev->rlength) {
- assert(ev->mlength == 0);
- debug(2, "%s: truncated, get ready for the get\n", __func__);
+ gossip_err("%s: mlen %llu and rlen %llu do not agree\n", __func__,
+ ev->mlength, ev->rlength);
+ exit(1);
}
if (ev->mlength > 0) {
- /* Would like to unlink here, but "me in use" error happens
- * sometimes. Avoid race by doing it at test time. */
- if (!sq->is_unexpected)
- sq->me_unlink = 1;
+ /* make sure both SEND_END and ACK happened for these */
+ if (!sq->is_unexpected && ++sq->saw_send_end_and_ack == 2) {
+ debug(2, "%s: saw ack last, unlinking %p\n", __func__, sq);
+ ret = PtlMEUnlink(sq->me);
+ if (ret)
+ gossip_err("%s: PtlMEUnlink sq %p: %s\n", __func__,
+ sq, PtlErrorStr(ret));
+ }
sq->state = SQ_WAITING_USER_TEST;
gen_mutex_lock(&list_mutex);
qlist_del(&sq->list);
@@ -375,21 +455,23 @@ static int handle_event(ptl_event_t *ev)
case PTL_EVENT_PUT_END:
/*
* Peer did a send to us. Four cases:
- * 1. expected pre-posted receive, our rq in user_ptr.
- * 2. unexpected message, user_ptr is &unexpected_md[i];
- * 3. non-preposted message, user_ptr is &preposted_md[i];
- * 4. zero md, non-preposted that was too big and truncated
+ * 1. unexpected message, user_ptr is &unexpected_md[i];
+ * 2a. non-preposted message, user_ptr is &preposted_md[i];
+ * 2b. zero md, non-preposted that was too big and truncated
+ * 3. expected pre-posted receive, our rq in user_ptr.
*/
- which = unexpected_md_index(ev->md_handle);
+ which = unexpected_md_index(ev->md.user_ptr);
if (which >= 0) {
/* build new unexpected rq and copy in the data */
- debug(2, "%s: unexpected len %lld put to us\n", __func__,
- lld(ev->mlength));
+ debug(2, "%s: unexpected len %lld put to us, mb %llx\n", __func__,
+ lld(ev->mlength), llu(ev->match_bits));
rq = malloc(sizeof(*rq));
if (!rq) {
gossip_err("%s: alloc unexpected rq\n", __func__);
break;
}
+ if (ev->mlength > UNEXPECTED_MESSAGE_SIZE)
+ exit(1);
/*
* malloc this separately to hand to testunexpected caller; that
@@ -397,7 +479,6 @@ static int handle_event(ptl_event_t *ev)
* easier.
*/
rq->type = BMI_RECV;
- rq->me_unlink = 0;
rq->unex_buf = malloc(ev->mlength);
if (!rq->unex_buf) {
gossip_err("%s: alloc unexpected rq data\n", __func__);
@@ -412,18 +493,39 @@ static int handle_event(ptl_event_t *ev)
gen_mutex_lock(&list_mutex);
qlist_add_tail(&rq->list, &q_unexpected_done);
gen_mutex_unlock(&list_mutex);
+ debug(1, "%s: unexpected %d offset %llu\n", __func__, which,
+ llu(ev->offset));
+ if (UNEXPECTED_SIZE_PER_MD - ev->offset < UNEXPECTED_MESSAGE_SIZE) {
+ debug(1, "%s: reposting unexpected %d\n", __func__, which);
+ if (unexpected_need_repost[which] == 0) {
+ unexpected_need_repost[which] = 1;
+ ++unexpected_need_repost_sum;
+ }
+ }
+ /* try to unpost some, if they are free now */
+ if (unexpected_need_repost_sum) {
+ for (which = 0; which < UNEXPECTED_NUM_MD; which++) {
+ if (unexpected_need_repost[which])
+ unexpected_repost(which);
+ }
+ }
break;
}
- which = nonprepost_md_index(ev->md_handle);
+ which = nonprepost_md_index(ev->md.user_ptr);
if (which >= 0 || ev->md_handle == zero_md) {
/* build new nonprepost rq, but just keep pointer to the data, or
* if truncated, build the req but no data to hang onto */
- debug(2, "%s: nonprepost len %llu tag %llu put to us%s\n",
- __func__, llu(ev->rlength),
- llu(ev->match_bits & 0xffffffffULL),
+ debug(1, "%s: nonprepost len %llu/%llu mb %llx%s\n",
+ __func__, llu(ev->mlength), llu(ev->rlength),
+ ev->match_bits,
ev->md_handle == zero_md ? ", truncated" : "");
+ if (which >= 0 && ev->md_handle == zero_md) {
+ gossip_err("%s: which %d but zero md\n", __func__, which);
+ exit(1);
+ }
+
rq = malloc(sizeof(*rq));
if (!rq) {
gossip_err("%s: alloc nonprepost rq\n", __func__);
@@ -431,9 +533,10 @@ static int handle_event(ptl_event_t *ev)
}
rq->type = BMI_RECV;
- rq->me_unlink = 0;
+ rq->state = RQ_WAITING_USER_POST;
rq->actual_len = ev->rlength;
rq->bmi_tag = ev->match_bits & 0xffffffffULL; /* just 32 bits */
+ rq->match_bits = ev->match_bits;
rq->mop.addr = addr_from_nidpid(ev->initiator);
if (ev->md_handle == zero_md) {
rq->nonpp_buf = NULL;
@@ -443,6 +546,9 @@ static int handle_event(ptl_event_t *ev)
/* keep a ref to this md until the recv finishes */
++nonprepost_refcnt[rq->nonpp_md];
}
+ debug(2, "%s: rq %p NEW NONPREPOST mb 0x%llx%s\n", __func__,
+ rq, llu(rq->match_bits),
+ ev->md_handle == zero_md ? ", truncated" : "");
gen_mutex_lock(&list_mutex);
qlist_add_tail(&rq->list, &q_recv_nonprepost);
gen_mutex_unlock(&list_mutex);
@@ -451,21 +557,45 @@ static int handle_event(ptl_event_t *ev)
/* must be something we preposted, with user_ptr is rq */
rq = ev->md.user_ptr;
+#ifdef DEBUG_CNL_ODDITIES
+ if ((uintptr_t) rq & 1) {
+ debug(1, "%s: OFF BY 1 rq %p\n", __func__, rq);
+ rq = (void *) ((uintptr_t) rq - 1);
+ }
+#endif
rq->actual_len = ev->rlength; /* attempted length sent */
rq->state = RQ_WAITING_USER_TEST;
if (rq->actual_len > rq->tot_len)
rq->state = RQ_LEN_ERROR;
- debug(2, "%s: rq %p len %lld tag %d put to us\n", __func__, rq,
- lld(rq->actual_len), rq->bmi_tag);
+ debug(1, "%s: rq %p len %lld tag 0x%llx mb 0x%llx thresh %d put to us\n",
+ __func__, rq, lld(rq->actual_len), llu(rq->bmi_tag),
+ llu(rq->match_bits), ev->md.threshold);
gen_mutex_lock(&list_mutex);
qlist_del(&rq->list);
qlist_add_tail(&rq->list, &q_done);
gen_mutex_unlock(&list_mutex);
+
+#ifdef DEBUG_CNL_ODDITIES
+ /*
+ * At least on linux compute nodes, the me does not auto-unlink
+ * properly, even though the md did get unlinked. It is necessary
+ * to undo the ME too. Note that the MD threshold is not updated
+ * to zero; it still sits at one (or whatever it was originally
+ * set up to be).
+ */
+ /* ret = PtlMDUnlink(rq->md); debug(2, "md unlink %d gives %s\n", rq->md, PtlErrorStr(ret)); */
+ /* ret = PtlMDUnlink(rq->tmd); debug(2, "tmd unlink %d gives %s\n", rq->tmd, PtlErrorStr(ret)); */
+ ret = PtlMEUnlink(rq->me); debug(2, "me unlink %d gives %s\n", rq->me, PtlErrorStr(ret));
+ ret = PtlMEUnlink(rq->tme); debug(2, "tme unlink %d gives %s\n", rq->tme, PtlErrorStr(ret));
+#endif
break;
case PTL_EVENT_GET_END:
- /* our send, turned into a get from the receiver, is now done */
+ /* our send, turned into a get from the receiver, is now done, as
+ * far as we are conerned, as he has gotten it from us */
sq = ev->md.user_ptr;
+ debug(1, "%s: peer got sq %p len %llu/%llu mb %llx\n", __func__, sq,
+ llu(ev->mlength), llu(ev->rlength), ev->match_bits);
sq->state = SQ_WAITING_USER_TEST;
gen_mutex_lock(&list_mutex);
qlist_del(&sq->list);
@@ -474,8 +604,8 @@ static int handle_event(ptl_event_t *ev)
break;
case PTL_EVENT_REPLY_END:
- debug(2, "%s: get completed\n", __func__);
rq = ev->md.user_ptr;
+ debug(2, "%s: get completed, rq %p\n", __func__, rq);
rq->state = RQ_WAITING_USER_TEST;
gen_mutex_lock(&list_mutex);
qlist_del(&rq->list);
@@ -484,19 +614,13 @@ static int handle_event(ptl_event_t *ev)
break;
case PTL_EVENT_UNLINK:
- which = unexpected_md_index(ev->md_handle);
- if (which >= 0) {
- /* me was also unlinked; put both back at the end */
- debug(2, "%s: unlinked unexpected md %d, repost\n", __func__,
- which);
- unexpected_repost(which);
- break;
- }
-
- which = nonprepost_md_index(ev->md_handle);
+ /* XXX: does this ever get called on CNL? Apparently not. */
+ debug(2, "%s: unlink event! user_ptr %p\n", __func__, ev->md.user_ptr);
+ which = nonprepost_md_index(ev->md.user_ptr);
if (which >= 0) {
- debug(2, "%s: unlinked nonprepost md %d, maybe repost\n", __func__,
- which);
+ debug(1, "%s: unlinked nonprepost md %d, is_posted %d refcnt %d\n",
+ __func__, which, nonprepost_is_posted[which],
+ nonprepost_refcnt[which]);
nonprepost_is_posted[which] = 0;
if (nonprepost_refcnt[which] == 0)
/* already satisfied all the recvs, can this happen so fast? */
@@ -504,7 +628,7 @@ static int handle_event(ptl_event_t *ev)
break;
}
- debug(2, "%s: unlinked a send or recv\n", __func__);
+ debug(1, "%s: unlinked a send or recv, nothing to do\n", __func__);
/*
* Expected recv, unlink just cleans it up. Already got the send
@@ -512,6 +636,15 @@ static int handle_event(ptl_event_t *ev)
*/
break;
+ case PTL_EVENT_SEND_START:
+ debug(0, "%s: send start, a debugging message thresh %d\n", __func__,
+ ev->md.threshold);
+ break;
+ case PTL_EVENT_PUT_START:
+ debug(0, "%s: put start, a debugging message, thresh %d\n", __func__,
+ ev->md.threshold);
+ break;
+
default:
gossip_err("%s: unknown event %s\n", __func__,
PtlEventKindStr(ev->type));
@@ -547,14 +680,14 @@ static int __check_eq(int idle_ms)
ms = 0; /* just quickly pull events off */
if (ret == PTL_EQ_DROPPED) {
/* oh well, hope things retry, just point this out */
- gossip_err("%s: PtlEQGet: dropped some completions\n",
+ gossip_err("%s: PtlEQPoll: dropped some completions\n",
__func__);
}
} else if (ret == PTL_EQ_EMPTY) {
ret = 0;
break;
} else {
- gossip_err("%s: PtlEQGet: %s", __func__, PtlErrorStr(ret));
+ gossip_err("%s: PtlEQPoll: %s\n", __func__, PtlErrorStr(ret));
ret = -EIO;
break;
}
@@ -595,15 +728,13 @@ static void fill_done(struct bmip_work *
if (w->state == RQ_LEN_ERROR)
*err = -PVFS_EOVERFLOW;
+ debug(2, "%s: %s %p size %llu peer %s\n", __func__,
+ w->type == BMI_SEND ? "sq" : "rq", w, llu(*size),
+ bmip_rev_lookup(w->mop.addr));
+
/* free resources too */
id_gen_fast_unregister(w->mop.op_id);
qlist_del(&w->list);
- /* work around "me/md in use" problem with doing this in the ack */
- if (w->me_unlink) {
- int ret = PtlMEUnlink(w->me);
- if (ret)
- gossip_err("%s: PtlMEUnlink: %s\n", __func__, PtlErrorStr(ret));
- }
free(w);
}
@@ -754,12 +885,14 @@ static int ensure_ni_initialized(struct
ptl_process_id_t my_pid)
{
int ret = 0;
- static ptl_process_id_t no_pid;
+ ptl_process_id_t no_pid;
+ int nic_type;
ptl_md_t zero_mdesc = {
.threshold = PTL_MD_THRESH_INF,
.max_size = 0,
.options = PTL_MD_OP_PUT | PTL_MD_TRUNCATE | PTL_MD_MAX_SIZE |
PTL_MD_EVENT_START_DISABLE,
+ .user_ptr = 0,
};
/* already initialized */
@@ -778,19 +911,30 @@ static int ensure_ni_initialized(struct
* lookup server, figure out how route would go to it, choose
* that interface. Yeah.
*/
+
+#if defined(__CRAYXT_SERVICE) || defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+ /*
+ * Magic for Cray XT service nodes and compute node linux.
+ * Catamount uses default, TCP uses default.
+ */
+ nic_type = CRAY_USER_NAL;
+#else
+ nic_type = PTL_IFACE_DEFAULT;
+#endif
+
+ /* needed for TCP */
/* setenv("PTL_IFACE", "eth0", 0); */
- ret = PtlNIInit(bmi_portals_nic_type, my_pid.pid, NULL, NULL, &ni);
-#ifdef __LIBCATAMOUNT__
- if (bmi_portals_nic_type == PTL_IFACE_DEFAULT) {
- if (ret == PTL_IFACE_DUP && ni != PTL_INVALID_HANDLE) {
- ret = 0; /* already set up by pre-main on Cray compute nodes */
- ni_init_dup = 1;
- }
+
+ ret = PtlNIInit(nic_type, my_pid.pid, NULL, NULL, &ni);
+#if defined(__LIBCATAMOUNT__) || defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+ if (ret == PTL_IFACE_DUP && ni != PTL_INVALID_HANDLE) {
+ ret = 0; /* already set up by pre-main on catamount nodes */
+ ni_init_dup = 1;
}
#endif
if (ret) {
/* error number is bogus here, do not try to decode it */
- gossip_err("%s: PtlNIInit failed: %d\n", __func__, ret);
+ gossip_err("%s: PtlNIInit failed: %s\n", __func__, PtlErrorStr(ret));
ni = PTL_INVALID_HANDLE; /* init call nulls it out */
ret = -EIO;
goto out;
@@ -812,7 +956,7 @@ static int ensure_ni_initialized(struct
debug(0, "%s: runtime thinks my id is %d.%d\n", __func__, id.nid, id.pid);
}
-#ifndef __LIBCATAMOUNT__
+#if !(defined(__LIBCATAMOUNT__) || defined(__CRAYXT_SERVICE) || defined(__CRAYXT_COMPUTE_LINUX_TARGET))
/*
* Need an access control entry to allow everybody to talk, else root
* cannot talk to random user, e.g. Not implemented on Cray.
@@ -850,7 +994,8 @@ static int ensure_ni_initialized(struct
/* "zero" grabs just the header (of nonprepost, not unexpected), drops the
* contents */
- ret = PtlMEAttach(ni, ptl_index, any_pid, 0, 0xffffffffULL, PTL_RETAIN,
+ ret = PtlMEAttach(ni, ptl_index, any_pid, 0,
+ (0x3fffffffULL << 32) | 0xffffffffULL, PTL_RETAIN,
PTL_INS_AFTER, &zero_me);
if (ret) {
gossip_err("%s: PtlMEAttach zero: %s\n", __func__, PtlErrorStr(ret));
@@ -906,7 +1051,7 @@ static void build_mdesc(struct bmip_work
void *const *buffers, const bmi_size_t *sizes)
{
mdesc->threshold = 1;
- mdesc->options = PTL_MD_EVENT_START_DISABLE;
+ mdesc->options = 0; /* PTL_MD_EVENT_START_DISABLE; */
mdesc->eq_handle = eq;
mdesc->user_ptr = w;
@@ -937,7 +1082,7 @@ post_send(bmi_op_id_t *id, struct bmi_me
{
struct bmip_method_addr *pma = addr->method_data;
struct bmip_work *sq;
- uint64_t tag;
+ uint64_t mb;
int ret;
ptl_md_t mdesc;
@@ -955,7 +1100,7 @@ post_send(bmi_op_id_t *id, struct bmi_me
goto out;
}
sq->type = BMI_SEND;
- sq->me_unlink = 0;
+ sq->saw_send_end_and_ack = 0;
sq->tot_len = total_size;
sq->is_unexpected = is_unexpected;
fill_mop(sq, id, addr, user_ptr, context_id);
@@ -963,26 +1108,36 @@ post_send(bmi_op_id_t *id, struct bmi_me
build_mdesc(sq, &mdesc, numbufs, (void *const *)(uintptr_t) buffers, sizes);
mdesc.threshold = 2; /* put, ack */
- debug(2, "%s: sq %p len %lld peer %s tag %d\n", __func__, sq,
- lld(total_size), pma->peername, bmi_tag);
-
sq->state = SQ_WAITING_ACK;
gen_mutex_lock(&list_mutex);
qlist_add_tail(&sq->list, &q_send_waiting_ack);
gen_mutex_unlock(&list_mutex);
/* if not unexpected, use an ME in case he has to come get it */
- tag = bmi_tag;
if (sq->is_unexpected) {
+
+ debug(2, "%s: sq %p len %lld peer %s tag %d unexpected\n", __func__, sq,
+ lld(total_size), pma->peername, bmi_tag);
/* md without any match entry, for sending */
- tag |= match_bits_unexpected;
+ mb = match_bits_unexpected | bmi_tag;
ret = PtlMDBind(ni, mdesc, PTL_UNLINK, &sq->md);
if (ret) {
gossip_err("%s: PtlMDBind: %s\n", __func__, PtlErrorStr(ret));
return -EIO;
}
+ debug(2, "%s: bound md %d\n", __func__, sq->md);
} else {
- ret = PtlMEInsert(mark_me, pma->pid, match_bits_long_send | tag,
+ /* seqno increments on every expected send (only) */
+ if (++pma->seqno_out >= match_bits_seqno_max)
+ pma->seqno_out = 0;
+ mb = mb_from_tag_and_seqno(bmi_tag, pma->seqno_out);
+
+ debug(2, "%s: sq %p len %lld peer %s tag %d seqno %u mb 0x%llx\n",
+ __func__, sq, lld(total_size), pma->peername, bmi_tag,
+ pma->seqno_out, llu(mb));
+
+ /* long-send bit only on the ME, not as the outgoing mb in PtlPut */
+ ret = PtlMEInsert(mark_me, pma->pid, match_bits_long_send | mb,
0, PTL_UNLINK, PTL_INS_BEFORE, &sq->me);
if (ret) {
gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
@@ -1003,7 +1158,10 @@ post_send(bmi_op_id_t *id, struct bmi_me
}
}
- ret = PtlPut(sq->md, PTL_ACK_REQ, pma->pid, ptl_index, 0, tag, 0, 0);
+ sq->bmi_tag = bmi_tag; /* both for debugging dumps */
+ sq->match_bits = mb;
+
+ ret = PtlPut(sq->md, PTL_ACK_REQ, pma->pid, ptl_index, 0, mb, 0, 0);
if (ret) {
gossip_err("%s: PtlPut: %s\n", __func__, PtlErrorStr(ret));
return -EIO;
@@ -1103,10 +1261,18 @@ static int match_nonprepost_recv(bmi_op_
int ret = 0;
ptl_md_t mdesc;
struct bmip_work *rq;
+ uint64_t mb;
+
+ /* expected match bits */
+ mb = mb_from_tag_and_seqno(tag, pma->seqno_in);
+ /* XXX: remove bmi_tag comparison if match_bits works */
gen_mutex_lock(&list_mutex);
qlist_for_each_entry(rq, &q_recv_nonprepost, list) {
- if (rq->mop.addr == addr && rq->bmi_tag == tag) {
+ debug(2, "%s: compare rq %p addr %p =? %p tag %u =? %u mb 0x%llx =? 0x%llx\n", __func__,
+ rq, rq->mop.addr, addr, rq->bmi_tag, tag, llu(rq->match_bits),
+ llu(mb));
+ if (rq->mop.addr == addr && rq->bmi_tag == tag && rq->match_bits == mb) {
found = 1;
qlist_del(&rq->list);
break;
@@ -1137,6 +1303,7 @@ static int match_nonprepost_recv(bmi_op_
nonprepost_repost(rq->nonpp_md);
}
rq->state = RQ_WAITING_USER_TEST;
+ debug(2, "%s: found short message rq %p, copied\n", __func__, rq);
goto foundout;
}
@@ -1156,7 +1323,7 @@ static int match_nonprepost_recv(bmi_op_
rq->tot_len = total_size;
build_mdesc(rq, &mdesc, numbufs, buffers, sizes);
- mdesc.options |= PTL_MD_OP_GET;
+ mdesc.threshold = 2; /* XXX: on Cray only, this must be 2, not 1 */
ret = PtlMDBind(ni, mdesc, PTL_UNLINK, &rq->md);
if (ret) {
@@ -1166,9 +1333,11 @@ static int match_nonprepost_recv(bmi_op_
goto out;
}
- ret = PtlGet(rq->md, pma->pid, ptl_index, 0, match_bits_long_send | tag, 0);
+ mb |= match_bits_long_send;
+ debug(2, "%s: rq %p doing get mb 0x%llx\n", __func__, rq, llu(mb));
+ ret = PtlGet(rq->md, pma->pid, ptl_index, 0, mb, 0);
if (ret) {
- gossip_err("%s: PtlGetRegion: %s\n", __func__, PtlErrorStr(ret));
+ gossip_err("%s: PtlGet: %s\n", __func__, PtlErrorStr(ret));
ret = -EIO;
free(rq);
goto out;
@@ -1200,25 +1369,31 @@ static int post_recv(bmi_op_id_t *id, st
struct bmip_work *rq = NULL;
ptl_md_t mdesc;
int ret, ms = 0;
+ uint64_t mb = 0;
ret = ensure_ni_initialized(pma, any_pid);
if (ret)
goto out;
- debug(2, "%s: len %lld peer %s tag %d\n", __func__, lld(total_size),
- pma->peername, tag);
+ /* increment the expected seqno of the message he will send us */
+ if (++pma->seqno_in >= match_bits_seqno_max)
+ pma->seqno_in = 0;
+
+ debug(2, "%s: len %lld peer %s tag %d seqno %u\n", __func__,
+ lld(total_size), pma->peername, tag, pma->seqno_in);
rq = NULL;
gen_mutex_lock(&eq_mutex); /* do not let test threads manipulate eq */
restart:
/* drain the EQ */
- debug(4, "%s: check eq\n", __func__);
+ debug(2, "%s: check eq\n", __func__);
__check_eq(ms);
- /* first check the unexpected receive queue */
- debug(4, "%s: match nonprepost?\n", __func__);
+ /* first check the nonpreposted receive queue */
+ debug(2, "%s: match nonprepost?\n", __func__);
ret = match_nonprepost_recv(id, addr, numbufs, buffers, sizes,
total_size, tag, user_ptr, context_id);
+
if (ret != 0) {
if (ret > 0) /* handled it via the nonprepost queue */
ret = 0;
@@ -1234,11 +1409,11 @@ restart:
goto out;
}
rq->type = BMI_RECV;
- rq->me_unlink = 0;
rq->tot_len = total_size;
rq->actual_len = 0;
rq->bmi_tag = tag;
fill_mop(rq, id, addr, user_ptr, context_id);
+ memset(&mdesc, 0, sizeof(mdesc));
build_mdesc(rq, &mdesc, numbufs, buffers, sizes);
mdesc.threshold = 0; /* initially inactive */
mdesc.options |= PTL_MD_OP_PUT;
@@ -1246,8 +1421,10 @@ restart:
/* put at the end of the preposted list, just before the first
* nonprepost or unex ME. */
rq->me = PTL_INVALID_HANDLE;
- debug(4, "%s: me insert\n", __func__);
- ret = PtlMEInsert(mark_me, pma->pid, tag, 0, PTL_UNLINK,
+ debug(2, "%s: me insert\n", __func__);
+ mb = mb_from_tag_and_seqno(tag, pma->seqno_in);
+ rq->match_bits = mb;
+ ret = PtlMEInsert(mark_me, pma->pid, mb, 0, PTL_UNLINK,
PTL_INS_BEFORE, &rq->me);
if (ret) {
gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
@@ -1255,22 +1432,24 @@ restart:
goto out;
}
- debug(4, "%s: md attach\n", __func__);
+ debug(2, "%s: md attach\n", __func__);
ret = PtlMDAttach(rq->me, mdesc, PTL_UNLINK, &rq->md);
if (ret) {
gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
ret = -EIO;
goto out;
}
+ debug(2, "%s: me %d, md %d\n", __func__, rq->me, rq->md);
}
/* now update it atomically with respect to the event stream from the NIC */
mdesc.threshold = 1;
- debug(4, "%s: md update\n", __func__);
+ debug(2, "%s: md update threshold to 1\n", __func__);
ret = PtlMDUpdate(rq->md, NULL, &mdesc, eq);
if (ret) {
if (ret == PTL_MD_NO_UPDATE) {
/* cannot block, other thread may have processed the event for us */
+ debug(2, "%s: md update: no update\n", __func__);
ms = PTL_TIME_FOREVER;
goto restart;
}
@@ -1279,8 +1458,33 @@ restart:
goto out;
}
+#ifdef DEBUG_CNL_ODDITIES
+ {
+ debug(2, "insert another\n");
+ ret = PtlMEInsert(mark_me, pma->pid, 0, -1ULL, PTL_UNLINK,
+ PTL_INS_BEFORE, &rq->tme);
+ if (ret) {
+ gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
+ ret = -EIO;
+ goto out;
+ }
+
+ debug(2, "%s: md attach\n", __func__);
+ mdesc.user_ptr = (void *) ((uintptr_t) mdesc.user_ptr + 1);
+ ret = PtlMDAttach(rq->tme, mdesc, PTL_UNLINK, &rq->tmd);
+ if (ret) {
+ gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
+ ret = -EIO;
+ goto out;
+ }
+ debug(2, "%s: me %d, md %d\n", __func__, rq->tme, rq->tmd);
+ }
+#endif
+
- debug(4, "%s: done\n", __func__);
+ debug(2, "%s: rq %p waiting incoming, len %lld peer %s tag %d seqno %u mb 0x%llx\n",
+ __func__, rq, lld(total_size), pma->peername, tag, pma->seqno_in,
+ llu(mb));
rq->state = RQ_WAITING_INCOMING;
gen_mutex_lock(&list_mutex);
qlist_add_tail(&rq->list, &q_recv_waiting_incoming);
@@ -1324,6 +1528,31 @@ static int bmip_post_recv_list(bmi_op_id
tot_expected_len, tag, user_ptr, context_id);
}
+/* debugging */
+#define show_queue(q) do { \
+ fprintf(stderr, #q "\n"); \
+ qlist_for_each_entry(w, &q, list) { \
+ fprintf(stderr, "%s %p state %s len %llu tag 0x%llx mb 0x%0llx\n", \
+ w->type == BMI_SEND ? "sq" : "rq", \
+ w, state_name(w->state), \
+ w->type == BMI_SEND ? llu(w->tot_len) : llu(w->actual_len), \
+ llu(w->bmi_tag), llu(w->match_bits)); \
+ } \
+} while (0)
+
+static void dump_queues(int sig __unused)
+{
+ struct bmip_work *w;
+
+ /* debugging */
+ show_queue(q_send_waiting_ack);
+ show_queue(q_send_waiting_get);
+ show_queue(q_recv_waiting_incoming);
+ show_queue(q_recv_waiting_get);
+ show_queue(q_recv_nonprepost);
+ show_queue(q_unexpected_done);
+ show_queue(q_done);
+}
/*
* Cancel. Grab the eq lock to keep things from finishing as we are
@@ -1340,7 +1569,9 @@ static int bmip_cancel(bmi_op_id_t id, b
__check_eq(0);
mop = id_gen_fast_lookup(id);
w = mop->method_data;
- debug(2, "%s: cancel %p state %s\n", __func__, w, state_name(w->state));
+ fprintf(stderr, "%s: cancel %p state %s len %llu tag 0x%llx mb 0x%llx\n",
+ __func__, w, state_name(w->state), llu(w->tot_len),
+ llu(w->bmi_tag), llu(w->match_bits));
switch (w->state) {
case SQ_WAITING_ACK:
@@ -1371,6 +1602,7 @@ static int bmip_cancel(bmi_op_id_t id, b
case SQ_WAITING_USER_TEST:
case RQ_WAITING_USER_TEST:
+ case RQ_WAITING_USER_POST:
case RQ_LEN_ERROR:
case SQ_CANCELLED:
case RQ_CANCELLED:
@@ -1387,6 +1619,11 @@ link_done:
out:
gen_mutex_unlock(&eq_mutex);
+
+ /* debugging */
+ dump_queues(0);
+
+ exit(1);
return 0;
}
@@ -1418,7 +1655,7 @@ static struct bmi_method_addr *bmip_allo
if (pma->pid.pid == pid.pid && pma->pid.nid == pid.nid) {
/* relies on alloc_method_addr() working like it does */
map = &((struct bmi_method_addr *) pma)[-1];
- debug(2, "%s: found map %p from pma %p\n", __func__, map, pma);
+ debug(2, "%s: found matching peer %s\n", __func__, pma->peername);
goto out;
}
}
@@ -1436,16 +1673,19 @@ static struct bmi_method_addr *bmip_allo
sprintf(pma->peername, "%s:%d", hostname, pid.pid);
pma->pid = pid;
+ pma->seqno_in = 0;
+ pma->seqno_out = 0;
qlist_add(&pma->list, &pma_list);
if (register_with_bmi) {
ret = bmi_method_addr_reg_callback(map);
- if (ret < 0) {
+ if (!ret) {
gossip_err("%s: bmi_method_addr_reg_callback failed\n", __func__);
free(map);
map = NULL;
}
}
+ debug(2, "%s: new peer %s\n", __func__, pma->peername);
out:
gen_mutex_unlock(&pma_mutex);
@@ -1453,7 +1693,7 @@ out:
}
-#ifndef __LIBCATAMOUNT__
+#if !(defined(__LIBCATAMOUNT__) || defined(__CRAYXT_COMPUTE_LINUX_TARGET) || defined(__CRAYXT_SERVICE))
/*
* Clients give hostnames. Convert these to Portals nids. This routine
* specific for Portals-over-IP (tcp or utcp).
@@ -1632,8 +1872,12 @@ static int unexpected_init(struct bmi_me
* to repost the first. Sort of a circular buffer structure. This is
* hopefully better than wasting a full 8k for every small control message.
*/
- for (i=0; i<UNEXPECTED_NUM_MD; i++)
+ unexpected_need_repost_sum = 0;
+ for (i=0; i<UNEXPECTED_NUM_MD; i++) {
+ unexpected_is_posted[i] = 0;
+ unexpected_need_repost[i] = 0;
unexpected_repost(i);
+ }
out:
return ret;
@@ -1644,6 +1888,21 @@ static void unexpected_repost(int which)
int ret;
ptl_md_t mdesc;
+ /* unlink used-up one */
+ if (unexpected_is_posted[which]) {
+ debug(1, "%s: trying unpost %d\n", __func__, which);
+ ret = PtlMEUnlink(unexpected_me[which]);
+ if (ret) {
+ gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, which,
+ PtlErrorStr(ret));
+ return;
+ }
+ debug(1, "%s: unposted %d\n", __func__, which);
+ unexpected_need_repost[which] = 0;
+ unexpected_is_posted[which] = 0;
+ --unexpected_need_repost_sum;
+ }
+
/* unexpected messages are limited by the API to a certain size */
mdesc.start = unexpected_buf + which * (UNEXPECTED_QUEUE_SIZE / 2);
mdesc.length = UNEXPECTED_QUEUE_SIZE / 2;
@@ -1652,24 +1911,37 @@ static void unexpected_repost(int which)
| PTL_MD_MAX_SIZE;
mdesc.max_size = UNEXPECTED_MESSAGE_SIZE;
mdesc.eq_handle = eq;
+ mdesc.user_ptr = (void *) (uintptr_t) (UNEXPECTED_MD_INDEX_OFFSET + which);
/*
- * Take any tag, as long as it has the unexpected bit set. This always
- * goes at the very end of the match list.
+ * Take any tag, as long as it has the unexpected bit set, and not
+ * the long send bit. Not sure if we need both bits for this. This always
+ * goes at the very end of the list, just in front of zero. The nonpp
+ * and unex ones can be comingled, as they select different things, but
+ * they must come after the preposts and before the zero md.
*/
- ret = PtlMEAttach(ni, ptl_index, any_pid, match_bits_unexpected,
- 0xffffffffULL, PTL_UNLINK, PTL_INS_AFTER,
- &unexpected_me[which]);
+ ret = PtlMEInsert(zero_me, any_pid, match_bits_unexpected,
+ (0x3fffffffULL << 32) | 0xffffffffULL, PTL_UNLINK,
+ PTL_INS_BEFORE, &unexpected_me[which]);
if (ret) {
- gossip_err("%s: PtlMEAttach: %s\n", __func__, PtlErrorStr(ret));
+ gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
return;
}
- /* put data here when it matches; when full, unlink it */
- ret = PtlMDAttach(unexpected_me[which], mdesc, PTL_UNLINK,
+ /*
+ * Put data here when it matches. Do not auto-unlink else a new md
+ * may get stuck in and cause a false match in unexpected_md_index above.
+ * Do it all manually. Also have to make sure these do not get reused
+ * in case things sitting in the queue haven't been looked at yet. Maybe
+ * need to use md.user_ptr, or look at the match bits.
+ */
+ ret = PtlMDAttach(unexpected_me[which], mdesc, PTL_RETAIN,
&unexpected_md[which]);
if (ret)
gossip_err("%s: PtlMDAttach: %s\n", __func__, PtlErrorStr(ret));
+
+ unexpected_is_posted[which] = 1;
+ debug(1, "%s: reposted %d\n", __func__, which);
}
static int unexpected_fini(void)
@@ -1677,14 +1949,14 @@ static int unexpected_fini(void)
int i, ret;
for (i=0; i<UNEXPECTED_NUM_MD; i++) {
- ret = PtlMDUnlink(unexpected_md[i]);
+ /* MDs go away when MEs unlinked */
+ ret = PtlMEUnlink(unexpected_me[i]);
if (ret) {
- gossip_err("%s: PtlMDUnlink %d: %s\n", __func__, i,
+ gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, i,
PtlErrorStr(ret));
return ret;
}
}
- /* MEs are automatically discarded when MDs go away */
free(unexpected_buf);
return 0;
}
@@ -1719,6 +1991,27 @@ static void nonprepost_repost(int which)
{
int ret;
ptl_md_t mdesc;
+ static int count = 0;
+
+ debug(0, "%s: WHICH %d\n", __func__, which);
+ ++count;
+ if (count > 2)
+ exit(0);
+
+ /* unlink used-up one */
+ if (unexpected_is_posted[which]) {
+ debug(1, "%s: trying unpost %d\n", __func__, which);
+ ret = PtlMEUnlink(unexpected_me[which]);
+ if (ret) {
+ gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, which,
+ PtlErrorStr(ret));
+ return;
+ }
+ debug(1, "%s: unposted %d\n", __func__, which);
+ unexpected_need_repost[which] = 0;
+ unexpected_is_posted[which] = 0;
+ --unexpected_need_repost_sum;
+ }
/* only short messages that fit max_size go in here */
mdesc.start = nonprepost_buf + which * (NONPREPOST_QUEUE_SIZE / 2);
@@ -1728,13 +2021,17 @@ static void nonprepost_repost(int which)
| PTL_MD_MAX_SIZE;
mdesc.max_size = NONPREPOST_MESSAGE_SIZE;
mdesc.eq_handle = eq;
+ mdesc.user_ptr = (void *) (uintptr_t) (NONPREPOST_MD_INDEX_OFFSET + which);
+
+ /* XXX: maybe need manual unlink like for unexpecteds on CNL */
/* also at the very end of the list */
- ret = PtlMEAttach(ni, ptl_index, any_pid, 0,
- 0xffffffffULL, PTL_UNLINK, PTL_INS_AFTER,
- &nonprepost_me[which]);
+ /* match anything as long as top two bits are zero */
+ ret = PtlMEInsert(zero_me, any_pid, 0,
+ (0x3fffffffULL << 32) | 0xffffffffULL,
+ PTL_UNLINK, PTL_INS_BEFORE, &nonprepost_me[which]);
if (ret) {
- gossip_err("%s: PtlMEAttach: %s\n", __func__, PtlErrorStr(ret));
+ gossip_err("%s: PtlMEInsert: %s\n", __func__, PtlErrorStr(ret));
return;
}
@@ -1759,14 +2056,13 @@ static int nonprepost_fini(void)
nonprepost_refcnt[i]);
if (!nonprepost_is_posted[i])
continue;
- ret = PtlMDUnlink(nonprepost_md[i]);
+ /* MDs go away when MEs unlinked */
+ ret = PtlMEUnlink(nonprepost_me[i]);
if (ret) {
- gossip_err("%s: PtlMDUnlink %d: %s\n", __func__, i,
+ gossip_err("%s: PtlMEUnlink %d: %s\n", __func__, i,
PtlErrorStr(ret));
- return ret;
}
}
- /* MEs are automatically discarded when MDs go away */
free(nonprepost_buf);
return 0;
}
@@ -1873,24 +2169,6 @@ static int bmip_initialize(struct bmi_me
bmi_portals_method_id = method_id;
- bmi_portals_nic_type = PTL_IFACE_DEFAULT;
-#ifdef __LIBCATAMOUNT__
- {
- /* magic for Cray XT3 service nodes only; compute uses default,
- * and TCP uses default */
- struct utsname buf;
-
- ret = uname(&buf);
- if (ret) {
- gossip_err("%s: uname failed: %m\n", __func__);
- ret = -EIO;
- goto out;
- }
- if (strcmp(buf.sysname, "Linux") == 0)
- bmi_portals_nic_type = CRAY_USER_NAL;
- }
-#endif
-
ret = PtlInit(&numint);
if (ret) {
gossip_err("%s: PtlInit failed\n", __func__);
@@ -1912,6 +2190,17 @@ static int bmip_initialize(struct bmi_me
/* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DBG_ALL | 0x00000000); */
/* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DBG_DROP | 0x00000000); */
+ /* catamount has different debug symbols, but never prints anything */
+ PtlNIDebug(PTL_INVALID_HANDLE, PTL_DEBUG_ALL | PTL_DEBUG_NI_ALL);
+ /* PtlNIDebug(PTL_INVALID_HANDLE, PTL_DEBUG_DROP | 0x00000000); */
+
+#if defined(__CRAYXT_SERVICE)
+ /*
+ * debug
+ */
+ signal(SIGUSR1, dump_queues);
+#endif
+
/*
* Allocate and build MDs for a queue of unexpected messages from
* all hosts. Drop lock for coming NI init call.
@@ -1951,6 +2240,19 @@ static int bmip_finalize(void)
nonprepost_fini();
if (unexpected_buf)
unexpected_fini();
+
+#if 0 /* example code: stick this somewhere to test if the EQ is freeable */
+ /* unexpected_fini(); */
+ nonprepost_fini();
+ ret = PtlMEUnlink(zero_me);
+ if (ret)
+ gossip_err("%s: PtlMEUnlink zero: %s\n", __func__, PtlErrorStr(ret));
+ ret = PtlEQFree(eq);
+ if (ret)
+ gossip_err("%s: PtlEQFree: %s\n", __func__, PtlErrorStr(ret));
+ printf("eqfree okay\n");
+ exit(1);
+#endif
/* destroy connection structures */
ret = PtlMEUnlink(mark_me);
More information about the Pvfs2-cvs
mailing list