[Pvfs2-developers] patch: error code bug fixes
Murali Vilayannur
murali.vilayannur at gmail.com
Thu Apr 5 11:38:28 EDT 2007
Hi Phil,
Nice catches!
Should we perhaps augment all such functions to not return a int and
replace them with a PVFS_error or a BMI_error as the case maybe?
thanks,
Murali
On 3/26/07, Sam Lang <slang at mcs.anl.gov> wrote:
>
> Phil,
>
> I've committed this patch. Thanks for all the fixes. It looks like
> some of them may really help in debugging user errors.
>
> -sam
>
>
> On Mar 20, 2007, at 9:22 AM, Phil Carns wrote:
>
> > This patch corrects a variety of error code problems:
> >
> > - several BMI error codes were not tagged with the BMI error class,
> > which is important to allow client state machines to retry on
> > network errors
> > - ditto above for a few flow errors
> > - ECONNRESET was not understood by BMI or included in the error
> > code mapping
> > - error codes were printed in hex by some routines
> > - some kernel operation timeouts were translated as EINTR, which is
> > misleading (these were changed to ETIMEDOUT)
> > - timeouts while waiting for a kernel buffer were reported as -1
> > (EPERM) rather than ETIMEDOUT
> >
> > I also noticed quite a bit of fragility in how sockio.c handles
> > error codes, but I didn't address that in this patch other than to
> > work around one common case. Basically, the issue is that sockio.c
> > still sets errno and returns -1 when it has a problem (most of the
> > PVFS2 source code APIs return -PVFS_ERRORs). bmi_tcp.c then
> > translates it. This is fragile for a few reasons, but one that
> > really stands out now is that only sockio.c knows how to translate
> > h_errno values. That means that now some of the sockio functions
> > do translate error codes immediately, while others defer to let
> > bmi_tcp.c do the work. This is confusing :)
> >
> > The BMI error handling and ECONNRESET parts of this patch are
> > important for failover scenarios so that clients are able to pick
> > back up without error.
> >
> > The ones related to kernel timeouts are important if you are tuning
> > kernel buffer sizes- at some point if your buffers are large enough
> > you may end up with processes waiting for buffers long enough to
> > exhaust the default timeouts.
> >
> > -Phil
> > Index: pvfs2_src/src/io/bmi/bmi_tcp/socket-collection.c
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi_tcp/socket-collection.c (revision 2846)
> > +++ pvfs2_src/src/io/bmi/bmi_tcp/socket-collection.c (revision 2847)
> > @@ -292,7 +292,7 @@
> > if(ret < 0)
> > {
> > gen_mutex_unlock(&scp->mutex);
> > - return(-old_errno);
> > + return(bmi_tcp_errno_to_pvfs(-old_errno));
> > }
> >
> > /* nothing ready, just return */
> > Index: pvfs2_src/src/io/bmi/bmi_tcp/sockio.c
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi_tcp/sockio.c (revision 2846)
> > +++ pvfs2_src/src/io/bmi/bmi_tcp/sockio.c (revision 2847)
> > @@ -95,7 +95,7 @@
> > {
> > if (errno == EINTR)
> > goto connect_sock_restart;
> > - return (-PVFS_ERROR_CODE(errno));
> > + return (-1);
> > }
> > return (sockd);
> > }
> > Index: pvfs2_src/src/io/bmi/bmi_tcp/bmi-tcp.c
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi_tcp/bmi-tcp.c (revision 2846)
> > +++ pvfs2_src/src/io/bmi/bmi_tcp/bmi-tcp.c (revision 2847)
> > @@ -704,7 +704,7 @@
> > struct tcp_allowed_connection_s *tcp_allowed_connection =
> > NULL;
> > if (inout_parameter == NULL)
> > {
> > - ret = -EINVAL;
> > + ret = bmi_tcp_errno_to_pvfs(-EINVAL);
> > break;
> > }
> > else
> > @@ -715,7 +715,7 @@
> > (struct tcp_allowed_connection_s *) calloc(1,
> > sizeof(struct tcp_allowed_connection_s));
> > if (tcp_allowed_connection == NULL)
> > {
> > - ret = -ENOMEM;
> > + ret = bmi_tcp_errno_to_pvfs(-ENOMEM);
> > break;
> > }
> > #ifdef __PVFS2_SERVER__
> > @@ -1831,9 +1831,6 @@
> >
> > if(tcp_addr_data->addr_error)
> > {
> > - /* TODO: make this a debug rather than error message once we have
> > - * tested this out enough
> > - */
> > gossip_debug(GOSSIP_BMI_DEBUG_TCP, "%s: attempting
> > reconnect.\n",
> > __func__);
> > tcp_addr_data->addr_error = 0;
> > @@ -1930,14 +1927,15 @@
> > }
> > else
> > {
> > + tmp_errno = errno;
> > /* BMI_sockio_connect_sock returns a PVFS error */
> > char buff[300];
> >
> > snprintf(buff, 300, "Error: BMI_sockio_connect_sock: (%
> > s):",
> > tcp_addr_data->hostname);
> >
> > - PVFS_perror_gossip(buff, ret);
> > - return (ret);
> > + PVFS_perror_gossip(buff, bmi_tcp_errno_to_pvfs(-
> > tmp_errno));
> > + return (bmi_tcp_errno_to_pvfs(-tmp_errno));
> > }
> > }
> >
> > @@ -2277,6 +2275,7 @@
> > if (ret < 0)
> > {
> > PVFS_perror_gossip("Error: payload_progress", ret);
> > + /* payload_progress() returns BMI error codes */
> > tcp_forget_addr(query_op->addr, 0, ret);
> > return (ret);
> > }
> > @@ -2444,6 +2443,7 @@
> > gen_mutex_lock(&interface_mutex);
> > if (ret < 0)
> > {
> > + /* BMI_socket_collection_testglobal() returns BMI error
> > code */
> > return (ret);
> > }
> >
> > @@ -2457,6 +2457,7 @@
> > /* skip working on addresses in failure mode */
> > if(tcp_addr_data->addr_error)
> > {
> > + /* addr_error field is in BMI error code format */
> > tcp_forget_addr(addr_array[i], 0, tcp_addr_data->addr_error);
> > continue;
> > }
> > @@ -2727,7 +2728,7 @@
> > "...dropping connection.\n");
> > tcp_forget_addr(map, 0, bmi_tcp_errno_to_pvfs(-EPIPE));
> > }
> > - return(ret);
> > + return(0);
> > }
> > else
> > {
> > @@ -2932,6 +2933,7 @@
> > if (ret < 0)
> > {
> > PVFS_perror_gossip("Error: socket failed to init", ret);
> > + /* tcp_sock_init() returns BMI error code */
> > tcp_forget_addr(my_method_op->addr, 0, ret);
> > return (0);
> > }
> > @@ -2956,6 +2958,7 @@
> > if (ret < 0)
> > {
> > PVFS_perror_gossip("Error: payload_progress", ret);
> > + /* payload_progress() returns BMI error codes */
> > tcp_forget_addr(my_method_op->addr, 0, ret);
> > return (0);
> > }
> > @@ -3023,6 +3026,7 @@
> > if (ret < 0)
> > {
> > PVFS_perror_gossip("Error: payload_progress", ret);
> > + /* payload_progress() returns BMI error codes */
> > tcp_forget_addr(my_method_op->addr, 0, ret);
> > return (0);
> > }
> > @@ -3318,7 +3322,7 @@
> > if(!(*peer))
> > {
> > close(*socket);
> > - return(-BMI_ENOMEM);
> > + return(bmi_tcp_errno_to_pvfs(-BMI_ENOMEM));
> > }
> > strcpy(*peer, tmp_peer);
> >
> > @@ -3447,6 +3451,7 @@
> > if (ret < 0)
> > {
> > gossip_debug(GOSSIP_BMI_DEBUG_TCP, "tcp_sock_init() failure.\n");
> > + /* tcp_sock_init() returns BMI error code */
> > tcp_forget_addr(dest, 0, ret);
> > return (ret);
> > }
> > @@ -3489,6 +3494,7 @@
> > if (ret < 0)
> > {
> > PVFS_perror_gossip("Error: payload_progress", ret);
> > + /* payload_progress() returns BMI error codes */
> > tcp_forget_addr(dest, 0, ret);
> > return (ret);
> > }
> > Index: pvfs2_src/include/pvfs2-types.h
> > ===================================================================
> > --- pvfs2_src/include/pvfs2-types.h (revision 2847)
> > +++ pvfs2_src/include/pvfs2-types.h (revision 2848)
> > @@ -529,6 +529,7 @@
> > #define PVFS_EHOSTUNREACH E(56) /* No route to host */
> > #define PVFS_EALREADY E(57) /* Operation already in
> > progress */
> > #define PVFS_EACCES E(58) /* Access not allowed */
> > +#define PVFS_ECONNRESET E(59) /* Connection reset by peer */
> >
> > /***************** non-errno/pvfs2 specific error codes
> > *****************/
> > #define PVFS_ECANCEL (1|(PVFS_NON_ERRNO_ERROR_BIT|PVFS_ERROR_BIT))
> > @@ -547,7 +548,7 @@
> > * UNIX ERRNO VALUE IN THE MACROS BELOW (USED IN
> > * src/common/misc/errno-mapping.c and the kernel module)
> > */
> > -#define PVFS_ERRNO_MAX 59
> > +#define PVFS_ERRNO_MAX 60
> >
> > #ifndef EUNATCH
> > #define EUNATCH -1
> > @@ -633,7 +634,8 @@
> > EHOSTDOWN, \
> > EHOSTUNREACH, \
> > EALREADY, \
> > - EACCES, /* 58 */ \
> > + EACCES, \
> > + ECONNRESET, /* 59 */ \
> > 0 /* PVFS_ERRNO_MAX */ \
> > }; \
> > const char *PINT_non_errno_strerror_mapping[] = { \
> > Index: pvfs2_src/src/io/bmi/bmi-types.h
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi-types.h (revision 2847)
> > +++ pvfs2_src/src/io/bmi/bmi-types.h (revision 2848)
> > @@ -131,6 +131,7 @@
> > #define BMI_ENETUNREACH (PVFS_ENETUNREACH | PVFS_ERROR_BMI)
> > #define BMI_ENETRESET (PVFS_ENETRESET | PVFS_ERROR_BMI)
> > #define BMI_ENOBUFS (PVFS_ENOBUFS | PVFS_ERROR_BMI)
> > +#define BMI_ECONNRESET (PVFS_ECONNRESET | PVFS_ERROR_BMI)
> > #define BMI_ETIMEDOUT (PVFS_ETIMEDOUT | PVFS_ERROR_BMI)
> > #define BMI_ECONNREFUSED (PVFS_ECONNREFUSED | PVFS_ERROR_BMI)
> > #define BMI_EHOSTDOWN (PVFS_EHOSTDOWN | PVFS_ERROR_BMI)
> > Index: pvfs2_src/src/io/bmi/bmi.c
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi.c (revision 2848)
> > +++ pvfs2_src/src/io/bmi/bmi.c (revision 2849)
> > @@ -1861,6 +1861,7 @@
> > __CASE(EHOSTUNREACH);
> > __CASE(EALREADY);
> > __CASE(EACCES);
> > + __CASE(ECONNRESET);
> > #undef __CASE
> > }
> > return bmi_errno;
> > Index: pvfs2_src/src/common/misc/errno-mapping.c
> > ===================================================================
> > --- pvfs2_src/src/common/misc/errno-mapping.c (revision 2849)
> > +++ pvfs2_src/src/common/misc/errno-mapping.c (revision 2850)
> > @@ -73,7 +73,7 @@
> > }
> > else
> > {
> > - fprintf(stderr, "Warning: non PVFS2 error code (%x):\n",
> > + fprintf(stderr, "Warning: non PVFS2 error code (%d):\n",
> > -retcode);
> > fprintf(stderr, "%s: %s\n", text, strerror(-retcode));
> > }
> > @@ -103,7 +103,7 @@
> > }
> > else
> > {
> > - gossip_err("Warning: non PVFS2 error code (%x):\n", -retcode);
> > + gossip_err("Warning: non PVFS2 error code (%d):\n", -retcode);
> > gossip_err("%s: %s\n", text, strerror(-retcode));
> > }
> > return;
> > Index: pvfs2_src/src/apps/kernel/linux/pvfs2-client-core.c
> > ===================================================================
> > --- pvfs2_src/src/apps/kernel/linux/pvfs2-client-core.c (revision
> > 2996)
> > +++ pvfs2_src/src/apps/kernel/linux/pvfs2-client-core.c (revision
> > 2997)
> > @@ -2439,7 +2439,11 @@
> > /* replace non-errno error code to avoid passing to
> > kernel */
> > if (*error_code == -PVFS_ECANCEL)
> > {
> > - *error_code = -PVFS_EINTR;
> > + /* if an ECANCEL shows up here without going
> > through the
> > + * cancel_op_in_progress() path, then -
> > PVFS_ETIMEDOUT is
> > + * a better errno approximation than -PVFS_EINTR
> > + */
> > + *error_code = -PVFS_ETIMEDOUT;
> > }
> > break;
> > case PVFS2_VFS_OP_FILE_IOX:
> > @@ -2469,7 +2473,11 @@
> > /* replace non-errno error code to avoid passing to
> > kernel */
> > if (*error_code == -PVFS_ECANCEL)
> > {
> > - *error_code = -PVFS_EINTR;
> > + /* if an ECANCEL shows up here without going
> > through the
> > + * cancel_op_in_progress() path, then -
> > PVFS_ETIMEDOUT is
> > + * a better errno approximation than -PVFS_EINTR
> > + */
> > + *error_code = -PVFS_ETIMEDOUT;
> > }
> > break;
> > }
> > Index: pvfs2_src/src/io/flow/flowproto-bmi-trove/flowproto-
> > multiqueue.c
> > ===================================================================
> > --- pvfs2_src/src/io/flow/flowproto-bmi-trove/flowproto-
> > multiqueue.c (revision 2997)
> > +++ pvfs2_src/src/io/flow/flowproto-bmi-trove/flowproto-
> > multiqueue.c (revision 2998)
> > @@ -472,7 +472,10 @@
> > "%s: called on active flow, %lld bytes
> > transferred.\n",
> > __func__, lld(flow_d->total_transferred));
> > assert(flow_d->state == FLOW_TRANSMITTING);
> > - handle_io_error(-PVFS_ECANCEL, NULL, flow_data);
> > + /* NOTE: set flow error class bit so that system interface
> > understands
> > + * that this may be a retry-able error
> > + */
> > + handle_io_error(-(PVFS_ECANCEL|PVFS_ERROR_FLOW), NULL,
> > flow_data);
> > if(flow_data->parent->state == FLOW_COMPLETE)
> > {
> > gen_mutex_unlock(flow_data->parent->flow_mutex);
> > Index: pvfs2_src/src/common/misc/errno-mapping.c
> > ===================================================================
> > --- pvfs2_src/src/common/misc/errno-mapping.c (revision 2997)
> > +++ pvfs2_src/src/common/misc/errno-mapping.c (revision 2998)
> > @@ -62,14 +62,15 @@
> > char buf[MAX_PVFS_STRERROR_LEN] = {0};
> > int index = PVFS_get_errno_mapping(-retcode);
> >
> > - snprintf(buf,MAX_PVFS_STRERROR_LEN,"%s: %s\n",text,
> > - PINT_non_errno_strerror_mapping[index]);
> > + snprintf(buf,MAX_PVFS_STRERROR_LEN,"%s: %s (error class: %
> > d)\n",text,
> > + PINT_non_errno_strerror_mapping[index],
> > PVFS_ERROR_CLASS(-retcode));
> > fprintf(stderr, "%s", buf);
> > }
> > else if (IS_PVFS_ERROR(-retcode))
> > {
> > - fprintf(stderr, "%s: %s\n", text,
> > - strerror(PVFS_ERROR_TO_ERRNO(-retcode)));
> > + fprintf(stderr, "%s: %s (error class: %d)\n", text,
> > + strerror(PVFS_ERROR_TO_ERRNO(-retcode)),
> > + PVFS_ERROR_CLASS(-retcode));
> > }
> > else
> > {
> > Index: pvfs2_src/src/kernel/linux-2.6/pvfs2-bufmap.c
> > ===================================================================
> > --- pvfs2_src/src/kernel/linux-2.6/pvfs2-bufmap.c (revision 3000)
> > +++ pvfs2_src/src/kernel/linux-2.6/pvfs2-bufmap.c (revision 3001)
> > @@ -331,6 +239,7 @@
> > if (!schedule_timeout(timeout))
> > {
> > gossip_debug(GOSSIP_BUFMAP_DEBUG, "***
> > wait_for_a_slot timed out\n");
> > + ret = -ETIMEDOUT;
> > break;
> > }
> > continue;
> > Index: pvfs2_src/src/io/bmi/bmi_tcp/sockio.c
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi_tcp/sockio.c (revision 3082)
> > +++ pvfs2_src/src/io/bmi/bmi_tcp/sockio.c (revision 3083)
> > @@ -60,6 +60,7 @@
> > return (sockd);
> > }
> >
> > +/* NOTE: this function returns BMI error codes */
> > int BMI_sockio_bind_sock_specific(int sockd,
> > const char *name,
> > int service)
> > @@ -75,12 +76,13 @@
> > {
> > if (errno == EINTR)
> > goto bind_sock_restart;
> > - return (-1);
> > + return(bmi_errno_to_pvfs(-errno));
> > }
> > return (sockd);
> > }
> >
> >
> > +/* NOTE: this function returns BMI error codes */
> > int BMI_sockio_connect_sock(int sockd,
> > const char *name,
> > int service)
> > @@ -89,13 +91,13 @@
> > int ret;
> >
> > if ((ret = BMI_sockio_init_sock(&saddr, name, service)) != 0)
> > - return (ret); /* converted to PVFS error code below */
> > + return (ret);
> > connect_sock_restart:
> > if (connect(sockd, (struct sockaddr *) &saddr, sizeof(saddr))
> > < 0)
> > {
> > if (errno == EINTR)
> > goto connect_sock_restart;
> > - return (-1);
> > + return(bmi_errno_to_pvfs(-errno));
> > }
> > return (sockd);
> > }
> > Index: pvfs2_src/src/io/bmi/bmi_tcp/bmi-tcp.c
> > ===================================================================
> > --- pvfs2_src/src/io/bmi/bmi_tcp/bmi-tcp.c (revision 3082)
> > +++ pvfs2_src/src/io/bmi/bmi_tcp/bmi-tcp.c (revision 3083)
> > @@ -1978,6 +1978,12 @@
> > ret = BMI_sockio_bind_sock_specific(tcp_addr_data->socket,
> > tcp_addr_data->hostname,
> > tcp_addr_data->port);
> > + /* NOTE: this particular function converts errno in
> > advance */
> > + if(ret < 0)
> > + {
> > + PVFS_perror_gossip("BMI_sockio_bind_sock_specific", ret);
> > + return(ret);
> > + }
> > }
> > else
> > {
> > @@ -2155,22 +2161,21 @@
> >
> > if (ret < 0)
> > {
> > - if (errno == EINPROGRESS)
> > + if (ret == -EINPROGRESS)
> > {
> > tcp_addr_data->not_connected = 1;
> > /* this will have to be connected later with a poll */
> > }
> > else
> > {
> > - tmp_errno = errno;
> > - /* BMI_sockio_connect_sock returns a PVFS error */
> > + /* NOTE: BMI_sockio_connect_sock returns a PVFS error */
> > char buff[300];
> >
> > snprintf(buff, 300, "Error: BMI_sockio_connect_sock: (%
> > s):",
> > tcp_addr_data->hostname);
> >
> > - PVFS_perror_gossip(buff, bmi_tcp_errno_to_pvfs(-
> > tmp_errno));
> > - return (bmi_tcp_errno_to_pvfs(-tmp_errno));
> > + PVFS_perror_gossip(buff, ret);
> > + return (ret);
> > }
> > }
> >
> > _______________________________________________
> > Pvfs2-developers mailing list
> > Pvfs2-developers at beowulf-underground.org
> > http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
>
> _______________________________________________
> Pvfs2-developers mailing list
> Pvfs2-developers at beowulf-underground.org
> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
>
More information about the Pvfs2-developers
mailing list