[PVFS2-CVS] commit by neill in pvfs2/src/client/sysint: acache.c client-state-machine.h shared-state-methods.c sys-getattr.sm sys-io.sm

CVS commit program cvs at parl.clemson.edu
Tue May 4 11:42:59 EDT 2004


Update of /projects/cvsroot/pvfs2/src/client/sysint
In directory parlweb:/tmp/cvs-serv31112/src/client/sysint

Modified Files:
	acache.c client-state-machine.h shared-state-methods.c 
	sys-getattr.sm sys-io.sm 
Log Message:
- added high level I/O state machine retry logic
- some improved error handling


Index: acache.c
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/client/sysint/acache.c,v
diff -p -u -r1.4 -r1.5
--- acache.c	24 Mar 2004 23:10:30 -0000	1.4
+++ acache.c	4 May 2004 14:42:58 -0000	1.5
@@ -370,7 +370,10 @@ int PINT_acache_object_attr_deep_copy(
 		if ((dest->mask & PVFS_ATTR_META_DFILES) &&
 		    dest->u.meta.dfile_count > 0)
                 {
-                    free(dest->u.meta.dfile_array);
+                    if (dest->u.meta.dfile_array)
+                    {
+                        free(dest->u.meta.dfile_array);
+                    }
                 }
 		dest->u.meta.dfile_array = malloc(df_array_size);
 		if (!dest->u.meta.dfile_array)

Index: client-state-machine.h
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/client/sysint/client-state-machine.h,v
diff -p -u -r1.101 -r1.102
--- client-state-machine.h	30 Apr 2004 15:19:31 -0000	1.101
+++ client-state-machine.h	4 May 2004 14:42:58 -0000	1.102
@@ -180,6 +180,8 @@ struct PINT_client_io_sm {
     PVFS_offset           file_req_offset;
     void                 *buffer;
     PVFS_Request          mem_req;
+    int                   stored_error_code;
+    int                   retry_count;
 
     /* cached from object attributes */
     int                   orig_datafile_count;

Index: shared-state-methods.c
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/client/sysint/shared-state-methods.c,v
diff -p -u -r1.16 -r1.17
--- shared-state-methods.c	28 Apr 2004 15:45:08 -0000	1.16
+++ shared-state-methods.c	4 May 2004 14:42:58 -0000	1.17
@@ -55,7 +55,6 @@ int PINT_sm_common_parent_getattr_setup_
         js_p->error_code = ret;
     }
 
-    /* let 'msgpairarray' handle the 'msgpair' case */
     sm_p->msgarray = &(sm_p->msgpair);
     sm_p->msgarray_count = 1;
 
@@ -107,7 +106,6 @@ int PINT_sm_common_object_getattr_setup_
         js_p->error_code = ret;
     }
 
-    /* let 'msgpairarray' handle the 'msgpair' case */
     sm_p->msgarray = &(sm_p->msgpair);
     sm_p->msgarray_count = 1;
 
@@ -122,10 +120,6 @@ int PINT_sm_common_object_getattr_failur
     return 1;
 }
 
-
-/*
-  shared/common msgpair completion functions
-*/
 int PINT_sm_common_directory_getattr_comp_fn(
     void *v_p,
     struct PVFS_server_resp *resp_p,
@@ -137,12 +131,13 @@ int PINT_sm_common_directory_getattr_com
     
     assert(resp_p->op == PVFS_SERV_GETATTR);
 
+    assert(sm_p->msgarray == &sm_p->msgpair);
+    sm_p->msgarray = NULL;
+    sm_p->msgarray_count = 0;
+
     gossip_debug(GOSSIP_CLIENT_DEBUG,
                  "PINT_sm_common_getattr_directory_comp_fn\n");
 
-    /* if we get an error, just return immediately, don't try to
-     * actually fill anything in.
-     */
     if (resp_p->status != 0)
     {
         gossip_err("Error: getattr failure\n");
@@ -150,9 +145,9 @@ int PINT_sm_common_directory_getattr_com
     }
 
     /*
-      if we didn't get a cache hit, we're making a
-      copy of the attributes here so that we can add
-      a acache entry later in cleanup.
+      if we didn't get a cache hit, we're making a copy of the
+      attributes here so that we can add a acache entry later in
+      cleanup.
     */
     if (!sm_p->acache_hit)
     {
@@ -161,8 +156,8 @@ int PINT_sm_common_directory_getattr_com
     }
 
     /*
-      if we got a cache hit, use those attributes,
-      otherwise use the real server replied attrs
+      if we got a cache hit, use those attributes, otherwise use the
+      real server replied attrs
     */
     attr = (sm_p->acache_hit ?
             &sm_p->pinode->attr :
@@ -230,12 +225,13 @@ int PINT_sm_common_object_getattr_comp_f
     
     assert(resp_p->op == PVFS_SERV_GETATTR);
 
+    assert(sm_p->msgarray == &sm_p->msgpair);
+    sm_p->msgarray = NULL;
+    sm_p->msgarray_count = 0;
+
     gossip_debug(GOSSIP_CLIENT_DEBUG,
                  "PINT_sm_common_getattr_object_comp_fn\n");
 
-    /* if we get an error, just return immediately, don't try to
-     * actually fill anything in.
-     */
     if (resp_p->status != 0)
     {
         gossip_err("Error: getattr failure\n");
@@ -243,12 +239,13 @@ int PINT_sm_common_object_getattr_comp_f
     }
 
     /*
-      if we didn't get a acache hit, we're making a
-      copy of the attributes here so that we can add
-      a acache entry later in cleanup.
+      if we didn't get a acache hit, we're making a copy of the
+      attributes here so that we can add a acache entry later in
+      cleanup.
     */
     if (!sm_p->acache_hit)
     {
+        memset(&sm_p->acache_attr, 0, sizeof(PVFS_object_attr));
         PINT_acache_object_attr_deep_copy(
             &sm_p->acache_attr, &resp_p->u.getattr.attr);
     }

Index: sys-getattr.sm
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/client/sysint/sys-getattr.sm,v
diff -p -u -r1.52 -r1.53
--- sys-getattr.sm	28 Apr 2004 16:32:41 -0000	1.52
+++ sys-getattr.sm	4 May 2004 14:42:59 -0000	1.53
@@ -661,10 +661,13 @@ static int getattr_cleanup(PINT_client_s
         {
 	    free(sm_p->u.getattr.size_array);
 	}
-	if (sm_p->msgarray != NULL && (sm_p->msgarray != &(sm_p->msgpair)) )
+
+	if (sm_p->msgarray != NULL && (sm_p->msgarray != &(sm_p->msgpair)))
         {
 	    free(sm_p->msgarray);
 	}
+        sm_p->msgarray = NULL;
+        sm_p->msgarray_count = 0;
 
         /*
           only free dist and dfile memory if we didn't get a

Index: sys-io.sm
===================================================================
RCS file: /projects/cvsroot/pvfs2/src/client/sysint/sys-io.sm,v
diff -p -u -r1.67 -r1.68
--- sys-io.sm	30 Apr 2004 14:58:41 -0000	1.67
+++ sys-io.sm	4 May 2004 14:42:59 -0000	1.68
@@ -24,9 +24,12 @@ extern job_context_id pint_client_sm_con
 enum
 {
     IO_NO_DATA = 1,
-    IO_DATAFILE_TRANSFERS_COMPLETE = 2
+    IO_DATAFILE_TRANSFERS_COMPLETE = 2,
+    IO_RETRY = 3
 };
 
+static int io_init(
+    PINT_client_sm *sm_p, job_status_s *js_p);
 static int io_object_getattr_failure(
     PINT_client_sm *sm_p, job_status_s *js_p);
 static int io_datafile_setup_msgpairs(
@@ -57,10 +60,12 @@ do {                                    
         free(iosm_p->datafile_index_array); \
         iosm_p->datafile_index_array = NULL;\
     }                                       \
-    if (sm_p->msgarray)                     \
+    if (sm_p->msgarray &&                   \
+        (sm_p->msgarray != &sm_p->msgpair)) \
     {                                       \
         free(sm_p->msgarray);               \
         sm_p->msgarray = NULL;              \
+        sm_p->msgarray_count = 0;           \
     }                                       \
     if (iosm_p->flow_array)                 \
     {                                       \
@@ -86,14 +91,22 @@ do {                                    
 
 %%
 
-machine pvfs2_client_io_sm(io_getattr_setup_msgpair,
-                           io_getattr_xfer_msgpair,
-                           io_getattr_failure,
-                           io_datafile_setup_msgpairs,
-                           io_datafile_post_msgpairs,
-                           io_datafile_complete_msgpairs,
-                           io_analyze_results)
+machine pvfs2_client_io_sm(
+    init,
+    io_getattr_setup_msgpair,
+    io_getattr_xfer_msgpair,
+    io_getattr_failure,
+    io_datafile_setup_msgpairs,
+    io_datafile_post_msgpairs,
+    io_datafile_complete_msgpairs,
+    io_analyze_results)
 {
+    state init
+    {
+        run io_init;
+        default => io_getattr_setup_msgpair;
+    }
+
     state io_getattr_setup_msgpair
     {
         run PINT_sm_common_object_getattr_setup_msgpair;
@@ -138,6 +151,7 @@ machine pvfs2_client_io_sm(io_getattr_se
     state io_analyze_results
     {
         run io_analyze_results;
+        IO_RETRY => init;
         default => terminate;
     }
 }
@@ -145,13 +159,13 @@ machine pvfs2_client_io_sm(io_getattr_se
 %%
 
 int PVFS_sys_io(PVFS_object_ref ref,
-                PVFS_Request          file_req,
-                PVFS_offset           file_req_offset,
-                void                 *buffer,
-                PVFS_Request          mem_req,
-                PVFS_credentials      credentials,
-                PVFS_sysresp_io      *resp_p,
-                enum PVFS_io_type     io_type)
+                PVFS_Request file_req,
+                PVFS_offset file_req_offset,
+                void *buffer,
+                PVFS_Request mem_req,
+                PVFS_credentials credentials,
+                PVFS_sysresp_io *resp_p,
+                enum PVFS_io_type io_type)
 {
     int ret = -PVFS_EINVAL;
     PINT_client_sm *sm_p = NULL;
@@ -195,8 +209,8 @@ int PVFS_sys_io(PVFS_object_ref ref,
     {
         return -PVFS_ENOMEM;
     }
-    memset(sm_p, 0, sizeof(*sm_p));
 
+    memset(sm_p, 0, sizeof(*sm_p));
     sm_p->cred_p = &credentials;
     sm_p->object_ref = ref;
     sm_p->u.io.io_type = io_type;
@@ -207,6 +221,19 @@ int PVFS_sys_io(PVFS_object_ref ref,
     sm_p->u.io.buffer = buffer; 
     sm_p->u.io.flowproto_type = cur_fs->flowproto;
     sm_p->u.io.encoding = cur_fs->encoding;
+    sm_p->u.io.stored_error_code = 0;
+    sm_p->u.io.retry_count = 0;
+    sm_p->u.io.datafile_handles = NULL;
+    sm_p->u.io.datafile_index_array = NULL;
+    sm_p->msgarray = NULL;
+    sm_p->u.io.dist_p = NULL;
+    sm_p->u.io.dist_size = 0;
+    sm_p->u.io.datafile_handles = NULL;
+    sm_p->u.io.datafile_count = 0;
+    sm_p->u.io.flow_array = NULL;
+    sm_p->u.io.flow_status_array = NULL;
+    sm_p->u.io.session_tag_array = NULL;
+    sm_p->u.io.ackarray = NULL;
 
     ret = PINT_client_state_machine_post(sm_p, PVFS_SYS_IO);
     if (ret)
@@ -250,6 +277,28 @@ int PVFS_sys_io(PVFS_object_ref ref,
 
 /*******************************************************************/
 
+static int io_init(PINT_client_sm *sm_p,
+                   job_status_s *js_p)
+{
+    job_id_t tmp_id;
+
+    gossip_debug(GOSSIP_CLIENT_DEBUG, "io state: init\n");
+
+    assert((js_p->error_code == 0) ||
+           (js_p->error_code == IO_RETRY));
+
+    if (js_p->error_code == IO_RETRY)
+    {
+        js_p->error_code = 0;
+
+        return job_req_sched_post_timer(
+            PVFS2_CLIENT_RETRY_DELAY, sm_p, 0, js_p, &tmp_id,
+            pint_client_sm_context);
+    }
+    return 1;
+}
+
+
 /* io_datafile_setup_msgpairs()
  *
  * Sets up msgpairs to send I/O requests to servers holding datafiles
@@ -288,27 +337,49 @@ static int io_datafile_setup_msgpairs(PI
     assert(attr->mask & PVFS_ATTR_META_DFILES);
     assert(attr->mask & PVFS_ATTR_META_DIST);
     assert(attr->u.meta.dist_size > 0);
+    assert(attr->u.meta.dfile_array);
     assert(attr->u.meta.dfile_count > 0);
 
-    /* assign internal io ptrs for convenience here (without copying) */
-    sm_p->u.io.dist_p = attr->u.meta.dist;
-    sm_p->u.io.dist_size = attr->u.meta.dist_size;
-    sm_p->u.io.datafile_handles = attr->u.meta.dfile_array;
-    sm_p->u.io.datafile_count = attr->u.meta.dfile_count;
+    /*
+      assign internal io ptrs for convenience here (without
+      copying) if unassigned
+    */
+    if (!sm_p->u.io.dist_p)
+    {
+        sm_p->u.io.dist_p = attr->u.meta.dist;
+    }
+    if (!sm_p->u.io.dist_size)
+    {
+        sm_p->u.io.dist_size = attr->u.meta.dist_size;
+    }
+    if (!sm_p->u.io.datafile_handles)
+    {
+        sm_p->u.io.datafile_handles = attr->u.meta.dfile_array;
+    }
+    if (!sm_p->u.io.datafile_count)
+    {
+        sm_p->u.io.datafile_count = attr->u.meta.dfile_count;
+    }
 
     ret = PINT_Dist_lookup(iosm_p->dist_p);
     assert(ret == 0);
 
-    target_datafile_array = (PVFS_handle *)malloc(
-        (iosm_p->datafile_count * sizeof(PVFS_handle)));
+    if (!target_datafile_array)
+    {
+        target_datafile_array = (PVFS_handle *)malloc(
+            (iosm_p->datafile_count * sizeof(PVFS_handle)));
+    }
     if (!target_datafile_array)
     {
         js_p->error_code = -PVFS_ENOMEM;
         return 1;
     }
 
-    iosm_p->datafile_index_array = (int *)malloc(
-        (iosm_p->datafile_count * sizeof(int)));
+    if (!iosm_p->datafile_index_array)
+    {
+        iosm_p->datafile_index_array = (int *)malloc(
+            (iosm_p->datafile_count * sizeof(int)));
+    }
     if (!iosm_p->datafile_index_array)
     {
         goto malloc_error_exit;
@@ -328,9 +399,17 @@ static int io_datafile_setup_msgpairs(PI
 
     if (target_datafile_count == 0)
     {
-        free(target_datafile_array);
-        free(iosm_p->datafile_index_array);
-        iosm_p->datafile_index_array = NULL;
+        if (target_datafile_array)
+        {
+            free(target_datafile_array);
+            target_datafile_array = NULL;
+        }
+
+        if (iosm_p->datafile_index_array)
+        {
+            free(iosm_p->datafile_index_array);
+            iosm_p->datafile_index_array = NULL;
+        }
 
         /* the no data case should be caught earlier than this */
         js_p->error_code = IO_NO_DATA;
@@ -342,8 +421,12 @@ static int io_datafile_setup_msgpairs(PI
                  "might have data\n", target_datafile_count);
 
     /* setup msgpair array */
-    sm_p->msgarray = (PINT_client_sm_msgpair_state *)malloc(
-        (target_datafile_count * sizeof(PINT_client_sm_msgpair_state)));
+    if (!sm_p->msgarray)
+    {
+        sm_p->msgarray = (PINT_client_sm_msgpair_state *)malloc(
+            (target_datafile_count *
+             sizeof(PINT_client_sm_msgpair_state)));
+    }
     if (!sm_p->msgarray)
     {
         goto malloc_error_exit;
@@ -351,8 +434,11 @@ static int io_datafile_setup_msgpairs(PI
     sm_p->msgarray_count = target_datafile_count;
 
     /* setup flow descriptor array */
-    iosm_p->flow_array = (flow_descriptor *)malloc(
-        (target_datafile_count * sizeof(flow_descriptor)));
+    if (!iosm_p->flow_array)
+    {
+        iosm_p->flow_array = (flow_descriptor *)malloc(
+            (target_datafile_count * sizeof(flow_descriptor)));
+    }
     if (!iosm_p->flow_array)
     {
         goto malloc_error_exit;
@@ -366,16 +452,22 @@ static int io_datafile_setup_msgpairs(PI
     }
 
     /* setup flow status array */
-    iosm_p->flow_status_array = (job_status_s *)malloc(
-        (target_datafile_count * sizeof(job_status_s)));
+    if (!iosm_p->flow_status_array)
+    {
+        iosm_p->flow_status_array = (job_status_s *)malloc(
+            (target_datafile_count * sizeof(job_status_s)));
+    }
     if (!iosm_p->flow_status_array)
     {
         goto malloc_error_exit;
     }
 
     /* setup session tag array */
-    iosm_p->session_tag_array = (PVFS_msg_tag_t *)malloc(
-        (target_datafile_count * sizeof(PVFS_msg_tag_t)));
+    if (!iosm_p->session_tag_array)
+    {
+        iosm_p->session_tag_array = (PVFS_msg_tag_t *)malloc(
+            (target_datafile_count * sizeof(PVFS_msg_tag_t)));
+    }
     if (!iosm_p->session_tag_array)
     {
         goto malloc_error_exit;
@@ -385,8 +477,12 @@ static int io_datafile_setup_msgpairs(PI
     if (iosm_p->io_type == PVFS_IO_WRITE)
     {
         /* setup the write acknowledgement array */
-        iosm_p->ackarray = (PINT_client_sm_recv_state *)malloc(
-            (target_datafile_count * sizeof(PINT_client_sm_recv_state)));
+        if (!iosm_p->ackarray)
+        {
+            iosm_p->ackarray = (PINT_client_sm_recv_state *)malloc(
+                (target_datafile_count *
+                 sizeof(PINT_client_sm_recv_state)));
+        }
         if (!iosm_p->ackarray)
         {
             goto malloc_error_exit;
@@ -401,6 +497,7 @@ static int io_datafile_setup_msgpairs(PI
     {
         int orig_index = 0;
         PINT_client_sm_msgpair_state *msg_p = &sm_p->msgarray[i];
+        assert(msg_p);
 
         gossip_debug(GOSSIP_IO_DEBUG, "  sending I/O request "
                      "for %Lu\n", Lu(target_datafile_array[i]));
@@ -436,14 +533,28 @@ static int io_datafile_setup_msgpairs(PI
         msg_p->handle = target_datafile_array[i];
         msg_p->retry_flag = PVFS_MSGPAIR_NO_RETRY;
         msg_p->comp_fn = NULL;
+
+        ret = PINT_bucket_map_to_server(&msg_p->svr_addr,
+                                        msg_p->handle,
+                                        msg_p->fs_id);
+        if (ret)
+        {
+            gossip_err("Failed to map meta server address\n");
+            js_p->error_code = ret;
+            return 1;
+        }
     }
 
     /* swap the new list in for the old one, freeing the old list */
-    if (!sm_p->acache_hit)
+    if (!sm_p->acache_hit && iosm_p->datafile_handles)
     {
         free(iosm_p->datafile_handles);
     }
-    iosm_p->datafile_handles = target_datafile_array;
+
+    if (target_datafile_array)
+    {
+        iosm_p->datafile_handles = target_datafile_array;
+    }
 
     /*
       store the original datafile_count before it's modified,
@@ -488,22 +599,13 @@ static int io_datafile_post_msgpairs(PIN
     assert(sm_p->msgarray);
     assert(sm_p->msgarray_count > 0);
 
-    sm_p->msgarray[0].comp_ct = 2 * sm_p->msgarray_count;
+    sm_p->msgarray[0].comp_ct = (2 * sm_p->msgarray_count);
 
     for (i = 0; i < sm_p->msgarray_count; i++)
     {
         PVFS_msg_tag_t session_tag;
         PINT_client_sm_msgpair_state *msg_p = &sm_p->msgarray[i];
-
-        ret = PINT_bucket_map_to_server(&msg_p->svr_addr,
-                                        msg_p->handle,
-                                        msg_p->fs_id);
-        if (ret)
-        {
-            gossip_err("Failed to map meta server address\n");
-            js_p->error_code = ret;
-            return 1;
-        }
+        assert(msg_p);
 
         ret = PINT_encode(&msg_p->req,
                           PINT_ENCODE_REQ,
@@ -635,7 +737,8 @@ static int io_datafile_complete_msgpairs
     job_id_t tmp_id;
     PINT_client_sm_msgpair_state *msg_p = NULL;
 
-    gossip_debug(GOSSIP_CLIENT_DEBUG, "io state: datafile_complete\n");
+    gossip_debug(GOSSIP_CLIENT_DEBUG, "io state: "
+                 "datafile_complete_msgpairs\n");
 
     assert(sm_p->msgarray_count == sm_p->u.io.datafile_count);
     assert(sm_p->msgarray[0].comp_ct >= 0);
@@ -939,11 +1042,9 @@ static int io_datafile_complete_msgpairs
         js_p->error_code = IO_DATAFILE_TRANSFERS_COMPLETE;
         return 1;
     }
-    else
-    {
-        /* there's still something left to transfer */
-        return 0;
-    }
+
+    /* there's still something left to transfer */
+    return 0;
 }
 
 static int io_object_getattr_failure(PINT_client_sm *sm_p,
@@ -953,14 +1054,15 @@ static int io_object_getattr_failure(PIN
                  "io_object_getattr_failure\n", sm_p);
 
     /*
-      NOTE:
-      this can easily happen if we're doing I/O on a file that
+      NOTE: this can easily happen if we're doing I/O on a file that
       was removed by another process
     */
     if (js_p->error_code == 0)
     {
         js_p->error_code = -PVFS_ENOENT;
     }
+
+    sm_p->u.io.stored_error_code = js_p->error_code;
     return 1;
 }
 
@@ -977,6 +1079,9 @@ static int io_analyze_results(PINT_clien
     if (js_p->error_code != IO_DATAFILE_TRANSFERS_COMPLETE)
     {
         /* some sort of error occurred early on */
+        js_p->error_code = (sm_p->u.io.stored_error_code ?
+                            sm_p->u.io.stored_error_code :
+                            js_p->error_code);
         if (js_p->error_code == 0)
         {
             js_p->error_code = -PVFS_EIO;
@@ -1110,12 +1215,42 @@ static int io_analyze_results(PINT_clien
     if (!sm_p->acache_hit)
     {
         free(sm_p->u.io.datafile_handles);
+        sm_p->u.io.datafile_handles = NULL;
     }
+
+    /*
+      FIXME: non bmi errors pop out in flow failures above -- they are
+      not properly marked as flow errors either, so we check for them
+      explicitly here (but not all -- fix it for real).
+    */
+    if (((PVFS_ERROR_CLASS(-error) == PVFS_ERROR_BMI) ||
+         (PVFS_ERROR_CLASS(-error) == PVFS_ERROR_FLOW) ||
+         (error == -ECONNRESET)) &&
+        (sm_p->u.io.retry_count < PVFS2_CLIENT_RETRY_LIMIT))
+    {
+        if (sm_p->acache_hit || sm_p->pinode)
+        {
+            sm_p->acache_hit = 0;
+            PINT_acache_release(sm_p->pinode);
+            sm_p->pinode = NULL;
+        }
+
+        sm_p->u.io.stored_error_code = 0;
+        sm_p->u.io.retry_count++;
+
+        gossip_debug(GOSSIP_IO_DEBUG, "Retrying I/O operation "
+                     "(attempt number %d)\n", sm_p->u.io.retry_count);
+
+        js_p->error_code = IO_RETRY;
+        return 1;
+    }
+
     CLEAN_PRIVATE_MEMBERS(iosm_p);
 
     /* return size, error, and set operation as complete */
     sm_p->u.io.io_resp_p->total_completed = total_size;
-    sm_p->error_code = error;
+    sm_p->error_code = (sm_p->u.io.stored_error_code ?
+                        sm_p->u.io.stored_error_code : error);
     sm_p->op_complete = 1;
 
     return 0;



More information about the PVFS2-CVS mailing list