[PVFS-developers] Faster stats

Porter Don PorterDE@mercury.hendrix.edu
Tue, 16 Sep 2003 16:17:25 -0500


This message is in MIME format. Since your mail reader does not understand
this format, some or all of this message may not be legible.

------_=_NextPart_000_01C37C97.ED30CC50
Content-Type: text/plain


Since you are rolling in patches as of late, I thought I would go ahead and
try to get this one in as well...

I noticed that ls -l's on a pvfs volume which either contain a lot of files
or are under load are extremely slow.  This is because every time a stat is
called it goes to _every_ iod to tally up the file size.  

So I thought that perhaps we could store the filesize as of the last stat or
close on the file in the metadata, which happens to have an unused field for
filesize (as well as the one in the stat structure). 

I implemented a check for whether or not a file has changed by calculating
what the filesize should be on the last iod to be written to.  If this
filesize has changed, then we know that the others have as well and we
re-tally the whole thing based on the old algorithm.  If it has not changed,
we just return the cached size. 

Further, because the manager tracks which files are open, we only need to do
this at all on an open file.  Because we write the stat to disk at close
anyway, we can assume that a closed file has the correct size cached.

This patch implements this with a run-time flag '-f' (for fast stats).  If
this flag is not set, the old behavior should be used.  Thus, it is a bit
more painless switch.  If everyone likes it, we can take out the old code
and have it always do what is in the 'if(fast_stats){}' loops.

I have only done performance testing on small clusters with 3 iods, but with
3 clients reading and writing a file, a 4th client's ls -l time dropped from
8.4 s to 0.6 s.  Under no load on this small cluster, there was on average a
50% speedup.  I think that on larger and more loaded clusters the
performance increase should be more dramatic.

Like I said, I was waiting to get to do a little testing on a larger
cluster, but since you are rolling in patches anyway....

Thanks,
don


------_=_NextPart_000_01C37C97.ED30CC50
Content-Type: application/octet-stream;
	name="fast-stat.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
	filename="fast-stat.patch"

diff -ur pvfs-1.6.0/mgr/meta/md_close.c =
pvfs-ls-dev/mgr/meta/md_close.c=0A=
--- pvfs-1.6.0/mgr/meta/md_close.c	Wed May  7 11:37:30 2003=0A=
+++ pvfs-ls-dev/mgr/meta/md_close.c	Wed Sep 10 16:29:34 2003=0A=
@@ -26,19 +26,30 @@=0A=
 #include <meta.h>=0A=
 #include <req.h>=0A=
 #include "metaio.h"=0A=
+#include <fslist.h>=0A=
 =0A=
-int md_close(mreq_p req, char *fname)=0A=
+extern int fast_stat;=0A=
+=0A=
+int md_close(mreq_p req, char *fname, fsinfo_p fs_p)=0A=
 {=0A=
 	time_t ctime;=0A=
 	fmeta meta;=0A=
 	int fd;=0A=
 =0A=
+	int ret, i;=0A=
+	ireq iodreq;=0A=
+	=0A=
+	/* variables used for calculating correct file size */=0A=
+	uint64_t real_file_sz, tmp_file_sz, iod_file_sz;=0A=
+	uint64_t strip_sz, stripe_sz, nr_strips, leftovers;=0A=
+	uint64_t total_blocks, blocks_per_node, remainder, block_remainder, =
terminal_iod_size, terminal_iod_index, term_iod_value;=0A=
+	=0A=
 	/* get fd for meta file */=0A=
 	if ((fd =3D meta_open(fname, O_RDWR)) < 0) {=0A=
 		perror("md_close: meta_open");=0A=
 		return(-1);=0A=
 	}=0A=
-=0A=
+	=0A=
 	/* get metadata */=0A=
 	if (meta_read(fd, &meta) < 0) {=0A=
 		perror("md_close: meta_read");=0A=
@@ -61,7 +72,101 @@=0A=
 	meta.u_stat.mtime =3D req->req.close.meta.u_stat.mtime;=0A=
 	/* don't change the creation time. */=0A=
 =0A=
-/* Write metadata back */=0A=
+	/* lets also start recording file size if -f option enabled*/=0A=
+	if(fast_stat){=0A=
+		/* if regular file, get size from iods */=0A=
+		if (S_ISREG(meta.u_stat.st_mode)){=0A=
+			iodreq.majik_nr       =3D IOD_MAJIK_NR;=0A=
+			iodreq.release_nr     =3D PVFS_RELEASE_NR;=0A=
+			iodreq.type 			 =3D IOD_STAT;=0A=
+			iodreq.dsize 			 =3D 0;=0A=
+			iodreq.req.stat.f_ino =3D meta.u_stat.st_ino;=0A=
+			=0A=
+			if ((ret =3D send_req(fs_p->iod, fs_p->nr_iods, =0A=
+									  meta.p_stat.base, meta.p_stat.pcount, =0A=
+									  &iodreq, fname, NULL)) < 0)=0A=
+				{=0A=
+					perror("md_close: send_req");=0A=
+					return -1;=0A=
+				}	=0A=
+			=0A=
+			strip_sz =3D meta.p_stat.ssize;=0A=
+			stripe_sz =3D meta.p_stat.ssize * =0A=
+				meta.p_stat.pcount;=0A=
+			real_file_sz =3D 0;=0A=
+			=0A=
+		/* We want to figure out the iod where the end of the file=0A=
+		 *	is and what it _ought_ to have as its iod_file_sz based=0A=
+		 *	upon the size we have stored in the metadata.  If these=0A=
+		 *	are the same, nothing has changed.  If not, we need to=0A=
+		 *	actually read all nodes.=0A=
+		 */=0A=
+			=0A=
+			total_blocks =3D meta.fsize / strip_sz;=0A=
+			remainder =3D meta.fsize % strip_sz;=0A=
+			blocks_per_node =3D total_blocks / meta.p_stat.pcount;=0A=
+			block_remainder =3D total_blocks % meta.p_stat.pcount;=0A=
+			terminal_iod_size =3D (blocks_per_node * strip_sz) + (remainder > 0 =
? remainder : strip_sz);=0A=
+			terminal_iod_index =3D remainder > 0 ? block_remainder + 1 : =
block_remainder;=0A=
+			=0A=
+			term_iod_value =3D fs_p->iod[(terminal_iod_index + =
meta.p_stat.base) %=0A=
+												fs_p->nr_iods].ack.ack.stat.fsize;=0A=
+			=0A=
+			if(term_iod_value !=3D terminal_iod_size){ =0A=
+				/* things have changed since we last wrote size to disk */=0A=
+				/* otherwise, we can just skip all this foolishness */=0A=
+				=0A=
+				for (i =3D 0; i < meta.p_stat.pcount; i++) {=0A=
+					/* i gives us the index into the iods, the ith iod in order */=0A=
+					iod_file_sz =3D fs_p->iod[(i + meta.p_stat.base) %=0A=
+													fs_p->nr_iods].ack.ack.stat.fsize;=0A=
+					if (iod_file_sz =3D=3D 0) continue;=0A=
+					=0A=
+					/* the plan is to calculate the file size based on what=0A=
+					 * this particular iod has.  the largest of these sizes=0A=
+					 * is the actual file size, taking into account sparse=0A=
+					 * issues.=0A=
+					 *=0A=
+					 * there are four components to the calculation:=0A=
+					 *=0A=
+					 * soff =3D should be 0 since it isn't implemented :)=0A=
+					 *=0A=
+					 * nr_strips * stripe_sz =3D this is a calculation of how many=0A=
+					 *   complete stripes are present.  note that any partial =
strip=0A=
+					 *   or the last complete strip (if there are no partials)=0A=
+					 *   may not be part of a whole stripe (thus the if() below).=0A=
+					 *=0A=
+					 * i * strip_sz =3D this accounts for full strips that should =
be=0A=
+					 *   present on iods that come before us in the iod ordering=0A=
+					 *=0A=
+					 * leftovers =3D the last remaining bytes=0A=
+					 */=0A=
+					nr_strips =3D iod_file_sz / strip_sz;=0A=
+					leftovers =3D iod_file_sz % strip_sz;=0A=
+					=0A=
+					if (leftovers =3D=3D 0) {=0A=
+						nr_strips--;=0A=
+						leftovers +=3D strip_sz;=0A=
+					}=0A=
+					tmp_file_sz =3D nr_strips * stripe_sz + i * strip_sz +=0A=
+						leftovers;=0A=
+					if (tmp_file_sz > real_file_sz) real_file_sz =3D tmp_file_sz;=0A=
+				}=0A=
+				=0A=
+				/* we're adding a 64-bit size field in addition to the st_size =0A=
+				 * later i'll remove the addition into st_size probably=0A=
+				 * i'm leaving it for now=0A=
+				 */=0A=
+				=0A=
+				=0A=
+				meta.u_stat.st_size =3D real_file_sz;=0A=
+				meta.fsize =3D real_file_sz;=0A=
+			}=0A=
+		}=0A=
+		// end of copy from do_stats=0A=
+	}	=0A=
+=0A=
+	/* Write metadata back */=0A=
 	if (meta_write(fd, &meta) < 0) {=0A=
 		perror("md_close: meta_write.");=0A=
 		return(-1);=0A=
Only in pvfs-ls-dev/mgr/meta: md_close.c~=0A=
Only in pvfs-ls-dev/mgr/meta: md_open.c~=0A=
diff -ur pvfs-1.6.0/mgr/meta/metaio.c pvfs-ls-dev/mgr/meta/metaio.c=0A=
--- pvfs-1.6.0/mgr/meta/metaio.c	Fri Oct 25 12:33:05 2002=0A=
+++ pvfs-ls-dev/mgr/meta/metaio.c	Wed Sep 10 16:10:45 2003=0A=
@@ -121,7 +121,6 @@=0A=
 =0A=
 	ret =3D read(fd, meta_p, sizeof(struct fmeta));=0A=
 	if (ret =3D=3D sizeof(struct fmeta)) {=0A=
-			  meta_p->u_stat.st_size=3D0;=0A=
 			  return 0;=0A=
 	}=0A=
 	/* it's no longer ok to read the old 1_5_0 structure */=0A=
diff -ur pvfs-1.6.0/mgr/mgr.c pvfs-ls-dev/mgr/mgr.c=0A=
--- pvfs-1.6.0/mgr/mgr.c	Mon Jun 16 09:52:41 2003=0A=
+++ pvfs-ls-dev/mgr/mgr.c	Wed Sep 10 16:33:02 2003=0A=
@@ -130,6 +130,7 @@=0A=
 fslist_p active_p;=0A=
 int acc; /* accept socket for mgr */=0A=
 int debug_on  =3D 0;=0A=
+int fast_stat =3D 0;=0A=
 int is_daemon =3D 1;=0A=
 int random_base =3D 0;=0A=
 int mgr_port =3D MGR_REQ_PORT;=0A=
@@ -172,11 +173,12 @@=0A=
 #endif=0A=
 =0A=
 	memset(&ack, 0, sizeof(ack));=0A=
-	while ((opt =3D getopt(argc, argv,"t:dgrp:s:")) !=3D EOF) {=0A=
+	while ((opt =3D getopt(argc, argv,"t:dgrfp:s:")) !=3D EOF) {=0A=
 		switch (opt) {=0A=
 			case 't' : timeout =3D atoi(optarg); break;=0A=
 			case 'd' : is_daemon=3D0; break;=0A=
 			case 'g' : debug_on=3D1; break;=0A=
+			case 'f' : fast_stat=3D1; break;=0A=
 			case 'r' : random_base=3D1; break;=0A=
 			case 'p' : mgr_port=3D atoi(optarg); break;=0A=
 			case 's' :=0A=
@@ -731,8 +733,12 @@=0A=
 	ireq iodreq;=0A=
 =0A=
 	/* variables used for calculating correct file size */=0A=
-	int64_t real_file_sz, tmp_file_sz, iod_file_sz;=0A=
-	int64_t strip_sz, stripe_sz, nr_strips, leftovers;=0A=
+	uint64_t real_file_sz, tmp_file_sz, iod_file_sz;=0A=
+	uint64_t strip_sz, stripe_sz, nr_strips, leftovers;=0A=
+	uint64_t total_blocks, blocks_per_node, remainder, block_remainder, =
terminal_iod_size, terminal_iod_index, term_iod_value;=0A=
+=0A=
+	int fd;=0A=
+=0A=
 =0A=
 	/* check for reserved name first */=0A=
 	if (resv_name(data_p) !=3D 0) {=0A=
@@ -750,8 +756,102 @@=0A=
 		return(send_error(ret, errno, sock, ack_p));=0A=
 	}=0A=
 =0A=
+	if(fast_stat){=0A=
+	  =0A=
+	  if ( (f_search(fs_p->fl_p, ack_p->ack.stat.meta.u_stat.st_ino) !=3D =
NULL) =0A=
+	       || ack_p->ack.stat.meta.u_stat.st_size =3D=3D 0 =0A=
+	       || ack_p->ack.stat.meta.fsize =3D=3D 0 )=0A=
+	    { /*file is open - we don't actually need the return value of =
f_search after this*/=0A=
+	      /* find last iod in sequence and multiply by numinodes */=0A=
+	      =0A=
+	      /* if regular file, get size from iods */=0A=
+	      if (S_ISREG(ack_p->ack.stat.meta.u_stat.st_mode)) {=0A=
+		iodreq.majik_nr       =3D IOD_MAJIK_NR;=0A=
+		iodreq.release_nr     =3D PVFS_RELEASE_NR;=0A=
+		iodreq.type 			 =3D IOD_STAT;=0A=
+		iodreq.dsize 			 =3D 0;=0A=
+		iodreq.req.stat.f_ino =3D ack_p->ack.stat.meta.u_stat.st_ino;=0A=
+		if ((ret =3D send_req(fs_p->iod, fs_p->nr_iods, =0A=
+				    ack_p->ack.stat.meta.p_stat.base, =
ack_p->ack.stat.meta.p_stat.pcount, =0A=
+				    &iodreq, data_p, NULL)) < 0)=0A=
+		  {=0A=
+		    return(send_error(ret, errno, sock, ack_p));=0A=
+		  }	=0A=
+		=0A=
+		strip_sz =3D ack_p->ack.stat.meta.p_stat.ssize;=0A=
+		stripe_sz =3D ack_p->ack.stat.meta.p_stat.ssize * =0A=
+		  ack_p->ack.stat.meta.p_stat.pcount;=0A=
+		real_file_sz =3D 0;=0A=
+=0A=
+		if( ack_p->ack.stat.meta.u_stat.st_size !=3D =
ack_p->ack.stat.meta.fsize){ /*sanity checking */=0A=
+		  ERR("do_stat: For some VERY strange reason st_size !=3D =
fsize\n");=0A=
+		  /* just pick one if they are out of sync */=0A=
+		  ack_p->ack.stat.meta.u_stat.st_size =3D =
ack_p->ack.stat.meta.fsize;=0A=
+		}=0A=
+=0A=
+		/* We want to figure out the iod where the end of the file=0A=
+		 *	is and what it _ought_ to have as its iod_file_sz based=0A=
+		 *	upon the size we have stored in the metadata.  If these=0A=
+		 *	are the same, nothing has changed.  If not, we need to=0A=
+		 *	actually read all nodes.=0A=
+		 */=0A=
 =0A=
-	/* if regular file, get size from iods */=0A=
+		total_blocks =3D ack_p->ack.stat.meta.fsize / strip_sz;=0A=
+		remainder =3D ack_p->ack.stat.meta.fsize % strip_sz;=0A=
+		blocks_per_node =3D total_blocks / =
ack_p->ack.stat.meta.p_stat.pcount;=0A=
+		block_remainder =3D total_blocks % =
ack_p->ack.stat.meta.p_stat.pcount;=0A=
+		terminal_iod_size =3D (blocks_per_node * strip_sz) + (remainder > 0 =
? remainder : strip_sz);=0A=
+		terminal_iod_index =3D remainder > 0 ? block_remainder + 1 : =
block_remainder;=0A=
+		=0A=
+		term_iod_value =3D fs_p->iod[(terminal_iod_index + =
ack_p->ack.stat.meta.p_stat.base) %=0A=
+					   fs_p->nr_iods].ack.ack.stat.fsize;=0A=
+		=0A=
+		if(term_iod_value !=3D terminal_iod_size){ =0A=
+		  /* things have changed since we last wrote size to disk */=0A=
+		  /* otherwise, we can just skip all this foolishness */=0A=
+		  for (i =3D 0; i < ack_p->ack.stat.meta.p_stat.pcount; i++) {=0A=
+		    /* i gives us the index into the iods, the ith iod in order =
*/=0A=
+=0A=
+		    if(i =3D=3D terminal_iod_index){ /*no need to do this call again =
*/=0A=
+		      iod_file_sz =3D term_iod_value;=0A=
+		    } else {=0A=
+		      iod_file_sz =3D fs_p->iod[(i + =
ack_p->ack.stat.meta.p_stat.base) %=0A=
+					      fs_p->nr_iods].ack.ack.stat.fsize;=0A=
+		    }=0A=
+=0A=
+		    //if (iod_file_sz =3D=3D 0) continue; =0A=
+		    if (iod_file_sz =3D=3D 0) break; /*there really shouldn't be =
anything else out there */=0A=
+		    =0A=
+		    nr_strips =3D iod_file_sz / strip_sz;=0A=
+		    leftovers =3D iod_file_sz % strip_sz;=0A=
+		    =0A=
+		    if (leftovers =3D=3D 0) {=0A=
+		      nr_strips--;=0A=
+		      leftovers +=3D strip_sz;=0A=
+		    }=0A=
+		    real_file_sz =3D nr_strips * stripe_sz + i * strip_sz +=0A=
+		      leftovers;=0A=
+		  }=0A=
+=0A=
+		  ack_p->ack.stat.meta.u_stat.st_size =3D real_file_sz;=0A=
+		  ack_p->ack.stat.meta.fsize =3D real_file_sz;=0A=
+		  =0A=
+		  /* store this info in the metadata so that we don't pay again =
*/=0A=
+		  fd =3D meta_open(data_p, O_WRONLY);=0A=
+		  if((meta_write(fd, &(ack_p->ack.stat.meta))) !=3D 0){=0A=
+		    PERROR("do_stat: meta_write");=0A=
+		  }=0A=
+		  if((meta_close(fd)) !=3D 0){=0A=
+		    PERROR("do_stat: meta_close");=0A=
+		  }=0A=
+		}=0A=
+	      }=0A=
+	    }=0A=
+	  /* else do nothing because it is already saved */=0A=
+	  =0A=
+	} else { /* old fashioned-way */=0A=
+	  =0A=
+	  /* if regular file, get size from iods */=0A=
 	if (S_ISREG(ack_p->ack.stat.meta.u_stat.st_mode)) {=0A=
 		iodreq.majik_nr       =3D IOD_MAJIK_NR;=0A=
 		iodreq.release_nr     =3D PVFS_RELEASE_NR;=0A=
@@ -815,6 +915,8 @@=0A=
 		ack_p->ack.stat.meta.fsize =3D real_file_sz;=0A=
 	}=0A=
 =0A=
+	} /* end of unfast stat */=0A=
+=0A=
 	if (!ack_p->status) return(bsend(sock, ack_p, sizeof(mack)));=0A=
 	else send_error(-1, ack_p->eno, sock, ack_p);=0A=
 =0A=
@@ -1076,7 +1178,7 @@=0A=
 		}=0A=
 		else {=0A=
 			/* call md_close() to update times and such */=0A=
-			md_close(req_p, f_p->f_name);=0A=
+			md_close(req_p, f_p->f_name, fs_p);=0A=
 		}=0A=
 		f_rem(fs_p->fl_p,f_p->f_ino); /* wax file info */=0A=
 	}=0A=
@@ -1808,7 +1910,7 @@=0A=
 			}=0A=
 			req.req.close.meta.u_stat.atime =3D time(NULL);=0A=
 			/* store the time values in the metadata file */=0A=
-			md_close(&req, f_p->f_name);=0A=
+			md_close(&req, f_p->f_name, iclose_fsp);=0A=
 =0A=
 			if (f_p->unlinked >=3D 0) /* tell IODs to remove the file too */ =
{=0A=
 				iodreq.majik_nr         =3D IOD_MAJIK_NR;=0A=
Only in pvfs-ls-dev/mgr: mgr.c~=0A=
diff -ur pvfs-1.6.0/mgr/mgr.h pvfs-ls-dev/mgr/mgr.h=0A=
--- pvfs-1.6.0/mgr/mgr.h	Thu Oct 24 10:09:33 2002=0A=
+++ pvfs-ls-dev/mgr/mgr.h	Tue Sep  9 15:27:58 2003=0A=
@@ -59,7 +59,7 @@=0A=
 /* METADATA FN PROTOTYPES */=0A=
 int md_chmod(mreq_p request, char *fname);=0A=
 int md_chown(mreq_p request, char *fname);=0A=
-int md_close(mreq_p req, char *fname);=0A=
+int md_close(mreq_p req, char *fname, fsinfo_p fs_p);=0A=
 int md_open(char *name, mreq_p request, fmeta_p metar_p);=0A=
 int md_rmdir(mreq_p req_p, char *fname);=0A=
 int md_mkdir(char *dirpath, dmeta_p dir);=0A=

------_=_NextPart_000_01C37C97.ED30CC50--