[Pvfs2-developers] Copy commands segfault on 2.4 kernels
Phil Carns
carns at mcs.anl.gov
Wed Apr 8 09:56:42 EDT 2009
You might also want to turn on all of the kernel module debugging by
echo'ing 32767 into /proc/sys/pvfs2/debug and looking at what it
generated in dmesg and/or /var/log/messages in the crashing cp case.
thanks,
-Phil
Phil Carns wrote:
> I don't see anything offensive in the stat output. There are no
> messages in dmesg or pvfs2-client.log, right?
>
> Two other possible ways to proceed may be to:
>
> 1) try the same strace'd cp on a different file system to compare the
> output and see what system call is supposed to happen next after the
> fstat's
>
> 2) try downloading a the source to your version of core-utils
> (http://ftp.gnu.org/gnu/coreutils/) and compiling it with debugging
> symbols so that you can actually see the segfault in gdb or valgrind.
> You can probably just set the CFLAGS env varaible to "-g" before running
> configure in core-utils to get debugging symbols.
>
> Actually, running valgrind on the cp command that you already have might
> possibly indicate something interesting, even if it can't map it to a
> particular line number.
>
> -Phil
>
> Bart Taylor wrote:
>> Here is a full strace -v output:
>>
>>
>>
>> [root at node1 root]# strace -v cp test.file /mnt/pvfs2/
>> execve("/bin/cp", ["cp", "test.file", "/mnt/pvfs2/"], [/* 22 vars */])
>> = 0
>> uname({sysname="Linux", nodename="node1",
>> release="2.4.21-27.0.2.ELsmp", version="#1 SMP Wed Jan 12 23:35:44 EST
>> 2005", machine="i686"}) = 0
>> brk(0) = 0x9692000
>> open("/etc/ld.so.preload", O_RDONLY) = -1 ENOENT (No such file or
>> directory)
>> open("/etc/ld.so.cache", O_RDONLY) = 3
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=229475,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=32, st_size=14525, st_atime=2009/04/07-15:54:03,
>> st_mtime=2009/04/07-13:38:35, st_ctime=2009/04/07-13:38:35}) = 0
>> old_mmap(NULL, 14525, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb75f5000
>> close(3) = 0
>> open("/lib/libacl.so.1", O_RDONLY) = 3
>> read(3,
>> "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0@\24\0\000"..., 512) =
>> 512
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=524363,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=40, st_size=19248, st_atime=2009/04/07-15:54:03,
>> st_mtime=2003/01/28-18:42:21, st_ctime=2009/04/07-13:37:22}) = 0
>> old_mmap(NULL, 22224, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x89c000
>> old_mmap(0x8a1000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED,
>> 3, 0x4000) = 0x8a1000
>> close(3) = 0
>> mprotect(0xbfffa000, 4096,
>> PROT_READ|PROT_WRITE|PROT_EXEC|PROT_GROWSDOWN) = 0
>> open("/lib/tls/libc.so.6", O_RDONLY) = 3
>> read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\200X\1"...,
>> 512) = 512
>> old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
>> -1, 0) = 0xb75f4000
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=14172162,
>> st_mode=S_IFREG|0755, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=3080, st_size=1571692, st_atime=2009/04/07-15:54:03,
>> st_mtime=2004/10/22-04:01:20, st_ctime=2009/04/07-13:37:20}) = 0
>> old_mmap(NULL, 1275340, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) =
>> 0xd4c000
>> old_mmap(0xe7e000, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED,
>> 3, 0x132000) = 0xe7e000
>> old_mmap(0xe81000, 9676, PROT_READ|PROT_WRITE,
>> MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xe81000
>> close(3) = 0
>> open("/lib/libattr.so.1", O_RDONLY) = 3
>> read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\320\n\0"...,
>> 512) = 512
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=524361,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=16, st_size=7148, st_atime=2009/04/07-15:54:03,
>> st_mtime=2003/01/28-18:09:10, st_ctime=2009/04/07-13:37:22}) = 0
>> old_mmap(NULL, 10124, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0xb1e000
>> old_mmap(0xb20000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED,
>> 3, 0x1000) = 0xb20000
>> close(3) = 0
>> set_thread_area({entry_number:-1 -> 6, base_addr:0xb75f4ae0,
>> limit:1048575, seg_32bit:1, contents:0, read_exec_only:0,
>> limit_in_pages:1, seg_not_present:0, useable:1}) = 0
>> munmap(0xb75f5000, 14525) = 0
>> brk(0) = 0x9692000
>> brk(0x96b3000) = 0x96b3000
>> brk(0) = 0x96b3000
>> open("/usr/lib/locale/locale-archive", O_RDONLY|O_LARGEFILE) = 3
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=3325956,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=62808, st_size=32148976, st_atime=2009/04/07-15:29:59,
>> st_mtime=2009/04/07-13:37:19, st_ctime=2009/04/07-13:37:20}) = 0
>> mmap2(NULL, 2097152, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb73f4000
>> close(3) = 0
>> geteuid32() = 0
>> lstat64("/mnt/pvfs2/", {st_dev=makedev(0, 10), st_ino=1048576,
>> st_mode=S_IFDIR|S_ISVTX|0777, st_nlink=1, st_uid=0, st_gid=0,
>> st_blksize=33554432, st_blocks=8, st_size=4096,
>> st_atime=2009/04/07-15:31:17, st_mtime=2009/04/07-15:31:17,
>> st_ctime=2009/04/07-15:31:17}) = 0
>> stat64("/mnt/pvfs2/", {st_dev=makedev(0, 10), st_ino=1048576,
>> st_mode=S_IFDIR|S_ISVTX|0777, st_nlink=1, st_uid=0, st_gid=0,
>> st_blksize=33554432, st_blocks=8, st_size=4096,
>> st_atime=2009/04/07-15:31:17, st_mtime=2009/04/07-15:31:17,
>> st_ctime=2009/04/07-15:31:17}) = 0
>> stat64("test.file", {st_dev=makedev(104, 3), st_ino=294926,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=8, st_size=5, st_atime=2009/04/07-15:29:59,
>> st_mtime=2009/04/07-14:45:08, st_ctime=2009/04/07-14:46:32}) = 0
>> stat64("/mnt/pvfs2/test.file", {st_dev=makedev(0, 10), st_ino=1048571,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0,
>> st_blksize=33554432, st_blocks=8, st_size=5,
>> st_atime=2009/04/07-15:31:17, st_mtime=2009/04/07-15:31:17,
>> st_ctime=2009/04/07-15:31:17}) = 0
>> open("test.file", O_RDONLY|O_LARGEFILE) = 3
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=294926,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=8, st_size=5, st_atime=2009/04/07-15:29:59,
>> st_mtime=2009/04/07-14:45:08, st_ctime=2009/04/07-14:46:32}) = 0
>> open("/mnt/pvfs2/test.file", O_WRONLY|O_TRUNC|O_LARGEFILE) = 4
>> fstat64(4, {st_dev=makedev(0, 10), st_ino=1048571,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0,
>> st_blksize=33554432, st_blocks=8, st_size=0,
>> st_atime=2009/04/07-15:31:17, st_mtime=2009/04/07-15:31:17,
>> st_ctime=2009/04/07-15:55:20}) = 0
>> fstat64(3, {st_dev=makedev(104, 3), st_ino=294926,
>> st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096,
>> st_blocks=8, st_size=5, st_atime=2009/04/07-15:29:59,
>> st_mtime=2009/04/07-14:45:08, st_ctime=2009/04/07-14:46:32}) = 0
>> --- SIGSEGV (Segmentation fault) @ 0 (0) ---
>> +++ killed by SIGSEGV +++
>> [root at node1 root]#
>>
>>
>>
>>
>>
>>
>>
>> On Tue, Apr 7, 2009 at 2:53 PM, Phil Carns <carns at mcs.anl.gov
>> <mailto:carns at mcs.anl.gov>> wrote:
>>
>> Hi Bart,
>>
>> From your strace output, my guess is that cp is running into
>> trouble with the value of one of the fstat() fields, but its hard to
>> say which one.
>>
>> Are you able to reproduce this reliably? Could you run the strace
>> again with the -v option to see if it gives a full listing of what
>> values were in the stat structs it got before crashing?
>>
>> -Phil
>>
>> Bart Taylor wrote:
>>
>> Hey guys,
>>
>> I am running into a problem with a system copy command
>> segfaulting on 2.4 kernels. Specifically, I am seeing this show
>> up on RHEL3 machines running a patched version of PVFS 2.6.
>> Machines running Linux 2.6 kernels do not experience this
>> problem. I believe we may have mentioned this recently but
>> hoped it would be fixed by some updates pulled into dcache.
>> That, apparently, is not the case.
>>
>> The segfault is extremely consistent; it happens every time a cp
>> is executed with a PVFS2 file system as the target. The target
>> file is always created with a size of zero, so at least part of
>> the command is completing. 'dd' commands execute normally.
>>
>> The setup is simple: 1 server node (RHEL4 2.6 kernel) with the
>> default interactive genconfig output, and 1 client with a 2.4
>> kernel. Mount the file system, execute a copy onto the file
>> system.
>> Here is the conf file contents:
>>
>> <Defaults>
>> UnexpectedRequests 50
>> EventLogging none
>> LogStamp datetime
>> BMIModules bmi_tcp
>> FlowModules flowproto_multiqueue
>> PerfUpdateInterval 1000
>> ServerJobBMITimeoutSecs 30
>> ServerJobFlowTimeoutSecs 30
>> ClientJobBMITimeoutSecs 300
>> ClientJobFlowTimeoutSecs 300
>> ClientRetryLimit 5
>> ClientRetryDelayMilliSecs 2000
>> TCPBindSpecific yes
>> </Defaults>
>>
>> <Aliases>
>> Alias node1 tcp://node1:3334
>> </Aliases>
>>
>> <Filesystem>
>> Name pvfs2-fs
>> ID 1227216139
>> RootHandle 1048576
>> <MetaHandleRanges>
>> Range node1 4-2147483650
>> </MetaHandleRanges>
>> <DataHandleRanges>
>> Range node1 2147483651-4294967297
>> </DataHandleRanges>
>> <StorageHints>
>> TroveSyncMeta no
>> TroveSyncData no
>> CoalescingHighWatermark infinity
>> CoalescingLowWatermark 0
>> TroveSyncMetaTimerSecs 5
>> DBCacheSizeBytes 1073741824
>> </StorageHints>
>> </Filesystem>
>>
>> And here is the last bit of an strace on a copy command:
>>
>> [root at node1 root]# strace cp test.file /mnt/pvfs2/
>> .....
>> brk(0) = 0x95ce000
>> open("/usr/lib/locale/locale-archive", O_RDONLY|O_LARGEFILE) = 3
>> fstat64(3, {st_mode=S_IFREG|0644, st_size=32148976, ...}) = 0
>> mmap2(NULL, 2097152, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb73f4000
>> close(3) = 0
>> geteuid32() = 0
>> lstat64("/mnt/pvfs2/", {st_mode=S_IFDIR|S_ISVTX|0777,
>> st_size=4096, ...}) = 0
>> stat64("/mnt/pvfs2/", {st_mode=S_IFDIR|S_ISVTX|0777,
>> st_size=4096, ...}) = 0
>> stat64("test.file", {st_mode=S_IFREG|0644, st_size=5, ...}) = 0
>> stat64("/mnt/pvfs2/test.file", {st_mode=S_IFREG|0644, st_size=0,
>> ...}) = 0
>> open("test.file", O_RDONLY|O_LARGEFILE) = 3
>> fstat64(3, {st_mode=S_IFREG|0644, st_size=5, ...}) = 0
>> open("/mnt/pvfs2/test.file", O_WRONLY|O_TRUNC|O_LARGEFILE) = 4
>> fstat64(4, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
>> fstat64(3, {st_mode=S_IFREG|0644, st_size=5, ...}) = 0
>> --- SIGSEGV (Segmentation fault) @ 0 (0) ---
>> +++ killed by SIGSEGV +++
>>
>>
>> There is nothing in the client or server logs without turning on
>> additional logging.
>>
>> Are there any suggestions on what might be causing this? Can I
>> provide any additional information that will be helpful for
>> debugging?
>>
>> Bart.
>>
>>
>>
>> ------------------------------------------------------------------------
>>
>> _______________________________________________
>> Pvfs2-developers mailing list
>> Pvfs2-developers at beowulf-underground.org
>> <mailto:Pvfs2-developers at beowulf-underground.org>
>>
>> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
>>
>>
>>
>
> _______________________________________________
> Pvfs2-developers mailing list
> Pvfs2-developers at beowulf-underground.org
> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
More information about the Pvfs2-developers
mailing list