Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: Re: zsh hangs (3.0.6-pre-2, 3.1.5-pws-16)



On Apr 29,  9:01am, Bart Schaefer wrote:
} Subject: Re: zsh hangs (3.0.6-pre-2, 3.1.5-pws-16)
}
} The hang appears to be a race condition in exec.c: getoutput().  Zsh is
} blocked forever in sigsuspend() waiting for the SIGCHLD that will tell
} it `resize` has exited.  Probably that signal arrived while zsh was
} handling the SIGWINCH sent by "resize" and zsh either improperly handled
} it then or dropped it on the floor.

This is indeed a race condition; I can't reproduce it under strace, but
the output I do get makes me suspicious of what's going on.  Here's a
fragment starting with the first WINCH and ending with the second:

--- SIGWINCH (Window size changed) ---
sigprocmask(SIG_BLOCK, ~[], [WINCH])    = 0
sigprocmask(SIG_SETMASK, [WINCH], ~[KILL STOP]) = 0
ioctl(10, TIOCGWINSZ, {ws_row=28, ws_col=86, ws_xpixel=535, ws_ypixel=368}) = 0
ioctl(10, TIOCSPGRP, [19992])           = 0
ioctl(10, SNDCTL_TMR_STOP, {B9600 opost isig -icanon -echo ...}) = 0
geteuid()                               = 674
write(10, "\r\33[m\17\33[27m\33[24m\33[Jzag"..., 26) = 26
write(10, "\33[K\33[67C(p6)\r\33[8C", 17) = 17
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
time(NULL)                              = 925778145
pipe([4, 5])                            = 0
fcntl(4, F_DUPFD, 10)                   = 11
close(4)                                = 0
fcntl(5, F_DUPFD, 10)                   = 12
close(5)                                = 0
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
fork()                                  = 19998
close(12)                               = 0
fcntl(11, F_GETFL)                      = 0 (flags O_RDONLY)
fstat(11, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x40008000
lseek(11, 0, SEEK_CUR)                  = -1 ESPIPE (Illegal seek)
read(11, "TERMCAP=\'xterm|vs100|xterms|xte"..., 4096) = 990
read(11, "", 4096)                      = 0
close(11)                               = 0
munmap(0x40008000, 4096)                = 0
sigsuspend(~[HUP CHLD] <unfinished ...>
--- SIGCHLD (Child exited) ---
<... sigsuspend resumed> )              = -1 EINTR (Interrupted system call)
sigprocmask(SIG_BLOCK, ~[], ~[HUP KILL STOP]) = 0
sigprocmask(SIG_SETMASK, ~[HUP KILL STOP], ~[KILL STOP]) = 0
wait4(-1, [WIFEXITED(s) && WEXITSTATUS(s) == 0], WNOHANG|WUNTRACED, NULL) = 19998
times({tms_utime=3, tms_stime=6, tms_cutime=5, tms_cstime=3}) = 22268196
wait4(-1, 0xbfffefb4, WNOHANG|WUNTRACED, NULL) = -1 ECHILD (No child processes)
sigreturn()                             = ? (mask now [CHLD WINCH])
brk(0x80b4000)                          = 0x80b4000
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
ioctl(10, TIOCGWINSZ, {ws_row=28, ws_col=86, ws_xpixel=516, ws_ypixel=364}) = 0
ioctl(10, TIOCSWINSZ, {ws_row=28, ws_col=86, ws_xpixel=516, ws_ypixel=364}) = 0
ioctl(10, TIOCSPGRP, [19992])           = 0
ioctl(10, SNDCTL_TMR_STOP, {B9600 opost isig -icanon -echo ...}) = 0
geteuid()                               = 674
write(10, "\r\33[m\17\33[27m\33[24m\33[Jzag"..., 26) = 26
write(10, "\33[K\33[73C(p6)\r\33[8C", 17) = 17
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
ioctl(10, TIOCGWINSZ, {ws_row=28, ws_col=86, ws_xpixel=516, ws_ypixel=364}) = 0
ioctl(10, TIOCSWINSZ, {ws_row=28, ws_col=86, ws_xpixel=516, ws_ypixel=364}) = 0
ioctl(10, TIOCSPGRP, [19992])           = 0
ioctl(10, SNDCTL_TMR_STOP, {B9600 opost isig -icanon -echo ...}) = 0
geteuid()                               = 674
write(10, "\r\33[m\17\33[27m\33[24m\33[Jzag"..., 26) = 26
write(10, "\33[K\33[73C(p6)\r\33[8C", 17) = 17
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
write(1, "\n", 1)                       = 1
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
write(1, "resized\n", 8)                = 8
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigprocmask(SIG_BLOCK, [CHLD], [WINCH]) = 0
write(1, "\n", 1)                       = 1
sigprocmask(SIG_BLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [CHLD WINCH]) = 0
sigprocmask(SIG_UNBLOCK, [CHLD], [WINCH]) = 0
sigreturn()                             = ? (mask now [])
--- SIGWINCH (Window size changed) ---

Note that the second WINCH isn't delivered until the first handler is done,
which is as it should be.  However, also note that during execution of the
handler, there are several instances where CHLD is blocked twice and then
unblocked twice.  Of course, the second block and the second unblock are
no-ops -- which is very bad in the case of the second unblock, because it
means that a SIGCHLD could be received before whatever code first blocked
the signal is ready to handle it.

I think that's what's happening -- getoutput() has called child_block()
and will call child_unblock() right before child_suspend(); but some other
code [probably in execpline()] has already called child_unblock(), so the
SIGCHLD gets handled before child_suspend() is called.  This results in
deadlock, because there are no children left.

"Premature" CHLD-unblocking doesn't cause a lockup at other times because
the code in waitjob() checks whether any children exist before calling
child_suspend().  Unfortunately, the process forked in getoutput() is not
added to the job table, so it's not directly possible to check whether it's
already been waited for.  The processes forked by getpipe() and getproc()
are similarly not job-tabled, but zsh never explicitly waits for them.

However, there is code in jobs.c to handle waiting for a process that is
not in the job table.  So I think the fix is the following; it doesn't
appear to break anything in simple tests.  The only behavioral change is
that zsh becomes interruptible with SIGINT (^C) during the wait for the
$(command), but that shouldn't matter because the output from the command
has already been consumed by readoutput().

Index: Src/exec.c
===================================================================
RCS file: /extra/cvsroot/zsh/zsh-3.0/Src/exec.c,v
retrieving revision 1.1.1.5.2.3
diff -u -r1.1.1.5.2.3 exec.c
--- exec.c	1999/04/28 05:21:34	1.1.1.5.2.3
+++ exec.c	1999/05/04 16:34:15
@@ -2071,7 +2071,7 @@
 	zclose(pipes[1]);
 	retval = readoutput(pipes[0], qt);
 	fdtable[pipes[0]] = 0;
-	child_suspend(0);		/* unblocks */
+	waitforpid(pid);		/* unblocks */
 	lastval = cmdoutval;
 	return retval;
     }

-- 
Bart Schaefer                                 Brass Lantern Enterprises
http://www.well.com/user/barts              http://www.brasslantern.com



Messages sorted by: Reverse Date, Date, Thread, Author