Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Re: Zsh hangs sometimes?



On Fri, 2 May 2008 19:39:43 +0200
Kamil Jońca <kjonca@xxxxxxxxxxxxxx> wrote:
> And sometimes this hangs :( 
> there's not child processes.

OK, as you already know this isn't a very good way of doing things
but it does tickle a bug and I've managed to get it to happen.

With a bit of luck, I think this fixes it.  On attaching to the
shell I happened to notice that there were two jobs in the job table
with the same process number.  One was marked as done, and the other
wasn't:  the shell was waiting for the second job to finish.
Further probing confirmed my suspicion that the shell had decided
to fish out the PID for a job that was already done and mark it
as, well, even more well done.  So the newly done job never got
cleared and the shell waited for it for ever.  This would only happen if
you had lots and lots and lots of processes so that there was a chance
that the process numbers would wrap before an entry in the table was
cleared (an essentially random process depending on when children
died)---not something that happens with most programmes.

The fix is not to fish out jobs that are already done.  I think the use
of findproc() is restricted to occasions when you've just found out, or
want to find out, something new about a process and hence ignoring those
attached to done and dusted jobs is OK.

I've tidied up zhandler() because the comments were all over the place
and I took against the way a goto in a for loop was written, and I've
tidied up the STAT_ definitions because I can't do powers of two in my
head well enough.  The only actual fix is the new test in the first hunk,
so you can ignore the rest until I commit it.

I'm extremely relieved this wasn't a horrible race.

Index: Src/jobs.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/jobs.c,v
retrieving revision 1.62
diff -u -r1.62 jobs.c
--- Src/jobs.c	25 Mar 2008 18:17:08 -0000	1.62
+++ Src/jobs.c	2 May 2008 22:33:14 -0000
@@ -153,6 +153,15 @@
 
     for (i = 1; i <= maxjob; i++)
     {
+	/*
+	 * We are only interested in jobs with processes still
+	 * marked as live.  Careful in case there's an identical
+	 * process number in a job we haven't quite got around
+	 * to deleting.
+	 */
+	if (jobtab[i].stat & STAT_DONE)
+	    continue;
+
 	for (pn = aux ? jobtab[i].auxprocs : jobtab[i].procs;
 	     pn; pn = pn->next)
 	    if (pn->pid == pid) {
Index: Src/signals.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/signals.c,v
retrieving revision 1.45
diff -u -r1.45 signals.c
--- Src/signals.c	1 May 2007 09:35:05 -0000	1.45
+++ Src/signals.c	2 May 2008 22:33:14 -0000
@@ -408,15 +408,21 @@
     signal_process(sig);
  
     sigfillset(&newmask);
-    oldmask = signal_block(newmask);        /* Block all signals temporarily           */
+    /* Block all signals temporarily           */
+    oldmask = signal_block(newmask);
  
 #if defined(NO_SIGNAL_BLOCKING)
-    do_jump = suspend_longjmp;              /* do we need to longjmp to signal_suspend */
-    suspend_longjmp = 0;                    /* In case a SIGCHLD somehow arrives       */
-
-    if (sig == SIGCHLD) {                   /* Traps can cause nested signal_suspend()  */
-        if (do_jump)
-            jump_to = suspend_jmp_buf;      /* Copy suspend_jmp_buf                    */
+    /* do we need to longjmp to signal_suspend */
+    do_jump = suspend_longjmp;
+    /* In case a SIGCHLD somehow arrives       */
+    suspend_longjmp = 0;
+
+    /* Traps can cause nested signal_suspend()  */
+    if (sig == SIGCHLD) {
+        if (do_jump) {
+	    /* Copy suspend_jmp_buf                    */
+            jump_to = suspend_jmp_buf;
+	}
     }
 #endif
 
@@ -425,30 +431,36 @@
         int temp_rear = ++queue_rear % MAX_QUEUE_SIZE;
 
 	DPUTS(temp_rear == queue_front, "BUG: signal queue full");
-        if (temp_rear != queue_front) { /* Make sure it's not full (extremely unlikely) */
-            queue_rear = temp_rear;                  /* ok, not full, so add to queue   */
-            signal_queue[queue_rear] = sig;          /* save signal caught              */
-            signal_mask_queue[queue_rear] = oldmask; /* save current signal mask        */
+	/* Make sure it's not full (extremely unlikely) */
+        if (temp_rear != queue_front) {
+	    /* ok, not full, so add to queue   */
+            queue_rear = temp_rear;
+	    /* save signal caught              */
+            signal_queue[queue_rear] = sig;
+	    /* save current signal mask        */
+            signal_mask_queue[queue_rear] = oldmask;
         }
         signal_reset(sig);
         return;
     }
  
-    signal_setmask(oldmask);          /* Reset signal mask, signal traps ok now */
+    /* Reset signal mask, signal traps ok now */
+    signal_setmask(oldmask);
  
     switch (sig) {
     case SIGCHLD:
 
 	/* keep WAITING until no more child processes to reap */
-	for (;;)
-	  cont: {
-            int old_errno = errno; /* save the errno, since WAIT may change it */
+	for (;;) {
+	    /* save the errno, since WAIT may change it */
+	    int old_errno = errno;
 	    int status;
 	    Job jn;
 	    Process pn;
-            pid_t pid;
+	    pid_t pid;
 	    pid_t *procsubpid = &cmdoutpid;
 	    int *procsubval = &cmdoutval;
+	    int cont = 0;
 	    struct execstack *es = exstack;
 
 	    /*
@@ -471,8 +483,8 @@
 # endif
 #endif
 
-            if (!pid)  /* no more children to reap */
-                break;
+	    if (!pid)  /* no more children to reap */
+		break;
 
 	    /* check if child returned was from process substitution */
 	    for (;;) {
@@ -483,7 +495,8 @@
 		    else
 			*procsubval = WEXITSTATUS(status);
 		    get_usage();
-		    goto cont;
+		    cont = 1;
+		    break;
 		}
 		if (!es)
 		    break;
@@ -491,16 +504,22 @@
 		procsubval = &es->cmdoutval;
 		es = es->next;
 	    }
+	    if (cont)
+		continue;
 
 	    /* check for WAIT error */
-            if (pid == -1) {
-                if (errno != ECHILD)
-                    zerr("wait failed: %e", errno);
-                errno = old_errno;    /* WAIT changed errno, so restore the original */
-                break;
-            }
+	    if (pid == -1) {
+		if (errno != ECHILD)
+		    zerr("wait failed: %e", errno);
+		/* WAIT changed errno, so restore the original */
+		errno = old_errno;
+		break;
+	    }
 
-	    /* Find the process and job containing this pid and update it. */
+	    /*
+	     * Find the process and job containing this pid and
+	     * update it.
+	     */
 	    if (findproc(pid, &jn, &pn, 0)) {
 #if defined(HAVE_WAIT3) && defined(HAVE_GETRUSAGE)
 		struct timezone dummy_tz;
@@ -517,11 +536,12 @@
 	    } else {
 		/* If not found, update the shell record of time spent by
 		 * children in sub processes anyway:  otherwise, this
-		 * will get added on to the next found process that terminates.
+		 * will get added on to the next found process that
+		 * terminates.
 		 */
 		get_usage();
 	    }
-        }
+	}
         break;
  
     case SIGHUP:
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.128
diff -u -r1.128 zsh.h
--- Src/zsh.h	29 Apr 2008 17:19:26 -0000	1.128
+++ Src/zsh.h	2 May 2008 22:33:14 -0000
@@ -857,24 +857,24 @@
     struct ttyinfo *ty;		/* the modes specified by STTY       */
 };
 
-#define STAT_CHANGED	(1<<0)	/* status changed and not reported      */
-#define STAT_STOPPED	(1<<1)	/* all procs stopped or exited          */
-#define STAT_TIMED	(1<<2)	/* job is being timed                   */
-#define STAT_DONE	(1<<3)	/* job is done                          */
-#define STAT_LOCKED	(1<<4)	/* shell is finished creating this job, */
-                                /*   may be deleted from job table      */
-#define STAT_NOPRINT	(1<<5)	/* job was killed internally,           */
-                                /*   we don't want to show that         */
-#define STAT_INUSE	(1<<6)	/* this job entry is in use             */
-#define STAT_SUPERJOB	(1<<7)	/* job has a subjob                     */
-#define STAT_SUBJOB	(1<<8)	/* job is a subjob                      */
-#define STAT_WASSUPER   (1<<9)  /* was a super-job, sub-job needs to be */
-				/* deleted */
-#define STAT_CURSH	(1<<10)	/* last command is in current shell     */
-#define STAT_NOSTTY	(1<<11)	/* the tty settings are not inherited   */
-				/* from this job when it exits.         */
-#define STAT_ATTACH	(1<<12)	/* delay reattaching shell to tty       */
-#define STAT_SUBLEADER  (1<<13) /* is super-job, but leader is sub-shell */
+#define STAT_CHANGED	(0x0001) /* status changed and not reported      */
+#define STAT_STOPPED	(0x0002) /* all procs stopped or exited          */
+#define STAT_TIMED	(0x0004) /* job is being timed                   */
+#define STAT_DONE	(0x0008) /* job is done                          */
+#define STAT_LOCKED	(0x0010) /* shell is finished creating this job, */
+                                 /*   may be deleted from job table      */
+#define STAT_NOPRINT	(0x0020) /* job was killed internally,           */
+                                 /*   we don't want to show that         */
+#define STAT_INUSE	(0x0040) /* this job entry is in use             */
+#define STAT_SUPERJOB	(0x0080) /* job has a subjob                     */
+#define STAT_SUBJOB	(0x0100) /* job is a subjob                      */
+#define STAT_WASSUPER   (0x0200) /* was a super-job, sub-job needs to be */
+				 /* deleted */
+#define STAT_CURSH	(0x0400) /* last command is in current shell     */
+#define STAT_NOSTTY	(0x0800) /* the tty settings are not inherited   */
+				 /* from this job when it exits.         */
+#define STAT_ATTACH	(0x1000) /* delay reattaching shell to tty       */
+#define STAT_SUBLEADER  (0x2000) /* is super-job, but leader is sub-shell */
 
 #define SP_RUNNING -1		/* fake status for jobs currently running */
 


-- 
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/



Messages sorted by: Reverse Date, Date, Thread, Author