Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: short-circuiting glob exclusion operator



This is a request for comments for now.  I'm not yet entirely sure the
pattern syntax with two tildes isn't going to cause problems (though I
don't see what they would be), or that I've identified all the side
effects that can happen.

As an ultra brief summary, ~~ in a glob introduces a pattern that will
prune directories on the way through in addition to acting in the normal
exclusion fashion.  So **/*~~*/foo stops at and ignores any directories
called foo at any level(#1) on the way down.  This is unlike a single
tilde where exclusions are only performed after the complete set of
matches is generated, so it's potentially much more efficient.

(#1) Except the top.  It's a bit anhoying you need  (|*/).

pws

diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo
index c6e7b6f..13ef139 100644
--- a/Doc/Zsh/expn.yo
+++ b/Doc/Zsh/expn.yo
@@ -2083,6 +2083,31 @@ Multiple patterns can be excluded by
 In the exclusion pattern (var(y)), `tt(/)' and `tt(.)' are not treated
 specially the way they usually are in globbing.
 )
+item(var(x)tt(~~)var(y))(
+(Requires tt(EXTENDED_GLOB) to be set.)
+This alternative form of the tt(~) pattern is only handled during
+filename generation (globbing).  The second tilde is not special
+in other forms of pattern matching.
+
+This is similar to the form with a single tilde in that the pattern var(x)
+must match and anything matched by var(y) is excluded from the match.
+However, in this case var(y) is used additionally to prune intermediate
+directories.  In other words, var(y) can match not just a complete file
+name to be returned, but also any directory by which the file is
+reached.  This is much more efficient than the previous form for deep
+directory hierarchies as the directory is excluded as soon as it is
+reached and is never entered.  As with the single-tilde form, tt(/)
+is not special in var(y) and so may be matched by pattern characters.
+
+Only a single tt(~~) is useful in each filename generation pattern.
+
+For example, `tt(**/*.o~~(*/|)lib)' searches recursively for files with
+the suffix `tt(.o)', but if a directory named `tt(lib)' is reached
+at the top level or in any subdirectory found recursively it is not
+searched.  With the pattern `tt(**/*~~(*/|)lib+LPAR()/+RPAR()), which searches
+recursively for directories, directories name tt(lib) are not only not
+searched but are not returned as a match.
+)
 item(var(x)tt(#))(
 (Requires tt(EXTENDED_GLOB) to be set.)
 Matches zero or more occurrences of the pattern var(x).
diff --git a/Src/glob.c b/Src/glob.c
index 2051016..a7a44f4 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -185,6 +185,7 @@ struct globdata {
     int gd_gf_follow, gd_gf_sorts, gd_gf_nsorts;
     struct globsort gd_gf_sortlist[MAX_SORTS];
     LinkList gd_gf_pre_words, gd_gf_post_words;
+    Patprog gd_exclude;
 
     char *gd_glob_pre, *gd_glob_suf;
 };
@@ -259,7 +260,7 @@ struct complist {
 
 /**/
 static void
-addpath(char *s, int l)
+addpath(const char *s, int l)
 {
     DPUTS(!pathbuf, "BUG: pathbuf not initialised");
     while (pathpos + l + 1 >= pathbufsz)
@@ -459,6 +460,38 @@ insert(char *s, int checked)
     return;
 }
 
+/*
+ * Decide whether to prune at this point in scanning. 1 = yes.
+ *
+ * We'll make temporary use of pathbuf for this; it's convenient because
+ * most of a long path won't need copying and if we are later going to
+ * add this file to it we know the buffer will be large enough.
+ *
+ * The filename fn is here metafied.
+ */
+/**/
+static int
+scanner_prune(const char *fn, int len)
+{
+    int ret, savepathpos, saverrsfound, savforceerrs;
+    if (!curglobdata.gd_exclude)
+	return 0;
+    savepathpos = pathpos;
+    addpath(fn, len);
+    if (pathpos && pathbuf[pathpos-1] == '/')
+	pathbuf[pathpos-1] = '\0';
+    /*
+     * As we're in the middle of a glob, be careful about
+     * the counters needed for approximation across directories.
+     */
+    patsaveerrs(&saverrsfound, &savforceerrs);
+    ret = pattry(curglobdata.gd_exclude, pathbuf);
+    patrestoreerrs(saverrsfound, savforceerrs);
+    pathbuf[pathpos = savepathpos] = '\0';
+
+    return ret;
+}
+
 /* Do the globbing:  scanner is called recursively *
  * with successive bits of the path until we've    *
  * tried all of it.                                */
@@ -510,6 +543,8 @@ scanner(Complist q, int shortcircuit)
 	    }
 	    pathbufcwd = pathpos;
 	}
+	if (scanner_prune(str, l))
+	    return;
 	if (q->next) {
 	    /* Not the last path section. Just add it to the path. */
 	    int oppos = pathpos;
@@ -563,6 +598,8 @@ scanner(Complist q, int shortcircuit)
 		continue;
 	    errsfound = errssofar;
 	    if (pattry(p, fn)) {
+		if (scanner_prune(fn, strlen(fn)))
+		    continue;
 		/* if this name matchs the pattern... */
 		if (pbcwdsav == pathbufcwd &&
 		    strlen(fn) + pathpos - pathbufcwd >= PATH_MAX) {
@@ -675,9 +712,10 @@ scanner(Complist q, int shortcircuit)
 
 /**/
 static Complist
-parsecomplist(char *instr)
+parsecomplist(char *instr, char **post_exclude)
 {
     Patprog p1;
+    struct patcompargs compargs;
     Complist l1;
     char *str;
     int compflags = gf_noglobdots ? (PAT_FILE|PAT_NOGLD) : PAT_FILE;
@@ -699,11 +737,11 @@ parsecomplist(char *instr)
 
 	    /* Now get the next path component if there is one. */
 	    l1 = (Complist) zhalloc(sizeof *l1);
-	    if ((l1->next = parsecomplist(instr)) == NULL) {
+	    if ((l1->next = parsecomplist(instr, post_exclude)) == NULL) {
 		errflag |= ERRFLAG_ERROR;
 		return NULL;
 	    }
-	    l1->pat = patcompile(NULL, compflags | PAT_ANY, NULL);
+	    l1->pat = patcompile(NULL, compflags | PAT_ANY, &compargs);
 	    l1->closure = 1;		/* ...zero or more times. */
 	    l1->follow = follow;
 	    return l1;
@@ -715,8 +753,9 @@ parsecomplist(char *instr)
 	!skipparens(Inpar, Outpar, (char **)&str) &&
         *str == zpc_special[ZPC_HASH] && str[-2] == '/') {
 	instr++;
-	if (!(p1 = patcompile(instr, compflags, &instr)))
+	if (!(p1 = patcompile(instr, compflags, &compargs)))
 	    return NULL;
+	instr = compargs.endexp;
 	if (instr[0] == '/' && instr[1] == Outpar && instr[2] == Pound) {
 	    int pdflag = 0;
 
@@ -730,21 +769,24 @@ parsecomplist(char *instr)
 	    /* special case (/)# to avoid infinite recursion */
 	    l1->closure = (*((char *)p1 + p1->startoff)) ? 1 + pdflag : 0;
 	    l1->follow = 0;
-	    l1->next = parsecomplist(instr);
+	    l1->next = parsecomplist(instr, NULL);
 	    return (l1->pat) ? l1 : NULL;
 	}
     } else {
 	/* parse single path component */
-	if (!(p1 = patcompile(instr, compflags|PAT_FILET, &instr)))
+	if (!(p1 = patcompile(instr, compflags|PAT_FILET, &compargs)))
 	    return NULL;
+	instr = compargs.endexp;
 	/* then do the remaining path components */
 	if (*instr == '/' || !*instr) {
 	    int ef = *instr == '/';
 
+	    if (post_exclude && compargs.post_exclude)
+		*post_exclude = dupstring(compargs.post_exclude);
 	    l1 = (Complist) zhalloc(sizeof *l1);
 	    l1->pat = p1;
 	    l1->closure = 0;
-	    l1->next = ef ? parsecomplist(instr+1) : NULL;
+	    l1->next = ef ? parsecomplist(instr+1, post_exclude) : NULL;
 	    return (ef && !l1->next) ? NULL : l1;
 	}
     }
@@ -756,7 +798,7 @@ parsecomplist(char *instr)
 
 /**/
 static Complist
-parsepat(char *str)
+parsepat(char *str, char **post_exclude)
 {
     long assert;
     int ignore;
@@ -784,8 +826,9 @@ parsepat(char *str)
 	pathbuf[pathpos = 1] = '\0';
     } else			/* pattern is relative to pwd */
 	pathbuf[pathpos = 0] = '\0';
+    *post_exclude = NULL;
 
-    return parsecomplist(str);
+    return parsecomplist(str, post_exclude);
 }
 
 /* get number after qualifier */
@@ -1187,6 +1230,7 @@ zglob(LinkList list, LinkNode np, int nountok)
     int first = 0, end = -1;		/* index of first match to return */
 					/* and index+1 of the last match */
     struct globdata saved;		/* saved glob state              */
+    char *post_exclude;			/* text after ~~ */
     int nobareglob = !isset(BAREGLOBQUAL);
     int shortcircuit = 0;		/* How many files to match;      */
 					/* 0 means no limit              */
@@ -1774,7 +1818,7 @@ zglob(LinkList list, LinkNode np, int nountok)
 			    qfirst = qn;
 			    for (qlast = qfirst; qlast->next;
 				 qlast = qlast->next)
-				;			    
+				;
 			} else
 			    qfirst = dup_qual_list(qn, &qlast);
 			/* ... link into new `or' chain ... */
@@ -1804,8 +1848,15 @@ zglob(LinkList list, LinkNode np, int nountok)
 	} else if (newquals)
 	    quals = newquals;
     }
-    q = parsepat(str);
-    if (!q || errflag) {	/* if parsing failed */
+    q = parsepat(str, &post_exclude);
+    if (q && !errflag && post_exclude) {
+	/* Exclusion after ~~: this is a normal pattern, not a file. */
+	curglobdata.gd_exclude = patcompile(post_exclude, 0, NULL);
+    } else {
+	curglobdata.gd_exclude = NULL;
+    }
+    if (!q || errflag || (post_exclude && !curglobdata.gd_exclude)) {
+	/* if parsing failed */
 	restore_globstate(saved);
 	if (unset(BADPATTERN)) {
 	    if (!nountok)
diff --git a/Src/pattern.c b/Src/pattern.c
index 72c7d97..4e5cc0c 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -521,21 +521,24 @@ patcompstart(void)
  * Top level pattern compilation subroutine
  * exp is a null-terminated, metafied string.
  * inflags is an or of some PAT_* flags.
- * endexp, if non-null, is set to a pointer to the end of the
- *   part of exp which was compiled.  This is used when
- *   compiling patterns for directories which must be
+ *
+ * compargs, if non-null, contains pointers that may be set
+ * to indicate the following. This is used when matching files
+ * as in this case the pattern has multiple parts.
+ * - endexp is set to the end of the part of exp which was compiled.
+ *   This is used when compiling patterns for directories which must be
  *   matched recursively.
  */
 
 /**/
 mod_export Patprog
-patcompile(char *exp, int inflags, char **endexp)
+patcompile(char *exp, int inflags, Patcompargs compargs)
 {
     int flags = 0;
     long len = 0;
     long startoff;
     Upat pscan;
-    char *lng, *strp = NULL;
+    char *lng, *strp = NULL, *post_exclude = NULL;
     Patprog p;
 
     queue_signals();
@@ -602,7 +605,7 @@ patcompile(char *exp, int inflags, char **endexp)
 	if (!strp || (*strp && *strp != '/')) {
 	    /* No, do normal compilation. */
 	    strp = NULL;
-	    if (patcompswitch(0, &flags) == 0) {
+	    if (patcompswitch(0, &flags, &post_exclude) == 0) {
 		unqueue_signals();
 		return NULL;
 	    }
@@ -735,13 +738,53 @@ patcompile(char *exp, int inflags, char **endexp)
 	p = newp;
     }
 
-    if (endexp)
-	*endexp = patparse;
+    if (compargs) {
+	compargs->endexp = patparse;
+	if ((compargs->post_exclude = post_exclude))
+	{
+	    /*
+	     * The top-level ~~ is the last thing that can appear.
+	     */
+	    compargs->endexp += strlen(patparse);
+	}
+    }
 
     unqueue_signals();
     return p;
 }
 
+/* Flags for exclusion with ~ */
+enum {
+    TILDE_SINGLE = 1,
+    TILDE_DOUBLE = 2,
+    TILDE_SLASH = 4,
+};
+
+/*
+ * Look for an active tilde; if 2 doubled, which can only happen
+ * at top level (not in parentheses) for a file glob.
+ *
+ * If term_match, we're looking for a terminator (something that can't
+ * be part of a pattern itself) to follow; otherwise, we're
+ * looking for a tilde without a terminator following.
+ */
+
+static int
+pattilde(int toplevel, int term_match)
+{
+    int ret;
+    if (*patparse != zpc_special[ZPC_TILDE])
+	return 0;
+    ret = (toplevel &&
+	   (patflags & PAT_FILET) &&
+	   patparse[1] == zpc_special[ZPC_TILDE]) ? TILDE_DOUBLE : TILDE_SINGLE;
+    if (patparse[ret] == '/' || !memchr(zpc_special, patparse[ret],
+					ZPC_SEG_COUNT))
+	return term_match ? ret : 0;
+    else
+	return term_match ? 0 : ret;
+}
+
 /*
  * Main body or parenthesized subexpression in pattern
  * Parenthesis (and any ksh_glob gubbins) will have been removed.
@@ -749,7 +792,7 @@ patcompile(char *exp, int inflags, char **endexp)
 
 /**/
 static long
-patcompswitch(int paren, int *flagp)
+patcompswitch(int paren, int *flagp, char **post_exclude)
 {
     long starter, br, ender, excsync = 0;
     int parno = 0;
@@ -783,17 +826,35 @@ patcompswitch(int paren, int *flagp)
 
     *flagp |= flags & (P_HSTART|P_PURESTR);
 
-    while (*patparse == zpc_chars[ZPC_BAR] ||
-	   (*patparse == zpc_special[ZPC_TILDE] &&
-	    (patparse[1] == '/' ||
-	     !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
-	int tilde = *patparse++ == zpc_special[ZPC_TILDE];
-	long gfnode = 0, newbr;
+    for (;;) {
+	int tilde;
+	long gfnode, newbr;
+	if (*patparse == zpc_chars[ZPC_BAR]) {
+	    /* handle alternation */
+	    tilde = 0;
+	    patparse++;
+	} else if ((tilde = pattilde(!paren, 1))) {
+	    patparse += tilde;
+	    /* handle exclusion... */
+	} else {
+	    break;
+	}
 
+	gfnode = 0;
 	*flagp &= ~P_PURESTR;
 
 	if (tilde) {
 	    union upat up;
+	    if (tilde == TILDE_DOUBLE) {
+		/*
+		 * Double tilde in top-level glob indicates pruning a
+		 * path: handle specially
+		 */
+		*post_exclude = patparse;
+		patparse += strlen(patparse);
+		break;
+	    }
+
 	    /* excsync remembers the P_EXCSYNC node before a chain of
 	     * exclusions:  all point back to this.  only the
 	     * original (non-excluded) branch gets a trailing P_EXCSYNC.
@@ -825,7 +886,7 @@ patcompswitch(int paren, int *flagp)
 	    patadd((char *)&up, 0, sizeof(up), 0);
 	    /* / is not treated as special if we are at top level */
 	    if (!paren && zpc_special[ZPC_SLASH] == '/') {
-		tilde++;
+		tilde |= TILDE_SLASH;
 		zpc_special[ZPC_SLASH] = Marker;
 	    }
 	} else {
@@ -864,7 +925,7 @@ patcompswitch(int paren, int *flagp)
 	    }
 	}
 	newbr = patcompbranch(&flags, paren);
-	if (tilde == 2) {
+	if (tilde & TILDE_SLASH) {
 	    /* restore special treatment of / */
 	    zpc_special[ZPC_SLASH] = '/';
 	}
@@ -935,8 +996,7 @@ patcompbranch(int *flagp, int paren)
 
     starter = chain = 0;
     while (!memchr(zpc_special, *patparse, ZPC_SEG_COUNT) ||
-	   (*patparse == zpc_special[ZPC_TILDE] && patparse[1] != '/' &&
-	    memchr(zpc_special, patparse[1], ZPC_SEG_COUNT))) {
+	   pattilde(!paren, 0)) {
 	if ((*patparse == zpc_special[ZPC_INPAR] &&
 	     patparse[1] == zpc_special[ZPC_HASH]) ||
 	    (*patparse == zpc_special[ZPC_KSH_AT] && patparse[1] == Inpar &&
@@ -1298,9 +1358,7 @@ patcomppiece(int *flagp, int paren)
 	     */
 	    if (kshchar ||
 		(memchr(zpc_special, *patparse, ZPC_NO_KSH_GLOB) &&
-		 (*patparse != zpc_special[ZPC_TILDE] ||
-		  patparse[1] == '/' ||
-		  !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
+		 !pattilde(!paren, 0))) {
 		break;
 	    }
     	}
@@ -1507,7 +1565,7 @@ patcomppiece(int *flagp, int paren)
 		 */
 		if (!(starter = patcompnot(1, &flags2)))
 		    return 0;
-	    } else if (!(starter = patcompswitch(1, &flags2)))
+	    } else if (!(starter = patcompswitch(1, &flags2, NULL)))
 		return 0;
 	    flags |= flags2 & P_HSTART;
 	    break;
@@ -1759,7 +1817,8 @@ patcompnot(int paren, int *flagsp)
     pattail(starter, excl = patnode(P_EXCLUDE));
     up.p = NULL;
     patadd((char *)&up, 0, sizeof(up), 0);
-    if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy, 0))))
+    if (!(br = (paren ? patcompswitch(1, &dummy, NULL) :
+		patcompbranch(&dummy, 0))))
 	return 0;
     pattail(br, patnode(P_EXCEND));
     n = patnode(P_NOTHING); /* just so much easier */
@@ -2024,6 +2083,26 @@ pattrystart(void)
     errsfound = 0;
 }
 
+/* Save the error count during a glob. */
+
+/**/
+void
+patsaveerrs(int *errsfoundp, int *forceerrsp)
+{
+    *errsfoundp = errsfound;
+    *forceerrsp = forceerrs;
+}
+
+/* Restore the error count during a glob. */
+
+/**/
+void
+patrestoreerrs(int errsfoundin, int forceerrsin)
+{
+    errsfound = errsfoundin;
+    forceerrs = forceerrsin;
+}
+
 /*
  * Fix up string length stuff.
  *
diff --git a/Src/zsh.h b/Src/zsh.h
index eee31da..765392a 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -511,6 +511,7 @@ typedef struct options	 *Options;
 typedef struct optname   *Optname;
 typedef struct param     *Param;
 typedef struct paramdef  *Paramdef;
+typedef struct patcompargs *Patcompargs;
 typedef struct patstralloc  *Patstralloc;
 typedef struct patprog   *Patprog;
 typedef struct prepromptfn *Prepromptfn;
@@ -1500,6 +1501,14 @@ struct patstralloc {
     int progstrunmetalen;	/* Length of the foregoing */
 };
 
+struct patcompargs {
+    char *endexp;		/* End of the part of the input
+				 * expression that was compiled.*/
+    char *post_exclude;         /* Expression following ~ in top-level
+				 * file match: handled specially.
+				 * Enabled by PAT_FILET flag. */
+};
+
 /* Flags used in pattern matchers (Patprog) and passed down to patcompile */
 
 #define PAT_FILE	0x0001	/* Pattern is a file name */
diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst
index a6b704a..f140b30 100644
--- a/Test/D02glob.ztst
+++ b/Test/D02glob.ztst
@@ -655,3 +655,28 @@
   [[ "1" = [$~cset] ]] || print Fail 5
   [[ "b" != [$~cset] ]] || print Fail 6
 0:character set specified as active variabe
+
+  (cd glob.tmp
+   setopt extendedglob
+   mkdir DT DT/one DT/two DT/three
+   for dir in one two three; do
+     mkdir DT/$dir/sub{1..3}
+     touch DT/$dir/sub{1..3}/file.{a,b,c}
+   done
+   print -l DT/o*/*3/file.*
+   print -l DT/**/*~~DT(.N)
+   # Attention: EXCLUSIONS BLOW YOUR MIND warning...
+   print -l DT/**/*.*~~(*sub[123]*.[ab]|*sub[13]*)
+   print -l DT/**/*.*~~*/(sub[12]|one|two)
+  )
+0:~~ exclusions
+>DT/one/sub3/file.a
+>DT/one/sub3/file.b
+>DT/one/sub3/file.c
+>
+>DT/one/sub2/file.c
+>DT/three/sub2/file.c
+>DT/two/sub2/file.c
+>DT/three/sub3/file.a
+>DT/three/sub3/file.b
+>DT/three/sub3/file.c



Messages sorted by: Reverse Date, Date, Thread, Author