Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Re: [BUG] quoting within bracket patterns has no effect



On Mon, 18 Jan 2016 17:24:34 +0000
Peter Stephenson <p.stephenson@xxxxxxxxxxx> wrote:
> Dash is a pattern special, but not shell special, character.  These
> haven't had much attention --- I have a vague memory, which could be
> fallacious, that some time ago the state of the art (whether or not the
> standard) was that it wasn't actually possible to quote these (other
> than by putting them in special positions) in most shells.
> 
> There could therefore be others like this.

We need to change "^" and "!" for negation of character sets.  "^" is
easy; just remove a special case that it checks for the non-tokenised
version.  "!" needs a new token along the same lines as "-".  It's not
used very much in zsh for this purpose, being inconvenient with history
substitution at the command line.

The changes make the following code behave differently:

  seq="a-z"
  [[ $char = [$seq] ]]

(except with GLOBSUBST in sh emulation where pattern characters from
unquoted variables are active).  Now you need

  [[ $char = [$~seq] ]]

I took a brief look at the completion code to see if anything would be
affected by this, but nothing stood out.

I had to change the code behind $~ so that it always tokenized "!" and
"-", not just after a "[", to get the above case to work.  I don't think
this actually makes much difference --- haswilds(), which looks to
see if globbing is needed, already does a more careful check than
just looking for tokens, so the only difference I can think of is
optimisation of a pattern to a pure string match, which could be
optimised to ignore Dash and Bang as they're only active if we
have Inbrack.

Another not very pleasant case is kshglob where "!(...)" expressions now
may have an untokenised or tokenised "!" --- the unquoted parentheses are
what triggers it to be a glob expression.  However, unless you start
fiddling with "disable -p" to turn off this form of globbing, which you
don't (please), no one's going to notice.

pws

diff --git a/README b/README
index 2e2ebce..8ec148e 100644
--- a/README
+++ b/README
@@ -29,17 +29,43 @@ Zsh is a shell with lots of features.  For a list of some of these, see the
 file FEATURES, and for the latest changes see NEWS.  For more
 details, see the documentation.
 
-Incompatibilities between 5.1 and 5.2
+Incompatibilities between 5.2 and 5.3
 -------------------------------------
 
+In character classes delimited by "[" and "]" within patterns, whether
+used for filename generation (globbing) or other forms of pattern
+matching, it used not to be possible to quote "-" when used for a range,
+or "^" and "!" when used for negating a character set.  The chracters can
+now be quoted by any of the standard shell means, but note that
+the "[" and "]" must not be quoted.  For example,
+
+  [[ $a = ['a-z'] ]]
+
+matches if the variable a contains just one of the characters "a", "-"
+or "z" only.  Previously this would have matched any lower case ASCII
+letter.  Note therefore the useful fact that
+
+  [[ $a = ["$cset"] ]]
+
+matches any chracter contained in the variable "cset".  A consequence
+of this change is that variables that should have active ranges need
+(with default zsh options) to be indicated explicitly, e.g.
+
+  cset="a-z"
+  [[ b = [${~cset}] ]]
+
+The "~" causes the "-" character to be active.  In sh emulation the
+"~" is unncessary in this example and double quotes must be used to
+suppress the range behaviour of the "-".
+
+Incompatibilities between 5.0.8 and 5.2
+---------------------------------------
+
 The behaviour of the parameter flag (P) has changed when it appears
 in a nested parameter group, in order to make it more useful in
 such cases.  A (P) in the outermost parameter group behaves as
 before.  See NEWS for more.
 
-Incompatibilities between 5.0.8 and 5.1
----------------------------------------
-
 The default behaviour when text is pasted into an X Windows terminal has
 changed significantly (unless you are using a very old terminal emulator
 that doesn't support this mode).  Now, the new "bracketed paste mode"
diff --git a/Src/glob.c b/Src/glob.c
index e5d8956..c799281 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -3476,7 +3476,7 @@ static void
 zshtokenize(char *s, int flags)
 {
     char *t;
-    int bslash = 0, seen_brct = 0;
+    int bslash = 0;
 
     for (; *s; s++) {
       cont:
@@ -3507,20 +3507,6 @@ zshtokenize(char *s, int flags)
 	    *t = Inang;
 	    *s = Outang;
 	    break;
-	case '[':
-	    if (bslash)
-		s[-1] = (flags & ZSHTOK_SUBST) ? Bnullkeep : Bnull;
-	    else {
-		seen_brct = 1;
-		*s = Inbrack;
-	    }
-	    break;
-	case '-':
-	    if (bslash)
-		s[-1] = (flags & ZSHTOK_SUBST) ? Bnullkeep : Bnull;
-	    else if (seen_brct) /* see corresonding code in lex.c */
-		*s = Dash;
-	    break;
 	case '(':
 	case '|':
 	case ')':
@@ -3531,10 +3517,13 @@ zshtokenize(char *s, int flags)
 	case '^':
 	case '#':
 	case '~':
+	case '[':
 	case ']':
 	case '*':
 	case '?':
 	case '=':
+	case '-':
+	case '!':
 	    for (t = ztokens; *t; t++) {
 		if (*t == *s) {
 		    if (bslash)
diff --git a/Src/lex.c b/Src/lex.c
index 9a7e3b8..0202d25 100644
--- a/Src/lex.c
+++ b/Src/lex.c
@@ -35,7 +35,7 @@
 /* tokens */
 
 /**/
-mod_export char ztokens[] = "#$^*(())$=|{}[]`<>>?~`,-'\"\\\\";
+mod_export char ztokens[] = "#$^*(())$=|{}[]`<>>?~`,-!'\"\\\\";
 
 /* parts of the current token */
 
@@ -395,8 +395,9 @@ ctxtlex(void)
 #define LX2_BQUOTE 16
 #define LX2_COMMA 17
 #define LX2_DASH 18
-#define LX2_OTHER 19
-#define LX2_META 20
+#define LX2_BANG 19
+#define LX2_OTHER 20
+#define LX2_META 21
 
 static unsigned char lexact1[256], lexact2[256], lextok2[256];
 
@@ -406,10 +407,10 @@ initlextabs(void)
 {
     int t0;
     static char *lx1 = "\\q\n;!&|(){}[]<>";
-    static char *lx2 = ";)|$[]~({}><=\\\'\"`,-";
+    static char *lx2 = ";)|$[]~({}><=\\\'\"`,-!";
 
     for (t0 = 0; t0 != 256; t0++) {
-	lexact1[t0] = LX1_OTHER;
+       lexact1[t0] = LX1_OTHER;
 	lexact2[t0] = LX2_OTHER;
 	lextok2[t0] = t0;
     }
@@ -1361,12 +1362,20 @@ gettokstr(int c, int sub)
 	     */
 	    if (seen_brct)
 		c = Dash;
-	    else
-		c = '-';
-	    break;
-	}
-	add(c);
-	c = hgetc();
+           else
+               c = '-';
+           break;
+       case LX2_BANG:
+           /*
+            * Same logic as Dash, for ! to perform negation in range.
+            */
+           if (seen_brct)
+               c = Bang;
+           else
+               c = '!';
+       }
+       add(c);
+       c = hgetc();
 	if (intpos)
 	    intpos--;
 	if (lexstop)
diff --git a/Src/pattern.c b/Src/pattern.c
index d2b8c59..72c7d97 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -247,7 +247,7 @@ typedef unsigned long zrange_t;
  */
 static const char zpc_chars[ZPC_COUNT] = {
     '/', '\0', Bar, Outpar, Tilde, Inpar, Quest, Star, Inbrack, Inang,
-    Hat, Pound, Bnullkeep, Quest, Star, '+', '!', '@'
+    Hat, Pound, Bnullkeep, Quest, Star, '+', Bang, '!', '@'
 };
 
 /*
@@ -257,7 +257,7 @@ static const char zpc_chars[ZPC_COUNT] = {
 /**/
 mod_export const char *zpc_strings[ZPC_COUNT] = {
    NULL, NULL, "|", NULL, "~", "(", "?", "*", "[", "<",
-   "^", "#", NULL, "?(", "*(", "+(", "!(", "@("
+   "^", "#", NULL, "?(", "*(", "+(", "!(", "\\!(", "@("
 };
 
 /*
@@ -481,7 +481,7 @@ patcompcharsset(void)
 	 */
 	zpc_special[ZPC_KSH_QUEST] = zpc_special[ZPC_KSH_STAR] =
 	    zpc_special[ZPC_KSH_PLUS] = zpc_special[ZPC_KSH_BANG] =
-	    zpc_special[ZPC_KSH_AT] = Marker;
+	    zpc_special[ZPC_KSH_BANG2] = zpc_special[ZPC_KSH_AT] = Marker;
     }
     /*
      * Note that if we are using KSHGLOB, then we test for a following
@@ -1268,6 +1268,8 @@ patcomppiece(int *flagp, int paren)
 		kshchar = STOUC('+');
 	    else if (*patparse == zpc_special[ZPC_KSH_BANG])
 		kshchar = STOUC('!');
+	    else if (*patparse == zpc_special[ZPC_KSH_BANG2])
+		kshchar = STOUC('!');
 	    else if (*patparse == zpc_special[ZPC_KSH_AT])
 		kshchar = STOUC('@');
 	    else if (*patparse == zpc_special[ZPC_KSH_STAR])
@@ -1424,7 +1426,7 @@ patcomppiece(int *flagp, int paren)
 	    DPUTS(zpc_special[ZPC_INBRACK] == Marker,
 		  "Treating '[' as pattern character although disabled");
 	    flags |= P_SIMPLE;
-	    if (*patparse == Hat || *patparse == '^' || *patparse == '!') {
+	    if (*patparse == Hat || *patparse == Bang) {
 		patparse++;
 		starter = patnode(P_ANYBUT);
 	    } else
@@ -4245,7 +4247,8 @@ haswilds(char *str)
 		     ((str[-1] == Quest && !zpc_disables[ZPC_KSH_QUEST]) ||
 		      (str[-1] == Star && !zpc_disables[ZPC_KSH_STAR]) ||
 		      (str[-1] == '+' && !zpc_disables[ZPC_KSH_PLUS]) ||
-		      (str[-1] == '!' && !zpc_disables[ZPC_KSH_BANG]) ||
+		      (str[-1] == Bang && !zpc_disables[ZPC_KSH_BANG]) ||
+		      (str[-1] == '!' && !zpc_disables[ZPC_KSH_BANG2]) ||
 		      (str[-1] == '@' && !zpc_disables[ZPC_KSH_AT]))))
 		    return 1;
 		break;
diff --git a/Src/zsh.h b/Src/zsh.h
index 6ee2a9c..0120ad7 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -193,29 +193,30 @@ struct mathfunc {
 #define Qtick		((char) 0x99)
 #define Comma		((char) 0x9a)
 #define Dash            ((char) 0x9b) /* Only in patterns */
+#define Bang            ((char) 0x9c) /* Only in patterns */
 /*
  * Marks the last of the group above.
  * Remaining tokens are even more special.
  */
-#define LAST_NORMAL_TOK Dash
+#define LAST_NORMAL_TOK Bang
 /*
  * Null arguments: placeholders for single and double quotes
  * and backslashes.
  */
-#define Snull		((char) 0x9c)
-#define Dnull		((char) 0x9d)
-#define Bnull		((char) 0x9e)
+#define Snull		((char) 0x9d)
+#define Dnull		((char) 0x9e)
+#define Bnull		((char) 0x9f)
 /*
  * Backslash which will be returned to "\" instead of being stripped
  * when we turn the string into a printable format.
  */
-#define Bnullkeep       ((char) 0x9f)
+#define Bnullkeep       ((char) 0xa0)
 /*
  * Null argument that does not correspond to any character.
  * This should be last as it does not appear in ztokens and
  * is used to initialise the IMETA type in inittyptab().
  */
-#define Nularg		((char) 0xa0)
+#define Nularg		((char) 0xa1)
 
 /*
  * Take care to update the use of IMETA appropriately when adding
@@ -226,7 +227,7 @@ struct mathfunc {
  * Also used in pattern character arrays as guaranteed not to
  * mark a character in a string.
  */
-#define Marker		((char) 0xa1)
+#define Marker		((char) 0xa2)
 
 /* chars that need to be quoted if meant literally */
 
@@ -1549,6 +1550,7 @@ enum zpc_chars {
     ZPC_KSH_STAR,               /* * for *(...) in KSH_GLOB */
     ZPC_KSH_PLUS,               /* + for +(...) in KSH_GLOB */
     ZPC_KSH_BANG,               /* ! for !(...) in KSH_GLOB */
+    ZPC_KSH_BANG2,              /* ! for !(...) in KSH_GLOB, untokenised */
     ZPC_KSH_AT,                 /* @ for @(...) in KSH_GLOB */
     ZPC_COUNT			/* Number of special chararacters */
 };
diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst
index 89256e3..a6b704a 100644
--- a/Test/D02glob.ztst
+++ b/Test/D02glob.ztst
@@ -622,3 +622,36 @@
 0:quoted - works in pattern in parameter
 >bcdef
 >cdef
+
+  [[ a != [^a] ]]
+0:^ active in character class if not quoted
+
+  [[ a = ['^a'] ]]
+0:^ not active in character class if quoted
+
+  [[ a != [!a] ]]
+0:! active in character class if not quoted
+
+  [[ a = ['!a'] ]]
+0:! not active in character class if quoted
+
+  # Actually, we don't need the quoting here,
+  # c.f. the next test.  This just makes it look
+  # more standard.
+  cset="^a-z"
+  [[ "^" = ["$cset"] ]] || print Fail 1
+  [[ "a" = ["$cset"] ]] || print Fail 2
+  [[ "-" = ["$cset"] ]] || print Fail 3
+  [[ "z" = ["$cset"] ]] || print Fail 4
+  [[ "1" != ["$cset"] ]] || print Fail 5
+  [[ "b" != ["$cset"] ]] || print Fail 6
+0:character set specified as quoted variable
+
+  cset="^a-z"
+  [[ "^" = [$~cset] ]] || print Fail 1
+  [[ "a" != [$~cset] ]] || print Fail 2
+  [[ "-" = [$~cset] ]] || print Fail 3
+  [[ "z" != [$~cset] ]] || print Fail 4
+  [[ "1" = [$~cset] ]] || print Fail 5
+  [[ "b" != [$~cset] ]] || print Fail 6
+0:character set specified as active variabe



Messages sorted by: Reverse Date, Date, Thread, Author