Zsh Mailing List Archive Messages sorted by: Reverse Date, Date, Thread, Author

Re: [BUG] quoting within bracket patterns has no effect

X-seq: zsh-workers 37678
From: Peter Stephenson <p.stephenson@xxxxxxxxxxx>
To: Zsh Hackers' List <zsh-workers@xxxxxxx>
Subject: Re: [BUG] quoting within bracket patterns has no effect
Date: Mon, 18 Jan 2016 17:24:34 +0000
In-reply-to: <569C68AB.2010806@inlv.org>
List-help: <mailto:zsh-workers-help@zsh.org>
List-id: Zsh Workers List <zsh-workers.zsh.org>
List-post: <mailto:zsh-workers@zsh.org>
Mailing-list: contact zsh-workers-help@xxxxxxx; run by ezmlm
Organization: Samsung Cambridge Solution Centre
References: <569C68AB.2010806@inlv.org>

On Mon, 18 Jan 2016 05:23:07 +0100
Martijn Dekker <martijn@xxxxxxxx> wrote:
> Quotes should disable the special meaning of characters in glob
> patterns[*].
>
> [*] "If any character (ordinary, shell special, or pattern special) is
> quoted, that pattern shall match the character itself."
> http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_13_01

Dash is a pattern special, but not shell special, character.  These
haven't had much attention --- I have a vague memory, which could be
fallacious, that some time ago the state of the art (whether or not the
standard) was that it wasn't actually possible to quote these (other
than by putting them in special positions) in most shells.

There could therefore be others like this.

Prior art for characters that are only sometimes special, but need to be
first class tokens when they are, exists in the case of "," as used in
subscripts, suggesting the following isn't hopelessly optimistic.

One thing that makes me think there's something I've missed is that
activating "-" in the case of a "[" within a brace parameter --- which I
did in case there was a pattern inside --- caused three tests to fall
over, and I can't see why.  However, it seems the case ${foo#[a-z]} does
work without that (again I don't know why), so it looks like that tweak
isn't needed.

You can read the code and the tests for the various gotchas I did manage
to think about.  "[]a-z]" being a valid range was one.

pws

diff --git a/Src/glob.c b/Src/glob.c
index 8bd2fc4..e5d8956 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -3476,7 +3476,7 @@ static void
 zshtokenize(char *s, int flags)
 {
     char *t;
-    int bslash = 0;
+    int bslash = 0, seen_brct = 0;
 
     for (; *s; s++) {
       cont:
@@ -3507,21 +3507,35 @@ zshtokenize(char *s, int flags)
 	    *t = Inang;
 	    *s = Outang;
 	    break;
+	case '[':
+	    if (bslash)
+		s[-1] = (flags & ZSHTOK_SUBST) ? Bnullkeep : Bnull;
+	    else {
+		seen_brct = 1;
+		*s = Inbrack;
+	    }
+	    break;
+	case '-':
+	    if (bslash)
+		s[-1] = (flags & ZSHTOK_SUBST) ? Bnullkeep : Bnull;
+	    else if (seen_brct) /* see corresonding code in lex.c */
+		*s = Dash;
+	    break;
 	case '(':
 	case '|':
 	case ')':
 	    if (flags & ZSHTOK_SHGLOB)
 		break;
+	    /*FALLTHROUGH*/
 	case '>':
 	case '^':
 	case '#':
 	case '~':
-	case '[':
 	case ']':
 	case '*':
 	case '?':
 	case '=':
-	    for (t = ztokens; *t; t++)
+	    for (t = ztokens; *t; t++) {
 		if (*t == *s) {
 		    if (bslash)
 			s[-1] = (flags & ZSHTOK_SUBST) ? Bnullkeep : Bnull;
@@ -3529,6 +3543,8 @@ zshtokenize(char *s, int flags)
 			*s = (t - ztokens) + Pound;
 		    break;
 		}
+	    }
+	    break;
 	}
 	bslash = 0;
     }
diff --git a/Src/lex.c b/Src/lex.c
index 0f260d0..9a7e3b8 100644
--- a/Src/lex.c
+++ b/Src/lex.c
@@ -35,7 +35,7 @@
 /* tokens */
 
 /**/
-mod_export char ztokens[] = "#$^*(())$=|{}[]`<>>?~`,'\"\\\\";
+mod_export char ztokens[] = "#$^*(())$=|{}[]`<>>?~`,-'\"\\\\";
 
 /* parts of the current token */
 
@@ -394,8 +394,9 @@ ctxtlex(void)
 #define LX2_DQUOTE 15
 #define LX2_BQUOTE 16
 #define LX2_COMMA 17
-#define LX2_OTHER 18
-#define LX2_META 19
+#define LX2_DASH 18
+#define LX2_OTHER 19
+#define LX2_META 20
 
 static unsigned char lexact1[256], lexact2[256], lextok2[256];
 
@@ -405,7 +406,7 @@ initlextabs(void)
 {
     int t0;
     static char *lx1 = "\\q\n;!&|(){}[]<>";
-    static char *lx2 = ";)|$[]~({}><=\\\'\"`,";
+    static char *lx2 = ";)|$[]~({}><=\\\'\"`,-";
 
     for (t0 = 0; t0 != 256; t0++) {
 	lexact1[t0] = LX1_OTHER;
@@ -919,7 +920,7 @@ gettok(void)
 static enum lextok
 gettokstr(int c, int sub)
 {
-    int bct = 0, pct = 0, brct = 0, fdpar = 0;
+    int bct = 0, pct = 0, brct = 0, seen_brct = 0, fdpar = 0;
     int intpos = 1, in_brace_param = 0;
     int inquote, unmatched = 0;
     enum lextok peek;
@@ -1033,8 +1034,10 @@ gettokstr(int c, int sub)
 	    }
 	    break;
 	case LX2_INBRACK:
-	    if (!in_brace_param)
+	    if (!in_brace_param) {
 		brct++;
+		seen_brct = 1;
+	    }
 	    c = Inbrack;
 	    break;
 	case LX2_OUTBRACK:
@@ -1346,6 +1349,21 @@ gettokstr(int c, int sub)
 	    c = Tick;
 	    SETPAREND
 	    break;
+	case LX2_DASH:
+	    /*
+	     * - shouldn't be treated as a special character unless
+	     * we're in a pattern.  Howeve,simply  counting "[" doesn't
+	     * work as []a-z] is a valid expression and we don't know
+	     * down here what this "[" is for as $foo[stuff] is valid
+	     * in zsh.  So just detect an opening [, which is enough
+	     * to turn this into a pattern; the Dash will be harmlessly
+	     * untokenised if not wanted.
+	     */
+	    if (seen_brct)
+		c = Dash;
+	    else
+		c = '-';
+	    break;
 	}
 	add(c);
 	c = hgetc();
diff --git a/Src/pattern.c b/Src/pattern.c
index 9e8a80a..d2b8c59 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -1459,7 +1459,7 @@ patcomppiece(int *flagp, int paren)
 		charstart = patparse;
 		METACHARINC(patparse);
 
-		if (*patparse == '-' && patparse[1] &&
+		if (*patparse == Dash && patparse[1] &&
 		    patparse[1] != Outbrack) {
 		    patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN);
 		    if (itok(*charstart)) {
@@ -1468,7 +1468,7 @@ patcomppiece(int *flagp, int paren)
 		    } else {
 			patadd(charstart, 0, patparse-charstart, PA_NOALIGN);
 		    }
-		    charstart = ++patparse;	/* skip ASCII '-' */
+		    charstart = ++patparse;	/* skip Dash token */
 		    METACHARINC(patparse);
 		}
 		if (itok(*charstart)) {
diff --git a/Src/utils.c b/Src/utils.c
index 788eba9..fd0bab3 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -3888,7 +3888,7 @@ inittyptab(void)
     typtab['\0'] |= IMETA;
     typtab[STOUC(Meta)  ] |= IMETA;
     typtab[STOUC(Marker)] |= IMETA;
-    for (t0 = (int)STOUC(Pound); t0 <= (int)STOUC(Comma); t0++)
+    for (t0 = (int)STOUC(Pound); t0 <= (int)STOUC(LAST_NORMAL_TOK); t0++)
 	typtab[t0] |= ITOK | IMETA;
     for (t0 = (int)STOUC(Snull); t0 <= (int)STOUC(Nularg); t0++)
 	typtab[t0] |= ITOK | IMETA | INULL;
diff --git a/Src/zsh.h b/Src/zsh.h
index 0302d68..6ee2a9c 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -192,24 +192,30 @@ struct mathfunc {
 #define Tilde		((char) 0x98)
 #define Qtick		((char) 0x99)
 #define Comma		((char) 0x9a)
+#define Dash            ((char) 0x9b) /* Only in patterns */
+/*
+ * Marks the last of the group above.
+ * Remaining tokens are even more special.
+ */
+#define LAST_NORMAL_TOK Dash
 /*
  * Null arguments: placeholders for single and double quotes
  * and backslashes.
  */
-#define Snull		((char) 0x9b)
-#define Dnull		((char) 0x9c)
-#define Bnull		((char) 0x9d)
+#define Snull		((char) 0x9c)
+#define Dnull		((char) 0x9d)
+#define Bnull		((char) 0x9e)
 /*
  * Backslash which will be returned to "\" instead of being stripped
  * when we turn the string into a printable format.
  */
-#define Bnullkeep       ((char) 0x9e)
+#define Bnullkeep       ((char) 0x9f)
 /*
  * Null argument that does not correspond to any character.
  * This should be last as it does not appear in ztokens and
  * is used to initialise the IMETA type in inittyptab().
  */
-#define Nularg		((char) 0x9f)
+#define Nularg		((char) 0xa0)
 
 /*
  * Take care to update the use of IMETA appropriately when adding
@@ -220,7 +226,7 @@ struct mathfunc {
  * Also used in pattern character arrays as guaranteed not to
  * mark a character in a string.
  */
-#define Marker		((char) 0xa0)
+#define Marker		((char) 0xa1)
 
 /* chars that need to be quoted if meant literally */
 
diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst
index f944a4f..86133b0 100644
--- a/Test/D02glob.ztst
+++ b/Test/D02glob.ztst
@@ -582,3 +582,43 @@
 >1 OK
 >2 OK
 >3 OK
+
+  [[ foo = 'f'\o"o" ]]
+0:Stripping of quotes from patterns (1)
+
+  [[ foo = 'f'('o'|'a')('o'|'b') ]]
+0:Stripping of quotes from patterns (2)
+
+  [[ fob = 'f'('o'|'a')('o'|'b') ]]
+0:Stripping of quotes from patterns (3)
+
+  [[ fab = 'f'('o'|'a')('o'|'b') ]]
+0:Stripping of quotes from patterns (4)
+
+  [[ fib != 'f'('o'|'a')('o'|'b') ]]
+0:Stripping of quotes from patterns (4)
+
+  [[ - != [a-z] ]]
+0:- is a special character in ranges
+
+  [[ - = ['a-z'] ]]
+0:- is not a special character in ranges if quoted
+
+  [[ b-1 = [a-z]-[0-9] ]]
+0:- untokenized following a bracketed subexpression
+
+  [[ b-1 = []a-z]-[]0-9] ]]
+0:- "]" after "[" is normal range character and - still works
+
+  headremove="bcdef"
+  print ${headremove#[a-z]}
+0:active - works in pattern in parameter
+>cdef
+
+  headremove="bcdef"
+  print ${headremove#['a-z']}
+  headremove="-cdef"
+  print ${headremove#['a-z']}
+0:quoted - works in pattern in parameter
+>bcdef
+>cdef

Follow-Ups:
- Re: [BUG] quoting within bracket patterns has no effect
  - From: Martijn Dekker
- Re: [BUG] quoting within bracket patterns has no effect
  - From: Peter Stephenson
- Re: [BUG] quoting within bracket patterns has no effect
  - From: Jun T.

References:
- [BUG] quoting within bracket patterns has no effect
  - From: Martijn Dekker

Messages sorted by: Reverse Date, Date, Thread, Author