Zsh Mailing List Archive Messages sorted by: Reverse Date, Date, Thread, Author
PATCH: 4.1.4: POSIX ranges

X-seq: zsh-workers 4209
From: Peter Stephenson <pws@xxxxxxxxxxxxxxxxx>
To: zsh-workers@xxxxxxxxxxxxxxx (Zsh hackers list)
Subject: PATCH: 4.1.4: POSIX ranges
Date: Wed, 08 Jul 1998 15:36:15 +0200
Somebody was complaining about the lack of these.  See the manual page
change.

I implemented the look up of 'alnum', 'alpha' etc. in the trivial way:
firstly because setting up a hash table uses even more code and
memory, secondly because the corresponding isalnum() etc. are usually
macros rather than functions so are untableable.  I'm hoping there
aren't too many underendowed ctype macro sets out there, or we're
going to have to add a configure test.

As usual with zsh, there were some extra issues buried in there.

First, the guarantees in the manual page that []...] would match a
literal ']' and [^]...] wouldn't, were being upheld by a test in
lex.c, instead of the globbing code.  This had the effect that
e.g. [...[] didn't work.  I've moved the test to where I think it
should be in glob.c: this means that all ]'s, even not preceeded by a
[, get tokenised.  The globbing code is supposed to be able to cope
with this sort of thing.

Second, the tokenise() code --- on the fly tokenisation for
substituted variables that should turn into glob patterns, and so on
--- was also getting its paws onto ['s and ]'s and trying to second
guess the globbing code.  As above, I've simply made the routine
blindly tokenise every unquoted [ and ] it comes across.

In both cases, if anyone knows somewhere where tokenising an unquoted
[ or ] is definitely wrong, say.  As far as I know, the only
difference is that glob() will be called unnecessarily a few times and
will harmlessly untokenise the bracket in question.

By the way, I haven't touched this, but what's happened to
nobadpattern?

% setopt nobadpattern
% [[ [ = [ ]]
zsh: bad pattern: [

Is it really not supposed to work inside tests?  Maybe it's safer that
way.

*** Doc/Zsh/expn.yo.range	Mon Jun 15 09:52:33 1998
--- Doc/Zsh/expn.yo	Wed Jul  8 12:23:06 1998
***************
*** 735,740 ****
--- 735,756 ----
  can be specified by separating two characters by a `tt(-)'.
  A `tt(-)' or `tt(])' may be matched by including it as the
  first character in the list.
+ There are also several named classes of characters, in the form
+ `tt([:)var(name)(tt:])' with the following meanings:  `tt([:alnum:])'
+ alphanmueric, `tt([:alpha:])' alphabetic,
+ `tt([:cntrl:])' control character, `tt([:digit:])' decimal
+ digit, `tt([:graph:])' printing character expect space,
+ `tt([:lower:])' lowercase letter, `tt([:print:])' printable character,
+ `tt([:punct:])' printing character neither alphanumeric nor space,
+ `tt([:space:])' whitespace character, `tt([:upper:])' uppercase letter, 
+ `tt([:xdigit:])' hexadecimal digit.  These use the macros provided by
+ the operating system to test for the given character combinations,
+ including any modifications due to local language settings:  see
+ manref(ctype)(3).  Note that the square brackets are additional
+ to those enclosing the whole set of characters, so to test for a
+ single alphanumeric character you need `tt([[:alnum:]])'.  Named
+ character sets can be used alongside other types,
+ e.g. `tt([[:alpha:]0-9])'.
  )
  xitem(tt([^)...tt(]))
  item(tt([!)...tt(]))(
*** Misc/globtests.range	Sat Apr 25 19:09:19 1998
--- Misc/globtests	Wed Jul  8 15:26:52 1998
***************
*** 1,6 ****
  #!/usr/local/bin/zsh -f
  
! setopt extendedglob
  unsetopt kshglob
  
  failed=0
--- 1,6 ----
  #!/usr/local/bin/zsh -f
  
! setopt extendedglob badpattern
  unsetopt kshglob
  
  failed=0
***************
*** 95,99 ****
--- 95,107 ----
  f foob          (^foo)b*
  t foobb         (^foo)b*
  f zsh           ^z*
+ t a%1X          [[:alpha:][:punct:]]#[[:digit:]][^[:lower:]]
+ f a%1           [[:alpha:][:punct:]]#[[:digit:]][^[:lower:]]
+ t [:            [[:]#
+ t :]            []:]#
+ t :]            [:]]#
+ t [             [[]
+ t ]             []]
+ t []            [^]]]
  EOT
  print "$failed tests failed."
*** Src/glob.c.range	Mon May 11 09:43:29 1998
--- Src/glob.c	Wed Jul  8 15:25:05 1998
***************
*** 633,641 ****
  		return NULL;
  	} else if (*pptr == Inbrack) {
  	    /* Character set: brackets had better match */
! 	    while (*++pptr && *pptr != Outbrack)
! 		if (itok(*pptr))
  		    *pptr = ztokens[*pptr - Pound];
  	    if (*pptr != Outbrack)
  		return NULL;
  	} else if (itok(*pptr) && *pptr != Star && *pptr != Quest)
--- 633,655 ----
  		return NULL;
  	} else if (*pptr == Inbrack) {
  	    /* Character set: brackets had better match */
! 	    if (pptr[1] == Outbrack)
! 		*++pptr = ']';
! 	    else if ((pptr[1] == Hat || pptr[1] == '^' || pptr[1] == '!') &&
! 		     pptr[2] == Outbrack)
! 		*(pptr += 2) = ']';
! 	    while (*++pptr && *pptr != Outbrack) {
! 		if (itok(*pptr)) {
! 		    /* POSIX classes: make sure it's a real one, *
! 		     * leave the Inbrack tokenised if so.        */
! 		    char *nptr;
! 		    if (*pptr == Inbrack && pptr[1] == ':'
! 			&& (nptr = strchr(pptr+2, ':')) && 
! 			*++nptr == Outbrack)
! 			pptr = nptr;
  		    *pptr = ztokens[*pptr - Pound];
+ 		}
+ 	    }
  	    if (*pptr != Outbrack)
  		return NULL;
  	} else if (itok(*pptr) && *pptr != Star && *pptr != Quest)
***************
*** 2192,2197 ****
--- 2206,2293 ----
  
  /**/
  static int
+ posix_range(char **patptr, int ch)
+ {
+     /* Match POSIX ranges, which correspond to ctype macros,  *
+      * e.g. [:alpha:] -> isalpha.  It just doesn't seem worth * 
+      * the palaver of creating a hash table for this.           */
+     char *start = *patptr;
+     int len;
+ 
+     /* we made sure in parsecomp() there was a ':' to search for */
+     *patptr = strchr(start, ':');
+     len = *patptr++ - start;
+ 
+     if (!strncmp(start, "alpha", len))
+ 	return isalpha(ch);
+     if (!strncmp(start, "alnum", len))
+ 	return isalnum(ch);
+     if (!strncmp(start, "cntrl", len))
+ 	return iscntrl(ch);
+     if (!strncmp(start, "digit", len))
+ 	return isdigit(ch);
+     if (!strncmp(start, "graph", len))
+ 	return isgraph(ch);
+     if (!strncmp(start, "lower", len))
+ 	return islower(ch);
+     if (!strncmp(start, "print", len))
+ 	return isprint(ch);
+     if (!strncmp(start, "punct", len))
+ 	return ispunct(ch);
+     if (!strncmp(start, "space", len))
+ 	return isspace(ch);
+     if (!strncmp(start, "upper", len))
+ 	return isupper(ch);
+     if (!strncmp(start, "xdigit", len))
+ 	return isxdigit(ch);
+     return 0;
+ }
+ 
+ /**/
+ static void
+ rangematch(char **patptr, int ch, int rchar)
+ {
+     /* Check for a character in a [...] or [^...].  The [ *
+      * and optional ^ have already been skipped.          */
+ 
+     char *pat = *patptr;
+ #ifdef HAVE_STRCOLL
+     char l_buf[2], r_buf[2], ch_buf[2];
+ 
+     ch_buf[0] = ch;
+     l_buf[1] = r_buf[1] = ch_buf[1] = '\0';
+ #endif
+ 
+ #define PAT(X) (pat[X] == Meta ? pat[(X)+1] ^ 32 : untok(pat[X]))
+ #define PPAT(X) (pat[(X)-1] == Meta ? pat[X] ^ 32 : untok(pat[X]))
+ 
+     for (pat++; *pat != Outbrack && *pat;
+ 	 *pat == Meta ? pat += 2 : pat++) {
+ 	if (*pat == Inbrack) {
+ 	    /* Inbrack can only occur inside a range if we found [:...:]. */
+ 	    pat += 2;
+ 	    if (posix_range(&pat, ch))
+ 		break;
+ 	} else if (*pat == '-' && pat[-1] != rchar &&
+ 		   pat[1] != Outbrack) {
+ #ifdef HAVE_STRCOLL
+ 	    l_buf[0] = PPAT(-1);
+ 	    r_buf[0] = PAT(1);
+ 	    if (strcoll(l_buf, ch_buf) <= 0 &&
+ 		strcoll(ch_buf, r_buf) <= 0)
+ #else
+ 		if (PPAT(-1) <= ch && PAT(1) >= ch)
+ #endif
+ 		    break;
+ 	} else if (ch == PAT(0))
+ 	    break;
+     }
+ 
+     *patptr = pat;
+ }
+ 
+ /**/
+ static int
  matchonce(Comp c)
  {
      char *pat = c->str;
***************
*** 2304,2341 ****
  	}
  	if (*pat == Inbrack) {
  	    /* Match groups of characters */
- #define PAT(X) (pat[X] == Meta ? pat[(X)+1] ^ 32 : untok(pat[X]))
- #define PPAT(X) (pat[(X)-1] == Meta ? pat[X] ^ 32 : untok(pat[X]))
  	    char ch;
- #ifdef HAVE_STRCOLL
- 	    char l_buf[2], r_buf[2], ch_buf[2];
- 
- 	    l_buf[1] = r_buf[1] = ch_buf[1] = '\0';
- #endif
  
  	    if (!*pptr)
  		break;
  	    ch = *pptr == Meta ? pptr[1] ^ 32 : *pptr;
- #ifdef HAVE_STRCOLL
- 	    ch_buf[0] = ch;
- #endif
  	    if (pat[1] == Hat || pat[1] == '^' || pat[1] == '!') {
  		/* group is negated */
! 		pat[1] = Hat;
! 		for (pat += 2; *pat != Outbrack && *pat;
! 		     *pat == Meta ? pat += 2 : pat++)
! 		    if (*pat == '-' && pat[-1] != Hat && pat[1] != Outbrack) {
! #ifdef HAVE_STRCOLL
! 			l_buf[0] = PPAT(-1);
! 			r_buf[0] = PAT(1);
! 			if (strcoll(l_buf, ch_buf) <= 0 &&
! 			    strcoll(ch_buf, r_buf) <= 0)
! #else
! 			if (PPAT(-1) <= ch && PAT(1) >= ch)
! #endif
! 			    break;
! 		    } else if (ch == PAT(0))
! 			break;
  		DPUTS(!*pat, "BUG: something is very wrong in doesmatch()");
  		if (*pat != Outbrack)
  		    break;
--- 2400,2414 ----
  	}
  	if (*pat == Inbrack) {
  	    /* Match groups of characters */
  	    char ch;
  
  	    if (!*pptr)
  		break;
  	    ch = *pptr == Meta ? pptr[1] ^ 32 : *pptr;
  	    if (pat[1] == Hat || pat[1] == '^' || pat[1] == '!') {
  		/* group is negated */
! 		*++pat = Hat;
! 		rangematch(&pat, ch, Hat);
  		DPUTS(!*pat, "BUG: something is very wrong in doesmatch()");
  		if (*pat != Outbrack)
  		    break;
***************
*** 2344,2364 ****
  		continue;
  	    } else {
  		/* pattern is not negated (affirmed? asserted?) */
! 		for (pat++; *pat != Outbrack && *pat;
! 		     *pat == Meta ? pat += 2 : pat++)
! 		    if (*pat == '-' && pat[-1] != Inbrack &&
! 			       pat[1] != Outbrack) {
! #ifdef HAVE_STRCOLL
! 			l_buf[0] = PPAT(-1);
! 			r_buf[0] = PAT(1);
! 			if (strcoll(l_buf, ch_buf) <= 0 &&
! 			    strcoll(ch_buf, r_buf) <= 0)
! #else
! 			if (PPAT(-1) <= ch && PAT(1) >= ch)
! #endif
! 			    break;
! 		    } else if (ch == PAT(0))
! 			break;
  		DPUTS(!pat || !*pat, "BUG: something is very wrong in doesmatch()");
  		if (*pat == Outbrack)
  		    break;
--- 2417,2423 ----
  		continue;
  	    } else {
  		/* pattern is not negated (affirmed? asserted?) */
! 		rangematch(&pat, ch, Inbrack);
  		DPUTS(!pat || !*pat, "BUG: something is very wrong in doesmatch()");
  		if (*pat == Outbrack)
  		    break;
***************
*** 2461,2480 ****
  	    }
  	    bslash = 1;
  	    continue;
- 	case '[':
- 	    if (bslash) {
- 		s[-1] = Bnull;
- 		break;
- 	    }
- 	    t = s;
- 	    if (*++s == '^' || *s == '!')
- 		s++;
- 	    while (*s && *++s != ']');
- 	    if (!*s)
- 		return;
- 	    *t = Inbrack;
- 	    *s = Outbrack;
- 	    break;
  	case '<':
  	    if (isset(SHGLOB))
  		break;
--- 2520,2525 ----
***************
*** 2502,2507 ****
--- 2547,2554 ----
  	case ')':
  	    if (isset(SHGLOB))
  		break;
+ 	case '[':
+ 	case ']':
  	case '*':
  	case '?':
  	    for (t = ztokens; *t; t++)
*** Src/lex.c.range	Wed Jul  8 14:20:46 1998
--- Src/lex.c	Wed Jul  8 15:22:14 1998
***************
*** 876,902 ****
  	    }
  	    break;
  	case LX2_INBRACK:
- 	    add(c);
  	    if (!in_brace_param)
  		brct++;
! 	    c = hgetc();
! 	    if (c == '!' || c == '^') {
! 		add(c);
! 		c = hgetc();
! 	    }
! 	    if (c == ']')
! 		break;
! 	    if (lexstop)
! 		goto brk;
! 	    intpos = 0;
! 	    continue;
  	case LX2_OUTBRACK:
  	    if (!in_brace_param)
  		brct--;
! 	    if (brct < 0) {
  		brct = 0;
- 		break;
- 	    }
  	    c = Outbrack;
  	    break;
  	case LX2_INPAR:
--- 876,890 ----
  	    }
  	    break;
  	case LX2_INBRACK:
  	    if (!in_brace_param)
  		brct++;
! 	    c = Inbrack;
! 	    break;
  	case LX2_OUTBRACK:
  	    if (!in_brace_param)
  		brct--;
! 	    if (brct < 0)
  		brct = 0;
  	    c = Outbrack;
  	    break;
  	case LX2_INPAR:

-- 
Peter Stephenson <pws@xxxxxxxxxxxxxxxxx>       Tel: +39 50 844536
WWW:  http://www.ifh.de/~pws/
Gruppo Teorico, Dipartimento di Fisica
Piazza Torricelli 2, 56100 Pisa, Italy
Follow-Ups:
- Re: PATCH: 4.1.4: POSIX ranges
  - From: Bart Schaefer
Messages sorted by: Reverse Date, Date, Thread, Author