Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: rewrite of completion matching



This is the long-hoped-for rewrite of the completion matching code so as
to handle arbitary characters instead of just a set of 256 single-byte
characters.  However, this is only half the story:  I have not modified
it to use wide characters yet, so it won't work with multibyte character
sets.  That second step ought to be much more mechanical than this one,
though it's still not trivial to identify anything as standard as a
well-defined interface to the matching code so it will take some
fiddling.

Astonishingly, after I'd removed all the trivial errors, it passed all
the compmatch tests straight away; so, although there are bound to be
glitches, it's going to need wider testing to find them out.

It's now possible (and recommended) to use m:{[:lower:]}={[:upper:]}
etc. for case matching, but until I change the code for wide characters
this won't actually gain you much.  I've updated compinstall to do
this.

By the way, you can now do partial word completion with
'r:|[[:punct:]]=**' instead of 'r:|[.,_-]=**', should you want to.

There are some limitations.  I'm aware of one place which is plain
wrong: if you use a negative character class assertion, [^...], in file
matching, then compfiles -p/-P is supposed to generate a match that
matches either that class, or the character on the command line.  Cases
like "[^[:upper:]]" with the character on the command line "A" stumped
me, so this doesn't work.  (It ought to be possible to do this as
"(A|[^[:upper:]])" but I've left such subtleties for now.)  What the
effect of this is likely to be is obscure, given that none of the
compfiles code is documented and nor is the point where it's called in
_path_files.  However, I don't think negative character class assertions
are very common in completion.

Other places marked "TODO" are mostly points where I noticed the
existing code was probably imperfect but haven't done anything about it,
rather than new problems.

I've handled the difficult case, the one where Bart and Andrey provided
commentary on what was supposed to be going on, as follows.  Instead of
attempting to assemble the string for the command line immediately and
then testing whether it was correct, I have assembled an appropriate set
of pattern match assertions (or restrictions, as I called them in the
code) for the string, and passed that down to the point where pattern
matching is done.  At this point the assertions are used to attempt to
recover the character on the line.  If it remained ambiguous even at
this point I have given up---clearly there's not a lot of point in
completion if it's simply guessing characters, so I don't think this is
likely to be a problem in practice.  Actually, I don't think this code
is used much for anything apart from correspondence classes, which is
the easy case.

I know that the matching test suite does tickle that code, but only in
small ways---individual characters being replaced by others.  Hence I
remain dubious that it's all perfect.  (In fact, I just discovered a
major error and the tests passed even before I'd fixed it.)  However,
by the same token I've no reason to believe the code it replaces was
perfect either.


Index: Completion/compinstall
===================================================================
RCS file: /cvsroot/zsh/zsh/Completion/compinstall,v
retrieving revision 1.12
diff -u -r1.12 compinstall
--- Completion/compinstall	18 Mar 2008 15:14:22 -0000	1.12
+++ Completion/compinstall	7 Jun 2008 20:19:34 -0000
@@ -985,8 +985,14 @@
   for (( eltcnt = 1; eltcnt <= $#mlist; eltcnt++ )); do
     [[ $mlist[eltcnt] == "+"* ]] && a_or_r[$eltcnt]='+'
     [[ -z $mlist[$eltcnt] ]] && n_list[$eltcnt]=$eltcnt
+    # Accept the old form of lower/upper correspondence, but we'll
+    # output the new one instead.
     [[ $mlist[$eltcnt] = *"m:{a-z}={A-Z}"* ]] && c_list[$eltcnt]=$eltcnt
+    [[ $mlist[$eltcnt] = *"m:{[:lower:]}={[:upper:]}"* ]] &&
+      c_list[$eltcnt]=$eltcnt
     [[ $mlist[$eltcnt] = *"m:{a-zA-Z}={A-Za-z}"* ]] && C_list[$eltcnt]=$eltcnt
+    [[ $mlist[$eltcnt] = *"m:{[:lower:][:upper:]}={[:upper:][:lower:]}"* ]] &&
+      C_list[$eltcnt]=$eltcnt
     # For partial word stuff, we use backreferences to find out what
     # the set of separators was.
     if [[ $mlist[$eltcnt] = (#b)*"r:|["([^\]]#)"]=*"#" r:|=*"* ]]; then
@@ -1105,8 +1111,10 @@
   # and reconstructing the elements of the matcher array.
   for (( eltcnt = 1; eltcnt <= 4; eltcnt++ )); do
     elt=
-    [[ $c_list[$eltcnt] != ' ' ]] && elt="${elt:+$elt }m:{a-z}={A-Z}"
-    [[ $C_list[$eltcnt] != ' ' ]] && elt="${elt:+$elt }m:{a-zA-Z}={A-Za-z}"
+    [[ $c_list[$eltcnt] != ' ' ]] &&
+      elt="${elt:+$elt }m:{[:lower:]}={[:upper:]}"
+    [[ $C_list[$eltcnt] != ' ' ]] &&
+      elt="${elt:+$elt }m:{[:lower:][:upper:]}={[:upper:][:lower:]}"
     [[ $p_list[$eltcnt] != ' ' ]] &&
       elt="${elt:+$elt }r:|[${pw_seps[$eltcnt]}]=*${pw_dstar[$eltcnt]}\
  r:|=*${pw_dstar[$eltcnt]}"
Index: Doc/Zsh/compwid.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/compwid.yo,v
retrieving revision 1.40
diff -u -r1.40 compwid.yo
--- Doc/Zsh/compwid.yo	7 Oct 2006 12:15:57 -0000	1.40
+++ Doc/Zsh/compwid.yo	7 Jun 2008 20:19:35 -0000
@@ -918,15 +918,35 @@
 meaning directly after the opening brace.  They indicate that a range of
 characters on the line match a range of characters in the trial
 completion, but (unlike ordinary character classes) paired according to
-the corresponding position in the sequence. For example, to make any
-lowercase letter on the line match the corresponding uppercase letter in
-the trial completion, you can use `tt(m:{a-z}={A-Z})'.  More than one
-pair of classes can occur, in which case the first class before the
-tt(=) corresponds to the first after it, and so on.  If one side has
+the corresponding position in the sequence.  For example, to make any
+ASCII lower case letter on the line match the corresponding upper case
+letter in the trial completion, you can use `tt(m:{a-z}={A-Z})'
+(however, see below for the recommended form for this).  More
+than one pair of classes can occur, in which case the first class before
+the tt(=) corresponds to the first after it, and so on.  If one side has
 more such classes than the other side, the superfluous classes behave
 like normal character classes.  In anchor patterns correspondence classes
 also behave like normal character classes.
 
+The standard `tt([:)var(name)tt(:])' forms described for standard shell
+patterns,
+ifnzman(noderef(Filename Generation))\
+ifzman(see the section FILENAME GENERATION in zmanref(zshexpn)),
+may appear in correspondence classes as well as normal character
+classes.  The only special behaviour in correspondence classes is if
+the form on the left and the form on the right are each one of
+tt([:upper:]), tt([:lower:]).  In these cases the
+character in the word and the character on the line must be the same up
+to a difference in case.  Hence to make any lower case character on the
+line match the corresponding upper case character in the trial
+completion you can use `tt(m:{[:lower:]}={[:upper:]})'.  Although the
+matching system does not yet handle multibyte characters, this is likely
+to be a future extension, at which point this syntax will handle
+arbitrary alphabets ; hence this form, rather than the use of explicit
+ranges, is thee recommended form.  In other cases
+`tt([:)var(name)tt(:])' forms are allowed, but imply no special
+constraint on the characters beyond that implied by the test itself.
+
 The pattern var(tpat) may also be one or two stars, `tt(*)' or
 `tt(**)'. This means that the pattern on the command line can match
 any number of characters in the trial completion. In this case the
@@ -948,7 +968,7 @@
 generated and uppercase letters on the line match the corresponding
 lowercase letters in the words:
 
-example(compadd -M 'L:|[nN][oO]= M:_= M:{A-Z}={a-z}' - \ 
+example(compadd -M 'L:|[nN][oO]= M:_= M:{[:up[er:]}={[:lower:]}' - \ 
   ${(k)options} )
 
 The first part says that the pattern `tt([nN][oO])' at the beginning
@@ -979,12 +999,12 @@
 the same as in the option example, except here we wish to retain the
 characters in the list of completions:
 
-example(compadd -M 'm:{a-z}={A-Z}' ... )
+example(compadd -M 'm:{[:lower:]}={[:upper:]}' ... )
 
 This makes lowercase letters match their uppercase counterparts.
 To make uppercase letters match the lowercase forms as well:
 
-example(compadd -M 'm:{a-zA-Z}={A-Za-z}' ... )
+example(compadd -M 'm:{[:lower:][:upper:]}={[:upper:][:lower:]}' ... )
 
 A nice example for the use of tt(*) patterns is partial word
 completion. Sometimes you would like to make strings like `tt(c.s.u)'
@@ -1046,23 +1066,23 @@
 complete strings with trailing numbers. Here one could use the simple
 form with only one anchor as in:
 
-example(compadd -M 'r:|[A-Z0-9]=* r:|=*' LikeTHIS FooHoo 5foo123 5bar234)
+example(compadd -M 'r:|[[:upper:]0-9]=* r:|=*' LikeTHIS FooHoo 5foo123 5bar234)
 
 But with this, the string `tt(H)' would neither complete to `tt(FooHoo)'
 nor to `tt(LikeTHIS)' because in each case there is an uppercase
 letter before the `tt(H)' and that is matched by the anchor. Likewise, 
 a `tt(2)' would not be completed. In both cases this could be changed
-by using `tt(r:|[A-Z0-9]=**)', but then `tt(H)' completes to both
+by using `tt(r:|[[:upper:]0-9]=**)', but then `tt(H)' completes to both
 `tt(LikeTHIS)' and `tt(FooHoo)' and a `tt(2)' matches the other
 strings because characters can be inserted before every uppercase
 letter and digit. To avoid this one would use:
 
-example(compadd -M 'r:[^A-Z0-9]||[A-Z0-9]=** r:|=*' \ 
+example(compadd -M 'r:[^[:upper:]0-9]||[[:upper:]0-9]=** r:|=*' \ 
     LikeTHIS FooHoo foo123 bar234)
 
 By using these two anchors, a `tt(H)' matches only uppercase `tt(H)'s that 
 are immediately preceded by something matching the left anchor
-`tt([^A-Z0-9])'. The effect is, of course, that `tt(H)' matches only
+`tt([^[:upper:]0-9])'. The effect is, of course, that `tt(H)' matches only
 the string `tt(FooHoo)', a `tt(2)' matches only `tt(bar234)' and so on.
 
 When using the completion system (see
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.46
diff -u -r1.46 pattern.c
--- Src/pattern.c	17 May 2008 12:15:19 -0000	1.46
+++ Src/pattern.c	7 Jun 2008 20:19:37 -0000
@@ -193,25 +193,6 @@
  *	     	    v      v  	  	    ^
  *	            ------------------------
  */
-#define PP_ALPHA  1
-#define PP_ALNUM  2
-#define PP_ASCII  3
-#define PP_BLANK  4
-#define PP_CNTRL  5
-#define PP_DIGIT  6
-#define PP_GRAPH  7
-#define PP_LOWER  8
-#define PP_PRINT  9
-#define PP_PUNCT  10
-#define PP_SPACE  11
-#define PP_UPPER  12
-#define PP_XDIGIT 13
-#define PP_IDENT  14
-#define PP_IFS    15
-#define PP_IFSSPACE   16
-#define PP_WORD   17
-#define PP_UNKWN  18
-#define PP_RANGE  19
 
 #define	P_OP(p)		((p)->l & 0xff)
 #define	P_NEXT(p)	((p)->l >> 8)
@@ -1057,6 +1038,127 @@
     return 1;
 }
 
+
+static const char *colon_stuffs[]  = {
+    "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph", 
+    "lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
+    "IFS", "IFSSPACE", "WORD", NULL
+};
+
+/*
+ * Handle the guts of a [:stuff:] character class element.
+ * start is the beginning of "stuff" and len is its length.
+ * This code is exported for the benefit of completion matching.
+ */
+
+/**/
+mod_export int
+range_type(char *start, int len)
+{
+    const char **csp;
+
+    for (csp = colon_stuffs; *csp; csp++) {
+	if (!strncmp(start, *csp, len))
+	    return (csp - colon_stuffs) + PP_FIRST;
+    }
+
+    return PP_UNKWN;
+}
+
+
+/*
+ * Convert the contents of a [...] or [^...] expression (just the
+ * ... part) back into a string.  This is used by compfiles -p/-P
+ * for some reason.  The compiled form (a metafied string) is
+ * passed in rangestr.
+ *
+ * If outstr is non-NULL the compiled form is placed there.  It
+ * must be sufficiently long.  A terminating NULL is appended.
+ *
+ * Return the length required, not including the terminating NULL.
+ *
+ * TODO: this is non-multibyte for now.  It will need to be defined
+ * appropriately with MULTIBYTE_SUPPORT when the completion matching
+ * code catches up.
+ */
+
+/**/
+mod_export int
+pattern_range_to_string(char *rangestr, char *outstr)
+{
+    int len = 0;
+
+    while (*rangestr) {
+	if (imeta(STOUC(*rangestr))) {
+	    int swtype = STOUC(*rangestr) - STOUC(Meta);
+
+	    if (swtype == 0) {
+		/* Ordindary metafied character */
+		if (outstr)
+		{
+		    *outstr++ = Meta;
+		    *outstr++ = rangestr[1] ^ 32;
+		}
+		len += 2;
+		rangestr += 2;
+	    } else if (swtype == PP_RANGE) {
+		/* X-Y range */
+		int i;
+
+		for (i = 0; i < 2; i++) {
+		    if (*rangestr == Meta) {
+			if (outstr) {
+			    *outstr++ = Meta;
+			    *outstr++ = rangestr[1];
+			}
+			len += 2;
+			rangestr += 2;
+		    } else {
+			if (outstr)
+			    *outstr++ = *rangestr;
+			len++;
+			rangestr++;
+		    }
+
+		    if (i == 0) {
+			if (outstr)
+			    *outstr++ = '-';
+			len++;
+		    }
+		}
+	    } else if (swtype >= PP_FIRST && swtype <= PP_LAST) {
+		/* [:stuff:]; we need to output [: and :] */
+		const char *found = colon_stuffs[swtype - PP_FIRST];
+		int newlen = strlen(found);
+		if (outstr) {
+		    strcpy(outstr, "[:");
+		    outstr += 2;
+		    memcpy(outstr, found, newlen);
+		    outstr += newlen;
+		    strcpy(outstr, ":]");
+		    outstr += 2;
+		}
+		len += newlen + 4;
+		rangestr++;
+	    } else {
+		/* shouldn't happen */
+		DPUTS(1, "BUG: unknown PP_ code in pattern range");
+		rangestr++;
+	    }
+	} else {
+	    /* ordinary character, guaranteed no Meta handling needed */
+	    if (outstr)
+		*outstr++ = *rangestr;
+	    len++;
+	    rangestr++;
+	}
+    }
+
+    if (outstr)
+	*outstr = '\0';
+    return len;
+}
+
 /*
  * compile a chunk such as a literal string or a [...] followed
  * by a possible hash operator
@@ -1230,45 +1332,10 @@
 			/* Posix range. */
 			patparse += 2;
 			len = nptr - patparse;
-			if (!strncmp(patparse, "alpha", len))
-			    ch = PP_ALPHA;
-			else if (!strncmp(patparse, "alnum", len))
-			    ch = PP_ALNUM;
-			else if (!strncmp(patparse, "ascii", len))
-			    ch = PP_ASCII;
-			else if (!strncmp(patparse, "blank", len))
-			    ch = PP_BLANK;
-			else if (!strncmp(patparse, "cntrl", len))
-			    ch = PP_CNTRL;
-			else if (!strncmp(patparse, "digit", len))
-			    ch = PP_DIGIT;
-			else if (!strncmp(patparse, "graph", len))
-			    ch = PP_GRAPH;
-			else if (!strncmp(patparse, "lower", len))
-			    ch = PP_LOWER;
-			else if (!strncmp(patparse, "print", len))
-			    ch = PP_PRINT;
-			else if (!strncmp(patparse, "punct", len))
-			    ch = PP_PUNCT;
-			else if (!strncmp(patparse, "space", len))
-			    ch = PP_SPACE;
-			else if (!strncmp(patparse, "upper", len))
-			    ch = PP_UPPER;
-			else if (!strncmp(patparse, "xdigit", len))
-			    ch = PP_XDIGIT;
-			else if (!strncmp(patparse, "IDENT", len))
-			    ch = PP_IDENT;
-			else if (!strncmp(patparse, "IFS", len))
-			    ch = PP_IFS;
-			else if (!strncmp(patparse, "IFSSPACE", len))
-			    ch = PP_IFSSPACE;
-			else if (!strncmp(patparse, "WORD", len))
-			    ch = PP_WORD;
-			else
-			    ch = PP_UNKWN;
+			ch = range_type(patparse, len);
 			patparse = nptr + 2;
 			if (ch != PP_UNKWN)
-			    patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN);
+			    patadd(NULL, STOUC(Meta) + ch, 1, PA_NOALIGN);
 			continue;
 		}
 		charstart = patparse;
@@ -1276,7 +1343,7 @@
 
 		if (*patparse == '-' && patparse[1] &&
 		    patparse[1] != Outbrack) {
-		    patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN);
+		    patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN);
 		    if (itok(*charstart)) {
 			patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
 			       PA_NOALIGN);
@@ -2369,19 +2436,19 @@
 		wchar_t cr = CHARREF(patinput, patinend);
 		char *scanop = (char *)P_OPERAND(scan);
 		if (patglobflags & GF_MULTIBYTE) {
-		    if (mb_patmatchrange(scanop, cr) ^
+		    if (mb_patmatchrange(scanop, cr, NULL, NULL) ^
 			(P_OP(scan) == P_ANYOF))
 			fail = 1;
 		    else
 			CHARINC(patinput, patinend);
-		} else if (patmatchrange(scanop, (int)cr) ^
+		} else if (patmatchrange(scanop, (int)cr, NULL, NULL) ^
 			   (P_OP(scan) == P_ANYOF))
 		    fail = 1;
 		else
 		    CHARINC(patinput, patinend);
 #else
 		if (patmatchrange((char *)P_OPERAND(scan),
-				   CHARREF(patinput, patinend)) ^
+				  CHARREF(patinput, patinend), NULL, NULL) ^
 		    (P_OP(scan) == P_ANYOF))
 		    fail = 1;
 		else
@@ -3122,12 +3189,33 @@
 /**/
 #ifdef MULTIBYTE_SUPPORT
 
+/*
+ * See if character ch matches a pattern range specification.
+ * The null-terminated specification is in range; the test
+ * character is in ch.
+ *
+ * indptr is used by completion matching, which is why this
+ * function is exported.  If indptr is not NULL we set *indptr
+ * to the index of the character in the range string, adjusted
+ * in the case of "A-B" ranges such that A would count as its
+ * normal index (say IA), B would count as IA + (B-A), and any
+ * character within the range as appropriate.  We're not strictly
+ * guaranteed this fits within a wint_t, but if this is Unicode
+ * in 32 bits we have a fair amount of distance left over.
+ *
+ * mtp is used in the same circumstances.  *mtp returns the match type:
+ * 0 for a standard character, else the PP_ index.  It's not
+ * useful if the match failed.
+ */
+
 /**/
-static int
-mb_patmatchrange(char *range, wchar_t ch)
+mod_export int
+mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp)
 {
     wchar_t r1, r2;
 
+    if (indptr)
+	*indptr = 0;
     /*
      * Careful here: unlike other strings, range is a NULL-terminated,
      * metafied string, because we need to treat the Posix and hyphenated
@@ -3135,7 +3223,10 @@
      */
     while (*range) {
 	if (imeta(STOUC(*range))) {
-	    switch (STOUC(*range++) - STOUC(Meta)) {
+	    int swtype = STOUC(*range++) - STOUC(Meta);
+	    if (mtp)
+		*mtp = swtype;
+	    switch (swtype) {
 	    case 0:
 		/* ordinary metafied character */
 		range--;
@@ -3214,8 +3305,19 @@
 	    case PP_RANGE:
 		r1 = metacharinc(&range);
 		r2 = metacharinc(&range);
-		if (r1 <= ch && ch <= r2)
+		if (r1 <= ch && ch <= r2) {
+		    if (indptr)
+			*indptr += ch - r1;
 		    return 1;
+		}
+		/* Careful not to screw up counting with bogus range */
+		if (indptr && r1 < r2) {
+		    /*
+		     * This gets incremented again below to get
+		     * us past the range end.  This is correct.
+		     */
+		    *indptr += r2 - r1;
+		}
 		break;
 	    case PP_UNKWN:
 		DPUTS(1, "BUG: unknown posix range passed through.\n");
@@ -3224,21 +3326,130 @@
 		DPUTS(1, "BUG: unknown metacharacter in range.");
 		break;
 	    }
-	} else if (metacharinc(&range) == ch)
+	} else if (metacharinc(&range) == ch) {
+	    if (mtp)
+		*mtp = 0;
 	    return 1;
+	}
+	if (indptr)
+	    (*indptr)++;
     }
     return 0;
 }
 
+
+#if 0
+/*
+ * This is effectively the reverse of mb_patmatchrange().
+ * Given a range descriptor of the same form, and an index into it,
+ * try to determine the character that is matched.  If the index
+ * points to a [:...:] generic style match, set chr to WEOF and
+ * return the type in mtp instead.  Return 1 if successful, 0 if
+ * there was no corresponding index.  Note all pointer arguments
+ * must be non-null.
+ *
+ * TODO: for now the completion matching code does not handle
+ * multibyte.  When it does, we will need either this, or
+ * patmatchindex(), but not both---unlike user-initiated pattern
+ * matching, multibyte mode in the line editor is always on when available.
+ */
+
 /**/
+mod_export int
+mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp)
+{
+    wchar_t r1, r2, rchr;
+    wint_t rdiff;
+
+    *chr = WEOF;
+    *mtp = 0;
+
+    while (*range) {
+	if (imeta(STOUC(*range))) {
+	    int swtype = STOUC(*range++) - STOUC(Meta);
+	    switch (swtype) {
+	    case 0:
+		range--;
+		rchr = metacharinc(&range);
+		if (!ind) {
+		    *chr = (wint_t) rchr;
+		    return 1;
+		}
+		break;
+
+	    case PP_ALPHA:
+	    case PP_ALNUM:
+	    case PP_ASCII:
+	    case PP_BLANK:
+	    case PP_CNTRL:
+	    case PP_DIGIT:
+	    case PP_GRAPH:
+	    case PP_LOWER:
+	    case PP_PRINT:
+	    case PP_PUNCT:
+	    case PP_SPACE:
+	    case PP_UPPER:
+	    case PP_XDIGIT:
+	    case PP_IDENT:
+	    case PP_IFS:
+	    case PP_IFSSPACE:
+	    case PP_WORD:
+		if (!ind) {
+		    *mtp = swtype;
+		    return 1;
+		}
+		break;
+
+	    case PP_RANGE:
+		r1 = metacharinc(&range);
+		r2 = metacharinc(&range);
+		rdiff = (wint_t)r2 - (wint_t)r1; 
+		if (rdiff >= ind) {
+		    *chr = (wint_t)r1 + ind;
+		    return 1;
+		}
+		/* note the extra decrement to ind below */
+		ind -= rdiff;
+		break;
+	    case PP_UNKWN:
+		DPUTS(1, "BUG: unknown posix range passed through.\n");
+		break;
+	    default:
+		DPUTS(1, "BUG: unknown metacharacter in range.");
+		break;
+	    }
+	} else {
+	    rchr = metacharinc(&range);
+	    if (!ind) {
+		*chr = (wint_t)rchr;
+		return 1;
+	    }
+	}
+	if (!ind--)
+	    break;
+    }
+
+    /* No corresponding index. */
+    return 0;
+}
 #endif
 
 /**/
-static int
-patmatchrange(char *range, int ch)
+#endif
+
+/*
+ * Identical function to mb_patmatchrange() above for single-byte
+ * characters.
+ */
+
+/**/
+mod_export int
+patmatchrange(char *range, int ch, int *indptr, int *mtp)
 {
     int r1, r2;
 
+    if (indptr)
+	*indptr = 0;
     /*
      * Careful here: unlike other strings, range is a NULL-terminated,
      * metafied string, because we need to treat the Posix and hyphenated
@@ -3246,7 +3457,10 @@
      */
     for (; *range; range++) {
 	if (imeta(STOUC(*range))) {
-	    switch (STOUC(*range)-STOUC(Meta)) {
+	    int swtype = STOUC(*range) - STOUC(Meta);
+	    if (mtp)
+		*mtp = swtype;
+	    switch (swtype) {
 	    case 0:
 		if (STOUC(*++range ^ 32) == ch)
 		    return 1;
@@ -3326,8 +3540,13 @@
 		r2 = STOUC(UNMETA(range));
 		if (*range == Meta)
 		    range++;
-		if (r1 <= ch && ch <= r2)
+		if (r1 <= ch && ch <= r2) {
+		    if (indptr)
+			*indptr += ch - r1;
 		    return 1;
+		}
+		if (indptr && r1 < r2)
+		    *indptr += r2 - r1;
 		break;
 	    case PP_UNKWN:
 		DPUTS(1, "BUG: unknown posix range passed through.\n");
@@ -3336,9 +3555,100 @@
 		DPUTS(1, "BUG: unknown metacharacter in range.");
 		break;
 	    }
-	} else if (STOUC(*range) == ch)
+	} else if (STOUC(*range) == ch) {
+	    if (mtp)
+		*mtp = 0;
 	    return 1;
+	}
+	if (indptr)
+	    (*indptr)++;
+    }
+    return 0;
+}
+
+/*
+ * Identical function to mb_patmatchindex() above for single-byte
+ * characters.  Here -1 represents a character that needs a special type.
+ */
+
+/**/
+mod_export int
+patmatchindex(char *range, int ind, int *chr, int *mtp)
+{
+    int r1, r2, rdiff, rchr;
+
+    *chr = -1;
+    *mtp = 0;
+
+    for (; *range; range++) {
+	if (imeta(STOUC(*range))) {
+	    int swtype = STOUC(*range) - STOUC(Meta);
+	    switch (swtype) {
+	    case 0:
+		/* ordinary metafied character */
+		rchr = STOUC(*++range) ^ 32;
+		if (!ind) {
+		    *chr = rchr;
+		    return 1;
+		}
+		break;
+
+	    case PP_ALPHA:
+	    case PP_ALNUM:
+	    case PP_ASCII:
+	    case PP_BLANK:
+	    case PP_CNTRL:
+	    case PP_DIGIT:
+	    case PP_GRAPH:
+	    case PP_LOWER:
+	    case PP_PRINT:
+	    case PP_PUNCT:
+	    case PP_SPACE:
+	    case PP_UPPER:
+	    case PP_XDIGIT:
+	    case PP_IDENT:
+	    case PP_IFS:
+	    case PP_IFSSPACE:
+	    case PP_WORD:
+		if (!ind) {
+		    *mtp = swtype;
+		    return 1;
+		}
+		break;
+
+	    case PP_RANGE:
+		range++;
+		r1 = STOUC(UNMETA(range));
+		METACHARINC(range);
+		r2 = STOUC(UNMETA(range));
+		if (*range == Meta)
+		    range++;
+		rdiff = r2 - r1; 
+		if (rdiff >= ind) {
+		    *chr = r1 + ind;
+		    return 1;
+		}
+		/* note the extra decrement to ind below */
+		ind -= rdiff;
+		break;
+	    case PP_UNKWN:
+		DPUTS(1, "BUG: unknown posix range passed through.\n");
+		break;
+	    default:
+		DPUTS(1, "BUG: unknown metacharacter in range.");
+		break;
+	    }
+	} else {
+	    if (!ind) {
+		*chr = STOUC(*range);
+		return 1;
+	    }
+	}
+	if (!ind--)
+	    break;
     }
+
+    /* No corresponding index. */
     return 0;
 }
 
@@ -3382,14 +3692,14 @@
 #ifdef MULTIBYTE_SUPPORT
 	    wchar_t cr = CHARREF(scan, patinend);
 	    if (patglobflags & GF_MULTIBYTE) {
-		if (mb_patmatchrange(opnd, cr) ^
+		if (mb_patmatchrange(opnd, cr, NULL, NULL) ^
 		    (P_OP(p) == P_ANYOF))
 		    break;
-	    } else if (patmatchrange(opnd, (int)cr) ^
+	    } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^
 		       (P_OP(p) == P_ANYOF))
 		break;
 #else
-	    if (patmatchrange(opnd, CHARREF(scan, patinend)) ^
+	    if (patmatchrange(opnd, CHARREF(scan, patinend), NULL, NULL) ^
 		(P_OP(p) == P_ANYOF))
 		break;
 #endif
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.136
diff -u -r1.136 zsh.h
--- Src/zsh.h	1 Jun 2008 17:58:42 -0000	1.136
+++ Src/zsh.h	7 Jun 2008 20:19:38 -0000
@@ -1307,6 +1307,48 @@
 #define PAT_HAS_EXCLUDP	0x0800	/* (internal): top-level path1~path2. */
 #define PAT_LCMATCHUC   0x1000  /* equivalent to setting (#l) */
 
+/*
+ * Special match types used in character classes.  These
+ * are represented as tokens, with Meta added.  The character
+ * class is represented as a metafied string, with only these
+ * tokens special.  Note that an active leading "!" or "^" for
+ * negation is not part of the string but is flagged in the
+ * surrounding context.
+ *
+ * These types are also used in character and equivalence classes
+ * in completion matching.
+ *
+ * This must be kept ordered by the array colon_stuffs in pattern.c.
+ */
+/* Special value for first definition */
+#define PP_FIRST  1
+/* POSIX-defined types:  [:alpha:] etc. */
+#define PP_ALPHA  1
+#define PP_ALNUM  2
+#define PP_ASCII  3
+#define PP_BLANK  4
+#define PP_CNTRL  5
+#define PP_DIGIT  6
+#define PP_GRAPH  7
+#define PP_LOWER  8
+#define PP_PRINT  9
+#define PP_PUNCT  10
+#define PP_SPACE  11
+#define PP_UPPER  12
+#define PP_XDIGIT 13
+/* Zsh additions:  [:IDENT:] etc. */
+#define PP_IDENT  14
+#define PP_IFS    15
+#define PP_IFSSPACE   16
+#define PP_WORD   17
+/* Special value for last definition */
+#define PP_LAST   17
+
+/* Unknown type.  Not used in a valid token. */
+#define PP_UNKWN  18
+/* Range: token followed by the (possibly multibyte) start and end */
+#define PP_RANGE  19
+
 /* Globbing flags: lower 8 bits gives approx count */
 #define GF_LCMATCHUC	0x0100
 #define GF_IGNCASE	0x0200
Index: Src/Zle/comp.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/comp.h,v
retrieving revision 1.17
diff -u -r1.17 comp.h
--- Src/Zle/comp.h	6 May 2008 16:01:19 -0000	1.17
+++ Src/Zle/comp.h	7 Jun 2008 20:19:38 -0000
@@ -162,12 +162,49 @@
 #define CMF_RIGHT 4
 #define CMF_INTER 8
 
+/*
+ * Types of cpattern structure.
+ * Note freecpattern() assumes any <= CPAT_EQUIV have string.
+ */
+enum {
+    CPAT_CCLASS,		/* [...]: ordinary character class */
+    CPAT_NCLASS,		/* [!...]: ordinary character class, negated */
+    CPAT_EQUIV,			/* {...}: equivalence class */
+    CPAT_ANY,			/* ?: any character */
+    CPAT_CHAR			/* Single character given explicitly */
+};
+
+/*
+ * A pattern element in a matcher specification.
+ * Unlike normal patterns this only presents one character in
+ * either the test completion or the word on the command line.
+ */
 struct cpattern {
     Cpattern next;		/* next sub-pattern */
-    unsigned char tab[256];	/* table of matched characters */
-    int equiv;			/* if this is a {...} class */
+    int tp;			/* type of object as above */
+    union {
+	char *str;		/* if a character class, the objects
+				 * in it in a similar form to normal
+				 * pattern matching (a metafied string
+				 * with tokens).
+				 * Note the allocated length may be longer
+				 * than the null-terminated string.
+				 */
+	int chr;		/* if a single character, it
+				 * TODO: eventually should be a
+				 * convchar_t.
+				 */
+    } u;
 };
 
+/*
+ * For now this just handles single-byte characters.
+ * TODO: this will change.
+ */
+#define PATMATCHRANGE(r, c, ip, mtp)	patmatchrange(r, c, ip, mtp)
+#define PATMATCHINDEX(r, i, cp, mtp)	patmatchindex(r, i, cp, mtp)
+#define CONVCAST(c)	(c)
+
 /* This is a special return value for parse_cmatcher(), *
  * signalling an error. */
 
Index: Src/Zle/complete.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/complete.c,v
retrieving revision 1.39
diff -u -r1.39 complete.c
--- Src/Zle/complete.c	6 Jul 2007 21:52:40 -0000	1.39
+++ Src/Zle/complete.c	7 Jun 2008 20:19:38 -0000
@@ -122,13 +122,15 @@
 
     while (p) {
 	n = p->next;
+	if (p->tp <= CPAT_EQUIV)
+	    free(p->u.str);
 	zfree(p, sizeof(struct cpattern));
 
 	p = n;
     }
 }
 
-/* Copy a completion matcher list. */
+/* Copy a completion matcher list into permanent storage. */
 
 /**/
 mod_export Cmatcher
@@ -157,22 +159,51 @@
     return r;
 }
 
+/*
+ * Copy a single entry in a matcher pattern.
+ * If useheap is 1, it comes from the heap.
+ */
+
+/**/
+mod_export Cpattern
+cp_cpattern_element(Cpattern o)
+{
+    Cpattern n = zalloc(sizeof(struct cpattern));
+
+    n->next = NULL;
+
+    n->tp = o->tp;
+    switch (o->tp)
+    {
+    case CPAT_CCLASS:
+    case CPAT_NCLASS:
+    case CPAT_EQUIV:
+	n->u.str = ztrdup(o->u.str);
+	break;
+
+    case CPAT_CHAR:
+	n->u.chr = o->u.chr;
+	break;
+
+    default:
+	/* just to keep compiler quiet */
+	break;
+    }
+
+    return n;
+}
+
 /* Copy a completion matcher pattern. */
 
 /**/
 static Cpattern
 cpcpattern(Cpattern o)
 {
-    Cpattern r = NULL, *p = &r, n;
+    Cpattern r = NULL, *p = &r;
 
     while (o) {
-	*p = n = (Cpattern) zalloc(sizeof(struct cpattern));
-
-	n->next = NULL;
-	memcpy(n->tab, o->tab, 256);
-	n->equiv = o->equiv;
-
-	p = &(n->next);
+	*p = cp_cpattern_element(o);
+	p = &((*p)->next);
 	o = o->next;
     }
     return r;
@@ -331,14 +362,26 @@
     return ret;
 }
 
-/* Parse a pattern for matcher control. */
+/*
+ * Parse a pattern for matcher control. 
+ * name is the name of the builtin from which this is called, for errors.
+ * *sp is the input string and will be updated to the end of the parsed
+ *   pattern.
+ * *lp will be set to the number of characters (possibly multibyte)
+ *   that the pattern will match.  This must be deterministic, given
+ *   the syntax allowed here.
+ * e, if non-zero, is the ASCII end character to match; if zero,
+ *   stop on a blank.
+ * *err is set to 1 to indicate an error, else to 0.
+ */
 
 /**/
 static Cpattern
 parse_pattern(char *name, char **sp, int *lp, char e, int *err)
 {
     Cpattern ret = NULL, r = NULL, n;
-    unsigned char *s = (unsigned char *) *sp;
+    char *s = *sp;
+    int inchar;
     int l = 0;
 
     *err = 0;
@@ -346,25 +389,18 @@
     while (*s && (e ? (*s != e) : !inblank(*s))) {
 	n = (Cpattern) hcalloc(sizeof(*n));
 	n->next = NULL;
-	n->equiv = 0;
 
-	if (*s == '[') {
-	    s = parse_class(n, s + 1, ']');
-	    if (!*s) {
-		*err = 1;
-		zwarnnam(name, "unterminated character class");
-		return NULL;
-	    }
-	} else if (*s == '{') {
-	    n->equiv = 1;
-	    s = parse_class(n, s + 1, '}');
+	if (*s == '[' || *s == '{') {
+	    s = parse_class(n, s);
 	    if (!*s) {
 		*err = 1;
 		zwarnnam(name, "unterminated character class");
 		return NULL;
 	    }
+	    s++;
 	} else if (*s == '?') {
-	    memset(n->tab, 1, 256);
+	    n->tp = CPAT_ANY;
+	    s++;
 	} else if (*s == '*' || *s == '(' || *s == ')' || *s == '=') {
 	    *err = 1;
 	    zwarnnam(name, "invalid pattern character `%c'", *s);
@@ -373,8 +409,13 @@
 	    if (*s == '\\' && s[1])
 		s++;
 
-	    memset(n->tab, 0, 256);
-	    n->tab[*s] = 1;
+	    if (*s == Meta)
+		inchar = STOUC(*++s) ^ 32;
+	    else
+		inchar = STOUC(*s);
+	    s++;
+	    n->tp = CPAT_CHAR;
+	    n->u.chr = inchar;
 	}
 	if (ret)
 	    r->next = n;
@@ -384,7 +425,6 @@
 	r = n;
 
 	l++;
-	s++;
     }
     *sp = (char *) s;
     *lp = l;
@@ -394,28 +434,86 @@
 /* Parse a character class for matcher control. */
 
 /**/
-static unsigned char *
-parse_class(Cpattern p, unsigned char *s, unsigned char e)
+static char *
+parse_class(Cpattern p, char *iptr)
 {
-    int n = 0, i = 1, j, eq = (e == '}'), k = 1;
-
-    if (!eq && (*s == '!' || *s == '^') && s[1] != e) { n = 1; s++; }
-
-    memset(p->tab, n, 256);
-
-    n = !n;
-    while (*s && (k || *s != e)) {
-	if (s[1] == '-' && s[2] && s[2] != e) {
-	    /* a run of characters */
-	    for (j = (int) *s; j <= (int) s[2]; j++)
-		p->tab[j] = (eq ? i++ : n);
+    int endchar, firsttime = 1;
+    char *optr, *nptr;
 
-	    s += 3;
+    if (*iptr++ == '[') {
+	endchar = ']';
+	/* TODO: surely [^]] is valid? */
+	if ((*iptr == '!' || *iptr == '^') && iptr[1] != ']') {
+	    p->tp = CPAT_NCLASS;
+	    iptr++;
 	} else
-	    p->tab[*s++] = (eq ? i++ : n);
-	k = 0;
+	    p->tp = CPAT_CCLASS;
+    } else {
+	endchar = '}';
+	p->tp = CPAT_EQUIV;
+    }
+
+    /* find end of class.  End character can appear literally first. */
+    for (optr = iptr; optr == iptr || *optr != endchar; optr++)
+	if (!*optr)
+	    return optr;
+    /*
+     * We can always fit the parsed class within the same length
+     * because of the tokenization (including a null byte).
+     *
+     * As the input string is metafied, but shouldn't contain shell
+     * tokens, we can just add our own tokens willy nilly.
+     */
+    optr = p->u.str = zalloc((optr-iptr) + 1);
+
+    while (firsttime || *iptr != endchar) {
+	int ch;
+
+	if (*iptr == '[' && iptr[1] == ':' &&
+	    (nptr = strchr((char *)iptr + 2, ':')) && nptr[1] == ']') {
+	    /* Range type */
+	    iptr += 2;
+	    ch = range_type((char *)iptr, nptr-iptr);
+	    iptr = nptr + 2;
+	    if (ch != PP_UNKWN)
+		*optr++ = STOUC(Meta) + ch;
+	} else {
+	    /* characters stay metafied */
+	    char *ptr1 = iptr;
+	    if (*iptr == Meta)
+		iptr++;
+	    iptr++;
+	    if (*iptr == '-' && iptr[1] && iptr[1] != endchar) {
+		/* a run of characters */
+		iptr++;
+		/* range token */
+		*optr++ = Meta + PP_RANGE;
+
+		/* start of range character */
+		if (*ptr1 == Meta) {
+		    *optr++ = Meta;
+		    *optr++ = ptr1[1] ^ 32;
+		} else
+		    *optr++ = *ptr1;
+
+		if (*iptr == Meta) {
+		    *optr++ = *iptr++;
+		    *optr++ = *iptr++;
+		} else
+		    *optr++ = *iptr++;
+	    } else {
+		if (*ptr1 == Meta) {
+		    *optr++ = Meta;
+		    *optr++ = ptr1[1] ^ 32;
+		} else
+		    *optr++ = *ptr1;
+	    }
+	}
+	firsttime = 0;
     }
-    return s;
+
+    *optr = '\0';
+    return iptr;
 }
 
 /**/
Index: Src/Zle/compmatch.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/compmatch.c,v
retrieving revision 1.53
diff -u -r1.53 compmatch.c
--- Src/Zle/compmatch.c	18 Jan 2008 16:41:36 -0000	1.53
+++ Src/Zle/compmatch.c	7 Jun 2008 20:19:39 -0000
@@ -30,37 +30,68 @@
 #include "complete.mdh"
 #include "compmatch.pro"
 
-/* This compares two cpattern lists and returns non-zero if they are
- * equal. */
+/*
+ * This compares two cpattern lists and returns non-zero if they are
+ * equal (N.B. opposite sense to usual *cmp()).
+ *
+ * The old version of this didn't worry about whether the lists
+ * were the same length.  This one does.  It's hard to see how
+ * that can be wrong even if it's unnecessary.
+ */
 
 /**/
 static int
-cmp_cpatterns(Cpattern a, Cpattern b)
+cpatterns_same(Cpattern a, Cpattern b)
 {
     while (a) {
-	if (a->equiv != b->equiv || memcmp(a->tab, b->tab, 256))
+	if (!b)
 	    return 0;
+	if (a->tp != b->tp)
+	    return 0;
+	switch (a->tp) {
+	case CPAT_CCLASS:
+	case CPAT_NCLASS:
+	case CPAT_EQUIV:
+	    /*
+	     * Patterns can actually match the same even if
+	     * the range strings don't compare differently, but
+	     * I don't think we need to handle that subtlety.
+	     */
+	    if (strcmp(a->u.str, b->u.str) != 0)
+		return 0;
+	    break;
+
+	case CPAT_CHAR:
+	    if (a->u.chr != b->u.chr)
+		return 0;
+	    break;
+
+	default:
+	    /* here to silence compiler */
+	    break;
+	}
+
 	a = a->next;
 	b = b->next;
     }
-    return 1;
+    return !b;
 }
 
 /* This compares two cmatchers and returns non-zero if they are equal. */
 
 /**/
 static int
-cmp_cmatchers(Cmatcher a, Cmatcher b)
+cmatchers_same(Cmatcher a, Cmatcher b)
 {
     return (a == b ||
 	    (a->flags == b->flags &&
 	     a->llen == b->llen && a->wlen == b->wlen &&
-	     (!a->llen || cmp_cpatterns(a->line, b->line)) &&
-	     (a->wlen <= 0 || cmp_cpatterns(a->word, b->word)) &&
+	     (!a->llen || cpatterns_same(a->line, b->line)) &&
+	     (a->wlen <= 0 || cpatterns_same(a->word, b->word)) &&
 	     (!(a->flags & (CMF_LEFT | CMF_RIGHT)) ||
 	      (a->lalen == b->lalen && a->ralen == b->ralen &&
-	       (!a->lalen || cmp_cpatterns(a->left, b->left)) &&
-	       (!a->ralen || cmp_cpatterns(a->right, b->right))))));
+	       (!a->lalen || cpatterns_same(a->left, b->left)) &&
+	       (!a->ralen || cpatterns_same(a->right, b->right))))));
 }
 
 /* Add the given matchers to the bmatcher list. */
@@ -97,7 +128,7 @@
 	t = 0;
 	for (ms = mstack; ms && !t; ms = ms->next)
 	    for (mp = ms->matcher; mp && !t; mp = mp->next)
-		t = cmp_cmatchers(mp, p->matcher);
+		t = cmatchers_same(mp, p->matcher);
 
 	p = p->next;
 	if (!t) {
@@ -449,7 +480,7 @@
     }
 }
 
-/* This tests if the string from the line l matches the word w. In bp
+/* This tests if the string from the line l matches the word w. In *bpp
  * the offset for the brace is returned, in rwlp the length of the
  * matched prefix or suffix, not including the stuff before or after
  * the last anchor is given. When sfx is non-zero matching is done from
@@ -1113,55 +1144,330 @@
     return r;
 }
 
-/* Check if the given pattern matches the given string.             *
+
+/*
+ * Guts of a single pattern for pattern_match().
+ * Return non-zero if match successful.
+ * If the class was an equivalence, return 1 + the index into
+ * the equivalence class (see pattern.c for how this is calculated).
+ */
+
+/**/
+mod_export int
+pattern_match1(Cpattern p, int c, int *mtp)
+{
+    /* TODO: should become convchar_t */
+    int ind;
+
+    *mtp = 0;
+    switch (p->tp) {
+    case CPAT_CCLASS:
+	return PATMATCHRANGE(p->u.str, CONVCAST(c), NULL, NULL);
+
+    case CPAT_NCLASS:
+	return !PATMATCHRANGE(p->u.str, CONVCAST(c), NULL, NULL);
+
+    case CPAT_EQUIV:
+	if (PATMATCHRANGE(p->u.str, CONVCAST(c), &ind, mtp))
+	    return ind + 1;
+	else
+	    return 0;
+
+    case CPAT_ANY:
+	return 1;
+
+    case CPAT_CHAR:
+	return (p->u.chr == c);
+
+    default:
+	DPUTS(1, "bad matcher pattern type");
+	return 0;
+    }
+}
+
+
+/*
+ * Use an equivalence to deduce the line character from the word, or
+ * vice versa.  (If vice versa, then "line" and "word" are reversed
+ * in what follows.  The logic is symmetric.)
+ * lp is the line pattern.
+ * wind is the index returned by a pattern match on the word pattern,
+ * with type wmtp.
+ * wchr is the word character.
+ * Return -1 if no matching character, else the character.
+ *
+ * Only makes sense if lp->tp == CPAT_EQUIV and the (unseen) word
+ * pattern also has that type.
+ */
+static int
+pattern_match_equivalence(Cpattern lp, int wind, int wmtp, int wchr)
+{
+    int lchr, lmtp;
+
+    if (!PATMATCHINDEX(lp->u.str, wind-1, &lchr, &lmtp)) {
+	/*
+	 * No equivalent.  No possible match; give up.
+	 */
+	return -1;
+    }
+    /*
+     * If we matched an exact character rather than a range
+     * type, return it.
+     */
+    if (lchr != -1)
+	return lchr;
+
+    /*
+     * Check the match types.  We may want a case-changed
+     * version of the word character.
+     */
+    if (wmtp == PP_UPPER && lmtp == PP_LOWER)
+	return tulower(wchr);
+    else if (wmtp == PP_LOWER && lmtp == PP_UPPER)
+	return tuupper(wchr);
+    else if (wmtp == lmtp) {
+	/*
+	 * Be lenient and allow identical replacements
+	 * for character classes, although in fact this
+	 * doesn't give special functionality for equivalence
+	 * classes.
+	 */
+	return wchr;
+    } else {
+	/*
+	 * Non-matching generic types; this can't work.
+	 */
+	return -1;
+    }
+}
+
+/*
+ * Check if the given pattern matches the given string.
  *  p and  s are either anchor or line pattern and string;
  * wp and ws are word (candidate) pattern and string
  *
- * If only one pattern is given, we just check if characters match
+ * If only one pattern is given, we just check if characters match.
  * If both line and word are given, we check that characters match
- * for {...} classes by comparing relative numbers in sequence.
+ * for {...} classes by comparing positions in the strings.
  *
  * Patterns and strings are always passed in pairs, so it is enough
  * to check for non-NULL wp. p should always be present.
+ *
+ * If prestrict is not NULL, it is a chain of patterns at least as long
+ * as the line string.  In this case we are still assembling the line at
+ * s (which has been allocated but doesn't yet contain anything useful)
+ * and must continue to do so as we go along; prestrict gives
+ * restrictions on the line character to be applied along side the other
+ * patterns.  In the simple case a restriction is a character to be put
+ * in place; otherwise it is a set of possible characters and we have to
+ * deduce an actual matching character.  Note prestrict is never an
+ * equivalence class.  In extreme cases we can't deduce a unique
+ * character; then the match fails.
  */
 
 /**/
 mod_export int
-pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
+pattern_match_restrict(Cpattern p, char *s, Cpattern wp, char *ws,
+		       Cpattern prestrict)
 {
-    unsigned char c;
-    unsigned char wc;
+    int c, ind;
+    int wc, wind;
+    int len, wlen, mt, wmt;
 
     while (p && wp && *s && *ws) {
-	c = p->tab[*((unsigned char *) s)];
-	wc = wp->tab[*((unsigned char *) ws)];
-
-	if (!c || !wc || c != wc)
+	/* First test the word character */
+	if (*ws == Meta) {
+	    wc = STOUC(ws[1]) ^ 32;
+	    wlen = 2;
+	} else {
+	    wc = STOUC(*ws);
+	    wlen = 1;
+	}
+	wind = pattern_match1(wp, wc, &wmt);
+	if (!wind)
 	    return 0;
 
-	s++;
-	ws++;
+	/*
+	 * Now the line character; deal with the case where
+	 * we don't yet have it, only a restriction on it.
+	 */
+	if (prestrict) {
+	    if (prestrict->tp == CPAT_CHAR) {
+		/*
+		 * Easy case: restricted to an exact character on
+		 * the line.  Procede as normal.
+		 */
+		c = prestrict->u.chr;
+	    } else {
+		if (p->tp == CPAT_CHAR) {
+		    /*
+		     * Normal line pattern is an exact character:  as
+		     * long as this matches prestrict, we can proceed
+		     * as usual.
+		     */
+		    c = p->u.chr;
+		} else if (p->tp == CPAT_EQUIV) {
+		    /*
+		     * An equivalence, so we can deduce the character
+		     * backwards from the word pattern and see if it
+		     * matches prestrict.
+		     */
+		    if ((c = pattern_match_equivalence(p, wind, wmt, wc)) == -1)
+			return 0;
+		} else {
+		    /*
+		     * Not an equivalence, so that means we must match
+		     * the word (not just the word pattern), so grab it
+		     * and make sure it fulfills our needs.  I think.
+		     * Not 100% sure about that, but what else can
+		     * we do?  We haven't actually been passed a string
+		     * from the command line.
+		     */
+		    c = wc;
+		}
+		/* Character so deduced must match the restriction. */
+		if (!pattern_match1(prestrict, c, &mt))
+		    return 0;
+	    }
+	    len = imeta(c) ? 2 : 1;
+	} else {
+	    /* We have the character itself. */
+	    if (*s == Meta) {
+		c = STOUC(s[1]) ^ 32;
+		len = 2;
+	    } else {
+		c = STOUC(*s);
+		len = 1;
+	    }
+	}
+	/*
+	 * If either is "?", they match each other; no further tests.
+	 * Apply this even if the character wasn't convertable;
+	 * there's no point trying to be clever in that case.
+	 */
+	if (p->tp != CPAT_ANY || wp->tp != CPAT_ANY)
+	{
+	    ind = pattern_match1(p, c, &mt);
+	    if (!ind)
+		return 0;
+	    if (ind != wind)
+		return 0;
+	    if (mt != wmt) {
+		/*
+		 * Special case if matching lower vs. upper or
+		 * vice versa.  The transformed characters must match.
+		 * We don't need to check the transformation is
+		 * the appropriate one for each character separately,
+		 * since that was done in pattern_match1(), so just
+		 * compare lower-cased versions of both.
+		 */
+		if ((mt == PP_LOWER || mt == PP_UPPER) &&
+		    (wmt == PP_LOWER || wmt == PP_UPPER)) {
+		    if (tulower(c) != tulower(wc))
+			return 0;
+		} else {
+		    /* Other different classes can't match. */
+		    return 0;
+		}
+	    }
+	}
+
+	if (prestrict) {
+	    /* We need to assemble the line */
+	    if (imeta(c)) {
+		*s++ = Meta;
+		*s++ = c ^ 32;
+	    } else {
+		*s++ = c;
+	    }
+	    prestrict = prestrict->next;
+	} else
+	    s += len;
+	ws += wlen;
 	p = p->next;
 	wp = wp->next;
     }
 
     while (p && *s) {
-	if (!p->tab[*((unsigned char *) s)])
+	if (prestrict) {
+	    /*
+	     * As above, but with even less info to go on.
+	     * (Can this happen?)  At least handle the cases where
+	     * one of our patterns has given us a specific character.
+	     */
+	    if (prestrict->tp == CPAT_CHAR) {
+		c = prestrict->u.chr;
+	    } else {
+		if (p->tp == CPAT_CHAR) {
+		    c = p->u.chr;
+		} else {
+		    /*
+		     * OK.  Here we are in a function with just a line
+		     * pattern and another pattern to restrict the
+		     * characters that can go on the line, and no actual
+		     * characters.  We're matching two patterns against
+		     * one another to generate a character to insert.
+		     * This is a bit too psychedelic, so I'm going to
+		     * bale out now.  See you on the ground.
+		     */
+		    return 0;
+		}
+		if (!pattern_match1(prestrict, c, &mt))
+		    return 0;
+	    }
+	} else {
+	    if (*s == Meta) {
+		c = STOUC(s[1]) ^ 32;
+		len = 2;
+	    } else {
+		c = STOUC(*s);
+		len = 1;
+	    }
+	}
+	if (!pattern_match1(p, c, &mt))
 	    return 0;
 	p = p->next;
-	s++;
+	if (prestrict) {
+	    if (imeta(c)) {
+		*s++ = Meta;
+		*s++ = c ^ 32;
+	    } else {
+		*s++ = c;
+	    }
+	    prestrict = prestrict->next;
+	} else
+	    s += len;
     }
 
     while (wp && *ws) {
-	if (!wp->tab[*((unsigned char *) ws)])
+	/* No funny business when we only have the word pattern. */
+	if (*ws == Meta) {
+	    wc = STOUC(ws[1]) ^ 32;
+	    wlen = 2;
+	} else {
+	    wc = STOUC(*ws);
+	    wlen = 1;
+	}
+	if (!pattern_match1(wp, wc, &wmt))
 	    return 0;
 	wp = wp->next;
-	ws++;
+	ws += wlen;
     }
 
     return 1;
 }
 
+/*
+ * The usual version of pattern matching, without the line string
+ * being handled by restriction.
+ */
+/**/
+mod_export int
+pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
+{
+    return pattern_match_restrict(p, s, wp, ws, NULL);
+}
+
 /* This splits the given string into a list of cline structs, separated
  * at those places where one of the anchors of an `*' pattern was found.
  * plen gives the number of characters on the line that matched this
@@ -1256,11 +1562,11 @@
     return ret;
 }
 
-/* This builds all the possible line patterns for the pattern pat in the
- * buffer line. Initially line is the same as lp, but during recursive
- * calls lp is incremented for storing successive characters. Whenever
- * a full possible string is build, we test if this line matches the
- * string given by wlen and word.
+
+/*
+ * This builds all the possible line patterns for the pattern pat in the
+ * buffer line.  Then we test if this line matches the string given by
+ * wlen and word.
  *
  * wpat contains pattern that matched previously
  * lpat contains the pattern for line we build
@@ -1269,91 +1575,297 @@
  *
  * The return value is the length of the string matched in the word, it
  * is zero if we couldn't build a line that matches the word.
+ *
+ * TODO: a lot of the nastiness associated with variable string
+ * lengths can go when we switch to wide characters.  (Why didn't
+ * I just keep line unmetafied and metafy into place at the end?  Er...)
  */
 
-
 /**/
 static int
-bld_line(Cpattern wpat, Cpattern lpat, char *line, char *lp,
-	 char *mword, char *word, int wlen, int sfx)
+bld_line(Cmatcher mp, char **linep, char *mword, char *word, int wlen, int sfx)
 {
-    if (lpat) {
-	/* Still working on the pattern. */
-
-	int i, l;
-	unsigned char c = 0;
+    Cpattern lpat = mp->line;
+    Cpattern wpat = mp->word;
+    Cpattern curgenpat;
+    VARARR(struct cpattern, genpatarr, mp->llen);
+    Cmlist ms;
+    int llen, rl;
+    char *oword = word, *line = *linep;
 
-	/* Get the number of the character for a correspondence class
-	 * if it has a corresponding class. */
-	if (lpat->equiv)
-	    if (wpat && *mword) {
-		c = wpat->tab[STOUC(*mword)];
-		wpat = wpat->next;
-		mword++;
+    /*
+     * Loop over all characters.  At this stage, line is an empty
+     * space of length llen (not counting the null byte) which we assemble as
+     * we go along.
+     *
+     * However, first we need to know what characters can appear at each
+     * point in the line.  For this we assemble an list genpatarr of the
+     * same length as the line.  (It's convenient to store this as an
+     * array but it's linked as a list, too.)  If there are equivalences
+     * we use mword to derive the equivalent character; when we've
+     * reached the end of mword, equivalences are treated just like
+     * ordinary character classes.  For character classes we just attach
+     * the class to the genpatarr list and apply it as a restriction
+     * when we finally match the line against the set of matchers.
+     */
+    curgenpat = genpatarr;
+    while (lpat) {
+	int wchr = (*mword == Meta) ? STOUC(mword[1]) ^ 32 : STOUC(*mword);
+	int wmtp, wind;
+	/*
+	 * If the line pattern is an equivalence, query wpat to find the
+	 * word part of the equivalence.  If we don't find one we don't try
+	 * equivalencing but use lpat as an ordinary match.  (It's not
+	 * entirely clear to me this is the correct behaviour on a
+	 * failed character match within the equivalence, but that was
+	 * the behaviour of the old logic that this replaces.)
+	 */
+	if (lpat->tp == CPAT_EQUIV && wpat && *mword) {
+	    wind = pattern_match1(wpat, wchr, &wmtp);
+	    wpat = wpat->next;
+	    mword += (*mword == Meta) ? 2 : 1;
+	} else
+	    wind = 0;
+	if (wind) {
+	    /*
+	     * Successful match for word side of equivalence.
+	     * Find the line equivalent.
+	     */
+	    int lchr;
+	    if ((lchr = pattern_match_equivalence(lpat, wind, wmtp, wchr))
+		== -1) {
+		/*
+		 * No equivalent.  No possible match; give up.
+		 */
+		return 0;
 	    }
+	    /*
+	     * We now have an exact character to match,
+	     * so make up a pattern element for it.
+	     */
+	    curgenpat->tp = CPAT_CHAR;
+	    curgenpat->u.chr = lchr;
+	} else {
+	    /*
+	     * Not an equivalence class, so we just keep the
+	     * test in the lpat as it is.
+	     */
+	    curgenpat->tp = lpat->tp;
+	    if (lpat->tp == CPAT_CHAR)
+		curgenpat->u.chr = lpat->u.chr;
+	    else if (lpat->tp != CPAT_ANY) {
+		/*
+		 * The string isn't modified and is only needed in calls from
+		 * this function, so we don't even need to copy it.
+		 */
+		curgenpat->u.str = lpat->u.str;
+	    }
+	}
+	lpat = lpat->next;
+	/*
+	 * This linked list is defined above as an array.
+	 * We could get away with just keeping it as an array
+	 * and passing it down as such, but that's a bit icky
+	 * since the generic linkage of Cpatterns is as a linked
+	 * list and we should keep our local memory management
+	 * problems to ourselvess.
+	 */
+	if (lpat)
+	    curgenpat->next = curgenpat+1;
+	else
+	    curgenpat->next = NULL;
+	curgenpat++;
+    }
 
+    /*
+     * We now know how to match the word with the line patterns; let's
+     * see if it does.  We will use the information in curgenpat if we
+     * are successful to work out what character goes on the line.  This
+     * is a bit hairy, as in "the Yeti is a creature that is a bit
+     * hairy".
+     */
+    llen = mp->llen;
+    rl = 0;
 
-	/* Walk through the table in the pattern and try the characters
-	 * that may appear in the current position. */
-	for (i = 0; i < 256; i++)
-	    if ((lpat->equiv && c) ? (c == lpat->tab[i]) : lpat->tab[i]) {
-		*lp = i;
-		/* We stored the character, now call ourselves to build
-		 * the rest. */
-		if ((l = bld_line(wpat, lpat->next, line, lp + 1,
-				  mword, word, wlen, sfx)))
-		    return l;
-	    }
-    } else {
-	/* We reached the end, i.e. the line string is fully build, now
-	 * see if it matches the given word. */
+    if (sfx)
+    {
+	/*
+	 * We need to work backwards from the end of both the
+	 * word and the line strings.
+	 *
+	 * Position at the end of the word by counting characters.
+	 */
+	int l = wlen;
+	while (l--)
+	    word += (*word == Meta) ? 2 : 1;
 
-	Cmlist ms;
-	Cmatcher mp;
-	int l = lp - line, t, rl = 0, ind, add;
+	/*
+	 * We construct the line from the end.  We've left
+	 * enough space for possible Meta's.
+	 */
+	line += 2 * llen;
+	*line = '\0';
+	curgenpat = genpatarr + llen;
+    } else
+	curgenpat = genpatarr;
 
-	/* Quick test if the strings are exactly the same. */
-	if (l == wlen && !strncmp(line, word, l))
-	    return l;
+    /* we now reuse mp, lpat, wpat for the global matchers */
+    while (llen && wlen) {
+	int wchr, wmtp;
+	char *wp;
+	Cpattern tmpgenpat;
 
 	if (sfx) {
-	    line = lp; word += wlen;
-	    ind = -1; add = -1;
-	} else {
-	    ind = 0; add = 1;
-	}
-	/* We loop through the whole line string built. */
-	while (l && wlen) {
-	    if (word[ind] == line[ind]) {
-		/* The same character in both strings, skip over. */
-		line += add; word += add;
-		l--; wlen--; rl++;
+	    if (word > oword + 1 && word[-2] == Meta)
+		wp = word - 2;
+	    else
+		wp = word - 1;
+	    curgenpat--;
+	} else
+	    wp = word;
+	if (*wp == Meta)
+	    wchr = STOUC(wp[1]) ^ 32;
+	else
+	    wchr = STOUC(*wp);
+	if (pattern_match1(curgenpat, wchr, &wmtp))
+	{
+	    int lchr;
+	    /*
+	     * We can match the line character directly with the word
+	     * character.  If the line character is a fixed one,
+	     * keep it, since we went to all that trouble above,
+	     * else if it's generic, keep the word character,
+	     * since we have no choice.
+	     */
+	    if (curgenpat->tp == CPAT_CHAR)
+		lchr = curgenpat->u.chr;
+	    else
+		lchr = wchr;
+	    if (imeta(lchr)) {
+		if (sfx)
+		    line -= 2;
+		line[0] = Meta;
+		line[1] = lchr ^ 32;
+		if (!sfx)
+		    line += 2;
 	    } else {
-		t = 0;
-		for (ms = bmatchers; ms && !t; ms = ms->next) {
-		    mp = ms->matcher;
-		    if (mp && !mp->flags && mp->wlen <= wlen && mp->llen <= l &&
-			pattern_match(mp->line, (sfx ? line - mp->llen : line),
-				      mp->word, (sfx ? word - mp->wlen : word))) {
-			/* Both the line and the word pattern matched,
-			 * now skip over the matched portions. */
+		if (sfx)
+		    line--;
+		line[0] = lchr;
+		if (!sfx)
+		    line++;
+	    }
+
+	    llen--;
+	    wlen--;
+	    rl++;
+
+	    if (sfx)
+		word = wp;
+	    else {
+		if (llen)
+		    curgenpat++;
+		word += (*word == Meta) ? 2 : 1;
+	    }
+	}
+	else
+	{
+	    char *lp;
+	    /*
+	     * Need to loop over pattern matchers.
+	     */
+	    for (ms = bmatchers; ms; ms = ms->next) {
+		mp = ms->matcher;
+		/*
+		 * This is the nightmare case: we have line and
+		 * and word matchers and some pattern which restricts
+		 * the value on the line without us knowing exactly
+		 * what it is.  Despatch to the special function
+		 * for that.
+		 */
+		if (mp && !mp->flags && mp->wlen <= wlen &&
+		    mp->llen <= llen)
+		{
+		    if (sfx) {
+			/*
+			 * We haven't assembled the line yet, and with
+			 * Meta characters we don't yet know the length.
+			 * We'll fix this up later.
+			 */
+			lp = line - 2 * mp->llen;
+		    } else
+			lp = line;
+		    wp = word;
+		    if (sfx) {
+			int l = mp->wlen;
+			while (l--) {
+			    if (wp > oword + 1 && wp[-2] == Meta)
+				wp -= 2;
+			    else
+				wp--;
+			}
+
+			tmpgenpat = curgenpat - mp->llen;
+		    } else
+			tmpgenpat = curgenpat;
+		    if (pattern_match_restrict(mp->line, lp,
+					       mp->word, wp, tmpgenpat)) {
+			/*
+			 * Matched: advance over as many characters
+			 * of the patterns and strings as
+			 * we've done matches.
+			 */
 			if (sfx) {
-			    line -= mp->llen; word -= mp->wlen;
+			    int imove = mp->llen, nchar;
+			    char *pmove = lp;
+			    word = wp;
+			    
+			    /* Close the gap we left in the line string */
+			    while (imove--)
+				pmove += (*pmove == Meta) ? 2 : 1;
+			    /* Number of bytes to move */
+			    nchar = (int)(pmove - lp);
+			    /* The size of the gap */
+			    imove = 2 * mp->llen - nchar;
+			    if (imove) {
+				lp = line - imove;
+				/* Moving up, so start at the top */
+				while (nchar--)
+				    *--line = *--lp;
+				/* line is at the start of the moved text */
+			    }
+
+			    curgenpat = tmpgenpat;
 			} else {
-			    line += mp->llen; word += mp->wlen;
+			    int cnt = mp->llen;
+			    while (cnt--) {
+				line += (*line == Meta) ? 2 : 1;
+			    }
+
+			    cnt = mp->wlen;
+			    while (cnt--)
+				word += (*word == Meta) ? 2 : 1;
+
+			    curgenpat += mp->llen;
 			}
-			l -= mp->llen; wlen -= mp->wlen; rl += mp->wlen;
-			t = 1;
+			llen -= mp->llen;
+			wlen -= mp->wlen;
+			rl += mp->wlen;
+			break;
 		    }
 		}
-		if (!t)
-		    /* Didn't match, give up. */
-		    return 0;
 	    }
+	    if (!ms)
+		return 0;	/* Didn't match, give up */
 	}
-	if (!l)
-	    /* Unmatched portion in the line built, return matched length. */
-	    return rl;
+    }
+    if (!llen) {
+	/* Unmatched portion in the line built, return matched length. */
+	if (sfx)
+	    *linep = line;
+	else
+	    *line = '\0';
+	return rl;
     }
     return 0;
 }
@@ -1386,8 +1898,10 @@
 		    if ((t = pattern_match(mp->word, sa, NULL, NULL)) ||
 			pattern_match(mp->word, sb, NULL, NULL)) {
 			/* It matched one of the strings, t says which one. */
-			VARARR(char, line, mp->llen + 1);
-			char **ap, **bp;
+			/* TODO: double to allow Meta, not necessary
+			   when properly unmetafied */
+			VARARR(char, linearr, 2*mp->llen + 1);
+			char **ap, **bp, *line = linearr;
 			int *alp, *blp;
 
 			if (t) {
@@ -1399,10 +1913,8 @@
 			}
 			/* Now try to build a string that matches the other
 			 * string. */
-			if ((bl = bld_line(mp->word, mp->line, line, line,
-					   *ap, *bp, *blp, 0))) {
+			if ((bl = bld_line(mp, &line, *ap, *bp, *blp, 0))) {
 			    /* Found one, put it into the return string. */
-			    line[mp->llen] = '\0';
 			    if (rr <= mp->llen) {
 				char *or = rs;
 
@@ -1444,7 +1956,11 @@
     return rs;
 }
 
-/* This compares the anchors stored in two top-level clines. */
+/*
+ * This compares the anchors stored in two top-level clines.
+ * It returns 1 if the anchors are the same, 2 if they are
+ * compatible (and have been combined in "o"), 0 otherwise.
+ */
 
 /**/
 static int
@@ -1591,9 +2107,11 @@
 				       NULL, NULL)) ||
 		     pattern_match(mp->word, nw - (sfx ? mp->wlen : 0),
 				   NULL, NULL))) {
-		    VARARR(char, line, mp->llen + 1);
+		    /* TODO: doubled to allow Meta, not necessary
+		     * when properly unmetafied */
+		    VARARR(char, linearr, 2*mp->llen + 1);
 		    int bl;
-		    char *mw;
+		    char *mw, *line = linearr;
 
 		    /* Then build all the possible lines and see
 		     * if one of them matches the other string. */
@@ -1602,11 +2120,10 @@
 		    else
 			mw = nw - (sfx ? mp->wlen : 0);
 
-		    if ((bl = bld_line(mp->word, mp->line, line, line,
-				       mw, (t ? nw : ow), (t ? nl : ol), sfx)))  {
+		    if ((bl = bld_line(mp, &line, mw, (t ? nw : ow),
+				       (t ? nl : ol), sfx)))  {
 			/* Yep, one of the lines matched the other
 			 * string. */
-			line[mp->llen] = '\0';
 
 			if (t) {
 			    ol = mp->wlen; nl = bl;
Index: Src/Zle/computil.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/computil.c,v
retrieving revision 1.106
diff -u -r1.106 computil.c
--- Src/Zle/computil.c	7 Mar 2008 23:06:42 -0000	1.106
+++ Src/Zle/computil.c	7 Jun 2008 20:19:40 -0000
@@ -3997,6 +3997,239 @@
     return (found ? ret : NULL);
 }
 
+
+/*
+ * This code constructs (from heap) and returns a string that
+ * corresponds to a series of matches; when compiled as a pattern, at
+ * each position it matches either the character from the string "add"
+ * or the corresponding single-character match from the set of matchers.
+ * To take a simple case, if add is "a" and the single matcher for the
+ * character position matches "[0-9]", the pattern returned is "[0-9a]".
+ * We take account of equivalences between the word and line, too.
+ *
+ * As there are virtually no comments in this file, I don't really
+ * know why we're doing this, but it's to do with a matcher which
+ * is passed as an argument to the utility compfiles -p/-P.
+ */
+static char *
+cfp_matcher_range(Cmatcher *ms, char *add)
+{
+    Cmatcher *mp, m;
+    int len = 0, mt;
+    char *ret = NULL, *p = NULL, *adds = add;
+
+    /*
+     * Do this twice:  once to work out the length of the
+     * string in len, the second time to build it in ret.
+     * This is probably worthwhile because otherwise memory
+     * management is difficult.
+     */
+    for (;;) {
+	for (mp = ms; *add; add++, mp++) {
+	    if (!(m = *mp)) {
+		/*
+		 * No matcher, so just match the character
+		 * itself.
+		 *
+		 * TODO: surely this needs quoting if it's a
+		 * metacharacter?
+		 */
+		if (ret) {
+		    if (imeta(*add)) {
+			*p++ = Meta;
+			*p++ = *add ^ 32;
+		    } else
+			*p++ = *add;
+		} else
+		    len += imeta(*add) ? 2 : 1;
+	    } else if (m->flags & CMF_RIGHT) {
+		/*
+		 * Right-anchored:  match anything followed
+		 * by the character itself.
+		 */
+		if (ret) {
+		    *p++ = '*';
+		    /* TODO: quote again? */
+		    if (imeta(*add)) {
+			*p++ = Meta;
+			*p++ = *add ^ 32;
+		    } else
+			*p++ = *add;
+		} else
+		    len += imeta(*add) ? 3 : 2;
+	    } else {
+		/* The usual set of matcher possibilities. */
+		int ind;
+		if (m->line->tp == CPAT_EQUIV &&
+		    m->word->tp == CPAT_EQUIV) {
+		    /*
+		     * Genuine equivalence.  Add the character to match
+		     * and the equivalent character from the word
+		     * pattern.
+		     *
+		     * TODO: we could be more careful here with special
+		     * cases as we are in the basic character class
+		     * code below.
+		     */
+		    if (ret) {
+			*p++ = '[';
+			if (imeta(*add)) {
+			    *p++ = Meta;
+			    *p++ = *add ^ 32;
+			} else
+			    *p++ = *add;
+		    } else
+			len += imeta(*add) ? 3 : 2;
+		    if (PATMATCHRANGE(m->line->u.str, CONVCAST(*add),
+				      &ind, &mt)) {
+			/*
+			 * Find the equivalent match for ind in the
+			 * word pattern.
+			 */
+			if ((ind = pattern_match_equivalence
+			     (m->word, ind, mt, CONVCAST(*add))) != -1) {
+			    if (ret) {
+				if (imeta(ind)) {
+				    *p++ = Meta;
+				    *p++ = ind ^ 32;
+				} else
+				    *p++ = ind;
+			    } else
+				len += imeta(ind) ? 2 : 1;
+			}
+		    }
+		    if (ret)
+			*p++ = ']';
+		    else
+			len++;
+		} else {
+		    int newlen, addadd;
+
+		    switch (m->word->tp) {
+		    case CPAT_NCLASS:
+			/*
+			 * TODO: the old logic implies that we need to
+			 * match *add, i.e. it should be deleted from
+			 * the set of character's we're not allowed to
+			 * match.  That's too much like hard work for
+			 * now.  Indeed, in general it's impossible
+			 * without trickery.  Consider *add == 'A',
+			 * range == "[^[:upper:]]": we would have to
+			 * resort to something like "(A|[^[:upper:]])";
+			 * and in an expression like that *add may or
+			 * may not need backslashing.  So we're deep
+			 * into see-if-we-can-get-away-without
+			 * territory.
+			 */
+			if (ret) {
+			    *p++ = '[';
+			    *p++ = '^';
+			} else
+			    len += 2;
+			/*
+			 * Convert the compiled range string back
+			 * to an ordinary string.
+			 */
+			newlen =
+			    pattern_range_to_string(m->word->u.str, p);
+			DPUTS(!newlen, "empty character range");
+			if (ret) {
+			    p += newlen;
+			    *p++ = ']';
+			} else
+			    len += newlen + 1;
+			break;
+			    
+		    case CPAT_CCLASS:
+			/*
+			 * If there is an equivalence only on one
+			 * side it's not equivalent to anything.
+			 * Treat it as an ordinary character class.
+			 */ 
+		    case CPAT_EQUIV:
+		    case CPAT_CHAR:
+			if (ret)
+			    *p++ = '[';
+			else
+			    len++;
+			/*
+			 * We needed to add *add specially only if
+			 * it is not covered by the range.  This
+			 * is necessary for correct syntax---consider
+			 * if *add is ] and ] is also the first
+			 * character in the range.
+			 */
+			addadd = !pattern_match1(m->word, CONVCAST(*add), &mt);
+			if (addadd && *add == ']') {
+			    if (ret)
+				*p++ = *add;
+			    else
+				len++;
+			}
+			if (m->word->tp == CPAT_CHAR) {
+			    /*
+			     * The matcher just matches a single
+			     * character, but we need to be able
+			     * to match *add, too, hence we do
+			     * this as a [...].
+			     */
+			    if (ret) {
+				if (imeta(m->word->u.chr)) {
+				    *p++ = Meta;
+				    *p++ = m->word->u.chr ^ 32;
+				} else
+				    *p++ = m->word->u.chr;
+			    } else
+				len += imeta(m->word->u.chr) ? 2 : 1;
+			} else {
+			    /*
+			     * Convert the compiled range string back
+			     * to an ordinary string.
+			     */
+			    newlen =
+				pattern_range_to_string(m->word->u.str, p);
+			    DPUTS(!newlen, "empty character range");
+			    if (ret)
+				p += newlen;
+			    else
+				len += newlen;
+			}
+			if (addadd && *add != ']') {
+			    if (ret) {
+				if (imeta(*add)) {
+				    *p++ = Meta;
+				    *p++ = *add ^ 32;
+				} else
+				    *p++ = *add;
+			    } else
+				len += imeta(*add) ? 2 : 1;
+			}
+			if (ret)
+			    *p++ = ']';
+			else
+			    len++;
+			break;
+
+		    case CPAT_ANY:
+			if (ret)
+			    *p++ = '?';
+			else
+			    len++;
+			break;
+		    }
+		}
+	    }
+	}
+	if (ret) {
+	    *p = '\0';
+	    return ret;
+	}
+	p = ret = zhalloc(len + 1);
+	add = adds;
+    }
+}
+
+
 static char *
 cfp_matcher_pats(char *matcher, char *add)
 {
@@ -4064,64 +4297,8 @@
 			break;
 		    }
 	}
-	if (*add) {
-	    char *ret = "", buf[259];
-
-	    for (mp = ms; *add; add++, mp++) {
-		if (!(m = *mp)) {
-		    buf[0] = *add;
-		    buf[1] = '\0';
-		} else if (m->flags & CMF_RIGHT) {
-		    buf[0] = '*';
-		    buf[1] = *add;
-		    buf[2] = '\0';
-		} else {
-		    unsigned char *t, c;
-		    char *p = buf;
-		    int i;
-
-		    for (i = 256, t = m->word->tab; i--; t++)
-			if (*t)
-			    break;
-		    if (i) {
-			t = m->word->tab;
-			*p++ = '[';
-			if (m->line->equiv && m->word->equiv) {
-			    *p++ = *add;
-			    c = m->line->tab[STOUC(*add)];
-			    for (i = 0; i < 256; i++)
-				if (m->word->tab[i] == c) {
-				    *p++ = (char) i;
-				    break;
-				}
-			} else {
-			    if (*add == ']' || t[STOUC(']')])
-				*p++ = ']';
-			    for (i = 0; i < 256; i++, t++)
-				if (*t && ((char) i) != *add &&
-				    i != ']' && i != '-' &&
-				    i != '^' && i != '!')
-				    *p++ = (char) i;
-			    *p++ = *add;
-			    t = m->word->tab;
-			    if (*add != '^' && t[STOUC('^')])
-				*p++ = '^';
-			    if (*add != '!' && t[STOUC('!')])
-				*p++ = '!';
-			    if (*add != '-' && t[STOUC('-')])
-				*p++ = '-';
-			}
-			*p++ = ']';
-			*p = '\0';
-		    } else {
-			*p = '?';
-			p[1] = '\0';
-		    }
-		}
-		ret = dyncat(ret, buf);
-	    }
-	    return ret;
-	}
+	if (*add)
+	    return cfp_matcher_range(ms, add);
     }
     return add;
 }


-- 
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/



Messages sorted by: Reverse Date, Date, Thread, Author