Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Subject: PATCH: multibyte separators and delimiters



This fixes IFS, splitting and joining, the read builtin, and various
associated bits and pieces to use multibyte mode.

The point of nomultibyte mode is to be consistent, not to be clever, so
I've altered part of the pattern code not to use wide character tests when the mode is turned off.  This revealed some missing STOUC()s.

With this patch, I think the main shell is in pretty good shape. I'm
about to turn on MULTIBYTE by default where MULTIBYTE_SUPPORT is
defined: it's become clear this is the only way to get proper clean and
consistent behaviour.  This will leave the option to be turned off by
those in special cases who know they want to deal with single bytes.

One example of the damage done by the current default is hinted at by
the change to insert-composed-char.  With the "emulate" at the start,
multibyte mode was turned off in the main shell (though zle always
trusts the locale).  This meant the IFS was set inconsistently, and,
worse, wasn't restored properly since multibyte mode wasn't turned on at
that point.  I've used an explicit split on spaces, so it's no longer a
problem, but it does illustrate why turning off multibyte mode needs to
be left to specialists in private.

I needed some non-ASCII text for the tests.  For some reason copying
between Firefox and the terminal worked fine while copying into and out
of Emacs messed up, though Emacs worked fine if the same text was
written to a file.  There must be some setting I'm missing, but Emacs
doesn't exactly make it obvious.  (I think I've convinced Sylpheed
the result is UTF-8.)

Index: Doc/Zsh/builtins.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/builtins.yo,v
retrieving revision 1.85
diff -u -r1.85 builtins.yo
--- Doc/Zsh/builtins.yo	19 Apr 2006 16:09:07 -0000	1.85
+++ Doc/Zsh/builtins.yo	24 Jul 2006 21:41:19 -0000
@@ -1003,6 +1003,10 @@
 var(name), without word splitting.  This flag is ignored when tt(-q) is
 present.  Input is read from the terminal unless one of tt(-u) or tt(-p)
 is present.  This option may also be used within zle widgets.
+
+Note that despite the mnemonic `key' this option does read full
+characters, which may consist of multiple bytes if the option
+tt(MULTIBYTE) is set.
 )
 item(tt(-z))(
 Read one entry from the editor buffer stack and assign it to the first
Index: Functions/Zle/insert-composed-char
===================================================================
RCS file: /cvsroot/zsh/zsh/Functions/Zle/insert-composed-char,v
retrieving revision 1.3
diff -u -r1.3 insert-composed-char
--- Functions/Zle/insert-composed-char	21 Oct 2005 09:51:55 -0000	1.3
+++ Functions/Zle/insert-composed-char	24 Jul 2006 21:41:19 -0000
@@ -128,7 +128,7 @@
 # 'm            Macron
 # ''            Acute
 
-emulate -LR zsh
+emulate -L zsh
 setopt cbases extendedglob printeightbit
 
 local accent basechar ochar error
@@ -165,7 +165,8 @@
 fi
 
 local -A charmap
-charmap=(${=zsh_accented_chars[$accent]})
+# just in case someone is monkeying with IFS...
+charmap=(${(s. .)zsh_accented_chars[$accent]})
 
 if [[ ${#charmap} -eq 0 || -z $charmap[$basechar] ]]; then
   $error "Combination ${basechar}${accent} is not available."
Index: Src/builtin.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/builtin.c,v
retrieving revision 1.159
diff -u -r1.159 builtin.c
--- Src/builtin.c	10 Jul 2006 13:08:23 -0000	1.159
+++ Src/builtin.c	24 Jul 2006 21:41:23 -0000
@@ -4266,7 +4266,7 @@
 	    zerrnam(name, "not in while, until, select, or repeat loop");
 	    return 1;
 	}
-	contflag = 1;   /* ARE WE SUPPOSED TO FALL THROUGH HERE? */
+	contflag = 1; /* FALLTHROUGH */
     case BIN_BREAK:
 	if (!loops) {   /* break is only permitted in loops */
 	    zerrnam(name, "not in while, until, select, or repeat loop");
@@ -4560,7 +4560,14 @@
     int readchar = -1, val, resettty = 0;
     struct ttyinfo saveti;
     char d;
+#ifdef MULTIBYTE_SUPPORT
+    wchar_t delim = L'\n', wc;
+    mbstate_t mbs;
+    char *laststart;
+    size_t ret;
+#else
     char delim = '\n';
+#endif
 
     if (OPT_HASARG(ops,c='k')) {
 	char *eptr, *optarg = OPT_ARG(ops,c);
@@ -4666,7 +4673,23 @@
     }
     if (OPT_ISSET(ops,'d')) {
 	char *delimstr = OPT_ARG(ops,'d');
+#ifdef MULTIBYTE_SUPPORT
+	wint_t wc;
+
+	if (isset(MULTIBYTE)) {
+	    mb_metacharinit();
+	    (void)mb_metacharlenconv(delimstr, &wc);
+	}
+	else
+	    wc = WEOF;
+	if (wc != WEOF)
+	    delim = (wchar_t)wc;
+	else
+	    delim = (wchar_t)((delimstr[0] == Meta) ?
+			      delimstr[1] ^ 32 : delimstr[0]);
+#else
         delim = (delimstr[0] == Meta) ? delimstr[1] ^ 32 : delimstr[0];
+#endif
 	if (SHTTY != -1) {
 	    struct ttyinfo ti;
 	    gettyinfo(&ti);
@@ -4710,26 +4733,74 @@
 	}
     }
 
+#ifdef MULTIBYTE_SUPPORT
+    memset(&mbs, 0, sizeof(mbs));
+#endif
+
     /* option -k means read only a given number of characters (default 1) */
     if (OPT_ISSET(ops,'k')) {
+	int eof = 0;
 	/* allocate buffer space for result */
 	bptr = buf = (char *)zalloc(nchars+1);
 
 	do {
 	    if (izle) {
-		if ((val = getkeyptr(0, NULL)) < 0)
+		if ((val = getkeyptr(0, NULL)) < 0) {
+		    eof = 1;
 		    break;
-		*bptr++ = (char) val;
+		}
+		*bptr = (char) val;
+#ifdef MULTIBYTE_SUPPORT	
+		if (isset(MULTIBYTE)) {
+		    ret = mbrlen(bptr++, 1, &mbs);
+		    if (ret == MB_INVALID)
+			memset(&mbs, 0, sizeof(mbs));
+		    /* treat invalid as single character */
+		    if (ret != MB_INCOMPLETE)
+			nchars--;
+		    continue;
+		} else {
+		    bptr++;
+		    nchars--;
+		}
+#else
+		bptr++;
 		nchars--;
+#endif
 	    } else {
 		/* If read returns 0, is end of file */
 		if (readchar >= 0) {
 		    *bptr = readchar;
 		    val = 1;
 		    readchar = -1;
-		} else if ((val = read(readfd, bptr, nchars)) <= 0)
+		} else if ((val = read(readfd, bptr, nchars)) <= 0) {
+		    eof = 1;
 		    break;
+		}
 	    
+#ifdef MULTIBYTE_SUPPORT	
+		if (isset(MULTIBYTE)) {
+		    while (val > 0) {
+			ret = mbrlen(bptr, val, &mbs);
+			if (ret == MB_INCOMPLETE) {
+			    bptr += val;
+			    break;
+			} else {
+			    if (ret == MB_INVALID) {
+				memset(&mbs, 0, sizeof(mbs));
+				/* treat as single byte */
+				ret = 1;
+			    }
+			    else if (ret == 0) /* handle null as normal char */
+				ret = 1;
+			    nchars--;
+			    val -= ret;
+			    bptr += ret;
+			}
+		    }
+		    continue;
+		}
+#endif
 		/* decrement number of characters read from number required */
 		nchars -= val;
 
@@ -4761,7 +4832,7 @@
 	    zfree(buf, bptr - buf + 1);
 	if (resettty && SHTTY != -1)
 	    settyinfo(&saveti);
-	return val <= 0;
+	return eof;
     }
 
     /* option -q means get one character, and interpret it as a Y or N */
@@ -4770,10 +4841,25 @@
 
 	/* set up the buffer */
 	readbuf[1] = '\0';
-
+	
 	/* get, and store, reply */
 	if (izle) {
+#ifdef MULTIBYTE_SUPPORT
+	    int key;
+
+	    while ((key = getkeyptr(0, NULL)) >= 0) {
+		char c = (char)key;
+		/*
+		 * If multibyte, it can't be y, so we don't care
+		 * what key gets set to; just read to end of character.
+		 */
+		if (!isset(MULTIBYTE) ||
+		    mbrlen(&c, 1, &mbs) != MB_INCOMPLETE)
+		    break;
+	    }
+#else
 	    int key = getkeyptr(0, NULL);
+#endif
 
 	    readbuf[0] = (key == 'y' ? 'y' : 'n');
 	} else {
@@ -4786,6 +4872,7 @@
 		SHTTY = -1;
 	    }
 	}
+
 	if (OPT_ISSET(ops,'e') || OPT_ISSET(ops,'E'))
 	    printf("%s\n", readbuf);
 	if (!OPT_ISSET(ops,'e'))
@@ -4808,16 +4895,79 @@
     while (*args || (OPT_ISSET(ops,'A') && !gotnl)) {
 	sigset_t s = child_unblock();
 	buf = bptr = (char *)zalloc(bsiz = 64);
+#ifdef MULTIBYTE_SUPPORT
+	laststart = buf;
+	ret = MB_INCOMPLETE;
+#endif
 	/* get input, a character at a time */
 	while (!gotnl) {
 	    c = zread(izle, &readchar);
 	    /* \ at the end of a line indicates a continuation *
 	     * line, except in raw mode (-r option)            */
+#ifdef MULTIBYTE_SUPPORT
+	    if (c == EOF) {
+		/* not waiting to be completed any more */
+		ret = 0;
+		break;
+	    } 
+	    *bptr = (char)c;
+	    if (isset(MULTIBYTE)) {
+		ret = mbrtowc(&wc, bptr, 1, &mbs);
+		if (!ret)	/* NULL */
+		    ret = 1;
+	    } else {
+		ret = 1;
+		wc = (wchar_t)c;
+	    }
+	    if (ret != MB_INCOMPLETE) {
+		if (ret == MB_INVALID)
+		    memset(&mbs, 0, sizeof(mbs));
+		if (bslash && wc == delim) {
+		    bslash = 0;
+		    continue;
+		}
+		if (wc == delim)
+		    break;
+		/*
+		 * `first' is non-zero if any separator we encounter is a
+		 * non-whitespace separator, which means that anything
+		 * (even an empty string) between, before or after separators
+		 * is significant.  If it is zero, we have a whitespace
+		 * separator, which shouldn't cause extra empty strings to
+		 * be emitted.  Hence the test for (*buf || first) when
+		 * we assign the result of reading a word.
+		 */
+		if (!bslash && wcsitype(wc, ISEP)) {
+		    if (bptr != buf ||
+			(!(c < 128 && iwsep(c)) && first)) {
+			first |= !(c < 128 && iwsep(c));
+			break;
+		    }
+		    first |= !(c < 128 && iwsep(c));
+		    continue;
+		}
+		bslash = (wc == L'\\' && !bslash && !OPT_ISSET(ops,'r'));
+		if (bslash)
+		    continue;
+		first = 0;
+	    }
+	    if (imeta(STOUC(*bptr))) {
+		bptr[1] = bptr[0] ^ 32;
+		bptr[0] = Meta;
+		bptr += 2;
+	    }
+	    else
+		bptr++;
+	    if (ret != MB_INCOMPLETE)
+		laststart = bptr;
+#else
+	    if (c == EOF)
+		break;
 	    if (bslash && c == delim) {
 		bslash = 0;
 		continue;
 	    }
-	    if (c == EOF || c == delim)
+	    if (c == delim)
 		break;
 	    /*
 	     * `first' is non-zero if any separator we encounter is a
@@ -4845,18 +4995,42 @@
 		*bptr++ = c ^ 32;
 	    } else
 		*bptr++ = c;
+#endif
 	    /* increase the buffer size, if necessary */
 	    if (bptr >= buf + bsiz - 1) {
 		int blen = bptr - buf;
+#ifdef MULTIBYTE_SUPPORT
+		int llen = laststart - buf;
+#endif
 
 		buf = realloc(buf, bsiz *= 2);
 		bptr = buf + blen;
+#ifdef MULTIBYTE_SUPPORT
+		laststart = buf + llen;
+#endif
 	    }
 	}
 	signal_setmask(s);
+#ifdef MULTIBYTE_SUPPORT
+	if (c == EOF)
+	    gotnl = 1;
+	if (ret == MB_INCOMPLETE) {
+	    /*
+	     * We can only get here if there is an EOF in the
+	     * middle of a character... safest to keep the debris,
+	     * I suppose.
+	     */
+	    *bptr = '\0';
+	} else {
+	    if (wc == delim)
+		gotnl = 1;
+	    *laststart = '\0';
+	}
+#else
 	if (c == delim || c == EOF)
 	    gotnl = 1;
 	*bptr = '\0';
+#endif
 	/* dispose of word appropriately */
 	if (OPT_ISSET(ops,'e') || OPT_ISSET(ops,'E')) {
 	    zputs(buf, stdout);
@@ -4908,12 +5082,66 @@
 	return c == EOF;
     }
     buf = bptr = (char *)zalloc(bsiz = 64);
+#ifdef MULTIBYTE_SUPPORT
+    laststart = buf;
+    ret = MB_INCOMPLETE;
+#endif
     /* any remaining part of the line goes into one parameter */
     bslash = 0;
     if (!gotnl) {
 	sigset_t s = child_unblock();
 	for (;;) {
 	    c = zread(izle, &readchar);
+#ifdef MULTIBYTE_SUPPORT
+	    if (c == EOF) {
+		/* not waiting to be completed any more */
+		ret = 0;
+		break;
+	    }
+	    *bptr = (char)c;
+	    if (isset(MULTIBYTE)) {
+		ret = mbrtowc(&wc, bptr, 1, &mbs);
+		if (!ret)	/* NULL */
+		    ret = 1;
+	    } else {
+		ret = 1;
+		wc = (wchar_t)c;
+	    }
+	    if (ret != MB_INCOMPLETE) {
+		if (ret == MB_INVALID)
+		    memset(&mbs, 0, sizeof(mbs));
+		/*
+		 * \ at the end of a line introduces a continuation line,
+		 * except in raw mode (-r option)
+		 */
+		if (bslash && wc == delim) {
+		    bslash = 0;
+		    continue;
+		}
+		if (wc == delim && !zbuf)
+		    break;
+		if (!bslash && bptr == buf && wcsitype(wc, ISEP)) {
+		    if (c < 128 && iwsep(c))
+			continue;
+		    else if (!first) {
+			first = 1;
+			continue;
+		    }
+		}
+		bslash = (wc == L'\\' && !bslash && !OPT_ISSET(ops,'r'));
+		if (bslash)
+		    continue;
+	    }
+	    if (imeta(STOUC(*bptr))) {
+		bptr[1] = bptr[0] ^ 32;
+		bptr[0] = Meta;
+		bptr += 2;
+	    }
+	    else
+		bptr++;
+	    if (ret != MB_INCOMPLETE)
+		laststart = bptr;
+#else
 	    /* \ at the end of a line introduces a continuation line, except in
 	       raw mode (-r option) */
 	    if (bslash && c == delim) {
@@ -4938,22 +5166,36 @@
 		*bptr++ = c ^ 32;
 	    } else
 		*bptr++ = c;
+#endif
 	    /* increase the buffer size, if necessary */
 	    if (bptr >= buf + bsiz - 1) {
 		int blen = bptr - buf;
+#ifdef MULTIBYTE_SUPPORT
+		int llen = laststart - buf;
+#endif
 
 		buf = realloc(buf, bsiz *= 2);
 		bptr = buf + blen;
+#ifdef MULTIBYTE_SUPPORT
+		laststart = buf + llen;
+#endif
 	    }
 	}
 	signal_setmask(s);
     }
+#ifdef MULTIBYTE_SUPPORT
+    if (ret != MB_INCOMPLETE)
+	bptr = laststart;
+#endif
+    /*
+     * Strip trailing IFS whitespace.
+     * iwsep can only be certain single-byte ASCII bytes, but we
+     * must check the byte isn't metafied.
+     */
     while (bptr > buf) {
 	if (bptr > buf + 1 && bptr[-2] == Meta) {
-	    if (iwsep(bptr[-1] ^ 32))
-		bptr -= 2;
-	    else
-		break;
+	    /* non-ASCII, can't be IWSEP */
+	    break;
 	} else if (iwsep(bptr[-1]))
 	    bptr--;
 	else
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.35
diff -u -r1.35 pattern.c
--- Src/pattern.c	28 Jun 2006 13:12:55 -0000	1.35
+++ Src/pattern.c	24 Jul 2006 21:41:25 -0000
@@ -318,7 +318,7 @@
 	    inchar = *inptr++;
 	}
 	*x = inptr;
-	return (wchar_t)inchar;
+	return (wchar_t)STOUC(inchar);
     }
 
     while (*inptr) {
@@ -352,12 +352,14 @@
 #define PEOF EOF
 
 #define METACHARINC(x)	((void)((x) += (*(x) == Meta) ? 2 : 1))
+#endif
+
 /*
- * Return unmetafied char from string (x is any char *)
+ * Return unmetafied char from string (x is any char *).
+ * Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not
+ * in effect.
  */
 #define UNMETA(x)	(*(x) == Meta ? (x)[1] ^ 32 : *(x))
-#endif
-
 
 /* Add n more characters, ensuring there is enough space. */
 
@@ -1575,7 +1577,7 @@
     size_t ret;
 
     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80))
-	return (wchar_t) *x;
+	return (wchar_t) STOUC(*x);
 
     ret = mbrtowc(&wc, x, y-x, &shiftstate);
 
@@ -1583,7 +1585,7 @@
 	/* Error.  Treat as single byte. */
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
-	return (wchar_t) *x;
+	return (wchar_t) STOUC(*x);
     }
 
     return wc;
@@ -1626,7 +1628,7 @@
     size_t ret;
 
     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
-	return (wchar_t) *(*x)++;
+	return (wchar_t) STOUC(*(*x)++);
 
     ret = mbrtowc(&wc, *x, y-*x, &shiftstate);
 
@@ -1634,7 +1636,7 @@
 	/* Error.  Treat as single byte. */
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
-	return (wchar_t) *(*x)++;
+	return (wchar_t) STOUC(*(*x)++);
     }
 
     /* Nulls here are normal characters */
@@ -2222,20 +2224,33 @@
 	    }
 	    break;
 	case P_ANYOF:
-	    if (patinput == patinend ||
-		!patmatchrange((char *)P_OPERAND(scan),
-			       CHARREF(patinput, patinend)))
-		fail = 1;
-	    else
-		CHARINC(patinput, patinend);
-	    break;
 	case P_ANYBUT:
-	    if (patinput == patinend ||
-		patmatchrange((char *)P_OPERAND(scan),
-			      CHARREF(patinput, patinend)))
+	    if (patinput == patinend)
 		fail = 1;
-	    else
-		CHARINC(patinput, patinend);
+	    else {
+#ifdef MULTIBYTE_SUPPORT
+		wchar_t cr = CHARREF(patinput, patinend);
+		char *scanop = (char *)P_OPERAND(scan);
+		if (patglobflags & GF_MULTIBYTE) {
+		    if (mb_patmatchrange(scanop, cr) ^
+			(P_OP(scan) == P_ANYOF))
+			fail = 1;
+		    else
+			CHARINC(patinput, patinend);
+		} else if (patmatchrange(scanop, (int)cr) ^
+			   (P_OP(scan) == P_ANYOF))
+		    fail = 1;
+		else
+		    CHARINC(patinput, patinend);
+#else
+		if (patmatchrange((char *)P_OPERAND(scan),
+				   CHARREF(patinput, patinend)) ^
+		    (P_OP(scan) == P_ANYOF))
+		    fail = 1;
+		else
+		    CHARINC(patinput, patinend);
+#endif
+	    }
 	    break;
 	case P_NUMRNG:
 	case P_NUMFROM:
@@ -2923,7 +2938,7 @@
 
 /**/
 static int
-patmatchrange(char *range, wchar_t ch)
+mb_patmatchrange(char *range, wchar_t ch)
 {
     wchar_t r1, r2;
 
@@ -2994,21 +3009,20 @@
 		    return 1;
 		break;
 	    case PP_IDENT:
-		if (wcsiident(ch))
+		if (wcsitype(ch, IIDENT))
 		    return 1;
 		break;
 	    case PP_IFS:
-		/* TODO */
-		if (isep(ch))
+		if (wcsitype(ch, ISEP))
 		    return 1;
 		break;
 	    case PP_IFSSPACE:
-		/* TODO */
-		if (iwsep(ch))
+		/* must be ASCII space character */
+		if (ch < 128 && iwsep((int)ch))
 		    return 1;
 		break;
 	    case PP_WORD:
-		if (wcsiword(ch))
+		if (wcsitype(ch, IWORD))
 		    return 1;
 		break;
 	    case PP_RANGE:
@@ -3031,7 +3045,7 @@
 }
 
 /**/
-#else
+#endif
 
 /**/
 static int
@@ -3142,9 +3156,6 @@
     return 0;
 }
 
-/**/
-#endif
-
 /*
  * Repeatedly match something simple and say how many times.
  * charstart is an array parallel to that starting at patinput
@@ -3180,20 +3191,26 @@
 	}
 	break;
     case P_ANYOF:
-	while (scan < patinend &&
-	       patmatchrange(opnd, CHARREF(scan, patinend))) {
-	    charstart[scan-patinput] = 1;
-	    count++;
-	    CHARINC(scan, patinend);
-    	}
-	break;
     case P_ANYBUT:
-	while (scan < patinend &&
-	       !patmatchrange(opnd, CHARREF(scan, patinend))) {
+	while (scan < patinend) {
+#ifdef MULTIBYTE_SUPPORT
+	    wchar_t cr = CHARREF(scan, patinend);
+	    if (patglobflags & GF_MULTIBYTE) {
+		if (mb_patmatchrange(opnd, cr) ^
+		    (P_OP(p) == P_ANYOF))
+		    break;
+	    } else if (patmatchrange(opnd, (int)cr) ^
+		       (P_OP(p) == P_ANYOF))
+		break;
+#else
+	    if (patmatchrange(opnd, CHARREF(scan, patinend)) ^
+		P_OP(p) == P_ANYOF)
+		break;
+#endif
 	    charstart[scan-patinput] = 1;
 	    count++;
 	    CHARINC(scan, patinend);
-    	}
+	}
 	break;
 #ifdef DEBUG
     default:
Index: Src/subst.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/subst.c,v
retrieving revision 1.54
diff -u -r1.54 subst.c
--- Src/subst.c	10 Jul 2006 13:08:23 -0000	1.54
+++ Src/subst.c	24 Jul 2006 21:41:27 -0000
@@ -316,9 +316,14 @@
     local_list1(foo);
 
     if (split) {
-	for ( ; *x; x += l+1) {
+	/*
+	 * This doesn't handle multibyte characters, but we're
+	 * looking for whitespace separators which must be ASCII.
+	 */
+	for ( ; *x; x += l) {
 	    char c = (l = *x == Meta) ? x[1] ^ 32 : *x;
-	    if (!iwsep(c))
+	    l++;
+	    if (!iwsep(STOUC(c)))
 		break;
 	}
     }
@@ -328,20 +333,35 @@
     if (split) {
 	LinkNode n = firstnode(&foo);
 	int inq = 0, inp = 0;
-	for ( ; *x; x += l+1) {
-	    char c = (l = *x == Meta) ? x[1] ^ 32 : *x;
-	    if (!inq && !inp && isep(c)) {
-		*x = '\0';
-		for (x += l+1; *x; x += l+1) {
-		    c = (l = *x == Meta) ? x[1] ^ 32 : *x;
-		    if (!isep(c))
+	MB_METACHARINIT();
+	for ( ; *x; x += l) {
+	    int rawc = -1;
+	    convchar_t c;
+	    if (itok(STOUC(*x))) {
+		/* token, can't be separator, must be single byte */
+		rawc = *x;
+		l = 1;
+	    } else {
+		l = MB_METACHARLENCONV(x, &c);
+		if (!inq && !inp && MB_ZISTYPE(c, ISEP)) {
+		    *x = '\0';
+		    for (x += l; *x; x += l) {
+			if (itok(STOUC(*x))) {
+			    /* as above */
+			    rawc = *x;
+			    l = 1;
+			    break;
+			}
+			l = MB_METACHARLENCONV(x, &c);
+			if (!MB_ZISTYPE(c, ISEP))
+			    break;
+		    }
+		    if (!*x)
 			break;
+		    insertlinknode(&foo, n, (void *)x), incnode(n);
 		}
-		if (!*x)
-		    break;
-		insertlinknode(&foo, n, (void *)x), incnode(n);
 	    }
-	    switch (c) {
+	    switch (rawc) {
 	    case Dnull:  /* " */
 	    case Snull:  /* ' */
 	    case Tick:   /* ` (note: no Qtick!) */
@@ -357,8 +377,8 @@
 	    case Bnull:  /* \ */
 	    case Bnullkeep:
 		/* The parser verified the following char's existence. */
-		x += l+1;
-		l = *x == Meta;
+		x += l;
+		l = MB_METACHARLEN(x);
 		break;
 	    }
 	}
@@ -685,12 +705,14 @@
 static char *
 dopadding(char *str, int prenum, int postnum, char *preone, char *postone, char *premul, char *postmul)
 {
-    char def[3], *ret, *t, *r;
+    char *def, *ret, *t, *r;
     int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc;
 
-    def[0] = *ifs ? *ifs : ' ';
-    def[1] = *ifs == Meta ? ifs[1] ^ 32 : '\0';
-    def[2] = '\0';
+    MB_METACHARINIT();
+    if (*ifs)
+	def = dupstrpfx(ifs, MB_METACHARLEN(ifs));
+    else
+	def = "";
     if (preone && !*preone)
 	preone = def;
     if (postone && !*postone)
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.127
diff -u -r1.127 utils.c
--- Src/utils.c	10 Jul 2006 13:08:23 -0000	1.127
+++ Src/utils.c	24 Jul 2006 21:41:30 -0000
@@ -35,16 +35,65 @@
 /**/
 char *scriptname;
 
-/**/
 #ifdef MULTIBYTE_SUPPORT
+struct widechar_array {
+    wchar_t *chars;
+    size_t len;
+};
+typedef struct widechar_array *Widechar_array;
+
 /*
  * The wordchars variable turned into a wide character array.
  * This is much more convenient for testing.
  */
+struct widechar_array wordchars_wide;
 
-/**/
-mod_export wchar_t *wordchars_wide;
-/**/
+/*
+ * The same for the separators (IFS) array.
+ */
+struct widechar_array ifs_wide;
+
+/* Function to set one of the above from the multibyte array */
+
+static void
+set_widearray(char *mb_array, Widechar_array wca)
+{
+    if (wca->chars) {
+	free(wca->chars);
+	wca->chars = NULL;
+    }
+    wca->len = 0;
+
+    if (!isset(MULTIBYTE))
+	return;
+
+    if (mb_array) {
+	VARARR(wchar_t, tmpwcs, strlen(mb_array));
+	wchar_t *wcptr = tmpwcs;
+	wint_t wci;
+
+	mb_metacharinit();
+	while (*mb_array) {
+	    int mblen = mb_metacharlenconv(mb_array, &wci);
+
+	    if (!mblen)
+		break;
+	    /* No good unless all characters are convertible */
+	    if (*wcptr == WEOF)
+		return;
+	    *wcptr++ = (wchar_t)wci;
+#ifdef DEBUG
+	    if (wcptr[-1] < 0)
+		fprintf(stderr, "BUG: Bad cast to wchar_t\n");
+#endif
+	    mb_array += mblen;
+	}
+
+	wca->len = wcptr - tmpwcs;
+	wca->chars = (wchar_t *)zalloc(wca->len * sizeof(wchar_t));
+	wmemcpy(wca->chars, tmpwcs, wca->len);
+    }
+}
 #endif
 
 
@@ -1853,9 +1902,34 @@
 	if (c != '\n')
 	    while ((d = read1char()) >= 0 && d != '\n');
     } else {
-	settyinfo(&shttyinfo);
-	if (c != '\n' && !valid_chars)
+	if (c != '\n' && !valid_chars) {
+#ifdef MULTIBYTE_SUPPORT
+	    if (isset(MULTIBYTE) && c >= 0) {
+		/*
+		 * No waiting for a valid character, and no draining;
+		 * we should ensure we haven't stopped in the middle
+		 * of a multibyte character.
+		 */
+		mbstate_t mbs;
+		char cc = (char)c;
+		memset(&mbs, 0, sizeof(mbs));
+		for (;;) {
+		    size_t ret = mbrlen(&cc, 1, &mbs);
+
+		    if (ret != MB_INCOMPLETE)
+			break;
+		    c = read1char();
+		    if (c < 0)
+			break;
+		    cc = (char)c;
+		}
+	    }
+#endif
+	    settyinfo(&shttyinfo);
 	    write(SHTTY, "\n", 1);
+	}
+	else
+	    settyinfo(&shttyinfo);
     }
     return c;
 }
@@ -2253,6 +2327,10 @@
     char *t = *s;
     int i = 0;
 
+    /*
+     * Don't need to handle mutlibyte characters, they can't
+     * be IWSEP.  Do need to check for metafication.
+     */
     while (*t && iwsep(*t == Meta ? t[1] ^ 32 : *t)) {
 	if (*t == Meta)
 	    t++;
@@ -2293,19 +2371,23 @@
 
     t = s;
     skipwsep(&s);
-    if (*s && isep(*s == Meta ? s[1] ^ 32 : *s))
+    MB_METACHARINIT();
+    if (*s && itype_end(s, ISEP, 1) != s)
 	*ptr++ = dup(allownull ? "" : nulstring);
     else if (!allownull && t != s)
 	*ptr++ = dup("");
     while (*s) {
-	if (isep(*s == Meta ? s[1] ^ 32 : *s) || (quote && *s == '\\')) {
-	    if (*s == Meta)
-		s++;
+	char *iend = itype_end(s, ISEP, 1);
+	if (iend != s) {
+	    s = iend;
+	    skipwsep(&s);
+	}
+	else if (quote && *s == '\\') {
 	    s++;
 	    skipwsep(&s);
 	}
 	t = s;
-	findsep(&s, NULL, quote);
+	(void)findsep(&s, NULL, quote);
 	if (s > t || allownull) {
 	    *ptr = (heap ? (char *) hcalloc((s - t) + 1) :
 		    (char *) zshcalloc((s - t) + 1));
@@ -2321,68 +2403,87 @@
     return ret;
 }
 
+/*
+ * Find a separator.  Return 0 if already at separator, 1 if separator
+ * found later, else -1.  (Historical note: used to return length into
+ * string but this is all that is necessary and is less ambiguous with
+ * multibyte characters around.)
+ *
+ * *s is the string we are looking along, which will be updated
+ * to the point we have got to.
+ *
+ * sep is a possibly multicharacter separator to look for.  If NULL,
+ * use normal separator characters.  If *sep is NULL, split on individual
+ * characters.
+ *
+ * quote is a flag that '\<sep>' should not be treated as a separator.
+ * in this case we need to be able to strip the backslash directly
+ * in the string, so the calling function must have sent us something
+ * modifiable.  currently this only works for sep == NULL.  also in
+ * in this case only, we need to turn \\ into \.
+ */
+
 /**/
 static int
 findsep(char **s, char *sep, int quote)
 {
     /*
-     * *s is the string we are looking along, which will be updated
-     * to the point we have got to.
-     *
-     * sep is a possibly multicharacter separator to look for.  If NULL,
-     * use normal separator characters.
-     *
-     * quote is a flag that '\<sep>' should not be treated as a separator.
-     * in this case we need to be able to strip the backslash directly
-     * in the string, so the calling function must have sent us something
-     * modifiable.  currently this only works for sep == NULL.  also in
-     * in this case only, we need to turn \\ into \.
      */
-    int i;
+    int i, ilen;
     char *t, *tt;
+    convchar_t c;
 
+    MB_METACHARINIT();
     if (!sep) {
-	for (t = *s; *t; t++) {
-	    if (quote && *t == '\\' &&
-		(isep(t[1] == Meta ? (t[2] ^ 32) : t[1]) || t[1] == '\\')) {
-		chuck(t);
-		if (*t == Meta)
-		    t++;
-		continue;
-	    }
-	    if (*t == Meta) {
-		if (isep(t[1] ^ 32))
+	for (t = *s; *t; t += ilen) {
+	    if (quote && *t == '\\') {
+		if (t[1] == '\\') {
+		    chuck(t);
+		    ilen = 1;
+		    continue;
+		} else {
+		    ilen = MB_METACHARLENCONV(t+1, &c);
+		    if (MB_ZISTYPE(c, ISEP)) {
+			chuck(t);
+			/* then advance over new character, length ilen */
+		    } else {
+			/* treat *t (backslash) as normal byte */
+			if (isep(*t))
+			    break;
+			ilen = 1;
+		    }
+		}
+	    } else {
+		ilen = MB_METACHARLENCONV(t, &c);
+		if (MB_ZISTYPE(c, ISEP))
 		    break;
-		t++;
-	    } else if (isep(*t))
-		break;
+	    }
 	}
-	i = t - *s;
+	i = (t > *s);
 	*s = t;
 	return i;
     }
     if (!sep[0]) {
+	/*
+	 * NULL separator just means advance past first character,
+	 * if any.
+	 */
 	if (**s) {
-	    if (**s == Meta)
-		*s += 2;
-	    else
-		++*s;
+	    *s += MB_METACHARLEN(*s);
 	    return 1;
 	}
 	return -1;
     }
     for (i = 0; **s; i++) {
+	/*
+	 * The following works for multibyte characters by virtue of
+	 * the fact that sep may be a string (and we don't care how
+	 * it divides up, we need to match all of it).
+	 */
 	for (t = sep, tt = *s; *t && *tt && *t == *tt; t++, tt++);
 	if (!*t)
-	    return i;
-	if (*(*s)++ == Meta) {
-#ifdef DEBUG
-	    if (! *(*s)++)
-		fprintf(stderr, "BUG: unexpected end of string in findsep()\n");
-#else
-	    (*s)++;
-#endif
-	}
+	    return (i > 0);
+	*s += MB_METACHARLEN(*s);
     }
     return -1;
 }
@@ -2405,16 +2506,15 @@
 	}
 	return r;
     }
-    for (t = *s; *t; t++) {
-	if (*t == Meta) {
-	    if (! isep(t[1] ^ 32))
-		break;
-	    t++;
-	} else if (! isep(*t))
+    MB_METACHARINIT();
+    for (t = *s; *t; t += sl) {
+	convchar_t c;
+	sl = MB_METACHARLENCONV(t, &c);
+	if (!MB_ZISTYPE(c, ISEP))
 	    break;
     }
     *s = t;
-    findsep(s, sep, 0);
+    (void)findsep(s, sep, 0);
     return t;
 }
 
@@ -2436,18 +2536,17 @@
 	r = 0;
 	if (mul <= 0)
 	    skipwsep(&s);
-	if ((*s && isep(*s == Meta ? s[1] ^ 32 : *s)) ||
+	if ((*s && itype_end(s, ISEP, 1) != s) ||
 	    (mul < 0 && t != s))
 	    r++;
 	for (; *s; r++) {
-	    if (isep(*s == Meta ? s[1] ^ 32 : *s)) {
-		if (*s == Meta)
-		    s++;
-		s++;
+	    char *ie = itype_end(s, ISEP, 1);
+	    if (ie != s) {
+		s = ie;
 		if (mul <= 0)
 		    skipwsep(&s);
 	    }
-	    findsep(&s, NULL, 0);
+	    (void)findsep(&s, NULL, 0);
 	    t = s;
 	    if (mul <= 0)
 		skipwsep(&s);
@@ -2464,19 +2563,20 @@
 {
     char *r, *p, **t;
     int l, sl;
-    char sepbuf[3];
+    char sepbuf[2];
 
     if (!*s)
 	return heap ? "" : ztrdup("");
     if (!sep) {
-	p = sep = sepbuf;
-	if (ifs) {
-	    *p++ = *ifs;
-	    *p++ = *ifs == Meta ? ifs[1] ^ 32 : '\0';
+	/* optimise common case that ifs[0] is space */
+	if (ifs && *ifs != ' ') {
+	    MB_METACHARINIT();
+	    sep = dupstrpfx(ifs, MB_METACHARLEN(ifs));
 	} else {
+	    p = sep = sepbuf;
 	    *p++ = ' ';
+	    *p = '\0';
 	}
-	*p = '\0';
     }
     sl = strlen(sep);
     for (t = s, l = 1 - sl; *t; l += strlen(*t) + sl, t++);
@@ -2508,7 +2608,7 @@
 
     for (t = s; n--;) {
 	tt = t;
-	findsep(&t, sep, 0);
+	(void)findsep(&t, sep, 0);
 	*p = (heap ? (char *) hcalloc(t - tt + 1) :
 	      (char *) zshcalloc(t - tt + 1));
 	strncpy(*p, tt, t - tt);
@@ -2637,39 +2737,21 @@
     for (t0 = (int)STOUC(Snull); t0 <= (int)STOUC(Nularg); t0++)
 	typtab[t0] |= ITOK | IMETA | INULL;
     for (s = ifs ? ifs : DEFAULT_IFS; *s; s++) {
-	if (inblank(*s)) {
-	    if (s[1] == *s)
+	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
+#ifdef MULTIBYTE_SUPPORT
+	if (!isascii(c)) {
+	    /* see comment for wordchars below */
+	    continue;
+	}
+#endif
+	if (inblank(c)) {
+	    if (s[1] == c)
 		s++;
 	    else
-		typtab[STOUC(*s)] |= IWSEP;
-	}
-	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= ISEP;
-    }
-#ifdef MULTIBYTE_SUPPORT
-    if (wordchars) {
-	char *wordchars_unmeta;
-	const char *wordchars_ptr;
-	mbstate_t mbs;
-	size_t nchars;
-	int unmetalen;
-
-	wordchars_unmeta = dupstring(wordchars);
-	wordchars_ptr = unmetafy(wordchars_unmeta, &unmetalen);
-
-	memset(&mbs, 0, sizeof(mbs));
-	wordchars_wide = (wchar_t *)
-	    zrealloc(wordchars_wide, (unmetalen+1)*sizeof(wchar_t));
-	nchars = mbsrtowcs(wordchars_wide, &wordchars_ptr, unmetalen, &mbs);
-	if (nchars == MB_INVALID || nchars == MB_INCOMPLETE) {
-	    /* Conversion state is undefined: better just set to null */
-	    nchars = 0;
+		typtab[c] |= IWSEP;
 	}
-	wordchars_wide[nchars] = L'\0';
-    } else {
-	wordchars_wide = zrealloc(wordchars_wide, sizeof(wchar_t));
-	*wordchars_wide = L'\0';
+	typtab[c] |= ISEP;
     }
-#endif
     for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) {
 	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
 #ifdef MULTIBYTE_SUPPORT
@@ -2686,6 +2768,10 @@
 #endif
 	typtab[c] |= IWORD;
     }
+#ifdef MULTIBYTE_SUPPORT
+    set_widearray(wordchars, &wordchars_wide);
+    set_widearray(ifs, &ifs_wide);
+#endif
     for (s = SPECCHARS; *s; s++)
 	typtab[STOUC(*s)] |= ISPECIAL;
     if (specialcomma)
@@ -2718,62 +2804,60 @@
 }
 
 /*
- * iword() macro extended to support wide characters.
+ * zistype macro extended to support wide characters.
+ * Works for IIDENT, IWORD, IALNUM, ISEP.
+ * We don't need this for IWSEP because that only applies to
+ * a fixed set of ASCII characters.
+ * Note here that use of multibyte mode is not tested:
+ * that's because for ZLE this is unconditional,
+ * not dependent on the option.  The caller must decide.
  */
 
 /**/
 mod_export int
-wcsiword(wchar_t c)
+wcsitype(wchar_t c, int itype)
 {
     int len;
     VARARR(char, outstr, MB_CUR_MAX);
+
+    if (!isset(MULTIBYTE))
+	return zistype(c, itype);
+
     /*
      * Strategy:  the shell requires that the multibyte representation
      * be an extension of ASCII.  So see if converting the character
-     * produces an ASCII character.  If it does, use iword on that.
-     * If it doesn't, use iswalnum on the original character.  This
-     * is pretty good most of the time.
+     * produces an ASCII character.  If it does, use zistype on that.
+     * If it doesn't, use iswalnum on the original character.
+     * If that fails, resort to the appropriate wide character array.
      */
     len = wctomb(outstr, c);
 
     if (len == 0) {
 	/* NULL is special */
-	return iword(0);
+	return zistype(0, itype);
     } else if (len == 1 && iascii(*outstr)) {
-	return iword(*outstr);
+	return zistype(*outstr, itype);
     } else {
-	return iswalnum(c) || wcschr(wordchars_wide, c);
-    }
-}
-
-/*
- * iident() macro extended to support wide characters.
- *
- * The macro is intended to test if a character is allowed in an
- * internal zsh identifier.  We allow all alphanumerics outside
- * the ASCII range unless POSIXIDENTIFIERS is set.
- *
- * Otherwise similar to wcsiword.
- */
+	switch (itype) {
+	case IIDENT:
+	    if (!isset(POSIXIDENTIFIERS))
+		return 0;
+	    return iswalnum(c);
 
-/**/
-mod_export int
-wcsiident(wchar_t c)
-{
-    int len;
-    VARARR(char, outstr, MB_CUR_MAX);
+	case IWORD:
+	    if (iswalnum(c))
+		return 1;
+	    return !!wmemchr(wordchars_wide.chars, c, wordchars_wide.len);
 
-    len = wctomb(outstr, c);
+	case ISEP:
+	    return !!wmemchr(ifs_wide.chars, c, ifs_wide.len);
 
-    if (len == 0) {
-	/* NULL is special */
-	return 0;
-    } else if (len == 1 && iascii(*outstr)) {
-	return iident(*outstr);
-    } else {
-	return !isset(POSIXIDENTIFIERS) && iswalnum(c);
+	default:
+	    return iswalnum(c);
+	}
     }
 }
+
 /**/
 #endif
 
@@ -2789,7 +2873,7 @@
  * If "once" is set, just test the first character, i.e. (outptr !=
  * inptr) tests whether the first character is valid in an identifier.
  *
- * Currently this is only called with itype IIDENT or IUSER.
+ * Currently this is only called with itype IIDENT, IUSER or ISEP.
  */
 
 /**/
@@ -2819,12 +2903,25 @@
 		    break;
 	    } else {
 		/*
-		 * Valid non-ASCII character.  Allow all alphanumerics;
-		 * if testing for words, allow all wordchars.
+		 * Valid non-ASCII character.
 		 */
-		if (!(iswalnum(wc) ||
-		      (itype == IWORD && wcschr(wordchars_wide, wc))))
+		switch (itype) {
+		case IWORD:
+		    if (!iswalnum(wc) && 
+			!wmemchr(wordchars_wide.chars, wc,
+				 wordchars_wide.len))
+			return (char *)ptr;
 		    break;
+
+		case ISEP:
+		    if (!wmemchr(ifs_wide.chars, wc, ifs_wide.len))
+			return (char *)ptr;
+		    break;
+
+		default:
+		    if (!iswalnum(wc))
+			return (char *)ptr;
+		}
 	    }
 	    ptr += len;
 
@@ -3791,16 +3888,22 @@
     wchar_t wc;
 
     if (!isset(MULTIBYTE)) {
+	/* treat as single byte, possibly metafied */
 	if (wcp)
-	    *wcp = WEOF;
+	    *wcp = (wint_t)(*s == Meta ? s[1] ^ 32 : *s);
 	return 1 + (*s == Meta);
     }
 
     ret = MB_INVALID;
     for (ptr = s; *ptr; ) {
-	if (*ptr == Meta)
+	if (*ptr == Meta) {
 	    inchar = *++ptr ^ 32;
-	else
+#ifdef DEBUG
+	    if (!*ptr)
+		fprintf(stderr,
+			"BUG: unexpected end of string in mb_metacharlen()\n");
+#endif
+	} else
 	    inchar = *ptr;
 	ptr++;
 	ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
@@ -3874,6 +3977,23 @@
 }
 
 /**/
+#else
+
+/* Simple replacement for mb_metacharlenconv */
+int
+metacharlenconv(char *x, int *c)
+{
+    if (*x == Meta) {
+	if (c)
+	    *c == STOUC(x[1]);
+	return 2;
+    }
+    if (c)
+	*c = STOUC(*x);
+    return 1;
+}
+
+/**/
 #endif /* MULTIBYTE_SUPPORT */
 
 /* check for special characters in the string */
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.93
diff -u -r1.93 zsh.h
--- Src/zsh.h	10 Jul 2006 13:08:23 -0000	1.93
+++ Src/zsh.h	24 Jul 2006 21:41:33 -0000
@@ -1925,6 +1925,8 @@
 #ifdef MULTIBYTE_SUPPORT
 #define nicezputs(str, outs)	(void)mb_niceformat((str), (outs), NULL, 0)
 #define MB_METACHARINIT()	mb_metacharinit()
+typedef wint_t convchar_t;
+#define MB_METACHARLENCONV(str, cp)	mb_metacharlenconv((str), (cp))
 #define MB_METACHARLEN(str)	mb_metacharlenconv(str, NULL)
 #define MB_METASTRLEN(str)	mb_metastrlen(str)
 
@@ -1948,6 +1950,8 @@
 
 #else
 #define MB_METACHARINIT()
+typedef int convchar_t;
+#define MB_METACHARLENCONV(str, cp)	metacharlenconv((str), (cp))
 #define MB_METACHARLEN(str)	(*(str) == Meta ? 2 : 1)
 #define MB_METASTRLEN(str)	ztrlen(str)
 
Index: Src/ztype.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/ztype.h,v
retrieving revision 1.4
diff -u -r1.4 ztype.h
--- Src/ztype.h	10 Jul 2006 13:08:23 -0000	1.4
+++ Src/ztype.h	24 Jul 2006 21:41:33 -0000
@@ -59,6 +59,12 @@
 #define iwsep(X) zistype(X,IWSEP)
 #define inull(X) zistype(X,INULL)
 
+#ifdef MULTIBYTE_SUPPORT
+#define MB_ZISTYPE(X,Y) wcsitype((X),(Y))
+#else
+#define MB_ZISTYPE(X,Y)	zistype((X),(Y))
+#endif
+
 #define iascii(X) isascii(STOUC(X))
 #define ilower(X) islower(STOUC(X))
 #define iprint(X) isprint(STOUC(X))
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.30
diff -u -r1.30 zle.h
--- Src/Zle/zle.h	9 Apr 2006 21:47:22 -0000	1.30
+++ Src/Zle/zle.h	24 Jul 2006 21:41:33 -0000
@@ -62,11 +62,11 @@
 #define ZC_iblank wcsiblank
 #define ZC_icntrl iswcntrl
 #define ZC_idigit iswdigit
-#define ZC_iident wcsiident
+#define ZC_iident(x) wcsitype((x), IIDENT)
 #define ZC_ilower iswlower
 #define ZC_inblank iswspace
 #define ZC_iupper iswupper
-#define ZC_iword wcsiword
+#define ZC_iword(x) wcsitype((x), IWORD)
 
 #define ZC_tolower towlower
 #define ZC_toupper towupper
Index: Src/Zle/zle_main.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_main.c,v
retrieving revision 1.88
diff -u -r1.88 zle_main.c
--- Src/Zle/zle_main.c	30 May 2006 22:35:04 -0000	1.88
+++ Src/Zle/zle_main.c	24 Jul 2006 21:41:34 -0000
@@ -1290,32 +1290,40 @@
 	    char **arr = getarrvalue(v), **aptr, **tmparr, **tptr;
 	    tptr = tmparr = (char **)zhalloc(sizeof(char *)*(arrlen(arr)+1));
 	    for (aptr = arr; *aptr; aptr++) {
-		int sepcount = 0;
+		int sepcount = 0, clen;
+		convchar_t c;
 		/*
 		 * See if this word contains a separator character
 		 * or backslash
 		 */
-		for (t = *aptr; *t; t++) {
-		    if (*t == Meta) {
-			if (isep(t[1] ^ 32))
-			    sepcount++;
+		MB_METACHARINIT();
+		for (t = *aptr; *t; ) {
+		    if (*t == '\\') {
 			t++;
-		    } else if (isep(*t) || *t == '\\')
 			sepcount++;
+		    } else {
+			t += MB_METACHARLENCONV(t, &c);
+			if (MB_ZISTYPE(c, ISEP))
+			    sepcount++;
+		    }
 		}
 		if (sepcount) {
 		    /* Yes, so allocate enough space to quote it. */
 		    char *newstr, *nptr;
 		    newstr = zhalloc(strlen(*aptr)+sepcount+1);
 		    /* Go through string quoting separators */
+		    MB_METACHARINIT();
 		    for (t = *aptr, nptr = newstr; *t; ) {
-			if (*t == Meta) {
-			    if (isep(t[1] ^ 32))
-				*nptr++ = '\\';
-			    *nptr++ = *t++;
-			} else if (isep(*t) || *t == '\\')
+			if (*t == '\\') {
 			    *nptr++ = '\\';
-			*nptr++ = *t++;
+			    *nptr++ = *t++;
+			} else {
+			    clen = MB_METACHARLENCONV(t, &c);
+			    if (MB_ZISTYPE(c, ISEP))
+				*nptr++ = '\\';
+			    while (clen--)
+				*nptr++ = *t++;
+			}
 		    }
 		    *nptr = '\0';
 		    /* Stick this into the array of words to join up */
Index: Test/D04parameter.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D04parameter.ztst,v
retrieving revision 1.17
diff -u -r1.17 D04parameter.ztst
--- Test/D04parameter.ztst	17 Feb 2006 18:29:57 -0000	1.17
+++ Test/D04parameter.ztst	24 Jul 2006 21:41:34 -0000
@@ -725,6 +725,29 @@
 >7
 >8
 
+# Tests a long-standing bug with joining on metafied characters in IFS
+  (array=(one two three)
+  IFS=$'\0'
+  foo="$array"
+  for (( i = 1; i <= ${#foo}; i++ )); do
+    char=${foo[i]}
+    print $(( #char ))
+  done)
+0:Joining with NULL character from IFS
+>111
+>110
+>101
+>0
+>116
+>119
+>111
+>0
+>116
+>104
+>114
+>101
+>101
+
   unset SHLVL
   (( SHLVL++ ))
   print $SHLVL
Index: Test/D07multibyte.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v
retrieving revision 1.6
diff -u -r1.6 D07multibyte.ztst
--- Test/D07multibyte.ztst	11 Jul 2006 15:36:38 -0000	1.6
+++ Test/D07multibyte.ztst	24 Jul 2006 21:41:34 -0000
@@ -174,3 +174,57 @@
 1:POSIX_IDENTIFIERS option
 >3
 ?(eval):1: command not found: hähä=3
+
+  foo="Ølaf«Ødd«øpénëd«ån«àpple"
+  print -l ${(s.«.)foo}         
+  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
+  print -l ${=ioh}
+  print ${(w)#ioh}
+0:Splitting with multibyte characters
+>Ølaf
+>Ødd
+>øpénëd
+>ån
+>àpple
+>Ἐν
+>ἀρχῇ
+>ἦν
+>ὁ
+>λόγος,
+>καὶ
+>ὁ
+>λόγος
+>ἦν
+>πρὸς
+>τὸν
+>θεόν,
+>καὶ
+>θεὸς
+>ἦν
+>ὁ
+>λόγος.
+>17
+
+  read -d £ one
+  read -d £ two
+  print $one
+  print $two
+0:read with multibyte delimiter
+<first£second£
+>first
+>second
+
+  (IFS=«
+  read -d » -A array
+  print -l $array)
+0:read -A with multibyte IFS
+<dominus«illuminatio«mea»ignored
+>dominus
+>illuminatio
+>mea
+
+  read -k2 -u0 twochars
+  print $twochars
+0:read multibyte characters
+<«»ignored
+>«»

-- 
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/



Messages sorted by: Reverse Date, Date, Thread, Author