Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: multibyte odds and ends



A few minor things:

- fix bslashquote().  The only issue was a test for printability when
  using $'...'.  (I don't know where I got the idea that form of
  quoting was something to do with POSIX, it isn't.)
- remove aliases iascii, iupper, iprint, ilower which simply invoked
  standard ctype macros and instead use ctype directly for clarity
- fix return value from getzlequery(); it was only ever used in yes/no
  mode so a status return makes more sense than a character.

The only other outstanding syntactic issue I noticed this time through
was with bangchar, hatchar and hashchar which can be redefined by
setting HISTCHARS.  These are always compared to single bytes, and this
happens at quite a low level of input where we *really* don't want to
start handling multibyte characters since it effectively means a rewrite
of the lexical analyser.  They are in any case quite nasty hacks which
it wouldn't be much fun to handle outside the portable character set.
My suggestion is that their use be limited in some way:

- Make sure they are ASCII characters?
- Allow non-ASCII characters but make sure they are complete (this will
only work with single-byte extensions to ASCII e.g. ISO-8859-1)?
- Keep the current code and simply document that a single byte will be
compared, with possibly unexpected effects (possibly issue a warning)?

Does anybody have any preferences?  Does anybody ever redefine HISTCHARS?

Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.129
diff -u -r1.129 utils.c
--- Src/utils.c	25 Jul 2006 09:25:27 -0000	1.129
+++ Src/utils.c	1 Aug 2006 20:28:47 -0000
@@ -2835,7 +2835,7 @@
     if (len == 0) {
 	/* NULL is special */
 	return zistype(0, itype);
-    } else if (len == 1 && iascii(*outstr)) {
+    } else if (len == 1 && isascii(*outstr)) {
 	return zistype(*outstr, itype);
     } else {
 	switch (itype) {
@@ -2897,7 +2897,7 @@
 		/* in this case non-ASCII characters can't match */
 		if (chr > 127 || !zistype(chr,itype))
 		    break;
-	    } else if (len == 1 && iascii(*ptr)) {
+	    } else if (len == 1 && isascii(*ptr)) {
 		/* ASCII: can't be metafied, use standard test */
 		if (!zistype(*ptr,itype))
 		    break;
@@ -4017,7 +4017,7 @@
  * The last argument should be zero if this is to be used outside a string, *
  * one if it is to be quoted for the inside of a single quoted string,      *
  * two if it is for the inside of a double quoted string, and               *
- * three if it is for the inside of a posix quoted string.                  *
+ * three if it is for the inside of a $'...' quoted string.                 *
  * The string may be metafied and contain tokens.                           */
 
 /**/
@@ -4031,127 +4031,153 @@
 
     tt = v = buf;
     u = s;
-    for (; *u; u++) {
-	if (e && *e == u)
-	    *e = v, sf = 1;
-	if (instring == 3) {
-	  int c = *u;
-	  if (c == Meta) {
-	    c = *++u ^ 32;
-	  }
-	  c &= 0xff;
-	  if(isprint(c)) {
-	    switch (c) {
-	    case '\\':
-	    case '\'':
-	      *v++ = '\\';
-	      *v++ = c;
-	      break;
+    if (instring == 3) {
+	/*
+	 * As we test for printability here we need to be able
+	 * to look for multibyte characters.
+	 */
+	convchar_t cc;
+	MB_METACHARINIT();
+	while (*u) {
+	    const char *uend = u + MB_METACHARLENCONV(u, &cc);
+
+	    if (e && !sf && *e <= u) {
+		*e = v;
+		sf = 1;
+	    }
+	    if (
+#ifdef MULTIBYTE_SUPPORT
+		cc != WEOF && 
+#endif
+		MB_ISPRINT(cc)) {
+		switch (cc) {
+		case ZWC('\\'):
+		case ZWC('\''):
+		    *v++ = '\\';
+		    break;
 
-	    default:
-	      if(imeta(c)) {
-		*v++ = Meta;
-		*v++ = c ^ 32;
-	      }
-	      else {
-		if (isset(BANGHIST) && c == bangchar) {
-		  *v++ = '\\';
+		default:
+		    if (isset(BANGHIST) && cc == (wchar_t)bangchar)
+			*v++ = '\\';
+		    break;
 		}
-		*v++ = c;
-	      }
-	      break;
-	    }
-	  }
-	  else {
-	    switch (c) {
-	    case '\0':
-	      *v++ = '\\';
-	      *v++ = '0';
-	      if ('0' <= u[1] && u[1] <= '7') {
-		*v++ = '0';
-		*v++ = '0';
-	      }
-	      break;
-
-	    case '\007': *v++ = '\\'; *v++ = 'a'; break;
-	    case '\b': *v++ = '\\'; *v++ = 'b'; break;
-	    case '\f': *v++ = '\\'; *v++ = 'f'; break;
-	    case '\n': *v++ = '\\'; *v++ = 'n'; break;
-	    case '\r': *v++ = '\\'; *v++ = 'r'; break;
-	    case '\t': *v++ = '\\'; *v++ = 't'; break;
-	    case '\v': *v++ = '\\'; *v++ = 'v'; break;
+		while (u < uend)
+		    *v++ = *u++;
+	    } else {
+		/* Not printable */
+		for (; u < uend; u++) {
+		    /*
+		     * Just do this byte by byte; there's no great
+		     * advantage in being clever with multibyte
+		     * characters if we don't think they're printable.
+		     */
+		    int c;
+		    if (*u == Meta)
+			c = STOUC(*++u ^ 32);
+		    else
+			c = STOUC(*u);
+		    switch (c) {
+		    case '\0':
+			*v++ = '\\';
+			*v++ = '0';
+			if ('0' <= u[1] && u[1] <= '7') {
+			    *v++ = '0';
+			    *v++ = '0';
+			}
+			break;
 
-	    default:
-	      *v++ = '\\';
-	      *v++ = '0' + ((c >> 6) & 7);
-	      *v++ = '0' + ((c >> 3) & 7);
-	      *v++ = '0' + (c & 7);
-	      break;
+		    case '\007': *v++ = '\\'; *v++ = 'a'; break;
+		    case '\b': *v++ = '\\'; *v++ = 'b'; break;
+		    case '\f': *v++ = '\\'; *v++ = 'f'; break;
+		    case '\n': *v++ = '\\'; *v++ = 'n'; break;
+		    case '\r': *v++ = '\\'; *v++ = 'r'; break;
+		    case '\t': *v++ = '\\'; *v++ = 't'; break;
+		    case '\v': *v++ = '\\'; *v++ = 'v'; break;
+
+		    default:
+			*v++ = '\\';
+			*v++ = '0' + ((c >> 6) & 7);
+			*v++ = '0' + ((c >> 3) & 7);
+			*v++ = '0' + (c & 7);
+			break;
+		    }
+		}
 	    }
-	  }
-	  continue;
 	}
-	else if (*u == Tick || *u == Qtick) {
-	    char c = *u++;
+    }
+    else
+    {
+	/*
+	 * Here the only special characters are syntactic, so
+	 * we can go through bytewise.
+	 */
+	for (; *u; u++) {
+	    if (e && *e == u)
+		*e = v, sf = 1;
+	    if (*u == Tick || *u == Qtick) {
+		char c = *u++;
+
+		*v++ = c;
+		while (*u && *u != c)
+		    *v++ = *u++;
+		*v++ = c;
+		if (!*u)
+		    u--;
+		continue;
+	    }
+	    else if ((*u == String || *u == Qstring) &&
+		     (u[1] == Inpar || u[1] == Inbrack || u[1] == Inbrace)) {
+		char c = (u[1] == Inpar ? Outpar : (u[1] == Inbrace ?
+						    Outbrace : Outbrack));
+		char beg = *u;
+		int level = 0;
 
-	    *v++ = c;
-	    while (*u && *u != c)
 		*v++ = *u++;
-	    *v++ = c;
-	    if (!*u)
-		u--;
-	    continue;
-	}
-	else if ((*u == String || *u == Qstring) &&
-		 (u[1] == Inpar || u[1] == Inbrack || u[1] == Inbrace)) {
-	    char c = (u[1] == Inpar ? Outpar : (u[1] == Inbrace ?
-						Outbrace : Outbrack));
-	    char beg = *u;
-	    int level = 0;
-
-	    *v++ = *u++;
-	    *v++ = *u++;
-	    while (*u && (*u != c || level)) {
-		if (*u == beg)
-		    level++;
-		else if (*u == c)
-		    level--;
 		*v++ = *u++;
-	    }
-	    if (*u)
-		*v++ = *u;
-	    else
-		u--;
-	    continue;
-	}
-	else if (ispecial(*u) &&
-		 ((*u != '=' && *u != '~') ||
-		  u == s ||
-		  (isset(MAGICEQUALSUBST) && (u[-1] == '=' || u[-1] == ':')) ||
-		  (*u == '~' && isset(EXTENDEDGLOB))) &&
-	    (!instring ||
-	     (isset(BANGHIST) && *u == (char)bangchar && instring != 1) ||
-	     (instring == 2 &&
-	      (*u == '$' || *u == '`' || *u == '\"' || *u == '\\')) ||
-	     (instring == 1 && *u == '\''))) {
-	    if (*u == '\n' || (instring == 1 && *u == '\'')) {
-		if (unset(RCQUOTES)) {
-		    *v++ = '\'';
-		    if (*u == '\'')
-			*v++ = '\\';
+		while (*u && (*u != c || level)) {
+		    if (*u == beg)
+			level++;
+		    else if (*u == c)
+			level--;
+		    *v++ = *u++;
+		}
+		if (*u)
 		    *v++ = *u;
-		    *v++ = '\'';
-		} else if (*u == '\n')
-		    *v++ = '"', *v++ = '\n', *v++ = '"';
 		else
-		    *v++ = '\'', *v++ = '\'';
+		    u--;
 		continue;
-	    } else
-		*v++ = '\\';
+	    }
+	    else if (ispecial(*u) &&
+		     ((*u != '=' && *u != '~') ||
+		      u == s ||
+		      (isset(MAGICEQUALSUBST) &&
+		       (u[-1] == '=' || u[-1] == ':')) ||
+		      (*u == '~' && isset(EXTENDEDGLOB))) &&
+		     (!instring ||
+		      (isset(BANGHIST) && *u == (char)bangchar &&
+		       instring != 1) ||
+		      (instring == 2 &&
+		       (*u == '$' || *u == '`' || *u == '\"' || *u == '\\')) ||
+		      (instring == 1 && *u == '\''))) {
+		if (*u == '\n' || (instring == 1 && *u == '\'')) {
+		    if (unset(RCQUOTES)) {
+			*v++ = '\'';
+			if (*u == '\'')
+			    *v++ = '\\';
+			*v++ = *u;
+			*v++ = '\'';
+		    } else if (*u == '\n')
+			*v++ = '"', *v++ = '\n', *v++ = '"';
+		    else
+			*v++ = '\'', *v++ = '\'';
+		    continue;
+		} else
+		    *v++ = '\\';
+	    }
+	    if(*u == Meta)
+		*v++ = *u++;
+	    *v++ = *u;
 	}
-	if(*u == Meta)
-	    *v++ = *u++;
-	*v++ = *u;
     }
     *v = '\0';
 
Index: Src/ztype.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/ztype.h,v
retrieving revision 1.5
diff -u -r1.5 ztype.h
--- Src/ztype.h	24 Jul 2006 22:00:21 -0000	1.5
+++ Src/ztype.h	1 Aug 2006 20:28:47 -0000
@@ -61,11 +61,8 @@
 
 #ifdef MULTIBYTE_SUPPORT
 #define MB_ZISTYPE(X,Y) wcsitype((X),(Y))
+#define MB_ISPRINT(X)	iswprint(X)
 #else
 #define MB_ZISTYPE(X,Y)	zistype((X),(Y))
+#define MB_ISPRINT(X)	isprint(X)
 #endif
-
-#define iascii(X) isascii(STOUC(X))
-#define ilower(X) islower(STOUC(X))
-#define iprint(X) isprint(STOUC(X))
-#define iupper(X) isupper(STOUC(X))
Index: Src/Zle/compresult.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/compresult.c,v
retrieving revision 1.62
diff -u -r1.62 compresult.c
--- Src/Zle/compresult.c	7 Mar 2006 21:31:43 -0000	1.62
+++ Src/Zle/compresult.c	1 Aug 2006 20:28:48 -0000
@@ -1861,7 +1861,7 @@
 		     listdat.nlines));
 	qup = ((l + columns - 1) / columns) - 1;
 	fflush(shout);
-	if (getzlequery(1) != 'y') {
+	if (!getzlequery()) {
 	    if (clearflag) {
 		putc('\r', shout);
 		tcmultout(TCUP, TCMULTUP, qup);
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.31
diff -u -r1.31 zle.h
--- Src/Zle/zle.h	24 Jul 2006 22:00:21 -0000	1.31
+++ Src/Zle/zle.h	1 Aug 2006 20:28:48 -0000
@@ -125,9 +125,9 @@
 #define ZC_icntrl icntrl
 #define ZC_idigit idigit
 #define ZC_iident iident
-#define ZC_ilower ilower
+#define ZC_ilower islower
 #define ZC_inblank inblank
-#define ZC_iupper iupper
+#define ZC_iupper isupper
 #define ZC_iword iword
 
 #define ZC_tolower tulower
Index: Src/Zle/zle_tricky.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_tricky.c,v
retrieving revision 1.68
diff -u -r1.68 zle_tricky.c
--- Src/Zle/zle_tricky.c	10 Jul 2006 13:08:24 -0000	1.68
+++ Src/Zle/zle_tricky.c	1 Aug 2006 20:28:50 -0000
@@ -2298,7 +2298,7 @@
 	     fprintf(shout, "zsh: do you wish to see all %d lines? ", nlines));
 	qup = ((l + columns - 1) / columns) - 1;
 	fflush(shout);
-	if (getzlequery(1) != 'y') {
+	if (!getzlequery()) {
 	    if (clearflag) {
 		putc('\r', shout);
 		tcmultout(TCUP, TCMULTUP, qup);
Index: Src/Zle/zle_utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_utils.c,v
retrieving revision 1.37
diff -u -r1.37 zle_utils.c
--- Src/Zle/zle_utils.c	13 Jan 2006 17:13:51 -0000	1.37
+++ Src/Zle/zle_utils.c	1 Aug 2006 20:28:50 -0000
@@ -653,50 +653,42 @@
 }
 
 /*
- * Query the user, and return a single character response.  The question
- * is assumed to have been printed already, and the cursor is left
- * immediately after the response echoed.  (Might cause a problem if
- * this takes it onto the next line.)  If yesno is non-zero: <Tab> is
- * interpreted as 'y'; any other control character is interpreted as
- * 'n'.  If there are any characters in the buffer, this is taken as a
- * negative response, and no characters are read.  Case is folded.
- *
- * TBD: this may need extending to return a wchar_t or possibly
- * a wint_t.
+ * Query the user, and return 1 for yes, 0 for no.  The question is assumed to
+ * have been printed already, and the cursor is left immediately after the
+ * response echoed.  (Might cause a problem if this takes it onto the next
+ * line.)  <Tab> is interpreted as 'y'; any other control character is
+ * interpreted as 'n'.  If there are any characters in the buffer, this is
+ * taken as a negative response, and no characters are read.  Case is folded.
  */
 
 /**/
 mod_export int
-getzlequery(int yesno)
+getzlequery(void)
 {
     ZLE_INT_T c;
 #ifdef FIONREAD
     int val;
 
-    if (yesno) {
-	/* check for typeahead, which is treated as a negative response */
-	ioctl(SHTTY, FIONREAD, (char *)&val);
-	if (val) {
-	    putc('n', shout);
-	    return 'n';
-	}
+    /* check for typeahead, which is treated as a negative response */
+    ioctl(SHTTY, FIONREAD, (char *)&val);
+    if (val) {
+	putc('n', shout);
+	return 0;
     }
 #endif
 
     /* get a character from the tty and interpret it */
     c = getfullchar(0);
-    if (yesno) {
-	if (c == ZWC('\t'))
-	    c = ZWC('y');
-	else if (ZC_icntrl(c) || c == ZLEEOF)
-	    c = ZWC('n');
-	else
-	    c = ZC_tolower(c);
-    }
+    if (c == ZWC('\t'))
+	c = ZWC('y');
+    else if (ZC_icntrl(c) || c == ZLEEOF)
+	c = ZWC('n');
+    else
+	c = ZC_tolower(c);
     /* echo response and return */
     if (c != ZWC('\n'))
 	zwcputc(c);
-    return c;
+    return c == ZWC('y');
 }
 
 /* Format a string, keybinding style. */

-- 
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/



Messages sorted by: Reverse Date, Date, Thread, Author