Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: Displaying invalid characters



On Thu, 18 Mar 2010 15:44:24 +0000
Peter Stephenson <pws@xxxxxxx> wrote:
> All that's really on offer with the wrong character
> set is it doesn't hang...

Up to now the shell has given up processing a line when it encounters an
invalid character, i.e. a multibyte sequence that doesn't convert to a
wide character. Although it can't show the character correctly
(it simply doesn't know enough), it should be able to show the
character specially in a similar manner to unprintable characters
and continue looking at the line.

All we need to fix this efficiently is a range of 256 entries in wchar_t
that's guaranteed to be invalid.  I find from the Linux Unicode manual
entry that if __STDC_ISO_10646__ is defined, so that wchar_t is a
Unicode code point, the immediate range starting at 0xE000 is private to
the specific application, so we can use that.  So invalid characters
now appear highlighted as two hex digits in angle brackets and the rest
of the line is properly processed (unless you got seriously unlucky with
clashes between the two character sets, which is beyond our power to
fix---of course in the reverse of the case Vincent had the chunks of
UTF-8 characters are often valid ISO-8859-1, but that's relatively
benign).

It doesn't cover all systems, but as you'll see this was a really rather
easy change, so it's worth doing.  It's likely this doesn't work
properly in completion which I haven't touched.

If there are other systems that don't define __STDC_ISO_10646__ but are
known to have similar ranges in wchar_t I'm happy to add the appropriate
definitions.

Index: Doc/Zsh/zle.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/zle.yo,v
retrieving revision 1.81
diff -p -u -r1.81 zle.yo
--- Doc/Zsh/zle.yo	5 Dec 2009 19:38:07 -0000	1.81
+++ Doc/Zsh/zle.yo	20 Mar 2010 00:21:06 -0000
@@ -2286,6 +2286,20 @@ angle brackets.  The number is the code 
 character set; this may or may not be Unicode, depending on the operating
 system.
 )
+item(Invalid multibyte characters)(
+If the tt(MULTIBYTE) option is in effect, any sequence of one or more
+bytes that does not form a valid character in the current character
+set is treated as a series of bytes each shown as a special character.
+This case can be distinguished from other unprintable characters
+as the bytes are represented as two hexadecimal digits between angle
+brackets, as distinct from the four or eight digits that are used for
+unprintable characters that are nonetheless valid in the current
+character set.
+
+Not all systems support this: for it to work, the system's representation of
+wide characters must be code values from the Universal Character Set,
+as defined by IS0 10646 (also known as Unicode).
+)
 enditem()
 
 If tt(zle_highlight) is not set or no value applies to a particular
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.41
diff -p -u -r1.41 zle.h
--- Src/Zle/zle.h	24 Apr 2009 09:00:38 -0000	1.41
+++ Src/Zle/zle.h	20 Mar 2010 00:21:06 -0000
@@ -419,6 +419,20 @@ typedef struct {
 typedef REFRESH_ELEMENT *REFRESH_STRING;
 
 
+#if defined(MULTIBYTE_SUPPORT) && defined(__STDC_ISO_10646__)
+#define ZSH_INVALID_WCHAR_BASE	(0xe000U)
+#define ZSH_INVALID_WCHAR_TEST(x)			\
+    ((unsigned)(x) >= ZSH_INVALID_WCHAR_BASE &&		\
+     (unsigned)(x) <= (ZSH_INVALID_WCHAR_BASE + 255u))
+#define ZSH_INVALID_WCHAR_TO_CHAR(x)			\
+    ((char)((unsigned)(x) - ZSH_INVALID_WCHAR_BASE))
+#define ZSH_INVALID_WCHAR_TO_INT(x)			\
+    ((int)((unsigned)(x) - ZSH_INVALID_WCHAR_BASE))
+#define ZSH_CHAR_TO_INVALID_WCHAR(x)		\
+    ((wchar_t)(STOUC(x) + ZSH_INVALID_WCHAR_BASE))
+#endif
+
+
 #ifdef DEBUG
 #define METACHECK()		\
 	DPUTS(zlemetaline == NULL, "line not metafied")
Index: Src/Zle/zle_refresh.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_refresh.c,v
retrieving revision 1.77
diff -p -u -r1.77 zle_refresh.c
--- Src/Zle/zle_refresh.c	5 Dec 2009 19:38:07 -0000	1.77
+++ Src/Zle/zle_refresh.c	20 Mar 2010 00:21:07 -0000
@@ -1263,7 +1263,11 @@ zrefresh(void)
 	    }
 	}
 #ifdef MULTIBYTE_SUPPORT
-	else if (iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
+	else if (
+#ifdef __STDC_ISO_10646__
+		 !ZSH_INVALID_WCHAR_TEST(*t) &&
+#endif
+		 iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
 	    int ichars;
 	    if (width > rpms.sen - rpms.s) {
 		int started = 0;
@@ -1367,6 +1371,12 @@ zrefresh(void)
 	    wchar_t wc;
 	    int started = 0;
 
+#ifdef __STDC_ISO_10646__
+	    if (ZSH_INVALID_WCHAR_TEST(*t)) {
+		int c = ZSH_INVALID_WCHAR_TO_INT(*t);
+		sprintf(dispchars, "<%.02x>", c);
+	    } else
+#endif
 	    if ((unsigned)*t > 0xffffU) {
 		sprintf(dispchars, "<%.08x>", (unsigned)*t);
 	    } else {
Index: Src/Zle/zle_utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_utils.c,v
retrieving revision 1.55
diff -p -u -r1.55 zle_utils.c
--- Src/Zle/zle_utils.c	3 Jan 2009 18:12:15 -0000	1.55
+++ Src/Zle/zle_utils.c	20 Mar 2010 00:21:07 -0000
@@ -120,11 +120,19 @@ zlecharasstring(ZLE_CHAR_T inchar, char 
     size_t ret;
     char *ptr;
 
-    ret = wctomb(buf, inchar);
-    if (ret <= 0) {
-	/* Ick. */
-	buf[0] = '?';
-	return 1;
+#ifdef __STDC_ISO_10646__
+    if (ZSH_INVALID_WCHAR_TEST(inchar)) {
+	buf[0] = ZSH_INVALID_WCHAR_TO_CHAR(inchar);
+	ret = 1;
+    } else
+#endif
+    {
+	ret = wctomb(buf, inchar);
+	if (ret <= 0) {
+	    /* Ick. */
+	    buf[0] = '?';
+	    return 1;
+	}
     }
     ptr = buf + ret - 1;
     for (;;) {
@@ -196,13 +204,20 @@ zlelineasstring(ZLE_STRING_T instr, int 
     for (i=0; i < inll; i++, incs--) {
 	if (incs == 0)
 	    outcs = mb_len;
-	j = wcrtomb(s + mb_len, instr[i], &mbs);
-	if (j == -1) {
-	    /* invalid char; what to do? */
-	    s[mb_len++] = ZWC('?');
-	    memset(&mbs, 0, sizeof(mbs));
-	} else {
-	    mb_len += j;
+#ifdef __STDC_ISO_10646__
+	if (ZSH_INVALID_WCHAR_TEST(instr[i])) {
+	    s[mb_len++] = ZSH_INVALID_WCHAR_TO_CHAR(instr[i]);
+	} else
+#endif
+	{
+	    j = wcrtomb(s + mb_len, instr[i], &mbs);
+	    if (j == -1) {
+		/* invalid char */
+		s[mb_len++] = ZWC('?');
+		memset(&mbs, 0, sizeof(mbs));
+	    } else {
+		mb_len += j;
+	    }
 	}
     }
     if (incs == 0)
@@ -332,6 +347,13 @@ stringaszleline(char *instr, int incs, i
 	while (ll > 0) {
 	    size_t cnt = mbrtowc(outptr, inptr, ll, &mbs);
 
+#ifdef __STDC_ISO_10646__
+	    if (cnt == MB_INCOMPLETE || cnt == MB_INVALID) {
+		/* Use private encoding for invalid single byte */
+		*outptr = ZSH_CHAR_TO_INVALID_WCHAR(*inptr);
+		cnt = 1;
+	    }
+#else
 	    /*
 	     * At this point we don't handle either incomplete (-2) or
 	     * invalid (-1) multibyte sequences.  Use the current length
@@ -339,6 +361,7 @@ stringaszleline(char *instr, int incs, i
 	     */
 	    if (cnt == MB_INCOMPLETE || cnt == MB_INVALID)
 		break;
+#endif
 
 	    if (cnt == 0) {
 		/* Converting '\0' returns 0, but a '\0' is a real


-- 
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/



Messages sorted by: Reverse Date, Date, Thread, Author