Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: multibyte delimiters for substitutions and parameter flags



This is supposed to fix multibyte delimiters for substitutions in
modifiers when used in globbing or parameters (not yet history,
which is separate code), and for delimiters in the arguments of
parameter flags and similar.

Index: Src/glob.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/glob.c,v
retrieving revision 1.54
diff -u -r1.54 glob.c
--- Src/glob.c	1 Nov 2006 12:25:22 -0000	1.54
+++ Src/glob.c	2 Nov 2006 18:33:58 -0000
@@ -1243,9 +1243,10 @@
 		    else {
 			/* ... or a user name */
 			char sav, *tt;
+			int arglen;
 
 			/* Find matching delimiters */
-			tt = get_strarg(s);
+			tt = get_strarg(s, &arglen);
 			if (!*tt) {
 			    zerr("missing end of name");
 			    data = 0;
@@ -1255,7 +1256,7 @@
 			    sav = *tt;
 			    *tt = '\0';
 
-			    if ((pw = getpwnam(s + 1)))
+			    if ((pw = getpwnam(s + arglen)))
 				data = pw->pw_uid;
 			    else {
 				zerr("unknown user");
@@ -1268,7 +1269,7 @@
 			    data = 0;
 #endif /* !USE_GETPWNAM */
 			    if (sav)
-				s = tt + 1;
+				s = tt + arglen;
 			    else
 				s = tt;
 			}
@@ -1283,8 +1284,9 @@
 		    else {
 			/* ...or a delimited group name. */
 			char sav, *tt;
+			int arglen;
 
-			tt = get_strarg(s);
+			tt = get_strarg(s, &arglen);
 			if (!*tt) {
 			    zerr("missing end of name");
 			    data = 0;
@@ -1294,7 +1296,7 @@
 			    sav = *tt;
 			    *tt = '\0';
 
-			    if ((gr = getgrnam(s + 1)))
+			    if ((gr = getgrnam(s + arglen)))
 				data = gr->gr_gid;
 			    else {
 				zerr("unknown group");
@@ -1307,7 +1309,7 @@
 			    data = 0;
 #endif /* !USE_GETGRNAM */
 			    if (sav)
-				s = tt + 1;
+				s = tt + arglen;
 			    else
 				s = tt;
 			}
@@ -1438,8 +1440,7 @@
 			    tt = NULL;
 			}
 		    } else {
-			plus = 1;
-			tt = get_strarg(s);
+			tt = get_strarg(s, &plus);
 			if (!*tt)
 			{
 			    zerr("missing end of string");
Index: Src/params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/params.c,v
retrieving revision 1.120
diff -u -r1.120 params.c
--- Src/params.c	11 Sep 2006 11:09:15 -0000	1.120
+++ Src/params.c	2 Nov 2006 18:34:00 -0000
@@ -947,7 +947,7 @@
        int *prevcharlen, int *nextcharlen)
 {
     int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash;
-    int keymatch = 0, needtok = 0;
+    int keymatch = 0, needtok = 0, arglen;
     char *s = *str, *sep = NULL, *t, sav, *d, **ta, **p, *tt, c;
     zlong num = 1, beg = 0, r = 0;
     Patprog pprog = NULL;
@@ -1004,28 +1004,28 @@
 		 * special interpretation by getindex() of `*' or `@'. */
 		break;
 	    case 'n':
-		t = get_strarg(++s);
+		t = get_strarg(++s, &arglen);
 		if (!*t)
 		    goto flagerr;
 		sav = *t;
 		*t = '\0';
-		num = mathevalarg(s + 1, &d);
+		num = mathevalarg(s + arglen, &d);
 		if (!num)
 		    num = 1;
 		*t = sav;
-		s = t;
+		s = t + arglen - 1;
 		break;
 	    case 'b':
 		hasbeg = 1;
-		t = get_strarg(++s);
+		t = get_strarg(++s, &arglen);
 		if (!*t)
 		    goto flagerr;
 		sav = *t;
 		*t = '\0';
-		if ((beg = mathevalarg(s + 1, &d)) > 0)
+		if ((beg = mathevalarg(s + arglen, &d)) > 0)
 		    beg--;
 		*t = sav;
-		s = t;
+		s = t + arglen - 1;
 		break;
 	    case 'p':
 		escapes = 1;
@@ -1033,15 +1033,16 @@
 	    case 's':
 		/* This gives the string that separates words *
 		 * (for use with the `w' flag).               */
-		t = get_strarg(++s);
+		t = get_strarg(++s, &arglen);
 		if (!*t)
 		    goto flagerr;
 		sav = *t;
 		*t = '\0';
-		sep = escapes ? getkeystring(s + 1, &waste, GETKEYS_SEP, NULL)
-		    : dupstring(s + 1);
+		s += arglen;
+		sep = escapes ? getkeystring(s, &waste, GETKEYS_SEP, NULL)
+		    : dupstring(s);
 		*t = sav;
-		s = t;
+		s = t + arglen - 1;
 		break;
 	    default:
 	      flagerr:
Index: Src/subst.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/subst.c,v
retrieving revision 1.65
diff -u -r1.65 subst.c
--- Src/subst.c	1 Nov 2006 12:25:22 -0000	1.65
+++ Src/subst.c	2 Nov 2006 18:34:01 -0000
@@ -1137,62 +1137,113 @@
     return ret;
 }
 
+
+/*
+ * Look for a delimited portion of a string.  The first (possibly
+ * multibyte) character at s is the delimiter.  Various forms
+ * of brackets are treated separately, as documented.
+ *
+ * Returns a pointer to the final delimiter.  Sets *len to the
+ * length of the final delimiter; a NULL causes *len to be set
+ * to zero since we shouldn't advance past it.  (The string is
+ * tokenized, so a NULL is a real end of string.)
+ */
+
 /**/
 char *
-get_strarg(char *s)
+get_strarg(char *s, int *lenp)
 {
-    char t = *s++;
-
-    if (!t)
-	return s - 1;
+    convchar_t del;
+    int len;
+    char tok = 0;
+
+    MB_METACHARINIT();
+    len = MB_METACHARLENCONV(s, &del);
+    if (!len) {
+	*lenp = 0;
+	return s;
+    }
 
-    switch (t) {
-    case '(':
-	t = ')';
+#ifdef MULTIBYTE_SUPPORT
+    if (del == WEOF)
+	del = (wint_t)((*s == Meta) ? s[1] ^ 32 : *s);
+#endif
+    s += len;
+    switch (del) {
+    case ZWC('('):
+	del = ZWC(')');
 	break;
     case '[':
-	t = ']';
+	del = ZWC(']');
 	break;
     case '{':
-	t = '}';
+	del = ZWC('}');
 	break;
     case '<':
-	t = '>';
+	del = ZWC('>');
 	break;
     case Inpar:
-	t = Outpar;
+	tok = Outpar;
 	break;
     case Inang:
-	t = Outang;
+	tok = Outang;
 	break;
     case Inbrace:
-	t = Outbrace;
+	tok = Outbrace;
 	break;
     case Inbrack:
-	t = Outbrack;
+	tok = Outbrack;
 	break;
     }
 
-    while (*s && *s != t)
-	s++;
+    if (tok) {
+	/*
+	 * Looking for a matching token; we want the literal byte,
+	 * not a decoded multibyte character, so search specially.
+	 */
+	while (*s && *s != tok)
+	    s++;
+    } else {
+	convchar_t del2;
+	len = 0;
+	while (*s) {
+	    len = MB_METACHARLENCONV(s, &del2);
+#ifdef MULTIBYTE_SUPPORT
+	    if (del2 == WEOF)
+		del2 = (wint_t)((*s == Meta) ? s[1] ^ 32 : *s);
+#endif
+	    if (del == del2)
+		break;
+	    s += len;
+	}
+    }
 
+    *lenp = len;
     return s;
 }
 
+/*
+ * Get an integer argument; update *s to he end of the
+ * final delimiter.  *delmatchp is set to 1 if we have matching
+ * delimiters and there was no error in the evaluation, else 0.
+ */
+
 /**/
 static int
-get_intarg(char **s)
+get_intarg(char **s, int *delmatchp)
 {
-    char *t = get_strarg(*s + 1);
+    int arglen;
+    char *t = get_strarg(*s, &arglen);
     char *p, sav;
     zlong ret;
 
+    *delmatchp = 0;
     if (!*t)
 	return -1;
     sav = *t;
     *t = '\0';
-    p = dupstring(*s + 2);
-    *s = t;
+    p = dupstring(*s + arglen);
+    *s = t + arglen;
     *t = sav;
     if (parsestr(p))
 	return -1;
@@ -1204,6 +1255,7 @@
 	return -1;
     if (ret < 0)
 	ret = -ret;
+    *delmatchp = 1;
     return ret < 0 ? -ret : ret;
 }
 
@@ -1540,8 +1592,8 @@
 	    int escapes = 0;
 	    int klen;
 #define UNTOK(C)  (itok(C) ? ztokens[(C) - Pound] : (C))
-#define UNTOK_AND_ESCAPE(X) {\
-		untokenize(X = dupstring(s + 1));\
+#define UNTOK_AND_ESCAPE(X, S) {\
+		untokenize(X = dupstring(S));\
 		if (escapes) {\
 		    X = getkeystring(X, &klen, GETKEYS_SEP, NULL);\
 		    X = metafy(X, klen, META_HREALLOC);\
@@ -1549,6 +1601,9 @@
 	    }
 
 	    for (s++; (c = *s) != ')' && c != Outpar; s++, tt = 0) {
+		int arglen;	/* length of modifier argument */
+		int delmatch;	/* integer delimiters matched OK */
+
 		switch (c) {
 		case ')':
 		case Outpar:
@@ -1578,9 +1633,11 @@
 		    flags |= SUB_SUBSTR;
 		    break;
 		case 'I':
-		    flnum = get_intarg(&s);
+		    s++;
+		    flnum = get_intarg(&s, &delmatch);
 		    if (flnum < 0)
 			goto flagerr;
+		    s--;
 		    break;
 
 		case 'L':
@@ -1658,16 +1715,16 @@
 		    tt = 1;
 		/* fall through */
 		case 'j':
-		    t = get_strarg(++s);
+		    t = get_strarg(++s, &arglen);
 		    if (*t) {
 			sav = *t;
 			*t = '\0';
 			if (tt)
-			    UNTOK_AND_ESCAPE(spsep)
+			    UNTOK_AND_ESCAPE(spsep, s + arglen)
 			else
-			    UNTOK_AND_ESCAPE(sep)
+			    UNTOK_AND_ESCAPE(sep, s + arglen)
 			*t = sav;
-			s = t;
+			s = t + arglen - 1;
 		    } else
 			goto flagerr;
 		    break;
@@ -1676,43 +1733,43 @@
 		    tt = 1;
 		/* fall through */
 		case 'r':
-		    sav = s[1];
-		    num = get_intarg(&s);
+		    s++;
+		    num = get_intarg(&s, &delmatch);
 		    if (num < 0)
 			goto flagerr;
 		    if (tt)
 			prenum = num;
 		    else
 			postnum = num;
-		    if (UNTOK(s[1]) != UNTOK(sav))
+		    if (!delmatch)
 			break;
-		    t = get_strarg(++s);
+		    t = get_strarg(s, &arglen);
 		    if (!*t)
 			goto flagerr;
 		    sav = *t;
 		    *t = '\0';
 		    if (tt)
-			UNTOK_AND_ESCAPE(premul)
+			UNTOK_AND_ESCAPE(premul, s + arglen)
 		    else
-			UNTOK_AND_ESCAPE(postmul)
+			UNTOK_AND_ESCAPE(postmul, s + arglen)
 		    *t = sav;
 		    sav = *s;
-		    s = t + 1;
+		    s = t + arglen;
 		    if (UNTOK(*s) != UNTOK(sav)) {
 			s--;
 			break;
 		    }
-		    t = get_strarg(s);
+		    t = get_strarg(s, &arglen);
 		    if (!*t)
 			goto flagerr;
 		    sav = *t;
 		    *t = '\0';
 		    if (tt)
-			UNTOK_AND_ESCAPE(preone)
+			UNTOK_AND_ESCAPE(preone, s + arglen)
 		    else
-			UNTOK_AND_ESCAPE(postone)
+			UNTOK_AND_ESCAPE(postone, s + arglen)
 		    *t = sav;
-		    s = t;
+		    s = t + arglen - 1;
 		    break;
 
 		case 'm':
@@ -3251,9 +3308,10 @@
 void
 modify(char **str, char **ptr)
 {
-    char *ptr1, *ptr2, *ptr3, del, *lptr, c, *test, *sep, *t, *tt, tc, *e;
-    char *copy, *all, *tmp, sav;
-    int gbal, wall, rec, al, nl;
+    char *ptr1, *ptr2, *ptr3, *lptr, c, *test, *sep, *t, *tt, tc, *e;
+    char *copy, *all, *tmp, sav, sav1, *ptr1end;
+    int gbal, wall, rec, al, nl, charlen, delmatch;
+    convchar_t del;
 
     test = NULL;
 
@@ -3282,20 +3340,48 @@
 		break;
 
 	    case 's':
-		/* TODO: multibyte delimiter */
 		c = **ptr;
 		(*ptr)++;
 		ptr1 = *ptr;
-		del = *ptr1++;
-		for (ptr2 = ptr1; *ptr2 != del && *ptr2; ptr2++);
+		MB_METACHARINIT();
+		charlen = MB_METACHARLENCONV(ptr1, &del);
+#ifdef MULTIBYTE_SUPPORT
+		if (del == WEOF)
+		    del = (wint_t)((*ptr1 == Meta) ? ptr1[1] ^ 32 : *ptr1);
+#endif
+		ptr1 += charlen;
+		for (ptr2 = ptr1, charlen = 0; *ptr2; ptr2 += charlen) {
+		    convchar_t del2;
+		    charlen = MB_METACHARLENCONV(ptr2, &del2);
+#ifdef MULTIBYTE_SUPPORT
+		    if (del2 == WEOF)
+			del2 = (wint_t)((*ptr2 == Meta) ?
+					ptr2[1] ^ 32 : *ptr2);
+#endif
+		    if (del2 == del)
+			break;
+		}
 		if (!*ptr2) {
 		    zerr("bad substitution");
 		    return;
 		}
-		*ptr2++ = '\0';
-		for (ptr3 = ptr2; *ptr3 != del && *ptr3; ptr3++);
-		if ((sav = *ptr3))
-		    *ptr3++ = '\0';
+		ptr1end = ptr2;
+		ptr2 += charlen;
+		sav1 = *ptr1end;
+		*ptr1end = '\0';
+		for (ptr3 = ptr2, charlen = 0; *ptr3; ptr3 += charlen) {
+		    convchar_t del3;
+		    charlen = MB_METACHARLENCONV(ptr3, &del3);
+#ifdef MULTIBYTE_SUPPORT
+		    if (del3 == WEOF)
+			del3 = (wint_t)((*ptr3 == Meta) ?
+					ptr3[1] ^ 32 : *ptr3);
+#endif
+		    if (del3 == del)
+			break;
+		}
+		sav = *ptr3;
+		*ptr3 = '\0';
 		if (*ptr1) {
 		    zsfree(hsubl);
 		    hsubl = ztrdup(ptr1);
@@ -3313,10 +3399,9 @@
 		for (tt = hsubr = ztrdup(ptr2); *tt; tt++)
 		    if (inull(*tt) && *tt != Bnullkeep)
 			chuck(tt--);
-		ptr2[-1] = del;
-		if (sav)
-		    ptr3[-1] = sav;
-		*ptr = ptr3 - 1;
+		*ptr1end = sav1;
+		*ptr3 = sav;
+		*ptr = ptr3 + charlen - 1;
 		break;
 
 	    case '&':
@@ -3335,13 +3420,13 @@
 	    case 'W':
 		wall = 1;
 		(*ptr)++;
-		ptr1 = get_strarg(ptr2 = *ptr);
+		ptr1 = get_strarg(ptr2 = *ptr, &charlen);
 		if ((sav = *ptr1))
 		    *ptr1 = '\0';
-		sep = dupstring(ptr2 + 1);
+		sep = dupstring(ptr2 + charlen);
 		if (sav)
 		    *ptr1 = sav;
-		*ptr = ptr1 + 1;
+		*ptr = ptr1 + charlen;
 		c = '\0';
 		break;
 
@@ -3350,8 +3435,8 @@
 		(*ptr)++;
 		break;
 	    case 'F':
-		rec = get_intarg(ptr);
 		(*ptr)++;
+		rec = get_intarg(ptr, &delmatch);
 		break;
 	    default:
 		*ptr = lptr;
Index: Test/D04parameter.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D04parameter.ztst,v
retrieving revision 1.21
diff -u -r1.21 D04parameter.ztst
--- Test/D04parameter.ztst	13 Sep 2006 20:55:30 -0000	1.21
+++ Test/D04parameter.ztst	2 Nov 2006 18:34:01 -0000
@@ -867,3 +867,17 @@
 >andsomekept
 >andsomekept
 
+  file=/one/two/three/four
+  print ${file:fh}
+  print ${file:F.1.h}
+  print ${file:F+2+h}
+  print ${file:F(3)h}
+  print ${file:F<4>h}
+  print ${file:F{5}h}
+0:Modifiers with repetition
+>/
+>/one/two/three
+>/one/two
+>/one
+>/
+>/
Index: Test/D07multibyte.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v
retrieving revision 1.10
diff -u -r1.10 D07multibyte.ztst
--- Test/D07multibyte.ztst	13 Sep 2006 20:55:30 -0000	1.10
+++ Test/D07multibyte.ztst	2 Nov 2006 18:34:01 -0000
@@ -297,3 +297,17 @@
 >«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ
 >ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ
 # er... yeah, that looks right...
+
+  foo=picobarn
+  print ${foo:s£bar£rod£:s¥rod¥stick¥}
+0:Delimiters in modifiers
+>picostickn
+
+# TODO: if we get paired multibyte bracket delimiters to work
+# (as Emacs does, the smug so-and-so), the following should change.
+  foo=bar
+  print ${(r£5£¥X¥)foo}
+  print ${(l«10«»Y»£HI£)foo}
+0:Delimiters in parameter flags
+>barXX
+>YYYYYHIbar

-- 
Peter Stephenson <pws@xxxxxxx>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


To access the latest news from CSR copy this link into a web browser:  http://www.csr.com/email_sig.php



Messages sorted by: Reverse Date, Date, Thread, Author