Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Re: shwordsplit: final non-whitespace IFS character problem



On Fri, 4 Aug 2017 04:03:19 +0200
Martijn Dekker <martijn@xxxxxxxx> wrote:
> In field/word splitting, a final non-whitespace IFS delimiter character
> is counted as an empty field.

Hope this is good enough.  I've taken account of the fact that when
splitting "foo:bar::" one empty string is kept as it's not final.

As far as I can see from bash, in the case of white space, terminating
white space is also stripped (all of it, since the main difference here
is it combines to make a single delimiter), but it's quite hard to
generate a case to be sure with other shells making it pretty difficult
to get the effect of splitting without removing empty words.  So now

% foo="one    two   three   "
% print -l "${=foo}"
one
two
three

% (setopt posixstrings; print -l "${=foo}"; )
one
two
three
% 

which seems to agree with the following in bash:

$ var="one   two   three    "
$ fn() { typeset f; for f in "$@"; do echo $f; done; }
$ fn $var
one
two
three
$ 

but it's possible the space is being removed for some completely
different reason.

It would obviously be insane to make this the default behaviour.  I hope
the description of the option is suitably off-putting.

pws

diff --git a/Doc/Zsh/options.yo b/Doc/Zsh/options.yo
index 70092d6..c0f07d7 100644
--- a/Doc/Zsh/options.yo
+++ b/Doc/Zsh/options.yo
@@ -2193,16 +2193,16 @@ cindex(discarding embedded nulls in $'...')
 cindex(embedded nulls, in $'...')
 cindex(nulls, embedded in $'...')
 item(tt(POSIX_STRINGS) <K> <S>)(
-This option affects processing of quoted strings.  Currently it only
-affects the behaviour of null characters, i.e. character 0 in the
-portable character set corresponding to US ASCII.
+This option affects processing of quoted strings, and also
+splitting of strngs.
 
-When this option is not set, null characters embedded within strings
-of the form tt($')var(...)tt(') are treated as ordinary characters. The
-entire string is maintained within the shell and output to files where
-necessary, although owing to restrictions of the library interface
-the string is truncated at the null character in file names, environment
-variables, or in arguments to external programs.
+When this option is not set, null characters (character 0 in the
+portable character set coresponding to US ASCII) that are embedded
+within strings of the form tt($')var(...)tt(') are treated as ordinary
+characters. The entire string is maintained within the shell and output
+to files where necessary, although owing to restrictions of the libplary
+interface the string is truncated at the null character in file name s,
+environment variables, or in arguments to external programs.
 
 When this option is set, the tt($')var(...)tt(') expression is truncated at
 the null character.  Note that remaining parts of the same string
@@ -2211,6 +2211,19 @@ beyond the termination of the quotes are not truncated.
 For example, the command line argument tt(a$'b\0c'd) is treated with
 the option off as the characters tt(a), tt(b), null, tt(c), tt(d),
 and with the option on as the characters tt(a), tt(b), tt(d).
+
+Furthermore, when the option is set, a trailing separator followed by an
+empty strins does not cause extra fields to be output when the string
+is split.  For example,
+
+example(var="foo bar "
+print -l "${=var}")
+
+outputs a blank line at the end if tt(POSIXSTRINGS) is not set, but
+no blank line if the option is set.  Note the quotation marks as
+empty elements would in any case be removed without their presence.
+If the separator is not white space, only the final separator is
+ignored in this fashion.
 )
 pindex(POSIX_TRAPS)
 pindex(NO_POSIX_TRAPS)
diff --git a/Src/utils.c b/Src/utils.c
index 5055d69..5493317 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -3500,12 +3500,12 @@ skipwsep(char **s)
 mod_export char **
 spacesplit(char *s, int allownull, int heap, int quote)
 {
-    char *t, **ret, **ptr;
+    char *t, **ret, **ptr, **eptr;
     int l = sizeof(*ret) * (wordcount(s, NULL, -!allownull) + 1);
     char *(*dup)(const char *) = (heap ? dupstring : ztrdup);
 
     /* ### TODO: s/calloc/alloc/ */
-    ptr = ret = (char **) (heap ? hcalloc(l) : zshcalloc(l));
+    eptr = ptr = ret = (char **) (heap ? hcalloc(l) : zshcalloc(l));
 
     if (quote) {
 	/*
@@ -3537,6 +3537,7 @@ spacesplit(char *s, int allownull, int heap, int quote)
 	if (s > t || allownull) {
 	    *ptr = (char *) (heap ? zhalloc((s - t) + 1) :
 		                     zalloc((s - t) + 1));
+	    eptr = ptr;
 	    ztrncpy(*ptr++, t, s - t);
 	} else
 	    *ptr++ = dup(nulstring);
@@ -3545,6 +3546,20 @@ spacesplit(char *s, int allownull, int heap, int quote)
     }
     if (!allownull && t != s)
 	*ptr++ = dup("");
+    if (isset(POSIXSTRINGS) && ptr != eptr + 1) {
+	/*
+	 * Trailing separators do not generate extra fields in POSIX.
+	 * Note this is only the final separator --- if the
+	 * immediately preceding field was null it is still counted.
+	 * So just back up one.
+	 */
+	--ptr;
+	if (!heap) {
+	    char **ret2 = realloc(ret, sizeof(*ret) * (ptr+1-ret));
+	    ptr -= ret-ret2;
+	    ret = ret2;
+	}
+    }
     *ptr = NULL;
     return ret;
 }
diff --git a/Test/E01options.ztst b/Test/E01options.ztst
index f01d835..b394e7c 100644
--- a/Test/E01options.ztst
+++ b/Test/E01options.ztst
@@ -1339,3 +1339,44 @@
 ?(anon):4: `break' active at end of function scope
 ?(anon):4: `break' active at end of function scope
 ?(anon):4: `break' active at end of function scope
+
+  for opt in POSIX_STRINGS NO_POSIX_STRINGS; do
+    var="foo bar "
+    (setopt $opt; print -l X "${=var}" Y)
+    var="foo2::bar2:"
+    (setopt $opt; IFS=:; print -l X "${=var}" Y)
+    var="foo3:bar3::"
+    (setopt $opt; IFS=:; print -l X "${=var}" Y)
+  done
+0:POSIX_STRINGS effect on final delimiters
+>X
+>foo
+>bar
+>Y
+>X
+>foo2
+>
+>bar2
+>Y
+>X
+>foo3
+>bar3
+>
+>Y
+>X
+>foo
+>bar
+>
+>Y
+>X
+>foo2
+>
+>bar2
+>
+>Y
+>X
+>foo3
+>bar3
+>
+>
+>Y



Messages sorted by: Reverse Date, Date, Thread, Author