Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Re: Incorrect sorting of Polish characters



On Mon, 18 Jul 2016 10:33:29 +0100
Peter Stephenson <p.stephenson@xxxxxxxxxxx> wrote:
> On Sat, 16 Jul 2016 13:07:18 -0700
> Bart Schaefer <schaefer@xxxxxxxxxxxxxxxx> wrote:
> > On Jul 16,  7:17pm, M. Bartoszkiewicz wrote:
> > } I have noticed that some Polish characters
> > } are sorted incorrectly in glob expansion (but
> > } correctly in other contexts).
> 
> A simple-minded change to pass strcoll() unmetafied versions of the
> strings does seem to fix the problem, so it looks like this is the
> case.  However, that's not the right fix as we only want to unmetafy
> once per input string, not once per comparison, and below the call to
> qsort() there's quite a lot of internal string handling.  An equally
> simple-minded fix around the call to qsort() (saving and restoring the
> strings) didn't seem to work.  So this needs a bit more thought.

Adding an umetafied entry to the glob match that only gets used for
sorting seems to do the trick.  I think an additional single pass
through the array of matches isn't a big deal.  Possibly the sort code
needs a check through to confirm it really is unmeta-friendly for
globbing as there are different ways in.  Any other suggestions?

pws

diff --git a/Src/glob.c b/Src/glob.c
index 2051016..146b4db 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -41,7 +41,10 @@
 typedef struct gmatch *Gmatch;
 
 struct gmatch {
+    /* Metafied file name */
     char *name;
+    /* Unmetafied file name; embedded nulls can't occur in file names */
+    char *uname;
     /*
      * Array of sort strings:  one for each GS_EXEC sort type in
      * the glob qualifiers.
@@ -911,7 +914,8 @@ gmatchcmp(Gmatch a, Gmatch b)
     for (i = gf_nsorts, s = gf_sortlist; i; i--, s++) {
 	switch (s->tp & ~GS_DESC) {
 	case GS_NAME:
-	    r = zstrcmp(b->name, a->name, gf_numsort ? SORTIT_NUMERICALLY : 0);
+	    r = zstrcmp(b->uname, a->uname,
+			gf_numsort ? SORTIT_NUMERICALLY : 0);
 	    break;
 	case GS_DEPTH:
 	    {
@@ -1859,6 +1863,7 @@ zglob(LinkList list, LinkNode np, int nountok)
 	int nexecs = 0;
 	struct globsort *sortp;
 	struct globsort *lastsortp = gf_sortlist + gf_nsorts;
+	Gmatch gmptr;
 
 	/* First find out if there are any GS_EXECs, counting them. */
 	for (sortp = gf_sortlist; sortp < lastsortp; sortp++)
@@ -1910,6 +1915,29 @@ zglob(LinkList list, LinkNode np, int nountok)
 	    }
 	}
 
+	/*
+	 * Where necessary, create unmetafied version of names
+	 * for comparison.  If no Meta characters just point
+	 * to original string.  All on heap.
+	 */
+	for (gmptr = matchbuf; gmptr < matchptr; gmptr++)
+	{
+	    char *nptr;
+	    for (nptr = gmptr->name; *nptr; nptr++)
+	    {
+		if (*nptr == Meta)
+		    break;
+	    }
+	    if (*nptr == Meta)
+	    {
+		int dummy;
+		gmptr->uname = dupstring(gmptr->name);
+		unmetafy(gmptr->uname, &dummy);
+	    } else {
+		gmptr->uname = gmptr->name;
+	    }
+	}
+
 	/* Sort arguments in to lexical (and possibly numeric) order. *
 	 * This is reversed to facilitate insertion into the list.    */
 	qsort((void *) & matchbuf[0], matchct, sizeof(struct gmatch),
diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst
index dedf241..1b1d042 100644
--- a/Test/D07multibyte.ztst
+++ b/Test/D07multibyte.ztst
@@ -562,3 +562,20 @@
   }
   : $functions)
 0:Multibtye handled of functions parameter
+
+  if [[ -n ${$(locale -a 2>/dev/null)[(R)pl_PL.utf8]} ]]; then
+  (
+    export LC_ALL=pl_PL.UTF-8
+    local -a names=(a b c d e f $'\u0105' $'\u0107' $'\u0119')
+    print -o $names
+    mkdir -p plchars
+    cd plchars
+    touch $names
+    print ?
+  )
+  else
+    ZTST_skip="No Polish UTF-8 local found, skipping sort test"
+  fi
+0:Sorting of metafied Polish characters
+>a ą b c ć d e ę f
+>a ą b c ć d e ę f



Messages sorted by: Reverse Date, Date, Thread, Author