Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Re: [PATCH v3] regexp-replace and ^, word boundary or look-behind operators (and more).



2021-05-05 12:45:21 +0100, Stephane Chazelas:
> 2021-04-30 16:13:34 -0700, Bart Schaefer:
> [...]
> > I went back and looked at the patch again.
> 
> Thanks. Here's a third version with further improvements
> addressing some of the comments here.
[...]

That v3 patch had (at least) a couple of bugs:
- in ERE mode, replacement was not inserted properly when
  pattern matched an empty string not at the start of the
  subject (like in regexp-replace var '\>' new)

- it would run in an infinite loop when there's no match in ERE
  mode.

I see Bart ended up committing the v2 version of my patch (from
48747) a few months later in:

commit bb61da36aaeeaa70413cdf5bc66d7a71194f93e5
Author:     Stephane Chazelas <stephane.chazelas@xxxxxxxxx>
AuthorDate: Mon Sep 6 14:43:01 2021 -0700
Commit:     Bart Schaefer <schaefer@xxxxxxxxx>
CommitDate: Mon Sep 6 14:43:01 2021 -0700

    45180: clarify doc for POSIX EREs, fix an issue with PCRE when the replacement was empty or generated more than one element

That one didn't have the second problem but had the first and
also failed to add the replacement in:

regexp-replace var '^' replacement

for instance when $var is initially empty.

So here's a v4 that should address that, some of the objections
to v2 and uses namerefs to replace that illegible usage
of positional parameters for local variables (that's diff
against current HEAD, not pre-v2).

I went for the:

typeset -g -- "$1"
typeset -nu -- var=$1

suggested by Bart to avoid possible clashes with local variable
names. That might have side effects if called as regexp-replace
'a[2]' re?

diff --git a/Functions/Misc/regexp-replace b/Functions/Misc/regexp-replace
index d4408f0f7..0e3deed4f 100644
--- a/Functions/Misc/regexp-replace
+++ b/Functions/Misc/regexp-replace
@@ -1,91 +1,95 @@
-# Replace all occurrences of a regular expression in a variable.  The
-# variable is modified directly.  Respects the setting of the
-# option RE_MATCH_PCRE.
+# Replace all occurrences of a regular expression in a scalar variable.
+# The variable is modified directly.  Respects the setting of the option
+# RE_MATCH_PCRE, but otherwise sets the zsh emulation mode.
 #
-# First argument: *name* (not contents) of variable.
-# Second argument: regular expression
-# Third argument: replacement string.  This can contain all forms of
-# $ and backtick substitutions; in particular, $MATCH will be replaced
-# by the portion of the string matched by the regular expression.
-
-# we use positional parameters instead of variables to avoid
-# clashing with the user's variable. Make sure we start with 3 and only
-# 3 elements:
-argv=("$1" "$2" "$3")
-
-# $4 records whether pcre is enabled as that information would otherwise
-# be lost after emulate -L zsh
-4=0
-[[ -o re_match_pcre ]] && 4=1
+# Arguments:
+#
+# 1. *name* (not contents) of variable or more generally any lvalue;
+#    expected to be scalar.
+#
+# 2. regular expression
+#
+# 3. replacement string.  This can contain all forms of
+#    $ and backtick substitutions; in particular, $MATCH will be
+#    replaced by the portion of the string matched by the regular
+#    expression. Parsing errors are fatal to the shell process.
+
+if (( $# < 2 || $# > 3 )); then
+  setopt localoptions functionargzero
+  print -ru2 "Usage: $0 <varname> <regexp> [<replacement>]"
+  return 2
+fi
 
-emulate -L zsh
+# ensure variable exists in the caller's scope before referencing it
+# to make sure we don't end up referencing one of our own.
+typeset -g -- "$1" || return 2
+typeset -nu -- var=$1 || return 2
 
+local -i use_pcre=0
+[[ -o re_match_pcre ]] && use_pcre=1
 
-local MATCH MBEGIN MEND
+emulate -L zsh
+
+local regexp=$2 replacement=$3 result MATCH MBEGIN MEND
 local -a match mbegin mend
 
-if (( $4 )); then
+if (( use_pcre )); then
   # if using pcre, we're using pcre_match and a running offset
   # That's needed for ^, \A, \b, and look-behind operators to work
   # properly.
 
   zmodload zsh/pcre || return 2
-  pcre_compile -- "$2" && pcre_study || return 2
+  pcre_compile -- "$regexp" && pcre_study || return 2
+
+  local -i offset=0 start stop
+  local new ZPCRE_OP
+  local -a finds
 
-  # $4 is the current *byte* offset, $5, $6 reserved for later use
-  4=0 6=
+  while pcre_match -b -n $offset -- "$var"; do
+    # we need to perform the evaluation in a scalar assignment so that
+    # if it generates an array, the elements are converted to string (by
+    # joining with the first chararacter of $IFS as usual)
+    new=${(Xe)replacement}
 
-  local ZPCRE_OP
-  while pcre_match -b -n $4 -- "${(P)1}"; do
-    # append offsets and computed replacement to the array
-    # we need to perform the evaluation in a scalar assignment so that if
-    # it generates an array, the elements are converted to string (by
-    # joining with the first character of $IFS as usual)
-    5=${(e)3}
-    argv+=(${(s: :)ZPCRE_OP} "$5")
+    finds+=( ${(s[ ])ZPCRE_OP} "$new" )
 
     # for 0-width matches, increase offset by 1 to avoid
     # infinite loop
-    4=$((argv[-2] + (argv[-3] == argv[-2])))
+    (( offset = finds[-2] + (finds[-3] == finds[-2]) ))
   done
 
-  (($# > 6)) || return # no match
+  (( $#finds )) || return # no match
 
-  set +o multibyte
+  unsetopt multibyte
 
-  # $5 contains the result, $6 the current offset
-  5= 6=1
-  for 2 3 4 in "$@[7,-1]"; do
-    5+=${(P)1[$6,$2]}$4
-    6=$(($3 + 1))
+  offset=1
+  for start stop new in "$finds[@]"; do
+    result+=${var[offset,start]}$new
+    (( offset = stop + 1 ))
   done
-  5+=${(P)1[$6,-1]}
-else
+  result+=${var[offset,-1]}
+
+else # no PCRE
+
   # in ERE, we can't use an offset so ^, (and \<, \b, \B, [[:<:]] where
   # available) won't work properly.
-
-  # $4 is the string to be matched
-  4=${(P)1}
-
-  while [[ -n $4 ]]; do
-    if [[ $4 =~ $2 ]]; then
-      # append initial part and substituted match
-      5+=${4[1,MBEGIN-1]}${(e)3}
-      # truncate remaining string
-      if ((MEND < MBEGIN)); then
-        # zero-width match, skip one character for the next match
-        ((MEND++))
-	5+=${4[1]}
-      fi
-      4=${4[MEND+1,-1]}
-      # indicate we did something
-      6=1
-    else
-      break
+  local subject=$var
+  local -i ok
+  while [[ $subject =~ $regexp ]]; do
+    # append initial part and substituted match
+    result+=$subject[1,MBEGIN-1]${(Xe)replacement}
+    # truncate remaining string
+    if (( MEND < MBEGIN )); then
+      # zero-width match, skip one character for the next match
+      (( MEND++ ))
+      result+=$subject[MBEGIN]
     fi
+    subject=$subject[MEND+1,-1]
+    ok=1
+    [[ -n $subject ]] && break
   done
-  [[ -n $6 ]] || return # no match
-  5+=$4
+  (( ok )) || return
+  result+=$subject
 fi
 
-eval $1=\$5
+var=$result




Messages sorted by: Reverse Date, Date, Thread, Author