Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: New options for the PCRE module



Hi all,

I thought the PCRE module could use a little enhancement, so I added a few things that may be useful to some.

Let's see if I can get through all of this before the coffee wears off...

1. A new '-s' option to pcre_compile. This is the frequently set PCRE_DOTALL option, allowing the dot character to match a newline as well.

2. For pcre_match, a '-n offset' option for starting the search at the offset position in the match string, and a '-b' option for setting the variable ZPCRE_OP to the offset pair of positions of the entire successful pattern match. For example, if a pattern matches with the '-b' option set, a ZPCRE_OP set to the string "32 45" indicates that the entire match started on byte position 32 and ended on byte position 44. PCRE is saying byte position 32 to 45 exclusive, zero based.

All of this is to enable the 'find all' functionality. For example, if I want all of the non-overlapping matches within a string, I can now do:

accum=()

pcre_match -b -- $match_string

while [[ -n $ZPCRE_OP ]] do
   b=($=ZPCRE_OP)
   accum+=$MATCH
   pcre_match -b -n $(( b[2] )) -- $match_string
done
print -l $accum

On the safe side, regarding the possibility of multi-byte characters, I'm assuming that the returned offset positions are only for sending back to pcre_match and not for indexing on a match string, because the offsets are in byte count, not character count.

3. A needed correction: all of the module's external variables are now unset on each match attempt, so that a failed match will be obvious.

Could someone please point me to the doc files that would need updating (for the zshmodule man page), or if someone here has that part automated, I can send them whatever targeted write-up they want.


--- pcre.c	2007-07-09 02:30:42.000000000 -0700
+++ pcre-new.c	2009-02-26 22:10:46.000000000 -0800
@@ -82,6 +82,7 @@
     if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
     if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
     if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
+    if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
     
     if (zpcre_utf8_enabled())
 	pcre_opts |= PCRE_UTF8;
@@ -137,20 +138,23 @@
 
 /**/
 static int
-zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
+zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, 
+    int want_offset_pair, int matchedinarr)
 {
     char **captures, *match_all, **matches;
+    char offset_all[50];
     int capture_start = 1;
 
     if (matchedinarr)
 	capture_start = 0;
-    if (matchvar == NULL)
-	matchvar = "MATCH";
-    if (substravar == NULL)
-	substravar = "match";
-
+    
     /* captures[0] will be entire matched string, [1] first substring */
-    if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+    if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+	/* Set to the offsets of the complete match */
+	if (want_offset_pair) {
+	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+	    setsparam("ZPCRE_OP", ztrdup(offset_all));
+	}
 	match_all = ztrdup(captures[0]);
 	setsparam(matchvar, match_all);
 	matches = zarrdup(&captures[capture_start]);
@@ -163,12 +167,30 @@
 
 /**/
 static int
+getposint(char *instr, char *nam)
+{
+    char *eptr;
+    int ret;
+
+    ret = (int)zstrtol(instr, &eptr, 10);
+    if (*eptr || ret < 0) {
+	zwarnnam(nam, "integer expected: %s", instr);
+	return -1;
+    }
+
+    return ret;
+}
+
+/**/
+static int
 bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 {
     int ret, capcount, *ovec, ovecsize, c;
     char *matched_portion = NULL;
     char *receptacle = NULL;
     int return_value = 1;
+    int offset_start = 0;
+    int want_offset_pair = 0;
 
     if (pcre_pattern == NULL) {
 	zwarnnam(nam, "no pattern has been compiled");
@@ -181,6 +203,12 @@
     if(OPT_HASARG(ops,c='v')) {
 	matched_portion = OPT_ARG(ops,c);
     }
+    if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search */
+	offset_start = getposint(OPT_ARG(ops,c), nam);
+    }
+    /* For the entire match, 'Return' the offset positions instead of the matched string */
+    if(OPT_ISSET(ops,'b')) want_offset_pair = 1; 
+    
     if(!*args) {
 	zwarnnam(nam, "not enough arguments");
     }
@@ -194,12 +222,22 @@
     ovecsize = (capcount+1)*3;
     ovec = zalloc(ovecsize*sizeof(int));
     
-    ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
+    ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), offset_start, 0, ovec, ovecsize);
+
+    if (matched_portion == NULL)
+	matched_portion = "MATCH";
+    if (receptacle == NULL)
+	receptacle = "match";
+
+    /* Reset the external variables */
+    unsetparam(matched_portion);
+    unsetparam(receptacle);
+    unsetparam("ZPCRE_OP");
     
     if (ret==0) return_value = 0;
     else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
+	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
 	return_value = 0;
     }
     else {
@@ -258,7 +296,7 @@
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
+		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
 		    return_value = 1;
 		    break;
 		}
@@ -289,8 +327,8 @@
 #endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
 
 static struct builtin bintab[] = {
-    BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:",    NULL),
+    BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:n:b",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 


Messages sorted by: Reverse Date, Date, Thread, Author