[Raw Msg Headers][Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Regex in sift statements
Hello!
Felix Lee writes:
> I think it's a side effect of the token-matching semantics of sift.
> [A-Z] matches a 1-char token, not a char within a token.
>
> personally, I think the token-matching semantics was a bad idea. It's
> not quite flexible enough and it's easy to make mistakes. I'd prefer
> normal regexps augmented with metachars that match whole tokens.
I've done some work to substitute ZMailer's original token-based matching
algorithm with GNU's regex routines. Patches to the source files (from
ZMailer verison 2.96mea) see below. You have to replace the files
"libsh/regexp.[hc]" with the files "regex.[hc]" from GNU regex version 0.12.
Also change your "Config" file to compile the GNU regex routines correctly.
E.g. for Solaris 2.3 set
DEFS= -DSTDC_HEADERS=1 -DHAVE_STRING_H=1 -DHAVE_ALLOCA_H=1
This works fine for us, but be cautious: Some regular "sift"-expressions
in the original .cf-files rely heavily on the old token-based matching
algorithm! You have to change them to use the new semantics correctly!
Greetings
Thomas Knott
===================================================================
--- libsh/Makefile.in 1994/11/11 14:05:05
***************
*** 26,36 ****
OBJS = sslwalker.o optimizer.o interpret.o listutils.o builtins.o \
io.o expand.o mail.o path.o prompt.o test.o trap.o variables.o \
! execute.o jobcontrol.o regexp.o zsh.o listtrees.o
SOURCE = sslwalker.c optimizer.c interpret.c listutils.c builtins.c \
io.c expand.c mail.c path.c prompt.c test.c trap.c variables.c \
! execute.c jobcontrol.c regexp.c zsh.c listtrees.c
# keeping zsh.o after the library is important due to the 'rm -f *.o' below...
--- 26,36 ----
OBJS = sslwalker.o optimizer.o interpret.o listutils.o builtins.o \
io.o expand.o mail.o path.o prompt.o test.o trap.o variables.o \
! execute.o jobcontrol.o regex.o zsh.o listtrees.o
SOURCE = sslwalker.c optimizer.c interpret.c listutils.c builtins.c \
io.c expand.c mail.c path.c prompt.c test.c trap.c variables.c \
! execute.c jobcontrol.c regex.c zsh.c listtrees.c
# keeping zsh.o after the library is important due to the 'rm -f *.o' below...
===================================================================
--- libsh/interpret.c 1994/11/11 14:05:05
***************
*** 22,29 ****
#include <sys/wait.h>
#endif /* USE_UNIONWAIT */
#ifdef MAILER
- #include "regexp.h"
#include "sift.h"
#endif /* MAILER */
int magic_number = 2; /* check id for precompiled script files */
--- 22,29 ----
#include <sys/wait.h>
#endif /* USE_UNIONWAIT */
#ifdef MAILER
#include "sift.h"
+ #define DEBUG
#endif /* MAILER */
int magic_number = 2; /* check id for precompiled script files */
***************
*** 40,46 ****
--- 40,51 ----
#ifdef MAILER
extern int D_assign;
extern int funclevel;
+
+ #ifdef DEBUG
+ extern int D_compare, D_matched;
+ #endif /* DEBUG */
#endif /* MAILER */
+
extern FILE *runiofp;
extern char globchars[];
extern int execute(), runio();
***************
*** 108,113 ****
--- 113,250 ----
}
}
+ STATIC char *
+ dequote (str)
+ char *str;
+ {
+ int len;
+ char *sp, *ep, *res, *ptr;
+
+ len = strlen(str);
+ sp = str;
+ ep = str + len-1;
+
+ if ((*sp == '"' && *ep == '"') || (*sp == '\'' && *ep == '\'')) {
+ sp++;
+ ep--;
+ len -= 2;
+ }
+
+ return (strnsave(sp, len));
+ }
+
+ STATIC void
+ free_regexp (prog)
+ regexp *prog;
+ {
+ if (prog != NULL) {
+ regfree(&prog->re);
+ free(prog);
+ }
+ }
+
+ STATIC u_char *
+ regsub(prog, n)
+ regexp *prog;
+ int n;
+ {
+ if (prog == NULL || n < 0 || n > prog->re.re_nsub)
+ return (NULL);
+
+ return (prog->match[n]);
+ }
+
+ STATIC regexp *
+ reg_comp (str)
+ char *str;
+ {
+ char *reg_stat;
+ regexp *prog;
+
+ prog = (regexp *) emalloc(sizeof(regexp));
+ if (prog == NULL) {
+ fprintf(stderr, "%s: regexp %s: No space\n",
+ progname, str);
+ return (NULL);
+ }
+ bzero(prog, sizeof(regexp));
+
+ prog->pattern = str;
+
+ reg_stat = (char *) re_compile_pattern(str, strlen(str), &prog->re);
+ if (reg_stat != NULL) {
+ fprintf(stderr, "%s: regexp %s: %s\n", progname, str, reg_stat);
+
+ free(prog);
+ return (NULL);
+ }
+
+ return (prog);
+ }
+
+ STATIC int
+ reg_exec (prog, str)
+ regexp *prog;
+ char *str;
+ {
+ int i;
+ int re_stat;
+ regmatch_t *pmatch;
+
+ if (prog == NULL) {
+ fprintf(stderr, "%s: regexp: NULL program\n", progname);
+ return (0);
+ }
+
+ #ifdef DEBUG
+ if (D_compare) {
+ fprintf(stderr,
+ "%*scomparing %s and ", 4*funclevel, " ", prog->pattern);
+ if (str != NULL)
+ fprintf(stderr, "'%s'\n", str);
+ else
+ fprintf(stderr, "(nil)\n");
+ }
+ #endif /* DEBUG */
+
+ #ifndef USE_ALLOCA
+ pmatch = (regmatch_t *)
+ emalloc((prog->re.re_nsub+1)*sizeof(regmatch_t));
+ #else
+ pmatch = (regmatch_t *)
+ alloca((prog->re.re_nsub+1)*sizeof(regmatch_t));
+ #endif
+
+ re_stat = regexec(&prog->re, str, prog->re.re_nsub+1, pmatch, 0);
+ if (re_stat == REG_NOMATCH) {
+ #ifndef USE_ALLOCA
+ free(pmatch);
+ #endif
+ return 0;
+ }
+
+ for (i=0; i<=prog->re.re_nsub; i++)
+ prog->match[i] = strnsave(str + pmatch[i].rm_so,
+ pmatch[i].rm_eo - pmatch[i].rm_so);
+
+ #ifdef DEBUG
+ if (D_matched) {
+ fprintf(stderr,
+ "%*smatched %s and ", 4*funclevel, " ", prog->pattern);
+ if (str != NULL)
+ fprintf(stderr, "'%s'\n", str);
+ else
+ fprintf(stderr, "(nil)\n");
+ }
+ #endif /* DEBUG */
+
+ #ifndef USE_ALLOCA
+ free(pmatch);
+ #endif
+
+ return 1;
+ }
+
/*
* Return the next available filedescriptor, simulating kernel lookup.
***************
*** 713,719 ****
idx = sfdp->tabledesc->rearray_idx;
rep = repstart;
while (rep - repstart < idx && *rep != NULL)
! free((char *)*rep++);
free((char *)sfdp->tabledesc->rearray);
}
#endif /* MAILER */
--- 850,856 ----
idx = sfdp->tabledesc->rearray_idx;
rep = repstart;
while (rep - repstart < idx && *rep != NULL)
! free_regexp(*rep++);
free((char *)sfdp->tabledesc->rearray);
}
#endif /* MAILER */
***************
*** 874,903 ****
}
void
! setsubexps(sepp, re)
struct si_retab **sepp;
! regexp *re;
{
register struct si_retab *sep, *psep;
register int i;
for (sep = *sepp, psep = NULL; sep != NULL; psep = sep, sep = sep->next)
! if (sep->rep == re)
break;
if (sep == NULL) {
sep = (struct si_retab *)tmalloc(sizeof (struct si_retab));
- for (i=0; i < (sizeof sep->startp)/(sizeof sep->startp[0]);++i)
- sep->startp[i] = sep->endp[i] = NULL;
bzero((char *)sep, sizeof (struct si_retab));
! sep->rep = re;
sep->next = *sepp;
} else if (psep != NULL) {
psep->next = sep->next;
sep->next = *sepp;
}
*sepp = sep;
! re->startp = sep->startp;
! re->endp = sep->endp;
}
#if 0
--- 1011,1042 ----
}
void
! setsubexps(sepp, prog)
struct si_retab **sepp;
! regexp *prog;
{
register struct si_retab *sep, *psep;
register int i;
for (sep = *sepp, psep = NULL; sep != NULL; psep = sep, sep = sep->next)
! if (sep->rep == prog)
break;
if (sep == NULL) {
sep = (struct si_retab *)tmalloc(sizeof (struct si_retab));
bzero((char *)sep, sizeof (struct si_retab));
! if (prog != NULL)
! sep->match = (char **) tmalloc((prog->re.re_nsub+1)*sizeof(char *));
! else
! sep->match = NULL;
! sep->rep = prog;
sep->next = *sepp;
} else if (psep != NULL) {
psep->next = sep->next;
sep->next = *sepp;
}
*sepp = sep;
! if (prog != NULL)
! prog->match = sep->match;
}
#if 0
***************
*** 919,950 ****
short varindex; /* index of active loop variable */
};
- static
- struct token *
- scanstring(s)
- u_char *s;
- {
- u_char *cp, *bp, *buf;
- int len;
- struct token *t;
-
- t = HDR_SCANNER(s);
- if (t != NULL && t->t_next == NULL && t->t_type == String) {
- /* we need to de-quote the quoted-string */
- len = TOKENLEN(t);
- bp = buf = (u_char *)tmalloc(len+1);
- for (cp = t->t_pname; cp - t->t_pname < len ; ++cp) {
- if (*cp == '\\' && cp - t->t_pname < len-1)
- *bp++ = *++cp;
- else
- *bp++ = *cp;
- }
- *bp = '\0';
- t = HDR_SCANNER(buf);
- }
- return t;
- }
-
/*
* Interpret Shell pseudo-code generated from S/SL description.
*
--- 1058,1063 ----
***************
*** 1039,1044 ****
--- 1152,1165 ----
d = NULL;
argi1 = 0; */
+ /* Initialize syntax for regular expressions */
+ (void) re_set_syntax( RE_CONTEXT_INDEP_ANCHORS |
+ RE_CONTEXT_INDEP_OPS |
+ RE_CONTEXT_INVALID_OPS |
+ RE_NO_BK_PARENS |
+ RE_NO_BK_VBAR |
+ RE_DOT_NOT_NULL);
+
/* funcall tracing could be done here */
/* if (caller != NULL) grindef("ARGV = ", caller->argv); */
if (isset('R'))
***************
*** 1915,1931 ****
progname, nsift-1);
abort();
}
! sift[nsift].tlist = NULL;
sift[nsift].label = pc+1 - code;
sift[nsift].subexps = NULL;
v_accessed = NULL;
break;
case sSiftBody:
! /* we don't *need* to free tokens because they are
allocated off our MEM_SHCMD memory stack */
! if (sift[nsift].tlist)
! freeTokens(sift[nsift].tlist, MEM_SHCMD);
! sift[nsift].tlist = NULL;
if (command->buffer != NULL) {
if (cdr(command->buffer))
d = s_catstring(command->buffer);
--- 2036,2050 ----
progname, nsift-1);
abort();
}
! sift[nsift].str = NULL;
sift[nsift].label = pc+1 - code;
sift[nsift].subexps = NULL;
v_accessed = NULL;
break;
case sSiftBody:
! /* we don't *need* to free string because they are
allocated off our MEM_SHCMD memory stack */
! sift[nsift].str = NULL;
if (command->buffer != NULL) {
if (cdr(command->buffer))
d = s_catstring(command->buffer);
***************
*** 1933,1939 ****
d = command->buffer;
if (STRING(d)) {
arg1 = d->string;
! sift[nsift].tlist = scanstring(arg1);
}
}
sift[nsift].accessed = v_accessed; /* nop 2nd time */
--- 2052,2058 ----
d = command->buffer;
if (STRING(d)) {
arg1 = d->string;
! sift[nsift].str = dequote(arg1);
}
}
sift[nsift].accessed = v_accessed; /* nop 2nd time */
***************
*** 1961,1967 ****
} else
d = command->buffer;
if (STRING(d))
! re = regcomp((char *)d->string);
}
if (re == NULL)
break;
--- 2080,2086 ----
} else
d = command->buffer;
if (STRING(d))
! re = reg_comp((char *)d->string);
}
if (re == NULL)
break;
***************
*** 1999,2006 ****
break;
case sSiftPop:
/* see comment above about freeing tokens */
- if (sift[nsift].tlist)
- freeTokens(sift[nsift].tlist, MEM_SHCMD);
for (v_accessed = sift[nsift].accessed;
v_accessed != NULL;
v_accessed = sift[nsift].accessed) {
--- 2118,2123 ----
***************
*** 2029,2038 ****
}
break;
case sJumpIfRegmatch:
! if (sift[nsift].tlist == NULL)
! sift[nsift].tlist = makeToken((u_char *)"", 0);
setsubexps(&sift[nsift].subexps, re);
! if (!regexec(re, sift[nsift].tlist))
pc = code + argi1 - 1;
break;
#endif /* MAILER */
--- 2146,2155 ----
}
break;
case sJumpIfRegmatch:
! if (sift[nsift].str == NULL)
! sift[nsift].str = strsave("\0");
setsubexps(&sift[nsift].subexps, re);
! if (!reg_exec(re, sift[nsift].str))
pc = code + argi1 - 1;
break;
#endif /* MAILER */
***************
*** 2161,2167 ****
if (cdp->functions == NULL) {
if (cdp->rearray != NULL) {
while (cdp->rearray_idx >= 0)
! free((char *)cdp->rearray[cdp->rearray_idx--]);
free((char *)cdp->rearray);
}
free((char *)cdp->table);
--- 2278,2284 ----
if (cdp->functions == NULL) {
if (cdp->rearray != NULL) {
while (cdp->rearray_idx >= 0)
! free_regexp(cdp->rearray[cdp->rearray_idx--]);
free((char *)cdp->rearray);
}
free((char *)cdp->table);
===================================================================
--- libsh/main.c 1994/11/11 14:05:06
***************
*** 9,15 ****
int D_alloc = 0;
#ifdef MAILER
! int D_regnarrate = 0, D_compare = 0, D_matched = 0, D_functions = 0;
int D_assign = 0;
int funclevel = 0;
--- 9,15 ----
int D_alloc = 0;
#ifdef MAILER
! int D_compare = 0, D_matched = 0, D_functions = 0;
int D_assign = 0;
int funclevel = 0;
===================================================================
--- libsh/sh.h 1994/11/11 14:05:07
***************
*** 8,14 ****
#include "sh.sst.h"
#include <sys/types.h>
#ifdef MAILER
! #include "regexp.h"
#endif /* MAILER */
#include "interpret.h"
#include "listutils.h"
--- 8,14 ----
#include "sh.sst.h"
#include <sys/types.h>
#ifdef MAILER
! #include "sift.h"
#endif /* MAILER */
#include "interpret.h"
#include "listutils.h"
***************
*** 67,73 ****
unsigned char *eotable;
struct sslfuncdef *functions;
#ifdef MAILER
! regexp **rearray; /* array of regexp's in this table */
int rearray_idx; /* current index */
int rearray_size; /* size of rearray in elements */
#endif /* MAILER */
--- 67,73 ----
unsigned char *eotable;
struct sslfuncdef *functions;
#ifdef MAILER
! regexp **rearray; /* array of regex's in this table */
int rearray_idx; /* current index */
int rearray_size; /* size of rearray in elements */
#endif /* MAILER */
===================================================================
--- libsh/sift.h 1994/11/11 14:05:07
***************
*** 6,26 ****
#ifndef Z_SIFT_H
#define Z_SIFT_H
#include "token.h"
struct vaccess { /* in a list of this structure */
struct vaccess *next;
struct conscell *l; /* points at variable name in :env */
};
struct si_retab {
struct si_retab *next;
regexp *rep;
! struct token *startp[NSUBEXP];
! struct token *endp[NSUBEXP];
};
struct siftinfo {
! struct token *tlist; /* token list for sift expression */
struct vaccess *accessed; /* variables dependencies of expr. */
int label; /* label to go to when reevaluating */
regexp *program; /* compiled regular expression stack */
--- 6,32 ----
#ifndef Z_SIFT_H
#define Z_SIFT_H
#include "token.h"
+ #include "regex.h"
struct vaccess { /* in a list of this structure */
struct vaccess *next;
struct conscell *l; /* points at variable name in :env */
};
+ typedef struct regexp {
+ regex_t re;
+ char *pattern;
+ char **match;
+ } regexp;
+
struct si_retab {
struct si_retab *next;
regexp *rep;
! char **match;
};
struct siftinfo {
! char *str; /* string for sift expression */
struct vaccess *accessed; /* variables dependencies of expr. */
int label; /* label to go to when reevaluating */
regexp *program; /* compiled regular expression stack */
===================================================================
--- router/functions.c 1994/11/11 14:13:25
***************
*** 132,138 ****
int D_bind = 0;
int D_resolv = 0;
int D_alloc = 0;
- int D_regnarrate = 0;
static struct debugind {
char *name;
--- 132,137 ----
***************
*** 146,152 ****
{ "compare", &D_compare },
{ "matched", &D_matched },
{ "assign", &D_assign },
- { "regexp", &D_regnarrate },
{ "final", &D_final },
{ "db", &D_db },
{ "bind", &D_bind },
--- 145,150 ----