[Raw Msg Headers][Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Regex in sift statements



Hello!

Felix Lee writes:
> I think it's a side effect of the token-matching semantics of sift.
> [A-Z] matches a 1-char token, not a char within a token.
> 
> personally, I think the token-matching semantics was a bad idea.  It's
> not quite flexible enough and it's easy to make mistakes.  I'd prefer
> normal regexps augmented with metachars that match whole tokens.

I've done some work to substitute ZMailer's original token-based matching
algorithm with GNU's regex routines. Patches to the source files (from
ZMailer verison 2.96mea) see below. You have to replace the files
"libsh/regexp.[hc]" with the files "regex.[hc]" from GNU regex version 0.12.
Also change your "Config" file to compile the GNU regex routines correctly.
E.g. for Solaris 2.3 set
	DEFS=	-DSTDC_HEADERS=1 -DHAVE_STRING_H=1 -DHAVE_ALLOCA_H=1

This works fine for us, but be cautious: Some regular "sift"-expressions
in the original .cf-files rely heavily on the old token-based matching
algorithm! You have to change them to use the new semantics correctly!

Greetings
Thomas Knott

===================================================================
--- libsh/Makefile.in	1994/11/11 14:05:05
***************
*** 26,36 ****
  
  OBJS	= sslwalker.o optimizer.o interpret.o listutils.o builtins.o \
  	io.o expand.o mail.o path.o prompt.o test.o trap.o variables.o \
! 	execute.o jobcontrol.o regexp.o zsh.o listtrees.o
  
  SOURCE	= sslwalker.c optimizer.c interpret.c listutils.c builtins.c \
  	io.c expand.c mail.c path.c prompt.c test.c trap.c variables.c \
! 	execute.c jobcontrol.c regexp.c zsh.c listtrees.c
  
  
  # keeping zsh.o after the library is important due to the 'rm -f *.o' below...
--- 26,36 ----
  
  OBJS	= sslwalker.o optimizer.o interpret.o listutils.o builtins.o \
  	io.o expand.o mail.o path.o prompt.o test.o trap.o variables.o \
! 	execute.o jobcontrol.o regex.o zsh.o listtrees.o
  
  SOURCE	= sslwalker.c optimizer.c interpret.c listutils.c builtins.c \
  	io.c expand.c mail.c path.c prompt.c test.c trap.c variables.c \
! 	execute.c jobcontrol.c regex.c zsh.c listtrees.c
  
  
  # keeping zsh.o after the library is important due to the 'rm -f *.o' below...
===================================================================
--- libsh/interpret.c	1994/11/11 14:05:05
***************
*** 22,29 ****
  #include <sys/wait.h>
  #endif	/* USE_UNIONWAIT */
  #ifdef	MAILER
- #include "regexp.h"
  #include "sift.h"
  #endif	/* MAILER */
  
  int magic_number = 2;		/* check id for precompiled script files */
--- 22,29 ----
  #include <sys/wait.h>
  #endif	/* USE_UNIONWAIT */
  #ifdef	MAILER
  #include "sift.h"
+ #define DEBUG
  #endif	/* MAILER */
  
  int magic_number = 2;		/* check id for precompiled script files */
***************
*** 40,46 ****
--- 40,51 ----
  #ifdef	MAILER
  extern int D_assign;
  extern int funclevel;
+ 
+ #ifdef DEBUG
+ extern int D_compare, D_matched;
+ #endif	/* DEBUG */
  #endif	/* MAILER */
+ 
  extern FILE *runiofp;
  extern char globchars[];
  extern int execute(), runio();
***************
*** 108,113 ****
--- 113,250 ----
  	}
  }
  
+ STATIC char *
+ dequote (str)
+ 	char *str;
+ {
+ 	int	len;
+ 	char	*sp, *ep, *res, *ptr;
+ 
+ 	len = strlen(str);
+ 	sp = str;
+ 	ep = str + len-1;
+ 
+ 	if ((*sp == '"' && *ep == '"') || (*sp == '\'' && *ep == '\'')) {
+ 		sp++;
+ 		ep--;
+ 		len -= 2;
+ 	}
+ 
+ 	return (strnsave(sp, len));
+ }
+ 
+ STATIC void
+ free_regexp (prog)
+ 	regexp *prog;
+ {
+ 	if (prog != NULL) {
+ 		regfree(&prog->re);
+ 		free(prog);
+ 	}
+ }
+ 
+ STATIC u_char *
+ regsub(prog, n)
+ 	regexp *prog;
+ 	int n;
+ {
+ 	if (prog == NULL || n < 0 || n > prog->re.re_nsub)
+ 		return (NULL);
+ 
+ 	return (prog->match[n]);
+ }
+ 
+ STATIC regexp *
+ reg_comp (str)
+ 	char	*str;
+ {
+ 	char	*reg_stat;
+ 	regexp	*prog;
+ 
+ 	prog = (regexp *) emalloc(sizeof(regexp));
+ 	if (prog == NULL) {
+ 		fprintf(stderr, "%s: regexp %s: No space\n",
+ 			progname, str);
+ 		return (NULL);
+ 	}
+ 	bzero(prog, sizeof(regexp));
+ 
+ 	prog->pattern = str;
+ 
+ 	reg_stat = (char *) re_compile_pattern(str, strlen(str), &prog->re);
+ 	if (reg_stat != NULL) {
+ 		fprintf(stderr, "%s: regexp %s: %s\n", progname, str, reg_stat);
+ 
+ 		free(prog);
+ 		return (NULL);
+ 	}
+ 
+ 	return (prog);
+ }
+ 
+ STATIC int
+ reg_exec (prog, str)
+ 	regexp	*prog;
+ 	char	*str;
+ {
+ 	int		i;
+ 	int		re_stat;
+ 	regmatch_t	*pmatch;
+ 
+ 	if (prog == NULL) {
+ 		fprintf(stderr, "%s: regexp: NULL program\n", progname);
+ 		return (0);
+ 	}
+ 
+ #ifdef  DEBUG
+ 	if (D_compare) {
+ 		fprintf(stderr,
+ 			"%*scomparing %s and ", 4*funclevel, " ", prog->pattern);
+ 		if (str != NULL)
+ 			fprintf(stderr, "'%s'\n", str);
+ 		else
+ 			fprintf(stderr, "(nil)\n");
+ 	}
+ #endif  /* DEBUG */
+ 
+ #ifndef USE_ALLOCA
+ 	pmatch = (regmatch_t *)
+ 			emalloc((prog->re.re_nsub+1)*sizeof(regmatch_t));
+ #else
+ 	pmatch = (regmatch_t *)
+ 			alloca((prog->re.re_nsub+1)*sizeof(regmatch_t));
+ #endif
+ 
+ 	re_stat = regexec(&prog->re, str, prog->re.re_nsub+1, pmatch, 0);
+ 	if (re_stat == REG_NOMATCH) {
+ #ifndef USE_ALLOCA
+ 		free(pmatch);
+ #endif
+ 		return 0;
+ 	}
+ 
+ 	for (i=0; i<=prog->re.re_nsub; i++)
+ 		prog->match[i] = strnsave(str + pmatch[i].rm_so,
+ 				pmatch[i].rm_eo - pmatch[i].rm_so);
+ 
+ #ifdef  DEBUG
+ 	if (D_matched) {
+ 		fprintf(stderr,
+ 			"%*smatched %s and ", 4*funclevel, " ", prog->pattern);
+ 		if (str != NULL)
+ 			fprintf(stderr, "'%s'\n", str);
+ 		else
+ 			fprintf(stderr, "(nil)\n");
+ 	}
+ #endif  /* DEBUG */
+ 
+ #ifndef USE_ALLOCA
+ 	free(pmatch);
+ #endif
+ 
+ 	return 1;
+ }
+ 
  
  /*
   * Return the next available filedescriptor, simulating kernel lookup.
***************
*** 713,719 ****
  				idx = sfdp->tabledesc->rearray_idx;
  				rep = repstart;
  				while (rep - repstart < idx && *rep != NULL)
! 					free((char *)*rep++);
  				free((char *)sfdp->tabledesc->rearray);
  			}
  #endif	/* MAILER */
--- 850,856 ----
  				idx = sfdp->tabledesc->rearray_idx;
  				rep = repstart;
  				while (rep - repstart < idx && *rep != NULL)
! 					free_regexp(*rep++);
  				free((char *)sfdp->tabledesc->rearray);
  			}
  #endif	/* MAILER */
***************
*** 874,903 ****
  }
  
  void
! setsubexps(sepp, re)
  	struct si_retab **sepp;
! 	regexp *re;
  {
  	register struct si_retab *sep, *psep;
  	register int i;
  
  	for (sep = *sepp, psep = NULL; sep != NULL; psep = sep, sep = sep->next)
! 		if (sep->rep == re)
  			break;
  	if (sep == NULL) {
  		sep = (struct si_retab *)tmalloc(sizeof (struct si_retab));
- 		for (i=0; i < (sizeof sep->startp)/(sizeof sep->startp[0]);++i)
- 			sep->startp[i] = sep->endp[i] = NULL;
  		bzero((char *)sep, sizeof (struct si_retab));
! 		sep->rep = re;
  		sep->next = *sepp;
  	} else if (psep != NULL) {
  		psep->next = sep->next;
  		sep->next = *sepp;
  	}
  	*sepp = sep;
! 	re->startp = sep->startp;
! 	re->endp = sep->endp;
  }
  
  #if 0
--- 1011,1042 ----
  }
  
  void
! setsubexps(sepp, prog)
  	struct si_retab **sepp;
! 	regexp		*prog;
  {
  	register struct si_retab *sep, *psep;
  	register int i;
  
  	for (sep = *sepp, psep = NULL; sep != NULL; psep = sep, sep = sep->next)
! 		if (sep->rep == prog)
  			break;
  	if (sep == NULL) {
  		sep = (struct si_retab *)tmalloc(sizeof (struct si_retab));
  		bzero((char *)sep, sizeof (struct si_retab));
! 		if (prog != NULL)
! 			sep->match = (char **) tmalloc((prog->re.re_nsub+1)*sizeof(char *));
! 		else
! 			sep->match = NULL;
! 		sep->rep = prog;
  		sep->next = *sepp;
  	} else if (psep != NULL) {
  		psep->next = sep->next;
  		sep->next = *sepp;
  	}
  	*sepp = sep;
! 	if (prog != NULL)
! 		prog->match = sep->match;
  }
  
  #if 0
***************
*** 919,950 ****
  	short	varindex;	/* index of active loop variable */
  };
  
- static
- struct token *
- scanstring(s)
- 	u_char *s;
- {
- 	u_char *cp, *bp, *buf;
- 	int len;
- 	struct token *t;
- 
- 	t = HDR_SCANNER(s);
- 	if (t != NULL && t->t_next == NULL && t->t_type == String) {
- 		/* we need to de-quote the quoted-string */
- 		len = TOKENLEN(t);
- 		bp = buf = (u_char *)tmalloc(len+1);
- 		for (cp = t->t_pname; cp - t->t_pname < len ; ++cp) {
- 			if (*cp == '\\' && cp - t->t_pname < len-1)
- 				*bp++ = *++cp;
- 			else
- 				*bp++ = *cp;
- 		}
- 		*bp = '\0';
- 		t = HDR_SCANNER(buf);
- 	}
- 	return t;
- }
- 
  /*
   * Interpret Shell pseudo-code generated from S/SL description.
   *
--- 1058,1063 ----
***************
*** 1039,1044 ****
--- 1152,1165 ----
  	   d = NULL;
  	   argi1 = 0; */
  
+ 	/* Initialize syntax for regular expressions */
+ 	(void) re_set_syntax( RE_CONTEXT_INDEP_ANCHORS |
+ 			RE_CONTEXT_INDEP_OPS |
+ 			RE_CONTEXT_INVALID_OPS |
+ 			RE_NO_BK_PARENS |
+ 			RE_NO_BK_VBAR |
+ 			RE_DOT_NOT_NULL);
+ 
  	/* funcall tracing could be done here */
  	/* if (caller != NULL) grindef("ARGV = ", caller->argv); */
  	if (isset('R'))
***************
*** 1915,1931 ****
  					progname, nsift-1);
  				abort();
  			}
! 			sift[nsift].tlist = NULL;
  			sift[nsift].label = pc+1 - code;
  			sift[nsift].subexps = NULL;
  			v_accessed = NULL;
  			break;
  		case sSiftBody:
! 			/* we don't *need* to free tokens because they are
  			   allocated off our MEM_SHCMD memory stack */
! 			if (sift[nsift].tlist)
! 				freeTokens(sift[nsift].tlist, MEM_SHCMD);
! 			sift[nsift].tlist = NULL;
  			if (command->buffer != NULL) {
  				if (cdr(command->buffer))
  					d = s_catstring(command->buffer);
--- 2036,2050 ----
  					progname, nsift-1);
  				abort();
  			}
! 			sift[nsift].str = NULL;
  			sift[nsift].label = pc+1 - code;
  			sift[nsift].subexps = NULL;
  			v_accessed = NULL;
  			break;
  		case sSiftBody:
! 			/* we don't *need* to free string because they are
  			   allocated off our MEM_SHCMD memory stack */
! 			sift[nsift].str = NULL;
  			if (command->buffer != NULL) {
  				if (cdr(command->buffer))
  					d = s_catstring(command->buffer);
***************
*** 1933,1939 ****
  					d = command->buffer;
  				if (STRING(d)) {
  					arg1 = d->string;
! 					sift[nsift].tlist = scanstring(arg1);
  				}
  			}
  			sift[nsift].accessed = v_accessed;  /* nop 2nd time */
--- 2052,2058 ----
  					d = command->buffer;
  				if (STRING(d)) {
  					arg1 = d->string;
! 					sift[nsift].str = dequote(arg1);
  				}
  			}
  			sift[nsift].accessed = v_accessed;  /* nop 2nd time */
***************
*** 1961,1967 ****
  				} else
  					d = command->buffer;
  				if (STRING(d))
! 					re = regcomp((char *)d->string);
  			}
  			if (re == NULL)
  				break;
--- 2080,2086 ----
  				} else
  					d = command->buffer;
  				if (STRING(d))
! 					re = reg_comp((char *)d->string);
  			}
  			if (re == NULL)
  				break;
***************
*** 1999,2006 ****
  			break;
  		case sSiftPop:
  			/* see comment above about freeing tokens */
- 			if (sift[nsift].tlist)
- 				freeTokens(sift[nsift].tlist, MEM_SHCMD);
  			for (v_accessed = sift[nsift].accessed;
  			     v_accessed != NULL;
  			     v_accessed = sift[nsift].accessed) {
--- 2118,2123 ----
***************
*** 2029,2038 ****
  			}
  			break;
  		case sJumpIfRegmatch:
! 			if (sift[nsift].tlist == NULL)
! 				sift[nsift].tlist = makeToken((u_char *)"", 0);
  			setsubexps(&sift[nsift].subexps, re);
! 			if (!regexec(re, sift[nsift].tlist))
  				pc = code + argi1 - 1;
  			break;
  #endif	/* MAILER */
--- 2146,2155 ----
  			}
  			break;
  		case sJumpIfRegmatch:
! 			if (sift[nsift].str == NULL)
! 				sift[nsift].str = strsave("\0");
  			setsubexps(&sift[nsift].subexps, re);
! 			if (!reg_exec(re, sift[nsift].str))
  				pc = code + argi1 - 1;
  			break;
  #endif	/* MAILER */
***************
*** 2161,2167 ****
  	if (cdp->functions == NULL) {
  		if (cdp->rearray != NULL) {
  			while (cdp->rearray_idx >= 0)
! 				free((char *)cdp->rearray[cdp->rearray_idx--]);
  			free((char *)cdp->rearray);
  		}
  		free((char *)cdp->table);
--- 2278,2284 ----
  	if (cdp->functions == NULL) {
  		if (cdp->rearray != NULL) {
  			while (cdp->rearray_idx >= 0)
! 				free_regexp(cdp->rearray[cdp->rearray_idx--]);
  			free((char *)cdp->rearray);
  		}
  		free((char *)cdp->table);
===================================================================
--- libsh/main.c	1994/11/11 14:05:06
***************
*** 9,15 ****
  int D_alloc = 0;
  
  #ifdef	MAILER
! int D_regnarrate = 0, D_compare = 0, D_matched = 0, D_functions = 0;
  int D_assign = 0;
  int funclevel = 0;
  
--- 9,15 ----
  int D_alloc = 0;
  
  #ifdef	MAILER
! int D_compare = 0, D_matched = 0, D_functions = 0;
  int D_assign = 0;
  int funclevel = 0;
  
===================================================================
--- libsh/sh.h	1994/11/11 14:05:07
***************
*** 8,14 ****
  #include "sh.sst.h"
  #include <sys/types.h>
  #ifdef	MAILER
! #include "regexp.h"
  #endif	/* MAILER */
  #include "interpret.h"
  #include "listutils.h"
--- 8,14 ----
  #include "sh.sst.h"
  #include <sys/types.h>
  #ifdef	MAILER
! #include "sift.h"
  #endif	/* MAILER */
  #include "interpret.h"
  #include "listutils.h"
***************
*** 67,73 ****
  	unsigned char	*eotable;
  	struct sslfuncdef *functions;
  #ifdef	MAILER
! 	regexp		**rearray;	/* array of regexp's in this table */
  	int		rearray_idx;	/* current index */
  	int		rearray_size;	/* size of rearray in elements */
  #endif	/* MAILER */
--- 67,73 ----
  	unsigned char	*eotable;
  	struct sslfuncdef *functions;
  #ifdef	MAILER
! 	regexp		**rearray;	/* array of regex's in this table */
  	int		rearray_idx;	/* current index */
  	int		rearray_size;	/* size of rearray in elements */
  #endif	/* MAILER */
===================================================================
--- libsh/sift.h	1994/11/11 14:05:07
***************
*** 6,26 ****
  #ifndef	Z_SIFT_H
  #define	Z_SIFT_H
  #include "token.h"
  
  struct vaccess {	/* in a list of this structure */
  	struct vaccess	*next;
  	struct conscell	*l;		/* points at variable name in :env */
  };
  
  struct si_retab {
  	struct si_retab	*next;
  	regexp		*rep;
! 	struct token	*startp[NSUBEXP];
! 	struct token	*endp[NSUBEXP];
  };
  
  struct siftinfo {
! 	struct token	*tlist;		/* token list for sift expression */
  	struct vaccess	*accessed;	/* variables dependencies of expr. */
  	int		label;		/* label to go to when reevaluating */
  	regexp		*program;	/* compiled regular expression stack */
--- 6,32 ----
  #ifndef	Z_SIFT_H
  #define	Z_SIFT_H
  #include "token.h"
+ #include "regex.h"
  
  struct vaccess {	/* in a list of this structure */
  	struct vaccess	*next;
  	struct conscell	*l;		/* points at variable name in :env */
  };
  
+ typedef struct regexp {
+ 	regex_t		re;
+ 	char		*pattern;
+ 	char		**match;
+ } regexp;
+ 
  struct si_retab {
  	struct si_retab	*next;
  	regexp		*rep;
! 	char		**match;
  };
  
  struct siftinfo {
! 	char		*str;		/* string for sift expression */
  	struct vaccess	*accessed;	/* variables dependencies of expr. */
  	int		label;		/* label to go to when reevaluating */
  	regexp		*program;	/* compiled regular expression stack */
===================================================================
--- router/functions.c	1994/11/11 14:13:25
***************
*** 132,138 ****
  int		D_bind = 0;
  int		D_resolv = 0;
  int		D_alloc = 0;
- int		D_regnarrate = 0;
  
  static struct debugind {
  	char	*name;
--- 132,137 ----
***************
*** 146,152 ****
  	{	"compare",		&D_compare	},
  	{	"matched",		&D_matched	},
  	{	"assign",		&D_assign	},
- 	{	"regexp",		&D_regnarrate	},
  	{	"final",		&D_final	},
  	{	"db",			&D_db		},
  	{	"bind",			&D_bind		},
--- 145,150 ----