b.c.diff   [plain text]


--- b.c.orig	2008-11-12 11:59:26.000000000 -0800
+++ b.c	2008-11-12 11:58:53.000000000 -0800
@@ -32,6 +32,7 @@
 #include <stdlib.h>
 #include "awk.h"
 #include "ytab.h"
+#include <fcntl.h>
 
 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
 				/* NCHARS is 2**n */
@@ -65,6 +66,13 @@
 static uschar	*rlxstr;
 static uschar	*prestr;	/* current position in current re */
 static uschar	*lastre;	/* origin of last re */
+static uschar	*lastatom;	/* origin of last Atom */
+static uschar	*starttok;
+static char 	*basestr;	/* starts with original, replaced during
+				   repetition processing */
+static char 	*firstbasestr;
+
+static FILE * replogfile = 0;
 
 static	int setcnt;
 static	int poscnt;
@@ -124,6 +132,13 @@
 	Node *p, *p1;
 	fa *f;
 
+	firstbasestr = (char *)s;
+	basestr = firstbasestr;
+	if (replogfile==0) {
+		/*	disabled
+		replogfile = fopen("/tmp/repeatlog", "a");
+		*/
+	}
 	p = reparse(s);
 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 		/* put ALL STAR in front of reg.  exp. */
@@ -145,6 +160,14 @@
 	f->initstat = makeinit(f, anchor);
 	f->anchor = anchor;
 	f->restr = (uschar *) tostring(s);
+	if (replogfile) {
+		fflush(replogfile);
+		fclose(replogfile);
+		replogfile=0;
+	}
+	if (firstbasestr != basestr) {
+		if (basestr) free(basestr);
+	}
 	return f;
 }
 
@@ -628,9 +651,11 @@
 Node *primary(void)
 {
 	Node *np;
+	int savelastatom;
 
 	switch (rtok) {
 	case CHAR:
+		lastatom = starttok;
 		np = op2(CHAR, NIL, itonp(rlxval));
 		rtok = relex();
 		return (unary(np));
@@ -638,16 +663,25 @@
 		rtok = relex();
 		return (unary(op2(ALL, NIL, NIL)));
 	case EMPTYRE:
+		if (replogfile) {
+			fprintf(replogfile,
+				"returned EMPTYRE from primary\n");
+			fflush(replogfile);
+		}
 		rtok = relex();
-		return (unary(op2(ALL, NIL, NIL)));
+
+		return (unary(op2(EMPTYRE, NIL, NIL)));
 	case DOT:
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(op2(DOT, NIL, NIL)));
 	case CCL:
+		lastatom = starttok;
 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
 		rtok = relex();
 		return (unary(np));
 	case NCCL:
+		lastatom = starttok;
 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
 		rtok = relex();
 		return (unary(np));
@@ -658,6 +692,8 @@
 		rtok = relex();
 		return (unary(op2(CHAR, NIL, NIL)));
 	case '(':
+		lastatom = starttok;
+		savelastatom = (char *)starttok-basestr; /* Retain over recursion */
 		rtok = relex();
 		if (rtok == ')') {	/* special pleading for () */
 			rtok = relex();
@@ -665,6 +701,7 @@
 		}
 		np = regexp();
 		if (rtok == ')') {
+			lastatom = basestr+savelastatom; /* Restore */
 			rtok = relex();
 			return (unary(np));
 		}
@@ -679,8 +716,17 @@
 Node *concat(Node *np)
 {
 	switch (rtok) {
-	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
+	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
 		return (concat(op2(CAT, np, primary())));
+	case EMPTYRE:
+		if (replogfile) {
+			fprintf(replogfile,
+				"returned EMPTYRE to concat\n");
+			fflush(replogfile);
+		}
+		rtok = relex();
+		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+				primary())));
 	}
 	return (np);
 }
@@ -731,6 +777,9 @@
  * to nelson beebe for the suggestion; let's see if it works everywhere.
  */
 
+#if defined(__APPLE__)
+#define HAS_ISBLANK
+#endif
 #ifndef HAS_ISBLANK
 
 int (isblank)(int c)
@@ -760,6 +809,143 @@
 	{ NULL,		0,	NULL },
 };
 
+#define REPEAT_SIMPLE		0
+#define REPEAT_PLUS_APPENDED	1
+#define REPEAT_WITH_Q		2
+#define REPEAT_ZERO		3
+
+int replace_repeat(uschar * reptok, int reptoklen, uschar * atom, int atomlen, 
+			int firstnum, int secondnum, int special_case)
+{
+	int i, j;
+	uschar *buf = 0;
+	int ret = 1;
+	int init_q = (firstnum==0);		/* first added char will be ? */
+	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */ 
+	int prefix_length = (char *) reptok-basestr;	/* prefix includes first rep	*/ 
+	int suffix_length = strlen(reptok) - reptoklen;	/* string after rep specifier	*/
+	int size = prefix_length +  suffix_length;
+
+	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
+		size += atomlen*(firstnum-1);
+	}
+
+	/* Adjust size of buffer for special cases */
+	if (special_case == REPEAT_PLUS_APPENDED) {
+		size++;		/* for the final + */
+	} else if (special_case == REPEAT_WITH_Q) {
+		size += init_q + (atomlen+1)* n_q_reps;
+	} else if (special_case == REPEAT_ZERO) {
+		size += 2;	/* just a null ERE: () */
+	}
+	if ((buf = (uschar *) malloc(size+1)) == NULL)
+		FATAL("out of space in reg expr %.10s..", lastre);
+	if (replogfile) {
+		fprintf(replogfile, "re before: len=%d,%s\n"
+				    "         : init_q=%d,n_q_reps=%d\n",
+				strlen(basestr),basestr,
+				init_q,n_q_reps);
+		fprintf(replogfile, "re prefix_length=%d,atomlen=%d\n",
+				prefix_length,atomlen);
+/*
+		fprintf(replogfile, " new buf size: %d, atom=%s, atomlen=%d\n",
+				size, atom, atomlen);
+*/
+		fflush(replogfile);
+	}
+	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/ 
+	j = prefix_length;
+	if (special_case == REPEAT_ZERO) {
+		j -= atomlen;
+		buf[j++] = '(';
+		buf[j++] = ')';
+	}
+	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
+		memcpy(&buf[j], atom, atomlen);
+		j += atomlen;
+	}
+	if (special_case == REPEAT_PLUS_APPENDED) {
+		buf[j++] = '+';
+	} else if (special_case == REPEAT_WITH_Q) {
+		if (init_q) buf[j++] = '?';
+		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
+			memcpy(&buf[j], atom, atomlen);
+			j += atomlen;
+			buf[j++] = '?';
+		}
+	}
+	memcpy(&buf[j], reptok+reptoklen, suffix_length);
+	if (special_case == REPEAT_ZERO) {
+		buf[j+suffix_length] = '\0';
+	} else {
+		buf[size] = '\0';
+	}
+	if (replogfile) {
+		fprintf(replogfile, "re after : len=%d,%s\n",strlen(buf),buf);
+		fflush(replogfile);
+	}
+	/* free old basestr */
+	if (firstbasestr != basestr) {
+		if (basestr) free(basestr);
+	}
+	basestr = (char *)buf;
+	prestr  = buf + prefix_length;
+	if (special_case == REPEAT_ZERO) {
+		prestr  -= atomlen;
+		ret++;
+	}
+	return ret;
+}
+
+int repeat(uschar * reptok, int reptoklen, uschar * atom, int atomlen, 
+		int firstnum, int secondnum)
+{
+	/*
+	   In general, the repetition specifier or "bound" is replaced here
+	   by an equivalent ERE string, repeating the immediately previous atom
+	   and appending ? and + as needed. Note that the first copy of the
+	   atom is left in place, except in the special_case of a zero-repeat
+	   (i.e., {0}).
+	 */
+	int i, j;
+	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
+		if (firstnum < 2) {
+			/* 0 or 1: should be handled before you get here */
+			if (replogfile) {
+				fprintf(replogfile,
+					"{%d, %d}, shouldn't be here\n",
+					firstnum, secondnum);
+				fflush(replogfile);
+			}
+		} else {
+			return replace_repeat(reptok, reptoklen, atom, atomlen, 
+				firstnum, secondnum, REPEAT_PLUS_APPENDED);
+		}
+	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
+		if (firstnum == 0) {	/* {0} or {0,0} */
+			/* This case is unusual because the resulting 
+			   replacement string might actually be SMALLER than 
+			   the original ERE */
+			return replace_repeat(reptok, reptoklen, atom, atomlen, 
+					firstnum, secondnum, REPEAT_ZERO);
+		} else {		/* (firstnum >= 1) */
+			return replace_repeat(reptok, reptoklen, atom, atomlen, 
+					firstnum, secondnum, REPEAT_SIMPLE);
+		}
+	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
+		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
+		return replace_repeat(reptok, reptoklen, atom, atomlen, 
+					firstnum, secondnum, REPEAT_WITH_Q);
+	} else {	/* Error - shouldn't be here (n>m) */
+		if (replogfile) {
+			fprintf(replogfile,
+				"illegal ERE {%d,%d} shouldn't be here!\n",
+				firstnum,secondnum);
+			fflush(replogfile);
+		}
+	}
+	return 0;
+}
 
 int relex(void)		/* lexical analyzer for reparse */
 {
@@ -770,6 +956,11 @@
 	uschar *bp;
 	struct charclass *cc;
 	int i;
+	int num, m, commafound, digitfound;
+	uschar *startreptok;
+
+rescan:
+	starttok = prestr;
 
 	switch (c = *prestr++) {
 	case '|': return OR;
@@ -828,6 +1019,54 @@
 					}
 				} else
 					*bp++ = c;
+			} else if (Unix2003_compat && c == '['
+						   && *prestr == '.') {
+				char collate_char;
+				prestr++;
+				collate_char = *prestr++;
+				if (*prestr == '.' && prestr[1] == ']') {
+					prestr += 2;
+					/* Found it: map via locale TBD: for
+					   now, simply return this char.  This
+					   is sufficient to pass conformance
+					   test awk.ex 156
+					 */
+					if (*prestr == ']') {
+						prestr++;
+						rlxval = collate_char;
+						if (replogfile) {
+							fprintf(replogfile,
+								"[..] collate char=%c\n",
+								collate_char);
+							fflush(replogfile);
+						}
+						return CHAR;
+					}
+				}
+			} else if (Unix2003_compat && c == '['
+						   && *prestr == '=') {
+				char equiv_char;
+				prestr++;
+				equiv_char = *prestr++;
+				if (*prestr == '=' && prestr[1] == ']') {
+					prestr += 2;
+					/* Found it: map via locale TBD: for now
+					   simply return this char. This is 
+					   sufficient to pass conformance test
+					   awk.ex 156
+					 */
+					if (*prestr == ']') {
+						prestr++;
+						rlxval = equiv_char;
+						if (replogfile) {
+							fprintf(replogfile,
+								"[==] equiv char=%c\n",
+								equiv_char);
+							fflush(replogfile);
+						}
+						return CHAR;
+					}
+				}
 			} else if (c == '\0') {
 				FATAL("nonterminated character class %.20s", lastre);
 			} else if (bp == buf) {	/* 1st char is special */
@@ -835,6 +1074,12 @@
 			} else if (c == ']') {
 				*bp++ = 0;
 				rlxstr = (uschar *) tostring((char *) buf);
+				if (replogfile) {
+					fprintf(replogfile,
+					"detecting []: cflag=%d, len=%d,%s\n",
+						cflag,strlen(rlxstr),rlxstr);
+					fflush(replogfile);
+				}
 				if (cflag == 0)
 					return CCL;
 				else
@@ -842,6 +1087,75 @@
 			} else
 				*bp++ = c;
 		}
+		break;
+	case '{': 
+		if (Unix2003_compat && isdigit(*(prestr))) {
+			num = 0;	/* Process as a repetition */
+			n = -1; m = -1;
+			commafound = 0;
+			digitfound = 0;
+			startreptok = prestr-1;
+			/* Remember start of previous atom here ? */
+	 	} else {        	/* just a { char, not a repetition */
+			rlxval = c;
+			return CHAR;
+                }
+		for (; ; ) {
+			if ((c = *prestr++) == '}') {
+				if (commafound) {
+					if (digitfound) { /* {n,m} */
+						m = num;
+						if (m<n)
+							FATAL("illegal repetition expression: class %.20s",
+								lastre);
+						if ((n==0) && (m==1)) {
+							return QUEST;
+						}
+					} else {	/* {n,} */
+						if (n==0) return STAR;
+						if (n==1) return PLUS;
+					}
+				} else {
+					if (digitfound) { /* {n} same as {n,n} */
+						n = num;
+						m = num;
+					} else {	/* {} */
+						FATAL("illegal repetition expression: class %.20s",
+							lastre);
+					}
+				}
+				if (repeat(starttok, prestr-starttok, lastatom,
+					   startreptok - lastatom, n, m) > 0) {
+					if ((n==0) && (m==0)) {
+						return EMPTYRE;
+					}
+					/* must rescan input for next token */
+					goto rescan;
+				}
+				/* Failed to replace: eat up {...} characters
+				   and treat like just PLUS */
+				return PLUS;
+			} else if (c == '\0') {
+				FATAL("nonterminated character class %.20s",
+					lastre);
+			} else if (isdigit(c)) {
+				num = 10 * num + c - '0';
+				digitfound = 1;
+			} else if (c == ',') {
+				if (commafound)
+					FATAL("illegal repetition expression: class %.20s",
+						lastre);
+				/* looking for {n,} or {n,m} */
+				commafound = 1;
+				n = num;
+				digitfound = 0; /* reset */
+				num = 0;
+			} else {
+				FATAL("illegal repetition expression: class %.20s",
+					lastre);
+			}
+		}
+		break;
 	}
 }