~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
#include <my_global.h>
2
#include <m_string.h>
3
#include <m_ctype.h>
4
#ifdef __WIN__
5
#include  <limits.h>
6
#endif
7
8
#include "my_regex.h"
9
#include "utils.h"
10
#include "regex2.h"
11
12
#include "cclass.h"
13
#include "cname.h"
14
15
/*
16
 * parse structure, passed up and down to avoid global variables and
17
 * other clumsinesses
18
 */
19
struct parse {
20
	char *next;		/* next character in RE */
21
	char *end;		/* end of string (-> NUL normally) */
22
	int error;		/* has an error been seen? */
23
	sop *strip;		/* malloced strip */
24
	sopno ssize;		/* malloced strip size (allocated) */
25
	sopno slen;		/* malloced strip length (used) */
26
	int ncsalloc;		/* number of csets allocated */
27
	struct re_guts *g;
28
#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
29
	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
30
	sopno pend[NPAREN];	/* -> ) ([0] unused) */
31
	CHARSET_INFO *charset;	/* for ctype things  */
32
};
33
34
#include "regcomp.ih"
35
36
static char nuls[10];		/* place to point scanner in event of error */
37
38
struct cclass cclasses[CCLASS_LAST+1]= {
39
  { "alnum",	"","", _MY_U | _MY_L | _MY_NMR},
40
  { "alpha",	"","", _MY_U | _MY_L },
41
  { "blank",	"","", _MY_B },
42
  { "cntrl",	"","", _MY_CTR },
43
  { "digit",	"","", _MY_NMR },
44
  { "graph",	"","", _MY_PNT | _MY_U | _MY_L | _MY_NMR},
45
  { "lower",	"","", _MY_L },
46
  { "print",	"","", _MY_PNT | _MY_U | _MY_L | _MY_NMR | _MY_B },
47
  { "punct",	"","", _MY_PNT },
48
  { "space",	"","", _MY_SPC },
49
  { "upper",	"","", _MY_U },
50
  { "xdigit",	"","", _MY_X },
51
  { NULL,NULL,NULL, 0 }
52
};
53
54
/*
55
 * macros for use with parse structure
56
 * BEWARE:  these know that the parse structure is named `p' !!!
57
 */
58
#define	PEEK()	(*p->next)
59
#define	PEEK2()	(*(p->next+1))
60
#define	MORE()	(p->next < p->end)
61
#define	MORE2()	(p->next+1 < p->end)
62
#define	SEE(c)	(MORE() && PEEK() == (c))
63
#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
64
#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
65
#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
66
#define	NEXT()	(p->next++)
67
#define	NEXT2()	(p->next += 2)
68
#define	NEXTn(n)	(p->next += (n))
69
#define	GETNEXT()	(*p->next++)
70
#define	SETERROR(e)	seterr(p, (e))
71
#define	REQUIRE(co, e)	((co) || SETERROR(e))
72
#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
73
#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
74
#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
75
#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
76
#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
77
#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
78
#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
79
#define	HERE()		(p->slen)
80
#define	THERE()		(p->slen - 1)
81
#define	THERETHERE()	(p->slen - 2)
82
#define	DROP(n)	(p->slen -= (n))
83
84
#ifndef NDEBUG
85
static int never = 0;		/* for use in asserts; shuts lint up */
86
#else
87
#define	never	0		/* some <assert.h>s have bugs too */
88
#endif
89
90
/*
91
 - regcomp - interface for parser and compilation
92
 = extern int regcomp(regex_t *, const char *, int);
93
 = #define	REG_BASIC	0000
94
 = #define	REG_EXTENDED	0001
95
 = #define	REG_ICASE	0002
96
 = #define	REG_NOSUB	0004
97
 = #define	REG_NEWLINE	0010
98
 = #define	REG_NOSPEC	0020
99
 = #define	REG_PEND	0040
100
 = #define	REG_DUMP	0200
101
 */
102
int				/* 0 success, otherwise REG_something */
103
my_regcomp(preg, pattern, cflags, charset)
104
my_regex_t *preg;
105
const char *pattern;
106
int cflags;
107
CHARSET_INFO *charset;
108
{
109
	struct parse pa;
110
	register struct re_guts *g;
111
	register struct parse *p = &pa;
112
	register int i;
113
	register size_t len;
114
#ifdef REDEBUG
115
#	define	GOODFLAGS(f)	(f)
116
#else
117
#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
118
#endif
119
120
	my_regex_init(charset);	/* Init cclass if neaded */
121
	preg->charset=charset;
122
	cflags = GOODFLAGS(cflags);
123
	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
124
		return(REG_INVARG);
125
126
	if (cflags&REG_PEND) {
127
		if (preg->re_endp < pattern)
128
			return(REG_INVARG);
129
		len = preg->re_endp - pattern;
130
	} else
131
		len = strlen((char *)pattern);
132
133
	/* do the mallocs early so failure handling is easy */
134
	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
135
							(NC-1)*sizeof(cat_t));
136
	if (g == NULL)
137
		return(REG_ESPACE);
138
	p->ssize = (long) (len/(size_t)2*(size_t)3 + (size_t)1); /* ugh */
139
	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
140
	p->slen = 0;
141
	if (p->strip == NULL) {
142
		free((char *)g);
143
		return(REG_ESPACE);
144
	}
145
146
	/* set things up */
147
	p->g = g;
148
	p->next = (char *)pattern;	/* convenience; we do not modify it */
149
	p->end = p->next + len;
150
	p->error = 0;
151
	p->ncsalloc = 0;
152
	p->charset = preg->charset;
153
	for (i = 0; i < NPAREN; i++) {
154
		p->pbegin[i] = 0;
155
		p->pend[i] = 0;
156
	}
157
	g->csetsize = NC;
158
	g->sets = NULL;
159
	g->setbits = NULL;
160
	g->ncsets = 0;
161
	g->cflags = cflags;
162
	g->iflags = 0;
163
	g->nbol = 0;
164
	g->neol = 0;
165
	g->must = NULL;
166
	g->mlen = 0;
167
	g->nsub = 0;
168
	g->ncategories = 1;	/* category 0 is "everything else" */
169
	g->categories = &g->catspace[-(CHAR_MIN)];
170
	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
171
	g->backrefs = 0;
172
173
	/* do it */
174
	EMIT(OEND, 0);
175
	g->firststate = THERE();
176
	if (cflags&REG_EXTENDED)
177
		p_ere(p, OUT);
178
	else if (cflags&REG_NOSPEC)
179
		p_str(p);
180
	else
181
		p_bre(p, OUT, OUT);
182
	EMIT(OEND, 0);
183
	g->laststate = THERE();
184
185
	/* tidy up loose ends and fill things in */
186
	categorize(p, g);
187
	stripsnug(p, g);
188
	findmust(p, g);
189
	g->nplus = pluscount(p, g);
190
	g->magic = MAGIC2;
191
	preg->re_nsub = g->nsub;
192
	preg->re_g = g;
193
	preg->re_magic = MAGIC1;
194
#ifndef REDEBUG
195
	/* not debugging, so can't rely on the assert() in regexec() */
196
	if (g->iflags&BAD)
197
		SETERROR(REG_ASSERT);
198
#endif
199
200
	/* win or lose, we're done */
201
	if (p->error != 0)	/* lose */
202
		my_regfree(preg);
203
	return(p->error);
204
}
205
206
/*
207
 - p_ere - ERE parser top level, concatenation and alternation
208
 == static void p_ere(register struct parse *p, int stop);
209
 */
210
static void
211
p_ere(p, stop)
212
register struct parse *p;
213
int stop;			/* character this ERE should end at */
214
{
215
	register char c;
216
	register sopno prevback;
217
	register sopno prevfwd;
218
	register sopno conc;
219
	register int first = 1;		/* is this the first alternative? */
220
	for (;;) {
221
		/* do a bunch of concatenated expressions */
222
		conc = HERE();
223
		while (MORE() && (c = PEEK()) != '|' && c != stop)
224
			p_ere_exp(p);
225
		if(REQUIRE(HERE() != conc, REG_EMPTY)) {}/* require nonempty */
226
227
		if (!EAT('|'))
228
			break;		/* NOTE BREAK OUT */
229
230
		if (first) {
231
			INSERT(OCH_, conc);	/* offset is wrong */
232
			prevfwd = conc;
233
			prevback = conc;
234
			first = 0;
235
		}
236
		ASTERN(OOR1, prevback);
237
		prevback = THERE();
238
		AHEAD(prevfwd);			/* fix previous offset */
239
		prevfwd = HERE();
240
		EMIT(OOR2, 0);			/* offset is very wrong */
241
	}
242
243
	if (!first) {		/* tail-end fixups */
244
		AHEAD(prevfwd);
245
		ASTERN(O_CH, prevback);
246
	}
247
248
	assert(!MORE() || SEE(stop));
249
}
250
251
/*
252
 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
253
 == static void p_ere_exp(register struct parse *p);
254
 */
255
static void
256
p_ere_exp(p)
257
register struct parse *p;
258
{
259
	register char c;
260
	register sopno pos;
261
	register int count;
262
	register int count2;
263
	register sopno subno;
264
	int wascaret = 0;
265
266
	assert(MORE());		/* caller should have ensured this */
267
	c = GETNEXT();
268
269
	pos = HERE();
270
	switch (c) {
271
	case '(':
272
		if(REQUIRE(MORE(), REG_EPAREN)) {}
273
		p->g->nsub++;
274
		subno = (sopno) p->g->nsub;
275
		if (subno < NPAREN)
276
		        p->pbegin[subno] = HERE();
277
		EMIT(OLPAREN, subno);
278
		if (!SEE(')'))
279
			p_ere(p, ')');
280
		if (subno < NPAREN) {
281
			p->pend[subno] = HERE();
282
			assert(p->pend[subno] != 0);
283
		}
284
		EMIT(ORPAREN, subno);
285
		if(MUSTEAT(')', REG_EPAREN)) {}
286
		break;
287
#ifndef POSIX_MISTAKE
288
	case ')':		/* happens only if no current unmatched ( */
289
		/*
290
		 * You may ask, why the ifndef?  Because I didn't notice
291
		 * this until slightly too late for 1003.2, and none of the
292
		 * other 1003.2 regular-expression reviewers noticed it at
293
		 * all.  So an unmatched ) is legal POSIX, at least until
294
		 * we can get it fixed.
295
		 */
296
		SETERROR(REG_EPAREN);
297
		break;
298
#endif
299
	case '^':
300
		EMIT(OBOL, 0);
301
		p->g->iflags |= USEBOL;
302
		p->g->nbol++;
303
		wascaret = 1;
304
		break;
305
	case '$':
306
		EMIT(OEOL, 0);
307
		p->g->iflags |= USEEOL;
308
		p->g->neol++;
309
		break;
310
	case '|':
311
		SETERROR(REG_EMPTY);
312
		break;
313
	case '*':
314
	case '+':
315
	case '?':
316
		SETERROR(REG_BADRPT);
317
		break;
318
	case '.':
319
		if (p->g->cflags&REG_NEWLINE)
320
			nonnewline(p);
321
		else
322
			EMIT(OANY, 0);
323
		break;
324
	case '[':
325
		p_bracket(p);
326
		break;
327
	case '\\':
328
		if(REQUIRE(MORE(), REG_EESCAPE)) {}
329
		c = GETNEXT();
330
		ordinary(p, c);
331
		break;
332
	case '{':		/* okay as ordinary except if digit follows */
333
		if(REQUIRE(!MORE() || !my_isdigit(p->charset,PEEK()), REG_BADRPT)) {}
334
		/* FALLTHROUGH */
335
	default:
336
		ordinary(p, c);
337
		break;
338
	}
339
340
	if (!MORE())
341
		return;
342
	c = PEEK();
343
	/* we call { a repetition if followed by a digit */
344
	if (!( c == '*' || c == '+' || c == '?' ||
345
				(c == '{' && MORE2() && 
346
				 my_isdigit(p->charset,PEEK2())) ))
347
		return;		/* no repetition, we're done */
348
	NEXT();
349
350
	if(REQUIRE(!wascaret, REG_BADRPT)) {}
351
	switch (c) {
352
	case '*':	/* implemented as +? */
353
		/* this case does not require the (y|) trick, noKLUDGE */
354
		INSERT(OPLUS_, pos);
355
		ASTERN(O_PLUS, pos);
356
		INSERT(OQUEST_, pos);
357
		ASTERN(O_QUEST, pos);
358
		break;
359
	case '+':
360
		INSERT(OPLUS_, pos);
361
		ASTERN(O_PLUS, pos);
362
		break;
363
	case '?':
364
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
365
		INSERT(OCH_, pos);		/* offset slightly wrong */
366
		ASTERN(OOR1, pos);		/* this one's right */
367
		AHEAD(pos);			/* fix the OCH_ */
368
		EMIT(OOR2, 0);			/* offset very wrong... */
369
		AHEAD(THERE());			/* ...so fix it */
370
		ASTERN(O_CH, THERETHERE());
371
		break;
372
	case '{':
373
		count = p_count(p);
374
		if (EAT(',')) {
375
			if (my_isdigit(p->charset,PEEK())) {
376
				count2 = p_count(p);
377
				if(REQUIRE(count <= count2, REG_BADBR)) {}
378
			} else		/* single number with comma */
379
				count2 = RE_INFINITY;
380
		} else		/* just a single number */
381
			count2 = count;
382
		repeat(p, pos, count, count2);
383
		if (!EAT('}')) {	/* error heuristics */
384
			while (MORE() && PEEK() != '}')
385
				NEXT();
386
			if(REQUIRE(MORE(), REG_EBRACE)) {}
387
			SETERROR(REG_BADBR);
388
		}
389
		break;
390
	}
391
392
	if (!MORE())
393
		return;
394
	c = PEEK();
395
	if (!( c == '*' || c == '+' || c == '?' ||
396
				(c == '{' && MORE2() && 
397
				 my_isdigit(p->charset,PEEK2())) ) )
398
		return;
399
	SETERROR(REG_BADRPT);
400
}
401
402
/*
403
 - p_str - string (no metacharacters) "parser"
404
 == static void p_str(register struct parse *p);
405
 */
406
static void
407
p_str(p)
408
register struct parse *p;
409
{
410
	if(REQUIRE(MORE(), REG_EMPTY)) {}
411
	while (MORE())
412
		ordinary(p, GETNEXT());
413
}
414
415
/*
416
 - p_bre - BRE parser top level, anchoring and concatenation
417
 == static void p_bre(register struct parse *p, register int end1, \
418
 ==	register int end2);
419
 * Giving end1 as OUT essentially eliminates the end1/end2 check.
420
 *
421
 * This implementation is a bit of a kludge, in that a trailing $ is first
422
 * taken as an ordinary character and then revised to be an anchor.  The
423
 * only undesirable side effect is that '$' gets included as a character
424
 * category in such cases.  This is fairly harmless; not worth fixing.
425
 * The amount of lookahead needed to avoid this kludge is excessive.
426
 */
427
static void
428
p_bre(p, end1, end2)
429
register struct parse *p;
430
register int end1;		/* first terminating character */
431
register int end2;		/* second terminating character */
432
{
433
	register sopno start = HERE();
434
	register int first = 1;			/* first subexpression? */
435
	register int wasdollar = 0;
436
437
	if (EAT('^')) {
438
		EMIT(OBOL, 0);
439
		p->g->iflags |= USEBOL;
440
		p->g->nbol++;
441
	}
442
	while (MORE() && !SEETWO(end1, end2)) {
443
		wasdollar = p_simp_re(p, first);
444
		first = 0;
445
	}
446
	if (wasdollar) {	/* oops, that was a trailing anchor */
447
		DROP(1);
448
		EMIT(OEOL, 0);
449
		p->g->iflags |= USEEOL;
450
		p->g->neol++;
451
	}
452
453
	if(REQUIRE(HERE() != start, REG_EMPTY)) {}	/* require nonempty */
454
}
455
456
/*
457
 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
458
 == static int p_simp_re(register struct parse *p, int starordinary);
459
 */
460
static int			/* was the simple RE an unbackslashed $? */
461
p_simp_re(p, starordinary)
462
register struct parse *p;
463
int starordinary;		/* is a leading * an ordinary character? */
464
{
465
	register int c;
466
	register int count;
467
	register int count2;
468
	register sopno pos;
469
	register int i;
470
	register sopno subno;
471
#	define	BACKSL	(1<<CHAR_BIT)
472
473
	pos = HERE();		/* repetion op, if any, covers from here */
474
475
	assert(MORE());		/* caller should have ensured this */
476
	c = GETNEXT();
477
	if (c == '\\') {
478
		if(REQUIRE(MORE(), REG_EESCAPE)) {}
479
		c = BACKSL | (unsigned char)GETNEXT();
480
	}
481
	switch (c) {
482
	case '.':
483
		if (p->g->cflags&REG_NEWLINE)
484
			nonnewline(p);
485
		else
486
			EMIT(OANY, 0);
487
		break;
488
	case '[':
489
		p_bracket(p);
490
		break;
491
	case BACKSL|'{':
492
		SETERROR(REG_BADRPT);
493
		break;
494
	case BACKSL|'(':
495
		p->g->nsub++;
496
		subno = (sopno) p->g->nsub;
497
		if (subno < NPAREN)
498
			p->pbegin[subno] = HERE();
499
		EMIT(OLPAREN, subno);
500
		/* the MORE here is an error heuristic */
501
		if (MORE() && !SEETWO('\\', ')'))
502
			p_bre(p, '\\', ')');
503
		if (subno < NPAREN) {
504
			p->pend[subno] = HERE();
505
			assert(p->pend[subno] != 0);
506
		}
507
		EMIT(ORPAREN, subno);
508
		if(REQUIRE(EATTWO('\\', ')'), REG_EPAREN)) {}
509
		break;
510
	case BACKSL|')':	/* should not get here -- must be user */
511
	case BACKSL|'}':
512
		SETERROR(REG_EPAREN);
513
		break;
514
	case BACKSL|'1':
515
	case BACKSL|'2':
516
	case BACKSL|'3':
517
	case BACKSL|'4':
518
	case BACKSL|'5':
519
	case BACKSL|'6':
520
	case BACKSL|'7':
521
	case BACKSL|'8':
522
	case BACKSL|'9':
523
		i = (c&~BACKSL) - '0';
524
		assert(i < NPAREN);
525
		if (p->pend[i] != 0) {
526
			assert((uint) i <= p->g->nsub);
527
			EMIT(OBACK_, i);
528
			assert(p->pbegin[i] != 0);
529
			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
530
			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
531
			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
532
			EMIT(O_BACK, i);
533
		} else
534
			SETERROR(REG_ESUBREG);
535
		p->g->backrefs = 1;
536
		break;
537
	case '*':
538
		if(REQUIRE(starordinary, REG_BADRPT)) {}
539
		/* FALLTHROUGH */
540
	default:
541
		ordinary(p, c &~ BACKSL);
542
		break;
543
	}
544
545
	if (EAT('*')) {		/* implemented as +? */
546
		/* this case does not require the (y|) trick, noKLUDGE */
547
		INSERT(OPLUS_, pos);
548
		ASTERN(O_PLUS, pos);
549
		INSERT(OQUEST_, pos);
550
		ASTERN(O_QUEST, pos);
551
	} else if (EATTWO('\\', '{')) {
552
		count = p_count(p);
553
		if (EAT(',')) {
554
			if (MORE() && my_isdigit(p->charset,PEEK())) {
555
				count2 = p_count(p);
556
				if(REQUIRE(count <= count2, REG_BADBR)) {}
557
			} else		/* single number with comma */
558
				count2 = RE_INFINITY;
559
		} else		/* just a single number */
560
			count2 = count;
561
		repeat(p, pos, count, count2);
562
		if (!EATTWO('\\', '}')) {	/* error heuristics */
563
			while (MORE() && !SEETWO('\\', '}'))
564
				NEXT();
565
			if(REQUIRE(MORE(), REG_EBRACE)) {}
566
			SETERROR(REG_BADBR);
567
		}
568
	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
569
		return(1);
570
571
	return(0);
572
}
573
574
/*
575
 - p_count - parse a repetition count
576
 == static int p_count(register struct parse *p);
577
 */
578
static int			/* the value */
579
p_count(p)
580
register struct parse *p;
581
{
582
	register int count = 0;
583
	register int ndigits = 0;
584
585
	while (MORE() && my_isdigit(p->charset,PEEK()) && count <= DUPMAX) {
586
		count = count*10 + (GETNEXT() - '0');
587
		ndigits++;
588
	}
589
590
	if(REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR)) {}
591
	return(count);
592
}
593
594
/*
595
 - p_bracket - parse a bracketed character list
596
 == static void p_bracket(register struct parse *p);
597
 *
598
 * Note a significant property of this code:  if the allocset() did SETERROR,
599
 * no set operations are done.
600
 */
601
static void
602
p_bracket(p)
603
register struct parse *p;
604
{
605
	register cset *cs = allocset(p);
606
	register int invert = 0;
607
608
	/* Dept of Truly Sickening Special-Case Kludges */
609
	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
610
		EMIT(OBOW, 0);
611
		NEXTn(6);
612
		return;
613
	}
614
	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
615
		EMIT(OEOW, 0);
616
		NEXTn(6);
617
		return;
618
	}
619
620
	if (EAT('^'))
621
		invert++;	/* make note to invert set at end */
622
	if (EAT(']'))
623
		CHadd(cs, ']');
624
	else if (EAT('-'))
625
		CHadd(cs, '-');
626
	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
627
		p_b_term(p, cs);
628
	if (EAT('-'))
629
		CHadd(cs, '-');
630
	if(MUSTEAT(']', REG_EBRACK)) {}
631
632
	if (p->error != 0)	/* don't mess things up further */
633
		return;
634
635
	if (p->g->cflags&REG_ICASE) {
636
		register int i;
637
		register int ci;
638
639
		for (i = p->g->csetsize - 1; i >= 0; i--)
640
			if (CHIN(cs, i) && my_isalpha(p->charset,i)) {
641
				ci = othercase(p->charset,i);
642
				if (ci != i)
643
					CHadd(cs, ci);
644
			}
645
		if (cs->multis != NULL)
646
			mccase(p, cs);
647
	}
648
	if (invert) {
649
		register int i;
650
651
		for (i = p->g->csetsize - 1; i >= 0; i--)
652
			if (CHIN(cs, i))
653
				CHsub(cs, i);
654
			else
655
				CHadd(cs, i);
656
		if (p->g->cflags&REG_NEWLINE)
657
			CHsub(cs, '\n');
658
		if (cs->multis != NULL)
659
			mcinvert(p, cs);
660
	}
661
662
	assert(cs->multis == NULL);		/* xxx */
663
664
	if (nch(p, cs) == 1) {		/* optimize singleton sets */
665
		ordinary(p, firstch(p, cs));
666
		freeset(p, cs);
667
	} else
668
		EMIT(OANYOF, freezeset(p, cs));
669
}
670
671
/*
672
 - p_b_term - parse one term of a bracketed character list
673
 == static void p_b_term(register struct parse *p, register cset *cs);
674
 */
675
static void
676
p_b_term(p, cs)
677
register struct parse *p;
678
register cset *cs;
679
{
680
	register char c;
681
	register char start, finish;
682
	register int i;
683
684
	/* classify what we've got */
685
	switch ((MORE()) ? PEEK() : '\0') {
686
	case '[':
687
		c = (MORE2()) ? PEEK2() : '\0';
688
		break;
689
	case '-':
690
		SETERROR(REG_ERANGE);
691
		return;			/* NOTE RETURN */
692
		break;
693
	default:
694
		c = '\0';
695
		break;
696
	}
697
698
	switch (c) {
699
	case ':':		/* character class */
700
		NEXT2();
701
		if(REQUIRE(MORE(), REG_EBRACK)) {}
702
		c = PEEK();
703
		if(REQUIRE(c != '-' && c != ']', REG_ECTYPE)) {}
704
		p_b_cclass(p, cs);
705
		if(REQUIRE(MORE(), REG_EBRACK)) {}
706
		if(REQUIRE(EATTWO(':', ']'), REG_ECTYPE)) {}
707
		break;
708
	case '=':		/* equivalence class */
709
		NEXT2();
710
		if(REQUIRE(MORE(), REG_EBRACK)) {}
711
		c = PEEK();
712
		if(REQUIRE(c != '-' && c != ']', REG_ECOLLATE)) {}
713
		p_b_eclass(p, cs);
714
		if(REQUIRE(MORE(), REG_EBRACK)) {}
715
		if(REQUIRE(EATTWO('=', ']'), REG_ECOLLATE)) {}
716
		break;
717
	default:		/* symbol, ordinary character, or range */
718
/* xxx revision needed for multichar stuff */
719
		start = p_b_symbol(p);
720
		if (SEE('-') && MORE2() && PEEK2() != ']') {
721
			/* range */
722
			NEXT();
723
			if (EAT('-'))
724
				finish = '-';
725
			else
726
				finish = p_b_symbol(p);
727
		} else
728
			finish = start;
729
/* xxx what about signed chars here... */
730
		if(REQUIRE(start <= finish, REG_ERANGE)) {}
731
		for (i = start; i <= finish; i++)
732
			CHadd(cs, i);
733
		break;
734
	}
735
}
736
737
/*
738
 - p_b_cclass - parse a character-class name and deal with it
739
 == static void p_b_cclass(register struct parse *p, register cset *cs);
740
 */
741
static void
742
p_b_cclass(p, cs)
743
register struct parse *p;
744
register cset *cs;
745
{
746
	register char *sp = p->next;
747
	register struct cclass *cp;
748
	register size_t len;
749
	
750
	while (MORE() && my_isalpha(p->charset,PEEK()))
751
		NEXT();
752
	len = p->next - sp;
753
	for (cp = cclasses; cp->name != NULL; cp++)
754
		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
755
			break;
756
	if (cp->name == NULL) {
757
		/* oops, didn't find it */
758
		SETERROR(REG_ECTYPE);
759
		return;
760
	}
761
762
#ifndef USE_ORIG_REGEX_CODE
763
	{
764
		register size_t i;
765
		for (i=1 ; i<256 ; i++)
766
			if (p->charset->ctype[i+1] & cp->mask)
767
				CHadd(cs, i);
768
	}
769
#else	
770
	{
771
		register char *u = (char*) cp->chars;
772
		register char c;
773
		
774
		while ((c = *u++) != '\0')
775
			CHadd(cs, c);
776
		
777
		for (u = (char*) cp->multis; *u != '\0'; u += strlen(u) + 1)
778
			MCadd(p, cs, u);
779
	}
780
#endif
781
782
}
783
784
/*
785
 - p_b_eclass - parse an equivalence-class name and deal with it
786
 == static void p_b_eclass(register struct parse *p, register cset *cs);
787
 *
788
 * This implementation is incomplete. xxx
789
 */
790
static void
791
p_b_eclass(p, cs)
792
register struct parse *p;
793
register cset *cs;
794
{
795
	register char c;
796
797
	c = p_b_coll_elem(p, '=');
798
	CHadd(cs, c);
799
}
800
801
/*
802
 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
803
 == static char p_b_symbol(register struct parse *p);
804
 */
805
static char			/* value of symbol */
806
p_b_symbol(p)
807
register struct parse *p;
808
{
809
	register char value;
810
811
	if(REQUIRE(MORE(), REG_EBRACK)) {}
812
	if (!EATTWO('[', '.'))
813
		return(GETNEXT());
814
815
	/* collating symbol */
816
	value = p_b_coll_elem(p, '.');
817
	if(REQUIRE(EATTWO('.', ']'), REG_ECOLLATE)) {}
818
	return(value);
819
}
820
821
/*
822
 - p_b_coll_elem - parse a collating-element name and look it up
823
 == static char p_b_coll_elem(register struct parse *p, int endc);
824
 */
825
static char			/* value of collating element */
826
p_b_coll_elem(p, endc)
827
register struct parse *p;
828
int endc;			/* name ended by endc,']' */
829
{
830
	register char *sp = p->next;
831
	register struct cname *cp;
832
#ifdef _WIN64
833
	register __int64 len;
834
#else
835
	register int len;
836
#endif
837
	while (MORE() && !SEETWO(endc, ']'))
838
		NEXT();
839
	if (!MORE()) {
840
		SETERROR(REG_EBRACK);
841
		return(0);
842
	}
843
	len = p->next - sp;
844
	for (cp = cnames; cp->name != NULL; cp++)
845
		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
846
			return(cp->code);	/* known name */
847
	if (len == 1)
848
		return(*sp);	/* single character */
849
	SETERROR(REG_ECOLLATE);			/* neither */
850
	return(0);
851
}
852
853
/*
854
 - othercase - return the case counterpart of an alphabetic
855
 == static char othercase(int ch);
856
 */
857
static char			/* if no counterpart, return ch */
858
othercase(charset,ch)
859
CHARSET_INFO *charset;
860
int ch;
861
{
862
	/*
863
	  In MySQL some multi-byte character sets
864
	  have 'ctype' array but don't have 'to_lower'
865
	  and 'to_upper' arrays. In this case we handle
866
	  only basic latin letters a..z and A..Z.
867
	  
868
	  If 'to_lower' and 'to_upper' arrays are empty in a character set,
869
	  then my_isalpha(cs, ch) should never return TRUE for characters
870
	  other than basic latin letters. Otherwise it should be
871
	  considered as a mistake in character set definition.
872
	*/
873
	assert(my_isalpha(charset,ch));
874
	if (my_isupper(charset,ch))
875
	{
876
		return(charset->to_lower ? my_tolower(charset,ch) :
877
		                          ch - 'A' + 'a');
878
	}
879
	else if (my_islower(charset,ch))
880
	{
881
		return(charset->to_upper ? my_toupper(charset,ch) :
882
		                          ch - 'a' + 'A');
883
	}
884
	else			/* peculiar, but could happen */
885
		return(ch);
886
}
887
888
/*
889
 - bothcases - emit a dualcase version of a two-case character
890
 == static void bothcases(register struct parse *p, int ch);
891
 *
892
 * Boy, is this implementation ever a kludge...
893
 */
894
static void
895
bothcases(p, ch)
896
register struct parse *p;
897
int ch;
898
{
899
	register char *oldnext = p->next;
900
	register char *oldend = p->end;
901
	char bracket[3];
902
903
	assert(othercase(p->charset, ch) != ch); /* p_bracket() would recurse */
904
	p->next = bracket;
905
	p->end = bracket+2;
906
	bracket[0] = ch;
907
	bracket[1] = ']';
908
	bracket[2] = '\0';
909
	p_bracket(p);
910
	assert(p->next == bracket+2);
911
	p->next = oldnext;
912
	p->end = oldend;
913
}
914
915
/*
916
 - ordinary - emit an ordinary character
917
 == static void ordinary(register struct parse *p, register int ch);
918
 */
919
static void
920
ordinary(p, ch)
921
register struct parse *p;
922
register int ch;
923
{
924
	register cat_t *cap = p->g->categories;
925
926
	if ((p->g->cflags&REG_ICASE) && my_isalpha(p->charset,ch) && 
927
	     othercase(p->charset,ch) != ch)
928
		bothcases(p, ch);
929
	else {
930
		EMIT(OCHAR, (unsigned char)ch);
931
		if (cap[ch] == 0)
932
			cap[ch] = p->g->ncategories++;
933
	}
934
}
935
936
/*
937
 - nonnewline - emit REG_NEWLINE version of OANY
938
 == static void nonnewline(register struct parse *p);
939
 *
940
 * Boy, is this implementation ever a kludge...
941
 */
942
static void
943
nonnewline(p)
944
register struct parse *p;
945
{
946
	register char *oldnext = p->next;
947
	register char *oldend = p->end;
948
	char bracket[4];
949
950
	p->next = bracket;
951
	p->end = bracket+3;
952
	bracket[0] = '^';
953
	bracket[1] = '\n';
954
	bracket[2] = ']';
955
	bracket[3] = '\0';
956
	p_bracket(p);
957
	assert(p->next == bracket+3);
958
	p->next = oldnext;
959
	p->end = oldend;
960
}
961
962
/*
963
 - repeat - generate code for a bounded repetition, recursively if needed
964
 == static void repeat(register struct parse *p, sopno start, int from, int to);
965
 */
966
static void
967
repeat(p, start, from, to)
968
register struct parse *p;
969
sopno start;			/* operand from here to end of strip */
970
int from;			/* repeated from this number */
971
int to;				/* to this number of times (maybe RE_INFINITY) */
972
{
973
	register sopno finish = HERE();
974
#	define	N	2
975
#	define	INF	3
976
#	define	REP(f, t)	((f)*8 + (t))
977
#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == RE_INFINITY) ? INF : N)
978
	register sopno copy;
979
980
	if (p->error != 0)	/* head off possible runaway recursion */
981
		return;
982
983
	assert(from <= to);
984
985
	switch (REP(MAP(from), MAP(to))) {
986
	case REP(0, 0):			/* must be user doing this */
987
		DROP(finish-start);	/* drop the operand */
988
		break;
989
	case REP(0, 1):			/* as x{1,1}? */
990
	case REP(0, N):			/* as x{1,n}? */
991
	case REP(0, INF):		/* as x{1,}? */
992
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
993
		INSERT(OCH_, start);		/* offset is wrong... */
994
		repeat(p, start+1, 1, to);
995
		ASTERN(OOR1, start);
996
		AHEAD(start);			/* ... fix it */
997
		EMIT(OOR2, 0);
998
		AHEAD(THERE());
999
		ASTERN(O_CH, THERETHERE());
1000
		break;
1001
	case REP(1, 1):			/* trivial case */
1002
		/* done */
1003
		break;
1004
	case REP(1, N):			/* as x?x{1,n-1} */
1005
		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1006
		INSERT(OCH_, start);
1007
		ASTERN(OOR1, start);
1008
		AHEAD(start);
1009
		EMIT(OOR2, 0);			/* offset very wrong... */
1010
		AHEAD(THERE());			/* ...so fix it */
1011
		ASTERN(O_CH, THERETHERE());
1012
		copy = dupl(p, start+1, finish+1);
1013
		assert(copy == finish+4);
1014
		repeat(p, copy, 1, to-1);
1015
		break;
1016
	case REP(1, INF):		/* as x+ */
1017
		INSERT(OPLUS_, start);
1018
		ASTERN(O_PLUS, start);
1019
		break;
1020
	case REP(N, N):			/* as xx{m-1,n-1} */
1021
		copy = dupl(p, start, finish);
1022
		repeat(p, copy, from-1, to-1);
1023
		break;
1024
	case REP(N, INF):		/* as xx{n-1,INF} */
1025
		copy = dupl(p, start, finish);
1026
		repeat(p, copy, from-1, to);
1027
		break;
1028
	default:			/* "can't happen" */
1029
		SETERROR(REG_ASSERT);	/* just in case */
1030
		break;
1031
	}
1032
}
1033
1034
/*
1035
 - seterr - set an error condition
1036
 == static int seterr(register struct parse *p, int e);
1037
 */
1038
static int			/* useless but makes type checking happy */
1039
seterr(p, e)
1040
register struct parse *p;
1041
int e;
1042
{
1043
	if (p->error == 0)	/* keep earliest error condition */
1044
		p->error = e;
1045
	p->next = nuls;		/* try to bring things to a halt */
1046
	p->end = nuls;
1047
	return(0);		/* make the return value well-defined */
1048
}
1049
1050
/*
1051
 - allocset - allocate a set of characters for []
1052
 == static cset *allocset(register struct parse *p);
1053
 */
1054
static cset *
1055
allocset(p)
1056
register struct parse *p;
1057
{
1058
	register int no = p->g->ncsets++;
1059
	register size_t nc;
1060
	register size_t nbytes;
1061
	register cset *cs;
1062
	register size_t css = (size_t)p->g->csetsize;
1063
	register int i;
1064
1065
	if (no >= p->ncsalloc) {	/* need another column of space */
1066
		p->ncsalloc += CHAR_BIT;
1067
		nc = p->ncsalloc;
1068
		assert(nc % CHAR_BIT == 0);
1069
		nbytes = nc / CHAR_BIT * css;
1070
		if (p->g->sets == NULL)
1071
			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1072
		else
1073
			p->g->sets = (cset *)realloc((char *)p->g->sets,
1074
							nc * sizeof(cset));
1075
		if (p->g->setbits == NULL)
1076
			p->g->setbits = (uch *)malloc(nbytes);
1077
		else {
1078
			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1079
								nbytes);
1080
			/* xxx this isn't right if setbits is now NULL */
1081
			for (i = 0; i < no; i++)
1082
				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1083
		}
1084
		if (p->g->sets != NULL && p->g->setbits != NULL)
1085
			(void) memset((char *)p->g->setbits + (nbytes - css),
1086
								0, css);
1087
		else {
1088
			no = 0;
1089
			SETERROR(REG_ESPACE);
1090
			/* caller's responsibility not to do set ops */
1091
		}
1092
	}
1093
1094
	assert(p->g->sets != NULL);	/* xxx */
1095
	cs = &p->g->sets[no];
1096
	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1097
	cs->mask = 1 << ((no) % CHAR_BIT);
1098
	cs->hash = 0;
1099
	cs->smultis = 0;
1100
	cs->multis = NULL;
1101
1102
	return(cs);
1103
}
1104
1105
/*
1106
 - freeset - free a now-unused set
1107
 == static void freeset(register struct parse *p, register cset *cs);
1108
 */
1109
static void
1110
freeset(p, cs)
1111
register struct parse *p;
1112
register cset *cs;
1113
{
1114
	register size_t i;
1115
	register cset *top = &p->g->sets[p->g->ncsets];
1116
	register size_t css = (size_t)p->g->csetsize;
1117
1118
	for (i = 0; i < css; i++)
1119
	  CHsub(cs, i);
1120
	if (cs == top-1)	/* recover only the easy case */
1121
	  p->g->ncsets--;
1122
}
1123
1124
/*
1125
 - freezeset - final processing on a set of characters
1126
 == static int freezeset(register struct parse *p, register cset *cs);
1127
 *
1128
 * The main task here is merging identical sets.  This is usually a waste
1129
 * of time (although the hash code minimizes the overhead), but can win
1130
 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1131
 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1132
 * the same value!
1133
 */
1134
static int			/* set number */
1135
freezeset(p, cs)
1136
register struct parse *p;
1137
register cset *cs;
1138
{
1139
	register uch h = cs->hash;
1140
	register size_t i;
1141
	register cset *top = &p->g->sets[p->g->ncsets];
1142
	register cset *cs2;
1143
	register size_t css = (size_t)p->g->csetsize;
1144
1145
	/* look for an earlier one which is the same */
1146
	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1147
		if (cs2->hash == h && cs2 != cs) {
1148
			/* maybe */
1149
			for (i = 0; i < css; i++)
1150
				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1151
					break;		/* no */
1152
			if (i == css)
1153
				break;			/* yes */
1154
		}
1155
1156
	if (cs2 < top) {	/* found one */
1157
		freeset(p, cs);
1158
		cs = cs2;
1159
	}
1160
1161
	return((int)(cs - p->g->sets));
1162
}
1163
1164
/*
1165
 - firstch - return first character in a set (which must have at least one)
1166
 == static int firstch(register struct parse *p, register cset *cs);
1167
 */
1168
static int			/* character; there is no "none" value */
1169
firstch(p, cs)
1170
register struct parse *p;
1171
register cset *cs;
1172
{
1173
	register size_t i;
1174
	register size_t css = (size_t)p->g->csetsize;
1175
1176
	for (i = 0; i < css; i++)
1177
		if (CHIN(cs, i))
1178
			return((char)i);
1179
	assert(never);
1180
	return(0);		/* arbitrary */
1181
}
1182
1183
/*
1184
 - nch - number of characters in a set
1185
 == static int nch(register struct parse *p, register cset *cs);
1186
 */
1187
static int
1188
nch(p, cs)
1189
register struct parse *p;
1190
register cset *cs;
1191
{
1192
	register size_t i;
1193
	register size_t css = (size_t)p->g->csetsize;
1194
	register int n = 0;
1195
1196
	for (i = 0; i < css; i++)
1197
		if (CHIN(cs, i))
1198
			n++;
1199
	return(n);
1200
}
1201
1202
#ifdef USE_ORIG_REGEX_CODE
1203
/*
1204
 - mcadd - add a collating element to a cset
1205
 == static void mcadd(register struct parse *p, register cset *cs, \
1206
 ==	register char *cp);
1207
 */
1208
static void
1209
mcadd(p, cs, cp)
1210
register struct parse *p;
1211
register cset *cs;
1212
register char *cp;
1213
{
1214
	register size_t oldend = cs->smultis;
1215
1216
	cs->smultis += strlen(cp) + 1;
1217
	if (cs->multis == NULL)
1218
		cs->multis = malloc(cs->smultis);
1219
	else
1220
		cs->multis = realloc(cs->multis, cs->smultis);
1221
	if (cs->multis == NULL) {
1222
		SETERROR(REG_ESPACE);
1223
		return;
1224
	}
1225
1226
	(void) strcpy(cs->multis + oldend - 1, cp);
1227
	cs->multis[cs->smultis - 1] = '\0';
1228
}
1229
#endif
1230
1231
#ifdef NOT_USED
1232
/*
1233
 - mcsub - subtract a collating element from a cset
1234
 == static void mcsub(register cset *cs, register char *cp);
1235
 */
1236
static void
1237
mcsub(cs, cp)
1238
register cset *cs;
1239
register char *cp;
1240
{
1241
	register char *fp = mcfind(cs, cp);
1242
	register size_t len = strlen(fp);
1243
1244
	assert(fp != NULL);
1245
	(void) memmove(fp, fp + len + 1,
1246
				cs->smultis - (fp + len + 1 - cs->multis));
1247
	cs->smultis -= len;
1248
1249
	if (cs->smultis == 0) {
1250
		free(cs->multis);
1251
		cs->multis = NULL;
1252
		return;
1253
	}
1254
1255
	cs->multis = realloc(cs->multis, cs->smultis);
1256
	assert(cs->multis != NULL);
1257
}
1258
1259
/*
1260
 - mcin - is a collating element in a cset?
1261
 == static int mcin(register cset *cs, register char *cp);
1262
 */
1263
static int
1264
mcin(cs, cp)
1265
register cset *cs;
1266
register char *cp;
1267
{
1268
	return(mcfind(cs, cp) != NULL);
1269
}
1270
1271
/*
1272
 - mcfind - find a collating element in a cset
1273
 == static char *mcfind(register cset *cs, register char *cp);
1274
 */
1275
static char *
1276
mcfind(cs, cp)
1277
register cset *cs;
1278
register char *cp;
1279
{
1280
	register char *p;
1281
1282
	if (cs->multis == NULL)
1283
		return(NULL);
1284
	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1285
		if (strcmp(cp, p) == 0)
1286
			return(p);
1287
	return(NULL);
1288
}
1289
#endif
1290
1291
/*
1292
 - mcinvert - invert the list of collating elements in a cset
1293
 == static void mcinvert(register struct parse *p, register cset *cs);
1294
 *
1295
 * This would have to know the set of possibilities.  Implementation
1296
 * is deferred.
1297
 */
1298
static void
1299
mcinvert(p, cs)
1300
  register struct parse *p __attribute__((unused));
1301
  register cset *cs __attribute__((unused));
1302
{
1303
	assert(cs->multis == NULL);	/* xxx */
1304
}
1305
1306
/*
1307
 - mccase - add case counterparts of the list of collating elements in a cset
1308
 == static void mccase(register struct parse *p, register cset *cs);
1309
 *
1310
 * This would have to know the set of possibilities.  Implementation
1311
 * is deferred.
1312
 */
1313
static void
1314
mccase(p, cs)
1315
register struct parse *p __attribute__((unused));
1316
register cset *cs __attribute__((unused));
1317
{
1318
	assert(cs->multis == NULL);	/* xxx */
1319
}
1320
1321
/*
1322
 - isinsets - is this character in any sets?
1323
 == static int isinsets(register struct re_guts *g, int c);
1324
 */
1325
static int			/* predicate */
1326
isinsets(g, c)
1327
register struct re_guts *g;
1328
int c;
1329
{
1330
	register uch *col;
1331
	register int i;
1332
	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1333
	register unsigned uc = (unsigned char)c;
1334
1335
	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1336
		if (col[uc] != 0)
1337
			return(1);
1338
	return(0);
1339
}
1340
1341
/*
1342
 - samesets - are these two characters in exactly the same sets?
1343
 == static int samesets(register struct re_guts *g, int c1, int c2);
1344
 */
1345
static int			/* predicate */
1346
samesets(g, c1, c2)
1347
register struct re_guts *g;
1348
int c1;
1349
int c2;
1350
{
1351
	register uch *col;
1352
	register int i;
1353
	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1354
	register unsigned uc1 = (unsigned char)c1;
1355
	register unsigned uc2 = (unsigned char)c2;
1356
1357
	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1358
		if (col[uc1] != col[uc2])
1359
			return(0);
1360
	return(1);
1361
}
1362
1363
/*
1364
 - categorize - sort out character categories
1365
 == static void categorize(struct parse *p, register struct re_guts *g);
1366
 */
1367
static void
1368
categorize(p, g)
1369
struct parse *p;
1370
register struct re_guts *g;
1371
{
1372
	register cat_t *cats = g->categories;
1373
	register int c;
1374
	register int c2;
1375
	register cat_t cat;
1376
1377
	/* avoid making error situations worse */
1378
	if (p->error != 0)
1379
		return;
1380
1381
	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1382
		if (cats[c] == 0 && isinsets(g, c)) {
1383
			cat = g->ncategories++;
1384
			cats[c] = cat;
1385
			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1386
				if (cats[c2] == 0 && samesets(g, c, c2))
1387
					cats[c2] = cat;
1388
		}
1389
}
1390
1391
/*
1392
 - dupl - emit a duplicate of a bunch of sops
1393
 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1394
 */
1395
static sopno			/* start of duplicate */
1396
dupl(p, start, finish)
1397
register struct parse *p;
1398
sopno start;			/* from here */
1399
sopno finish;			/* to this less one */
1400
{
1401
	register sopno ret = HERE();
1402
	register sopno len = finish - start;
1403
1404
	assert(finish >= start);
1405
	if (len == 0)
1406
		return(ret);
1407
	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1408
	assert(p->ssize >= p->slen + len);
1409
	(void) memcpy((char *)(p->strip + p->slen),
1410
		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1411
	p->slen += len;
1412
	return(ret);
1413
}
1414
1415
/*
1416
 - doemit - emit a strip operator
1417
 == static void doemit(register struct parse *p, sop op, size_t opnd);
1418
 *
1419
 * It might seem better to implement this as a macro with a function as
1420
 * hard-case backup, but it's just too big and messy unless there are
1421
 * some changes to the data structures.  Maybe later.
1422
 */
1423
static void
1424
doemit(p, op, opnd)
1425
register struct parse *p;
1426
sop op;
1427
size_t opnd;
1428
{
1429
	/* avoid making error situations worse */
1430
	if (p->error != 0)
1431
		return;
1432
1433
	/* deal with oversize operands ("can't happen", more or less) */
1434
	assert(opnd < 1<<OPSHIFT);
1435
1436
	/* deal with undersized strip */
1437
	if (p->slen >= p->ssize)
1438
		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1439
	assert(p->slen < p->ssize);
1440
1441
	/* finally, it's all reduced to the easy case */
1442
	p->strip[p->slen++] = SOP(op, opnd);
1443
}
1444
1445
/*
1446
 - doinsert - insert a sop into the strip
1447
 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1448
 */
1449
static void
1450
doinsert(p, op, opnd, pos)
1451
register struct parse *p;
1452
sop op;
1453
size_t opnd;
1454
sopno pos;
1455
{
1456
	register sopno sn;
1457
	register sop s;
1458
	register int i;
1459
1460
	/* avoid making error situations worse */
1461
	if (p->error != 0)
1462
		return;
1463
1464
	sn = HERE();
1465
	EMIT(op, opnd);		/* do checks, ensure space */
1466
	assert(HERE() == sn+1);
1467
	s = p->strip[sn];
1468
1469
	/* adjust paren pointers */
1470
	assert(pos > 0);
1471
	for (i = 1; i < NPAREN; i++) {
1472
		if (p->pbegin[i] >= pos) {
1473
			p->pbegin[i]++;
1474
		}
1475
		if (p->pend[i] >= pos) {
1476
			p->pend[i]++;
1477
		}
1478
	}
1479
	{
1480
          int length=(HERE()-pos-1)*sizeof(sop);
1481
          bmove_upp((uchar *) &p->strip[pos+1]+length,
1482
                    (uchar *) &p->strip[pos]+length,
1483
                    length);
1484
        }
1485
#ifdef OLD_CODE
1486
        memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1487
                                                (HERE()-pos-1)*sizeof(sop));
1488
#endif
1489
	p->strip[pos] = s;
1490
}
1491
1492
/*
1493
 - dofwd - complete a forward reference
1494
 == static void dofwd(register struct parse *p, sopno pos, sop value);
1495
 */
1496
static void
1497
dofwd(p, pos, value)
1498
register struct parse *p;
1499
register sopno pos;
1500
sop value;
1501
{
1502
	/* avoid making error situations worse */
1503
	if (p->error != 0)
1504
		return;
1505
1506
	assert(value < 1<<OPSHIFT);
1507
	p->strip[pos] = OP(p->strip[pos]) | value;
1508
}
1509
1510
/*
1511
 - enlarge - enlarge the strip
1512
 == static void enlarge(register struct parse *p, sopno size);
1513
 */
1514
static void
1515
enlarge(p, size)
1516
register struct parse *p;
1517
register sopno size;
1518
{
1519
	register sop *sp;
1520
1521
	if (p->ssize >= size)
1522
		return;
1523
1524
	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1525
	if (sp == NULL) {
1526
		SETERROR(REG_ESPACE);
1527
		return;
1528
	}
1529
	p->strip = sp;
1530
	p->ssize = size;
1531
}
1532
1533
/*
1534
 - stripsnug - compact the strip
1535
 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1536
 */
1537
static void
1538
stripsnug(p, g)
1539
register struct parse *p;
1540
register struct re_guts *g;
1541
{
1542
	g->nstates = p->slen;
1543
	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1544
	if (g->strip == NULL) {
1545
		SETERROR(REG_ESPACE);
1546
		g->strip = p->strip;
1547
	}
1548
}
1549
1550
/*
1551
 - findmust - fill in must and mlen with longest mandatory literal string
1552
 == static void findmust(register struct parse *p, register struct re_guts *g);
1553
 *
1554
 * This algorithm could do fancy things like analyzing the operands of |
1555
 * for common subsequences.  Someday.  This code is simple and finds most
1556
 * of the interesting cases.
1557
 *
1558
 * Note that must and mlen got initialized during setup.
1559
 */
1560
static void
1561
findmust(p, g)
1562
struct parse *p;
1563
register struct re_guts *g;
1564
{
1565
	register sop *scan;
1566
	sop *start;
1567
	register sop *newstart;
1568
	register sopno newlen;
1569
	register sop s;
1570
	register char *cp;
1571
	register sopno i;
1572
	/* avoid making error situations worse */
1573
	if (p->error != 0)
1574
		return;
1575
1576
	/* find the longest OCHAR sequence in strip */
1577
	newlen = 0;
1578
	scan = g->strip + 1;
1579
	do {
1580
		s = *scan++;
1581
		switch (OP(s)) {
1582
		case OCHAR:		/* sequence member */
1583
			if (newlen == 0)		/* new sequence */
1584
				newstart = scan - 1;
1585
			newlen++;
1586
			break;
1587
		case OPLUS_:		/* things that don't break one */
1588
		case OLPAREN:
1589
		case ORPAREN:
1590
			break;
1591
		case OQUEST_:		/* things that must be skipped */
1592
		case OCH_:
1593
			scan--;
1594
			do {
1595
				scan += OPND(s);
1596
				s = *scan;
1597
				/* assert() interferes w debug printouts */
1598
				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1599
							OP(s) != OOR2) {
1600
					g->iflags |= BAD;
1601
					return;
1602
				}
1603
			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1604
			/* fallthrough */
1605
		default:		/* things that break a sequence */
1606
			if (newlen > g->mlen) {		/* ends one */
1607
				start = newstart;
1608
				g->mlen = newlen;
1609
			}
1610
			newlen = 0;
1611
			break;
1612
		}
1613
	} while (OP(s) != OEND);
1614
1615
	if (g->mlen == 0)		/* there isn't one */
1616
		return;
1617
1618
	/* turn it into a character string */
1619
	g->must = malloc((size_t)g->mlen + 1);
1620
	if (g->must == NULL) {		/* argh; just forget it */
1621
		g->mlen = 0;
1622
		return;
1623
	}
1624
	cp = g->must;
1625
	scan = start;
1626
	for (i = g->mlen; i > 0; i--) {
1627
		while (OP(s = *scan++) != OCHAR)
1628
			continue;
1629
		assert(cp < g->must + g->mlen);
1630
		*cp++ = (char)OPND(s);
1631
	}
1632
	assert(cp == g->must + g->mlen);
1633
	*cp++ = '\0';		/* just on general principles */
1634
}
1635
1636
/*
1637
 - pluscount - count + nesting
1638
 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1639
 */
1640
static sopno			/* nesting depth */
1641
pluscount(p, g)
1642
struct parse *p;
1643
register struct re_guts *g;
1644
{
1645
	register sop *scan;
1646
	register sop s;
1647
	register sopno plusnest = 0;
1648
	register sopno maxnest = 0;
1649
1650
	if (p->error != 0)
1651
		return(0);	/* there may not be an OEND */
1652
1653
	scan = g->strip + 1;
1654
	do {
1655
		s = *scan++;
1656
		switch (OP(s)) {
1657
		case OPLUS_:
1658
			plusnest++;
1659
			break;
1660
		case O_PLUS:
1661
			if (plusnest > maxnest)
1662
				maxnest = plusnest;
1663
			plusnest--;
1664
			break;
1665
		}
1666
	} while (OP(s) != OEND);
1667
	if (plusnest != 0)
1668
		g->iflags |= BAD;
1669
	return(maxnest);
1670
}