1
by brian
clean slate |
1 |
/*
|
2 |
*
|
|
3 |
* regexp.c - regular expression matching
|
|
4 |
*
|
|
5 |
* DESCRIPTION
|
|
6 |
*
|
|
7 |
* Underneath the reformatting and comment blocks which were added to
|
|
8 |
* make it consistent with the rest of the code, you will find a
|
|
9 |
* modified version of Henry Specer's regular expression library.
|
|
10 |
* Henry's functions were modified to provide the minimal regular
|
|
11 |
* expression matching, as required by P1003. Henry's code was
|
|
12 |
* copyrighted, and copy of the copyright message and restrictions
|
|
13 |
* are provided, verbatim, below:
|
|
14 |
*
|
|
15 |
* Copyright (c) 1986 by University of Toronto.
|
|
16 |
* Written by Henry Spencer. Not derived from licensed software.
|
|
17 |
*
|
|
18 |
* Permission is granted to anyone to use this software for any
|
|
19 |
* purpose on any computer system, and to redistribute it freely,
|
|
20 |
* subject to the following restrictions:
|
|
21 |
*
|
|
22 |
* 1. The author is not responsible for the consequences of use of
|
|
23 |
* this software, no matter how awful, even if they arise
|
|
24 |
* from defects in it.
|
|
25 |
*
|
|
26 |
* 2. The origin of this software must not be misrepresented, either
|
|
27 |
* by explicit claim or by omission.
|
|
28 |
*
|
|
29 |
* 3. Altered versions must be plainly marked as such, and must not
|
|
30 |
* be misrepresented as being the original software.
|
|
31 |
*
|
|
32 |
*
|
|
33 |
* This version modified by Ian Phillipps to return pointer to terminating
|
|
34 |
* NUL on substitution string. [ Temp mail address ex-igp@camcon.co.uk ]
|
|
35 |
*
|
|
36 |
* Altered by amylaar to support excompatible option and the
|
|
37 |
* operators \< and >\ . ( 7.Sep. 1991 )
|
|
38 |
*
|
|
39 |
* regsub altered by amylaar to take an additional parameter specifying
|
|
40 |
* maximum number of bytes that can be written to the memory region
|
|
41 |
* pointed to by dest
|
|
42 |
*
|
|
43 |
* Also altered by Fredrik Hubinette to handle the + operator and
|
|
44 |
* eight-bit chars. Mars 22 1996
|
|
45 |
*
|
|
46 |
*
|
|
47 |
* Beware that some of this code is subtly aware of the way operator
|
|
48 |
* precedence is structured in regular expressions. Serious changes in
|
|
49 |
* regular-expression syntax might require a total rethink.
|
|
50 |
*
|
|
51 |
* AUTHORS
|
|
52 |
*
|
|
53 |
* Mark H. Colburn, NAPS International (mark@jhereg.mn.org)
|
|
54 |
* Henry Spencer, University of Torronto (henry@utzoo.edu)
|
|
55 |
*
|
|
56 |
* Sponsored by The USENIX Association for public distribution.
|
|
57 |
*
|
|
58 |
*/
|
|
59 |
||
60 |
/* Headers */
|
|
61 |
#include "my_global.h" |
|
62 |
#include <ctype.h> |
|
63 |
#include "regexp.h" |
|
64 |
#ifdef __WIN__
|
|
65 |
#include <string.h> |
|
66 |
#else
|
|
67 |
#include "memory.h" |
|
68 |
#include "error.h" |
|
69 |
#endif
|
|
70 |
||
71 |
/*
|
|
72 |
* The "internal use only" fields in regexp.h are present to pass info from
|
|
73 |
* compile to execute that permits the execute phase to run lots faster on
|
|
74 |
* simple cases. They are:
|
|
75 |
*
|
|
76 |
* regstart char that must begin a match; '\0' if none obvious
|
|
77 |
* reganch is the match anchored (at beginning-of-line only)?
|
|
78 |
* regmust string (pointer into program) that match must include, or NULL
|
|
79 |
* regmlen length of regmust string
|
|
80 |
*
|
|
81 |
* Regstart and reganch permit very fast decisions on suitable starting points
|
|
82 |
* for a match, cutting down the work a lot. Regmust permits fast rejection
|
|
83 |
* of lines that cannot possibly match. The regmust tests are costly enough
|
|
84 |
* that regcomp() supplies a regmust only if the r.e. contains something
|
|
85 |
* potentially expensive (at present, the only such thing detected is * or +
|
|
86 |
* at the start of the r.e., which can involve a lot of backup). Regmlen is
|
|
87 |
* supplied because the test in regexec() needs it and regcomp() is computing
|
|
88 |
* it anyway.
|
|
89 |
*/
|
|
90 |
||
91 |
/*
|
|
92 |
* Structure for regexp "program". This is essentially a linear encoding
|
|
93 |
* of a nondeterministic finite-state machine (aka syntax charts or
|
|
94 |
* "railroad normal form" in parsing technology). Each node is an opcode
|
|
95 |
* plus a "nxt" pointer, possibly plus an operand. "Nxt" pointers of
|
|
96 |
* all nodes except BRANCH implement concatenation; a "nxt" pointer with
|
|
97 |
* a BRANCH on both ends of it is connecting two alternatives. (Here we
|
|
98 |
* have one of the subtle syntax dependencies: an individual BRANCH (as
|
|
99 |
* opposed to a collection of them) is never concatenated with anything
|
|
100 |
* because of operator precedence.) The operand of some types of node is
|
|
101 |
* a literal string; for others, it is a node leading into a sub-FSM. In
|
|
102 |
* particular, the operand of a BRANCH node is the first node of the branch.
|
|
103 |
* (NB this is *not* a tree structure: the tail of the branch connects
|
|
104 |
* to the thing following the set of BRANCHes.) The opcodes are:
|
|
105 |
*/
|
|
106 |
||
107 |
/* definition number opnd? meaning */
|
|
108 |
#define END 0 /* no End of program. */ |
|
109 |
#define BOL 1 /* no Match "" at beginning of line. */ |
|
110 |
#define EOL 2 /* no Match "" at end of line. */ |
|
111 |
#define ANY 3 /* no Match any one character. */ |
|
112 |
#define ANYOF 4 /* str Match any character in this string. */ |
|
113 |
#define ANYBUT 5 /* str Match any character not in this |
|
114 |
* string. */
|
|
115 |
#define BRANCH 6 /* node Match this alternative, or the |
|
116 |
* nxt... */
|
|
117 |
#define BACK 7 /* no Match "", "nxt" ptr points backward. */ |
|
118 |
#define EXACTLY 8 /* str Match this string. */ |
|
119 |
#define NOTHING 9 /* no Match empty string. */ |
|
120 |
#define STAR 10 /* node Match this (simple) thing 0 or more |
|
121 |
* times. */
|
|
122 |
#define WORDSTART 11 /* node matching a start of a word */ |
|
123 |
#define WORDEND 12 /* node matching an end of a word */ |
|
124 |
#define OPEN 20 /* no Mark this point in input as start of |
|
125 |
* #n. */
|
|
126 |
/* OPEN+1 is number 1, etc. */
|
|
127 |
#define CLOSE 30 /* no Analogous to OPEN. */ |
|
128 |
||
129 |
/*
|
|
130 |
* Opcode notes:
|
|
131 |
*
|
|
132 |
* BRANCH The set of branches constituting a single choice are hooked
|
|
133 |
* together with their "nxt" pointers, since precedence prevents
|
|
134 |
* anything being concatenated to any individual branch. The
|
|
135 |
* "nxt" pointer of the last BRANCH in a choice points to the
|
|
136 |
* thing following the whole choice. This is also where the
|
|
137 |
* final "nxt" pointer of each individual branch points; each
|
|
138 |
* branch starts with the operand node of a BRANCH node.
|
|
139 |
*
|
|
140 |
* BACK Normal "nxt" pointers all implicitly point forward; BACK
|
|
141 |
* exists to make loop structures possible.
|
|
142 |
*
|
|
143 |
* STAR complex '*', are implemented as circular BRANCH structures
|
|
144 |
* using BACK. Simple cases (one character per match) are
|
|
145 |
* implemented with STAR for speed and to minimize recursive
|
|
146 |
* plunges.
|
|
147 |
*
|
|
148 |
* OPEN,CLOSE ...are numbered at compile time.
|
|
149 |
*/
|
|
150 |
||
151 |
/*
|
|
152 |
* A node is one char of opcode followed by two chars of "nxt" pointer.
|
|
153 |
* "Nxt" pointers are stored as two 8-bit pieces, high order first. The
|
|
154 |
* value is a positive offset from the opcode of the node containing it.
|
|
155 |
* An operand, if any, simply follows the node. (Note that much of the
|
|
156 |
* code generation knows about this implicit relationship.)
|
|
157 |
*
|
|
158 |
* Using two bytes for the "nxt" pointer is vast overkill for most things,
|
|
159 |
* but allows patterns to get big without disasters.
|
|
160 |
*/
|
|
161 |
#define OP(p) (*(p))
|
|
162 |
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
|
|
163 |
#define OPERAND(p) ((p) + 3)
|
|
164 |
||
165 |
/*
|
|
166 |
* The first byte of the regexp internal "program" is actually this magic
|
|
167 |
* number; the start node begins in the second byte.
|
|
168 |
*/
|
|
169 |
#define MAGIC 0234
|
|
170 |
||
171 |
/*
|
|
172 |
* Utility definitions.
|
|
173 |
*/
|
|
174 |
||
175 |
#ifdef __WIN__
|
|
176 |
#define error(X,Y) fprintf(stderr, X, Y)
|
|
177 |
#endif
|
|
178 |
#define regerror(X) error("Regexp: %s\n",X);
|
|
179 |
#define SPECIAL 0x100
|
|
180 |
#define LBRAC ('('|SPECIAL)
|
|
181 |
#define RBRAC (')'|SPECIAL)
|
|
182 |
#define ASTERIX ('*'|SPECIAL)
|
|
183 |
#define PLUS ('+'|SPECIAL)
|
|
184 |
#define OR_OP ('|'|SPECIAL)
|
|
185 |
#define DOLLAR ('$'|SPECIAL)
|
|
186 |
#define DOT ('.'|SPECIAL)
|
|
187 |
#define CARET ('^'|SPECIAL)
|
|
188 |
#define LSQBRAC ('['|SPECIAL)
|
|
189 |
#define RSQBRAC (']'|SPECIAL)
|
|
190 |
#define LSHBRAC ('<'|SPECIAL)
|
|
191 |
#define RSHBRAC ('>'|SPECIAL)
|
|
192 |
#define FAIL(m) { regerror(m); return(NULL); }
|
|
193 |
#define ISMULT(c) ((c) == ASTERIX || (c)==PLUS)
|
|
194 |
#define META "^$.[()|*+\\"
|
|
195 |
#ifndef CHARBITS
|
|
196 |
#define CHARBITS 0xff
|
|
197 |
#define UCHARAT(p) ((int)*(unsigned char *)(p))
|
|
198 |
#else
|
|
199 |
#define UCHARAT(p) ((int)*(p)&CHARBITS)
|
|
200 |
#endif
|
|
201 |
#define ISWORDPART(c) ( isalnum(c) || (c) == '_' )
|
|
202 |
||
203 |
/*
|
|
204 |
* Flags to be passed up and down.
|
|
205 |
*/
|
|
206 |
#define HASWIDTH 01 /* Known never to match null string. */ |
|
207 |
#define SIMPLE 02 /* Simple enough to be STAR operand. */ |
|
208 |
#define SPSTART 04 /* Starts with * */ |
|
209 |
#define WORST 0 /* Worst case. */ |
|
210 |
#ifdef __WIN__
|
|
211 |
#define STRCHR(A,B) strchr(A,B)
|
|
212 |
#endif
|
|
213 |
||
214 |
/*
|
|
215 |
* Global work variables for regcomp().
|
|
216 |
*/
|
|
217 |
static short *regparse; /* Input-scan pointer. */ |
|
218 |
static int regnpar; /* () count. */ |
|
219 |
static char regdummy; |
|
220 |
static char *regcode; /* Code-emit pointer; ®dummy = don't. */ |
|
221 |
static long regsize; /* Code size. */ |
|
222 |
||
223 |
/*
|
|
224 |
* Forward declarations for regcomp()'s friends.
|
|
225 |
*/
|
|
226 |
#ifndef STATIC
|
|
227 |
#define STATIC static
|
|
228 |
#endif
|
|
229 |
STATIC char *reg(); |
|
230 |
STATIC char *regbranch(); |
|
231 |
STATIC char *regpiece(); |
|
232 |
STATIC char *regatom(); |
|
233 |
STATIC char *regnode(); |
|
234 |
STATIC char *regnext(); |
|
235 |
STATIC void regc(); |
|
236 |
STATIC void reginsert(); |
|
237 |
STATIC void regtail(); |
|
238 |
STATIC void regoptail(); |
|
239 |
||
240 |
/*
|
|
241 |
- regcomp - compile a regular expression into internal code
|
|
242 |
*
|
|
243 |
* We can't allocate space until we know how big the compiled form will be,
|
|
244 |
* but we can't compile it (and thus know how big it is) until we've got a
|
|
245 |
* place to put the code. So we cheat: we compile it twice, once with code
|
|
246 |
* generation turned off and size counting turned on, and once "for real".
|
|
247 |
* This also means that we don't allocate space until we are sure that the
|
|
248 |
* thing really will compile successfully, and we never have to move the
|
|
249 |
* code and thus invalidate pointers into it. (Note that it has to be in
|
|
250 |
* one piece because free() must be able to free it all.)
|
|
251 |
*
|
|
252 |
* Beware that the optimization-preparation code in here knows about some
|
|
253 |
* of the structure of the compiled regexp.
|
|
254 |
*/
|
|
255 |
regexp *regcomp(exp,excompat) |
|
256 |
char *exp; |
|
257 |
int excompat; /* \( \) operators like in unix ex */ |
|
258 |
{
|
|
259 |
register regexp *r; |
|
260 |
register char *scan; |
|
261 |
register char *longest; |
|
262 |
register int len; |
|
263 |
int flags; |
|
264 |
short *exp2,*dest,c; |
|
265 |
||
266 |
if (exp == (char *)NULL) |
|
267 |
FAIL("NULL argument"); |
|
268 |
||
269 |
exp2=(short*)malloc( (strlen(exp)+1) * (sizeof(short[8])/sizeof(char[8])) ); |
|
270 |
for ( scan=exp,dest=exp2;( c= UCHARAT(scan++)); ) { |
|
271 |
switch (c) { |
|
272 |
case '(': |
|
273 |
case ')': |
|
274 |
*dest++ = excompat ? c : c | SPECIAL; |
|
275 |
break; |
|
276 |
case '.': |
|
277 |
case '*': |
|
278 |
case '+': |
|
279 |
case '|': |
|
280 |
case '$': |
|
281 |
case '^': |
|
282 |
case '[': |
|
283 |
case ']': |
|
284 |
*dest++ = c | SPECIAL; |
|
285 |
break; |
|
286 |
case '\\': |
|
287 |
switch ( c = *scan++ ) { |
|
288 |
case '(': |
|
289 |
case ')': |
|
290 |
*dest++ = excompat ? c | SPECIAL : c; |
|
291 |
break; |
|
292 |
case '<': |
|
293 |
case '>': |
|
294 |
*dest++ = c | SPECIAL; |
|
295 |
break; |
|
296 |
case '{': |
|
297 |
case '}': |
|
298 |
FAIL("sorry, unimplemented operator"); |
|
299 |
case 'b': *dest++ = '\b'; break; |
|
300 |
case 't': *dest++ = '\t'; break; |
|
301 |
case 'r': *dest++ = '\r'; break; |
|
302 |
default: |
|
303 |
*dest++ = c; |
|
304 |
}
|
|
305 |
break; |
|
306 |
default: |
|
307 |
*dest++ = c; |
|
308 |
}
|
|
309 |
}
|
|
310 |
*dest=0; |
|
311 |
/* First pass: determine size, legality. */
|
|
312 |
regparse = exp2; |
|
313 |
regnpar = 1; |
|
314 |
regsize = 0L; |
|
315 |
regcode = ®dummy; |
|
316 |
regc(MAGIC); |
|
317 |
if (reg(0, &flags) == (char *)NULL) |
|
318 |
return ((regexp *)NULL); |
|
319 |
||
320 |
/* Small enough for pointer-storage convention? */
|
|
321 |
if (regsize >= 32767L) /* Probably could be 65535L. */ |
|
322 |
FAIL("regexp too big"); |
|
323 |
||
324 |
/* Allocate space. */
|
|
325 |
r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize); |
|
326 |
if (r == (regexp *) NULL) |
|
327 |
FAIL("out of space"); |
|
328 |
(void) bzero(r, sizeof(regexp) + (unsigned)regsize); |
|
329 |
||
330 |
/* Second pass: emit code. */
|
|
331 |
regparse = exp2; |
|
332 |
regnpar = 1; |
|
333 |
regcode = r->program; |
|
334 |
regc(MAGIC); |
|
335 |
if (reg(0, &flags) == NULL) |
|
336 |
return ((regexp *) NULL); |
|
337 |
||
338 |
/* Dig out information for optimizations. */
|
|
339 |
r->regstart = '\0'; /* Worst-case defaults. */ |
|
340 |
r->reganch = 0; |
|
341 |
r->regmust = NULL; |
|
342 |
r->regmlen = 0; |
|
343 |
scan = r->program + 1; /* First BRANCH. */ |
|
344 |
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ |
|
345 |
scan = OPERAND(scan); |
|
346 |
||
347 |
/* Starting-point info. */
|
|
348 |
if (OP(scan) == EXACTLY) |
|
349 |
r->regstart = *OPERAND(scan); |
|
350 |
else if (OP(scan) == BOL) |
|
351 |
r->reganch++; |
|
352 |
||
353 |
/*
|
|
354 |
* If there's something expensive in the r.e., find the longest
|
|
355 |
* literal string that must appear and make it the regmust. Resolve
|
|
356 |
* ties in favor of later strings, since the regstart check works
|
|
357 |
* with the beginning of the r.e. and avoiding duplication
|
|
358 |
* strengthens checking. Not a strong reason, but sufficient in the
|
|
359 |
* absence of others.
|
|
360 |
*/
|
|
361 |
if (flags & SPSTART) { |
|
362 |
longest = NULL; |
|
363 |
len = 0; |
|
364 |
for (; scan != NULL; scan = regnext(scan)) |
|
365 |
if (OP(scan) == EXACTLY && |
|
366 |
(int)strlen(OPERAND(scan)) >= len) { |
|
367 |
longest = OPERAND(scan); |
|
368 |
len = strlen(OPERAND(scan)); |
|
369 |
}
|
|
370 |
r->regmust = longest; |
|
371 |
r->regmlen = len; |
|
372 |
}
|
|
373 |
}
|
|
374 |
free((char*)exp2); |
|
375 |
return (r); |
|
376 |
}
|
|
377 |
||
378 |
/*
|
|
379 |
- reg - regular expression, i.e. main body or parenthesized thing
|
|
380 |
*
|
|
381 |
* Caller must absorb opening parenthesis.
|
|
382 |
*
|
|
383 |
* Combining parenthesis handling with the base level of regular expression
|
|
384 |
* is a trifle forced, but the need to tie the tails of the branches to what
|
|
385 |
* follows makes it hard to avoid.
|
|
386 |
*/
|
|
387 |
static char *reg(paren, flagp) |
|
388 |
int paren; /* Parenthesized? */ |
|
389 |
int *flagp; |
|
390 |
{
|
|
391 |
register char *ret; |
|
392 |
register char *br; |
|
393 |
register char *ender; |
|
394 |
register int parno=0; /* make gcc happy */ |
|
395 |
int flags; |
|
396 |
||
397 |
*flagp = HASWIDTH; /* Tentatively. */ |
|
398 |
||
399 |
/* Make an OPEN node, if parenthesized. */
|
|
400 |
if (paren) { |
|
401 |
if (regnpar >= NSUBEXP) |
|
402 |
FAIL("too many ()"); |
|
403 |
parno = regnpar; |
|
404 |
regnpar++; |
|
405 |
ret = regnode(OPEN + parno); |
|
406 |
} else |
|
407 |
ret = (char *)NULL; |
|
408 |
||
409 |
/* Pick up the branches, linking them together. */
|
|
410 |
br = regbranch(&flags); |
|
411 |
if (br == (char *)NULL) |
|
412 |
return ((char *)NULL); |
|
413 |
if (ret != (char *)NULL) |
|
414 |
regtail(ret, br); /* OPEN -> first. */ |
|
415 |
else
|
|
416 |
ret = br; |
|
417 |
if (!(flags & HASWIDTH)) |
|
418 |
*flagp &= ~HASWIDTH; |
|
419 |
*flagp |= flags & SPSTART; |
|
420 |
while (*regparse == OR_OP) { |
|
421 |
regparse++; |
|
422 |
br = regbranch(&flags); |
|
423 |
if (br == (char *)NULL) |
|
424 |
return ((char *)NULL); |
|
425 |
regtail(ret, br); /* BRANCH -> BRANCH. */ |
|
426 |
if (!(flags & HASWIDTH)) |
|
427 |
*flagp &= ~HASWIDTH; |
|
428 |
*flagp |= flags & SPSTART; |
|
429 |
}
|
|
430 |
||
431 |
/* Make a closing node, and hook it on the end. */
|
|
432 |
ender = regnode((paren) ? CLOSE + parno : END); |
|
433 |
regtail(ret, ender); |
|
434 |
||
435 |
/* Hook the tails of the branches to the closing node. */
|
|
436 |
for (br = ret; br != (char *)NULL; br = regnext(br)) |
|
437 |
regoptail(br, ender); |
|
438 |
||
439 |
/* Check for proper termination. */
|
|
440 |
if (paren && *regparse++ != RBRAC) { |
|
441 |
FAIL("unmatched ()"); |
|
442 |
} else if (!paren && *regparse != '\0') { |
|
443 |
if (*regparse == RBRAC) { |
|
444 |
FAIL("unmatched ()"); |
|
445 |
} else |
|
446 |
FAIL("junk on end");/* "Can't happen". */ |
|
447 |
/* NOTREACHED */
|
|
448 |
}
|
|
449 |
return (ret); |
|
450 |
}
|
|
451 |
||
452 |
/*
|
|
453 |
- regbranch - one alternative of an | operator
|
|
454 |
*
|
|
455 |
* Implements the concatenation operator.
|
|
456 |
*/
|
|
457 |
static char *regbranch(flagp) |
|
458 |
int *flagp; |
|
459 |
{
|
|
460 |
register char *ret; |
|
461 |
register char *chain; |
|
462 |
register char *latest; |
|
463 |
int flags; |
|
464 |
||
465 |
*flagp = WORST; /* Tentatively. */ |
|
466 |
||
467 |
ret = regnode(BRANCH); |
|
468 |
chain = (char *)NULL; |
|
469 |
while (*regparse != '\0' && *regparse != OR_OP && *regparse != RBRAC) { |
|
470 |
latest = regpiece(&flags); |
|
471 |
if (latest == (char *)NULL) |
|
472 |
return ((char *)NULL); |
|
473 |
*flagp |= flags & HASWIDTH; |
|
474 |
if (chain == (char *)NULL) /* First piece. */ |
|
475 |
*flagp |= flags & SPSTART; |
|
476 |
else
|
|
477 |
regtail(chain, latest); |
|
478 |
chain = latest; |
|
479 |
}
|
|
480 |
if (chain == (char *)NULL) /* Loop ran zero times. */ |
|
481 |
regnode(NOTHING); |
|
482 |
||
483 |
return (ret); |
|
484 |
}
|
|
485 |
||
486 |
/*
|
|
487 |
- regpiece - something followed by possible [*]
|
|
488 |
*
|
|
489 |
* Note that the branching code sequence used for * is somewhat optimized:
|
|
490 |
* they use the same NOTHING node as both the endmarker for their branch
|
|
491 |
* list and the body of the last branch. It might seem that this node could
|
|
492 |
* be dispensed with entirely, but the endmarker role is not redundant.
|
|
493 |
*/
|
|
494 |
static char *regpiece(flagp) |
|
495 |
int *flagp; |
|
496 |
{
|
|
497 |
register char *ret; |
|
498 |
register short op; |
|
499 |
/* register char *nxt; */
|
|
500 |
int flags; |
|
501 |
||
502 |
ret = regatom(&flags); |
|
503 |
if (ret == (char *)NULL) |
|
504 |
return ((char *)NULL); |
|
505 |
||
506 |
op = *regparse; |
|
507 |
if (!ISMULT(op)) { |
|
508 |
*flagp = flags; |
|
509 |
return (ret); |
|
510 |
}
|
|
511 |
if (!(flags & HASWIDTH)) |
|
512 |
FAIL("* or + operand could be empty"); |
|
513 |
*flagp = (WORST | SPSTART); |
|
514 |
||
515 |
if(op == ASTERIX) |
|
516 |
{
|
|
517 |
if (flags & SIMPLE) |
|
518 |
{
|
|
519 |
reginsert(STAR, ret); |
|
520 |
}
|
|
521 |
else
|
|
522 |
{
|
|
523 |
/* Emit x* as (x&|), where & means "self". */
|
|
524 |
reginsert(BRANCH, ret); /* Either x */ |
|
525 |
regoptail(ret, regnode(BACK)); /* and loop */ |
|
526 |
regoptail(ret, ret); /* back */ |
|
527 |
regtail(ret, regnode(BRANCH)); /* or */ |
|
528 |
regtail(ret, regnode(NOTHING)); /* null. */ |
|
529 |
}
|
|
530 |
}
|
|
531 |
else if(op == PLUS) |
|
532 |
{
|
|
533 |
/* Emit a+ as (a&) where & means "self" /Fredrik Hubinette */
|
|
534 |
char *tmp; |
|
535 |
tmp=regnode(BACK); |
|
536 |
reginsert(BRANCH, tmp); |
|
537 |
regtail(ret, tmp); |
|
538 |
regoptail(tmp, ret); |
|
539 |
regtail(ret, regnode(BRANCH)); |
|
540 |
regtail(ret, regnode(NOTHING)); |
|
541 |
}
|
|
542 |
||
543 |
regparse++; |
|
544 |
if (ISMULT(*regparse)) |
|
545 |
FAIL("nested * or +"); |
|
546 |
||
547 |
return (ret); |
|
548 |
}
|
|
549 |
||
550 |
||
551 |
/*
|
|
552 |
- regatom - the lowest level
|
|
553 |
*
|
|
554 |
* Optimization: gobbles an entire sequence of ordinary characters so that
|
|
555 |
* it can turn them into a single node, which is smaller to store and
|
|
556 |
* faster to run.
|
|
557 |
*/
|
|
558 |
static char *regatom(flagp) |
|
559 |
int *flagp; |
|
560 |
{
|
|
561 |
register char *ret; |
|
562 |
int flags; |
|
563 |
||
564 |
*flagp = WORST; /* Tentatively. */ |
|
565 |
||
566 |
switch (*regparse++) { |
|
567 |
case CARET: |
|
568 |
ret = regnode(BOL); |
|
569 |
break; |
|
570 |
case DOLLAR: |
|
571 |
ret = regnode(EOL); |
|
572 |
break; |
|
573 |
case DOT: |
|
574 |
ret = regnode(ANY); |
|
575 |
*flagp |= HASWIDTH | SIMPLE; |
|
576 |
break; |
|
577 |
case LSHBRAC: |
|
578 |
ret = regnode(WORDSTART); |
|
579 |
break; |
|
580 |
case RSHBRAC: |
|
581 |
ret = regnode(WORDEND); |
|
582 |
break; |
|
583 |
case LSQBRAC:{ |
|
584 |
register int class; |
|
585 |
register int classend; |
|
586 |
||
587 |
if (*regparse == CARET) { /* Complement of range. */ |
|
588 |
ret = regnode(ANYBUT); |
|
589 |
regparse++; |
|
590 |
} else |
|
591 |
ret = regnode(ANYOF); |
|
592 |
if (*regparse == RSQBRAC || *regparse == '-') |
|
593 |
regc(*regparse++); |
|
594 |
while (*regparse != '\0' && *regparse != RSQBRAC) { |
|
595 |
if (*regparse == '-') { |
|
596 |
regparse++; |
|
597 |
if (*regparse == RSQBRAC || *regparse == '\0') |
|
598 |
regc('-'); |
|
599 |
else { |
|
600 |
class = (CHARBITS & *(regparse - 2)) + 1; |
|
601 |
classend = (CHARBITS & *(regparse)); |
|
602 |
if (class > classend + 1) |
|
603 |
FAIL("invalid [] range"); |
|
604 |
for (; class <= classend; class++) |
|
605 |
regc(class); |
|
606 |
regparse++; |
|
607 |
}
|
|
608 |
} else |
|
609 |
regc(*regparse++); |
|
610 |
}
|
|
611 |
regc('\0'); |
|
612 |
if (*regparse != RSQBRAC) |
|
613 |
FAIL("unmatched []"); |
|
614 |
regparse++; |
|
615 |
*flagp |= HASWIDTH | SIMPLE; |
|
616 |
}
|
|
617 |
break; |
|
618 |
case LBRAC: |
|
619 |
ret = reg(1, &flags); |
|
620 |
if (ret == (char *)NULL) |
|
621 |
return ((char *)NULL); |
|
622 |
*flagp |= flags & (HASWIDTH | SPSTART); |
|
623 |
break; |
|
624 |
case '\0': |
|
625 |
case OR_OP: |
|
626 |
case RBRAC: |
|
627 |
FAIL("internal urp"); /* Supposed to be caught earlier. */ |
|
628 |
||
629 |
case ASTERIX: |
|
630 |
FAIL("* follows nothing\n"); |
|
631 |
||
632 |
default:{ |
|
633 |
register int len; |
|
634 |
register short ender; |
|
635 |
||
636 |
regparse--; |
|
637 |
for (len=0; regparse[len] && |
|
638 |
!(regparse[len]&SPECIAL) && regparse[len] != RSQBRAC; len++) ; |
|
639 |
if (len <= 0) |
|
640 |
{
|
|
641 |
FAIL("internal disaster"); |
|
642 |
}
|
|
643 |
ender = *(regparse + len); |
|
644 |
if (len > 1 && ISMULT(ender)) |
|
645 |
len--; /* Back off clear of * operand. */ |
|
646 |
*flagp |= HASWIDTH; |
|
647 |
if (len == 1) |
|
648 |
*flagp |= SIMPLE; |
|
649 |
ret = regnode(EXACTLY); |
|
650 |
while (len > 0) { |
|
651 |
regc(*regparse++); |
|
652 |
len--; |
|
653 |
}
|
|
654 |
regc('\0'); |
|
655 |
}
|
|
656 |
break; |
|
657 |
}
|
|
658 |
||
659 |
return (ret); |
|
660 |
}
|
|
661 |
||
662 |
/*
|
|
663 |
- regnode - emit a node
|
|
664 |
*/
|
|
665 |
static char *regnode(op) |
|
666 |
char op; |
|
667 |
{
|
|
668 |
register char *ret; |
|
669 |
register char *ptr; |
|
670 |
||
671 |
ret = regcode; |
|
672 |
if (ret == ®dummy) { |
|
673 |
regsize += 3; |
|
674 |
return (ret); |
|
675 |
}
|
|
676 |
ptr = ret; |
|
677 |
*ptr++ = op; |
|
678 |
*ptr++ = '\0'; /* Null "nxt" pointer. */ |
|
679 |
*ptr++ = '\0'; |
|
680 |
regcode = ptr; |
|
681 |
||
682 |
return (ret); |
|
683 |
}
|
|
684 |
||
685 |
/*
|
|
686 |
- regc - emit (if appropriate) a byte of code
|
|
687 |
*/
|
|
688 |
static void regc(b) |
|
689 |
char b; |
|
690 |
{
|
|
691 |
if (regcode != ®dummy) |
|
692 |
*regcode++ = b; |
|
693 |
else
|
|
694 |
regsize++; |
|
695 |
}
|
|
696 |
||
697 |
/*
|
|
698 |
- reginsert - insert an operator in front of already-emitted operand
|
|
699 |
*
|
|
700 |
* Means relocating the operand.
|
|
701 |
*/
|
|
702 |
static void reginsert(op, opnd) |
|
703 |
char op; |
|
704 |
char *opnd; |
|
705 |
{
|
|
706 |
register char *src; |
|
707 |
register char *dst; |
|
708 |
register char *place; |
|
709 |
||
710 |
if (regcode == ®dummy) { |
|
711 |
regsize += 3; |
|
712 |
return; |
|
713 |
}
|
|
714 |
src = regcode; |
|
715 |
regcode += 3; |
|
716 |
dst = regcode; |
|
717 |
while (src > opnd) |
|
718 |
*--dst = *--src; |
|
719 |
||
720 |
place = opnd; /* Op node, where operand used to be. */ |
|
721 |
*place++ = op; |
|
722 |
*place++ = '\0'; |
|
723 |
*place++ = '\0'; |
|
724 |
}
|
|
725 |
||
726 |
/*
|
|
727 |
- regtail - set the next-pointer at the end of a node chain
|
|
728 |
*/
|
|
729 |
static void regtail(p, val) |
|
730 |
char *p; |
|
731 |
char *val; |
|
732 |
{
|
|
733 |
register char *scan; |
|
734 |
register char *temp; |
|
735 |
register int offset; |
|
736 |
||
737 |
if (p == ®dummy) |
|
738 |
return; |
|
739 |
||
740 |
/* Find last node. */
|
|
741 |
scan = p; |
|
742 |
for (;;) { |
|
743 |
temp = regnext(scan); |
|
744 |
if (temp == (char *)NULL) |
|
745 |
break; |
|
746 |
scan = temp; |
|
747 |
}
|
|
748 |
||
749 |
if (OP(scan) == BACK) |
|
750 |
offset = scan - val; |
|
751 |
else
|
|
752 |
offset = val - scan; |
|
753 |
*(scan + 1) = (offset >> 8) & 0377; |
|
754 |
*(scan + 2) = offset & 0377; |
|
755 |
}
|
|
756 |
||
757 |
/*
|
|
758 |
- regoptail - regtail on operand of first argument; nop if operandless
|
|
759 |
*/
|
|
760 |
static void regoptail(p, val) |
|
761 |
char *p; |
|
762 |
char *val; |
|
763 |
{
|
|
764 |
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
|
|
765 |
if (p == (char *)NULL || p == ®dummy || OP(p) != BRANCH) |
|
766 |
return; |
|
767 |
regtail(OPERAND(p), val); |
|
768 |
}
|
|
769 |
||
770 |
/*
|
|
771 |
* regexec and friends
|
|
772 |
*/
|
|
773 |
||
774 |
/*
|
|
775 |
* Global work variables for regexec().
|
|
776 |
*/
|
|
777 |
static char *reginput; /* String-input pointer. */ |
|
778 |
static char *regbol; /* Beginning of input, for ^ check. */ |
|
779 |
static char **regstartp; /* Pointer to startp array. */ |
|
780 |
static char **regendp; /* Ditto for endp. */ |
|
781 |
||
782 |
/*
|
|
783 |
* Forwards.
|
|
784 |
*/
|
|
785 |
STATIC int regtry(); |
|
786 |
STATIC int regmatch(); |
|
787 |
STATIC int regrepeat(); |
|
788 |
||
789 |
#ifdef DEBUG
|
|
790 |
int regnarrate = 0; |
|
791 |
void regdump(); |
|
792 |
STATIC char *regprop(); |
|
793 |
#endif
|
|
794 |
||
795 |
/*
|
|
796 |
- regexec - match a regexp against a string
|
|
797 |
*/
|
|
798 |
int regexec(prog, string) |
|
799 |
register regexp *prog; |
|
800 |
register char *string; |
|
801 |
{
|
|
802 |
register char *s; |
|
803 |
||
804 |
/* Be paranoid... */
|
|
805 |
if (prog == (regexp *)NULL || string == (char *)NULL) { |
|
806 |
regerror("NULL parameter"); |
|
807 |
return (0); |
|
808 |
}
|
|
809 |
/* Check validity of program. */
|
|
810 |
if (UCHARAT(prog->program) != MAGIC) { |
|
811 |
regerror("corrupted program"); |
|
812 |
return (0); |
|
813 |
}
|
|
814 |
/* If there is a "must appear" string, look for it. */
|
|
815 |
if (prog->regmust != (char *)NULL) { |
|
816 |
s = string; |
|
817 |
while ((s = STRCHR(s, prog->regmust[0])) != (char *)NULL) { |
|
818 |
if (strncmp(s, prog->regmust, prog->regmlen) == 0) |
|
819 |
break; /* Found it. */ |
|
820 |
s++; |
|
821 |
}
|
|
822 |
if (s == (char *)NULL) /* Not present. */ |
|
823 |
return (0); |
|
824 |
}
|
|
825 |
/* Mark beginning of line for ^ . */
|
|
826 |
regbol = string; |
|
827 |
||
828 |
/* Simplest case: anchored match need be tried only once. */
|
|
829 |
if (prog->reganch) |
|
830 |
return (regtry(prog, string)); |
|
831 |
||
832 |
/* Messy cases: unanchored match. */
|
|
833 |
s = string; |
|
834 |
if (prog->regstart != '\0') |
|
835 |
/* We know what char it must start with. */
|
|
836 |
while ((s = STRCHR(s, prog->regstart)) != (char *)NULL) { |
|
837 |
if (regtry(prog, s)) |
|
838 |
return (1); |
|
839 |
s++; |
|
840 |
}
|
|
841 |
else
|
|
842 |
/* We don't -- general case. */
|
|
843 |
do { |
|
844 |
if (regtry(prog, s)) |
|
845 |
return (1); |
|
846 |
} while (*s++ != '\0'); |
|
847 |
||
848 |
/* Failure. */
|
|
849 |
return (0); |
|
850 |
}
|
|
851 |
||
852 |
/*
|
|
853 |
- regtry - try match at specific point
|
|
854 |
*/
|
|
855 |
||
856 |
static int regtry(regexp *prog, char *string) |
|
857 |
{
|
|
858 |
register int i; |
|
859 |
register char **sp; |
|
860 |
register char **ep; |
|
861 |
||
862 |
reginput = string; |
|
863 |
regstartp = prog->startp; |
|
864 |
regendp = prog->endp; |
|
865 |
||
866 |
sp = prog->startp; |
|
867 |
ep = prog->endp; |
|
868 |
for (i = NSUBEXP; i > 0; i--) { |
|
869 |
*sp++ = (char *)NULL; |
|
870 |
*ep++ = (char *)NULL; |
|
871 |
}
|
|
872 |
if (regmatch(prog->program + 1)) { |
|
873 |
prog->startp[0] = string; |
|
874 |
prog->endp[0] = reginput; |
|
875 |
return (1); |
|
876 |
} else |
|
877 |
return (0); |
|
878 |
}
|
|
879 |
||
880 |
/*
|
|
881 |
- regmatch - main matching routine
|
|
882 |
*
|
|
883 |
* Conceptually the strategy is simple: check to see whether the current
|
|
884 |
* node matches, call self recursively to see whether the rest matches,
|
|
885 |
* and then act accordingly. In practice we make some effort to avoid
|
|
886 |
* recursion, in particular by going through "ordinary" nodes (that don't
|
|
887 |
* need to know whether the rest of the match failed) by a loop instead of
|
|
888 |
* by recursion.
|
|
889 |
*/
|
|
890 |
||
891 |
static int regmatch(char *prog) |
|
892 |
{
|
|
893 |
register char *scan; /* Current node. */ |
|
894 |
char *nxt; /* nxt node. */ |
|
895 |
||
896 |
scan = prog; |
|
897 |
#ifdef DEBUG
|
|
898 |
if (scan != (char *)NULL && regnarrate) |
|
899 |
fprintf(stderr, "%s(\n", regprop(scan)); |
|
900 |
#endif
|
|
901 |
while (scan != (char *)NULL) { |
|
902 |
#ifdef DEBUG
|
|
903 |
if (regnarrate) |
|
904 |
fprintf(stderr, "%s...\n", regprop(scan)); |
|
905 |
#endif
|
|
906 |
nxt = regnext(scan); |
|
907 |
||
908 |
switch (OP(scan)) { |
|
909 |
case BOL: |
|
910 |
if (reginput != regbol) |
|
911 |
return (0); |
|
912 |
break; |
|
913 |
case EOL: |
|
914 |
if (*reginput != '\0') |
|
915 |
return (0); |
|
916 |
break; |
|
917 |
case ANY: |
|
918 |
if (*reginput == '\0') |
|
919 |
return (0); |
|
920 |
reginput++; |
|
921 |
break; |
|
922 |
case WORDSTART: |
|
923 |
if (reginput == regbol) |
|
924 |
break; |
|
925 |
if (*reginput == '\0' || |
|
926 |
ISWORDPART( *(reginput-1) ) || !ISWORDPART( *reginput ) ) |
|
927 |
return (0); |
|
928 |
break; |
|
929 |
case WORDEND: |
|
930 |
if (*reginput == '\0') |
|
931 |
break; |
|
932 |
if ( reginput == regbol || |
|
933 |
!ISWORDPART( *(reginput-1) ) || ISWORDPART( *reginput ) ) |
|
934 |
return (0); |
|
935 |
break; |
|
936 |
case EXACTLY:{ |
|
937 |
register int len; |
|
938 |
register char *opnd; |
|
939 |
||
940 |
opnd = OPERAND(scan); |
|
941 |
/* Inline the first character, for speed. */
|
|
942 |
if (*opnd != *reginput) |
|
943 |
return (0); |
|
944 |
len = strlen(opnd); |
|
945 |
if (len > 1 && strncmp(opnd, reginput, len) != 0) |
|
946 |
return (0); |
|
947 |
reginput += len; |
|
948 |
}
|
|
949 |
break; |
|
950 |
case ANYOF: |
|
951 |
if (*reginput == '\0' || |
|
952 |
STRCHR(OPERAND(scan), *reginput) == (char *)NULL) |
|
953 |
return (0); |
|
954 |
reginput++; |
|
955 |
break; |
|
956 |
case ANYBUT: |
|
957 |
if (*reginput == '\0' || |
|
958 |
STRCHR(OPERAND(scan), *reginput) != (char *)NULL) |
|
959 |
return (0); |
|
960 |
reginput++; |
|
961 |
break; |
|
962 |
case NOTHING: |
|
963 |
break; |
|
964 |
case BACK: |
|
965 |
break; |
|
966 |
case OPEN + 1: |
|
967 |
case OPEN + 2: |
|
968 |
case OPEN + 3: |
|
969 |
case OPEN + 4: |
|
970 |
case OPEN + 5: |
|
971 |
case OPEN + 6: |
|
972 |
case OPEN + 7: |
|
973 |
case OPEN + 8: |
|
974 |
case OPEN + 9:{ |
|
975 |
register int no; |
|
976 |
register char *save; |
|
977 |
||
978 |
no = OP(scan) - OPEN; |
|
979 |
save = reginput; |
|
980 |
||
981 |
if (regmatch(nxt)) { |
|
982 |
/*
|
|
983 |
* Don't set startp if some later invocation of the same
|
|
984 |
* parentheses already has.
|
|
985 |
*/
|
|
986 |
if (regstartp[no] == (char *)NULL) |
|
987 |
regstartp[no] = save; |
|
988 |
return (1); |
|
989 |
} else |
|
990 |
return (0); |
|
991 |
}
|
|
992 |
||
993 |
case CLOSE + 1: |
|
994 |
case CLOSE + 2: |
|
995 |
case CLOSE + 3: |
|
996 |
case CLOSE + 4: |
|
997 |
case CLOSE + 5: |
|
998 |
case CLOSE + 6: |
|
999 |
case CLOSE + 7: |
|
1000 |
case CLOSE + 8: |
|
1001 |
case CLOSE + 9:{ |
|
1002 |
register int no; |
|
1003 |
register char *save; |
|
1004 |
||
1005 |
no = OP(scan) - CLOSE; |
|
1006 |
save = reginput; |
|
1007 |
||
1008 |
if (regmatch(nxt)) { |
|
1009 |
/*
|
|
1010 |
* Don't set endp if some later invocation of the same
|
|
1011 |
* parentheses already has.
|
|
1012 |
*/
|
|
1013 |
if (regendp[no] == (char *)NULL) |
|
1014 |
regendp[no] = save; |
|
1015 |
return (1); |
|
1016 |
} else |
|
1017 |
return (0); |
|
1018 |
}
|
|
1019 |
||
1020 |
case BRANCH:{ |
|
1021 |
register char *save; |
|
1022 |
||
1023 |
if (OP(nxt) != BRANCH) /* No choice. */ |
|
1024 |
nxt = OPERAND(scan); /* Avoid recursion. */ |
|
1025 |
else { |
|
1026 |
do { |
|
1027 |
save = reginput; |
|
1028 |
if (regmatch(OPERAND(scan))) |
|
1029 |
return (1); |
|
1030 |
reginput = save; |
|
1031 |
scan = regnext(scan); |
|
1032 |
} while (scan != (char *)NULL && OP(scan) == BRANCH); |
|
1033 |
return (0); |
|
1034 |
/* NOTREACHED */
|
|
1035 |
}
|
|
1036 |
}
|
|
1037 |
break; |
|
1038 |
case STAR:{ |
|
1039 |
register char nextch; |
|
1040 |
register int no; |
|
1041 |
register char *save; |
|
1042 |
register int minimum; |
|
1043 |
||
1044 |
/*
|
|
1045 |
* Lookahead to avoid useless match attempts when we know
|
|
1046 |
* what character comes next.
|
|
1047 |
*/
|
|
1048 |
nextch = '\0'; |
|
1049 |
if (OP(nxt) == EXACTLY) |
|
1050 |
nextch = *OPERAND(nxt); |
|
1051 |
minimum = (OP(scan) == STAR) ? 0 : 1; |
|
1052 |
save = reginput; |
|
1053 |
no = regrepeat(OPERAND(scan)); |
|
1054 |
while (no >= minimum) { |
|
1055 |
/* If it could work, try it. */
|
|
1056 |
if (nextch == '\0' || *reginput == nextch) |
|
1057 |
if (regmatch(nxt)) |
|
1058 |
return (1); |
|
1059 |
/* Couldn't or didn't -- back up. */
|
|
1060 |
no--; |
|
1061 |
reginput = save + no; |
|
1062 |
}
|
|
1063 |
return (0); |
|
1064 |
}
|
|
1065 |
||
1066 |
case END: |
|
1067 |
return (1); /* Success! */ |
|
1068 |
||
1069 |
default: |
|
1070 |
regerror("memory corruption"); |
|
1071 |
return (0); |
|
1072 |
||
1073 |
}
|
|
1074 |
||
1075 |
scan = nxt; |
|
1076 |
}
|
|
1077 |
||
1078 |
/*
|
|
1079 |
* We get here only if there's trouble -- normally "case END" is the
|
|
1080 |
* terminating point.
|
|
1081 |
*/
|
|
1082 |
regerror("corrupted pointers"); |
|
1083 |
return (0); |
|
1084 |
}
|
|
1085 |
||
1086 |
/*
|
|
1087 |
- regrepeat - repeatedly match something simple, report how many
|
|
1088 |
*/
|
|
1089 |
||
1090 |
static int regrepeat(char *p) |
|
1091 |
{
|
|
1092 |
register int count = 0; |
|
1093 |
register char *scan; |
|
1094 |
register char *opnd; |
|
1095 |
||
1096 |
scan = reginput; |
|
1097 |
opnd = OPERAND(p); |
|
1098 |
switch (OP(p)) { |
|
1099 |
case ANY: |
|
1100 |
count = strlen(scan); |
|
1101 |
scan += count; |
|
1102 |
break; |
|
1103 |
case EXACTLY: |
|
1104 |
while (*opnd == *scan) { |
|
1105 |
count++; |
|
1106 |
scan++; |
|
1107 |
}
|
|
1108 |
break; |
|
1109 |
case ANYOF: |
|
1110 |
while (*scan != '\0' && STRCHR(opnd, *scan) != (char *)NULL) { |
|
1111 |
count++; |
|
1112 |
scan++; |
|
1113 |
}
|
|
1114 |
break; |
|
1115 |
case ANYBUT: |
|
1116 |
while (*scan != '\0' && STRCHR(opnd, *scan) == (char *)NULL) { |
|
1117 |
count++; |
|
1118 |
scan++; |
|
1119 |
}
|
|
1120 |
break; |
|
1121 |
default: /* Oh dear. Called inappropriately. */ |
|
1122 |
regerror("internal foulup"); |
|
1123 |
count = 0; /* Best compromise. */ |
|
1124 |
break; |
|
1125 |
}
|
|
1126 |
reginput = scan; |
|
1127 |
||
1128 |
return (count); |
|
1129 |
}
|
|
1130 |
||
1131 |
||
1132 |
/*
|
|
1133 |
- regnext - dig the "nxt" pointer out of a node
|
|
1134 |
*/
|
|
1135 |
||
1136 |
static char *regnext(register char *p) |
|
1137 |
{
|
|
1138 |
register int offset; |
|
1139 |
||
1140 |
if (p == ®dummy) |
|
1141 |
return ((char *)NULL); |
|
1142 |
||
1143 |
offset = NEXT(p); |
|
1144 |
if (offset == 0) |
|
1145 |
return ((char *)NULL); |
|
1146 |
||
1147 |
if (OP(p) == BACK) |
|
1148 |
return (p - offset); |
|
1149 |
else
|
|
1150 |
return (p + offset); |
|
1151 |
}
|
|
1152 |
||
1153 |
#ifdef DEBUG
|
|
1154 |
||
1155 |
STATIC char *regprop(); |
|
1156 |
||
1157 |
/*
|
|
1158 |
- regdump - dump a regexp onto stdout in vaguely comprehensible form
|
|
1159 |
*/
|
|
1160 |
void regdump(regexp *r) |
|
1161 |
{
|
|
1162 |
register char *s; |
|
1163 |
register char op = EXACTLY; /* Arbitrary non-END op. */ |
|
1164 |
register char *nxt; |
|
1165 |
||
1166 |
s = r->program + 1; |
|
1167 |
while (op != END) { /* While that wasn't END last time... */ |
|
1168 |
op = OP(s); |
|
1169 |
printf("%2ld%s", (long)(s - r->program), regprop(s)); /* Where, what. */ |
|
1170 |
nxt = regnext(s); |
|
1171 |
if (nxt == (char *)NULL) /* nxt ptr. */ |
|
1172 |
printf("(0)"); |
|
1173 |
else
|
|
1174 |
printf("(%ld)", (long)( (s - r->program) + (nxt - s))); |
|
1175 |
s += 3; |
|
1176 |
if (op == ANYOF || op == ANYBUT || op == EXACTLY) { |
|
1177 |
/* Literal string, where present. */
|
|
1178 |
while (*s != '\0') { |
|
1179 |
putchar(*s); |
|
1180 |
s++; |
|
1181 |
}
|
|
1182 |
s++; |
|
1183 |
}
|
|
1184 |
putchar('\n'); |
|
1185 |
}
|
|
1186 |
||
1187 |
/* Header fields of interest. */
|
|
1188 |
if (r->regstart != '\0') |
|
1189 |
printf("start `%c' ", r->regstart); |
|
1190 |
if (r->reganch) |
|
1191 |
printf("anchored "); |
|
1192 |
if (r->regmust != (char *)NULL) |
|
1193 |
printf("must have \"%s\"", r->regmust); |
|
1194 |
printf("\n"); |
|
1195 |
}
|
|
1196 |
||
1197 |
/*
|
|
1198 |
- regprop - printable representation of opcode
|
|
1199 |
*/
|
|
1200 |
||
1201 |
static char *regprop(char *op) |
|
1202 |
{
|
|
1203 |
register char *p; |
|
1204 |
static char buf[50]; |
|
1205 |
||
1206 |
strcpy(buf, ":"); |
|
1207 |
||
1208 |
switch (OP(op)) { |
|
1209 |
case BOL: |
|
1210 |
p = "BOL"; |
|
1211 |
break; |
|
1212 |
case EOL: |
|
1213 |
p = "EOL"; |
|
1214 |
break; |
|
1215 |
case ANY: |
|
1216 |
p = "ANY"; |
|
1217 |
break; |
|
1218 |
case ANYOF: |
|
1219 |
p = "ANYOF"; |
|
1220 |
break; |
|
1221 |
case ANYBUT: |
|
1222 |
p = "ANYBUT"; |
|
1223 |
break; |
|
1224 |
case BRANCH: |
|
1225 |
p = "BRANCH"; |
|
1226 |
break; |
|
1227 |
case EXACTLY: |
|
1228 |
p = "EXACTLY"; |
|
1229 |
break; |
|
1230 |
case NOTHING: |
|
1231 |
p = "NOTHING"; |
|
1232 |
break; |
|
1233 |
case BACK: |
|
1234 |
p = "BACK"; |
|
1235 |
break; |
|
1236 |
case END: |
|
1237 |
p = "END"; |
|
1238 |
break; |
|
1239 |
case OPEN + 1: |
|
1240 |
case OPEN + 2: |
|
1241 |
case OPEN + 3: |
|
1242 |
case OPEN + 4: |
|
1243 |
case OPEN + 5: |
|
1244 |
case OPEN + 6: |
|
1245 |
case OPEN + 7: |
|
1246 |
case OPEN + 8: |
|
1247 |
case OPEN + 9: |
|
1248 |
sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN); |
|
1249 |
p = (char *)NULL; |
|
1250 |
break; |
|
1251 |
case CLOSE + 1: |
|
1252 |
case CLOSE + 2: |
|
1253 |
case CLOSE + 3: |
|
1254 |
case CLOSE + 4: |
|
1255 |
case CLOSE + 5: |
|
1256 |
case CLOSE + 6: |
|
1257 |
case CLOSE + 7: |
|
1258 |
case CLOSE + 8: |
|
1259 |
case CLOSE + 9: |
|
1260 |
sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE); |
|
1261 |
p = (char *)NULL; |
|
1262 |
break; |
|
1263 |
case STAR: |
|
1264 |
p = "STAR"; |
|
1265 |
break; |
|
1266 |
default: |
|
1267 |
regerror("corrupted opcode"); |
|
1268 |
p=(char *)NULL; |
|
1269 |
break; |
|
1270 |
}
|
|
1271 |
if (p != (char *)NULL) |
|
1272 |
strcat(buf, p); |
|
1273 |
return (buf); |
|
1274 |
}
|
|
1275 |
#endif
|
|
1276 |
||
1277 |
/*
|
|
1278 |
- regsub - perform substitutions after a regexp match
|
|
1279 |
*/
|
|
1280 |
||
1281 |
char *regsub(regexp *prog, char *source, char *dest, int n) |
|
1282 |
{
|
|
1283 |
register char *src; |
|
1284 |
register char *dst; |
|
1285 |
register char c; |
|
1286 |
register int no; |
|
1287 |
register int len; |
|
1288 |
extern char *strncpy(); |
|
1289 |
||
1290 |
if (prog == (regexp *)NULL || |
|
1291 |
source == (char *)NULL || dest == (char *)NULL) { |
|
1292 |
regerror("NULL parm to regsub"); |
|
1293 |
return NULL; |
|
1294 |
}
|
|
1295 |
if (UCHARAT(prog->program) != MAGIC) { |
|
1296 |
regerror("damaged regexp fed to regsub"); |
|
1297 |
return NULL; |
|
1298 |
}
|
|
1299 |
src = source; |
|
1300 |
dst = dest; |
|
1301 |
while ((c = *src++) != '\0') { |
|
1302 |
if (c == '&') |
|
1303 |
no = 0; |
|
1304 |
else if (c == '\\' && '0' <= *src && *src <= '9') |
|
1305 |
no = *src++ - '0'; |
|
1306 |
else
|
|
1307 |
no = -1; |
|
1308 |
||
1309 |
if (no < 0) { /* Ordinary character. */ |
|
1310 |
if (c == '\\' && (*src == '\\' || *src == '&')) |
|
1311 |
c = *src++; |
|
1312 |
if (--n < 0) { /* amylaar */ |
|
1313 |
regerror("line too long"); |
|
1314 |
return NULL; |
|
1315 |
}
|
|
1316 |
*dst++ = c; |
|
1317 |
} else if (prog->startp[no] != (char *)NULL && |
|
1318 |
prog->endp[no] != (char *)NULL) { |
|
1319 |
len = prog->endp[no] - prog->startp[no]; |
|
1320 |
if ( (n-=len) < 0 ) { /* amylaar */ |
|
1321 |
regerror("line too long"); |
|
1322 |
return NULL; |
|
1323 |
}
|
|
1324 |
strncpy(dst, prog->startp[no], len); |
|
1325 |
dst += len; |
|
1326 |
if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */ |
|
1327 |
regerror("damaged match string"); |
|
1328 |
return NULL; |
|
1329 |
}
|
|
1330 |
}
|
|
1331 |
}
|
|
1332 |
if (--n < 0) { /* amylaar */ |
|
1333 |
regerror("line too long"); |
|
1334 |
return NULL; |
|
1335 |
}
|
|
1336 |
*dst = '\0'; |
|
1337 |
return dst; |
|
1338 |
}
|
|
1339 |
||
1340 |
||
1341 |
#if 0 /* Use the local regerror() in ed.c */ |
|
1342 |
||
1343 |
void regerror(char *s)
|
|
1344 |
{
|
|
1345 |
fprintf(stderr, "regexp(3): %s", s);
|
|
1346 |
exit(1);
|
|
1347 |
}
|
|
1348 |
#endif /* 0 */
|