~drizzle-trunk/drizzle/development

1548.2.11 by Barry.Leslie at PrimeBase
Removed libxml reqirement by using a home grown xml parser.
1
/* Copyright (c) 2010 PrimeBase Technologies GmbH, Germany
2
 *
3
 * PrimeBase Media Stream for MySQL
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
 *
19
 * Paul McCullagh (H&G2JCtL)
20
 *
21
 * 2010-01-12
22
 *
23
 * CORE SYSTEM:
24
 * XML Parsing
25
 *
26
 */
27
28
#include "CSConfig.h"
29
#include <inttypes.h>
30
31
1643.1.8 by Monty Taylor
Fixed a couple of solaris build issues. Callback functions passed to
32
#include <string.h>
33
#include <stdlib.h>
34
#include <ctype.h>
35
#include <stdio.h>
36
#include <errno.h>
37
1644.3.1 by Barry.Leslie at PrimeBase
Merged in changes from PBMS project.
38
#ifdef DRIZZLED
1643.1.8 by Monty Taylor
Fixed a couple of solaris build issues. Callback functions passed to
39
#include <boost/algorithm/string.hpp>
1644.3.1 by Barry.Leslie at PrimeBase
Merged in changes from PBMS project.
40
#define STRCASESTR(s1, s2) boost::ifind_first(s1, s2)
41
#else
42
#define STRCASESTR(s1, s2) strcasestr(s1, s2)
43
#endif
1548.2.11 by Barry.Leslie at PrimeBase
Removed libxml reqirement by using a home grown xml parser.
44
45
#include "CSXML.h"
46
47
#define ISSPACE(ch)			(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
48
#define ISSINGLE(ch)		(ch == '*' || ch == '+' || ch == '(' || ch == ')' || ch == ',' || ch == '|' || ch == '[' || ch == ']' || ch == '?' || ch == '/')
49
50
#define SET_CHAR(x, ch)		{ x->buffer[0] = ch; x->count = 1; }
51
#define ADD_CHAR(x, ch)		{ if (x->count < PARSE_BUFFER_SIZE) { x->buffer[x->count] = ch; x->count++; } else x->buffer[PARSE_BUFFER_SIZE-1] = ch; }
52
53
bool CSXMLParser::match_string(const char *ch)
54
{
55
	int32_t i;
56
	
57
	for (i=0; i<this->count; i++) {
58
		if (this->buffer[i] != *ch)
59
			return false;
60
		ch++;
61
	}
62
	if (*ch)
63
		return false;
64
	return(i == this->count);
65
}
66
67
void CSXMLParser::increment_nesting(wchar_t ch)
68
{
69
	if (this->nesting < PARSE_STACK_SIZE) {
70
		switch (ch) {
71
			case '/':
72
				this->end_type[this->nesting] = XML_OP_1_END_CLOSE_TAG;
73
				break;
74
			case '?':
75
				this->end_type[this->nesting] = XML_OP_1_END_PI_TAG;
76
				break;
77
			case '!':
78
				this->end_type[this->nesting] = XML_OP_1_END_ENTITY_TAG;
79
				break;
80
			case '[':
81
				this->end_type[this->nesting] = XML_OP_1_END_BRACKET_TAG;
82
				break;
83
			default:
84
				if (ISSPACE(ch))
85
					this->end_type[this->nesting] = XML_OP_1_END_UNKNOWN_TAG;
86
				else
87
					this->end_type[this->nesting] = XML_OP_1_END_TAG;
88
				break;
89
		}
90
	}
91
	this->nesting++;
92
}
93
94
int32_t CSXMLParser::parseChar(wchar_t ch)
95
/* This function does the actual work of parsing. It is expects 
96
 * "complete" characters as input. This could be 4 byte characters
97
 * as long as it is able to recognize the characters that are
98
 * relevant to parsing.
99
 * The function outputs processing instructions, and indicates
100
 * how the output data is to be understood.
101
 */
102
{
103
	switch (this->state) {
104
		case XML_BEFORE_CDATA:
105
			this->nesting = 0;
106
			/* This is the initial state! */
107
			if (ch == '<') {
108
				this->state = XML_LT;
109
				this->type = XML_noop;
110
			}
111
			else {
112
				this->state = XML_IN_CDATA;
113
				this->type = XML_CDATA_CH;
114
			}
115
			SET_CHAR(this, ch);
116
			break;
117
		case XML_IN_CDATA:
118
			if (ch == '<') {
119
				this->state = XML_LT;
120
				this->type = XML_noop;
121
			}
122
			else
123
				this->type = XML_CDATA_CH;
124
			SET_CHAR(this, ch);
125
			break;
126
		case XML_LT:
127
			if (ISSPACE(ch)) {
128
				if (this->nesting) {
129
					this->state = XML_BEFORE_ATTR;
130
					if (this->step == XML_STEP_TAG)
131
						this->type = XML_start_tag_TAG_CH;
132
					else if (this->step == XML_STEP_NESTED)
133
						this->type = XML_TAG_CH;
134
					else if (this->step == XML_STEP_NONE)
135
						this->type = XML_end_cdata_TAG_CH;
136
					else
137
						this->type = XML_add_attr_TAG_CH;
138
					this->step = XML_STEP_TAG;
139
					increment_nesting(ch);
140
					this->count = 0;
141
				}
142
				else {
143
					this->state = XML_IN_CDATA;
144
					this->type = XML_CDATA_CH;
145
					ADD_CHAR(this, ch);
146
				}
147
			}
148
			else if (ch == '!') {
149
				this->state = XML_LT_BANG;
150
				this->type = XML_noop;
151
				ADD_CHAR(this, ch);
152
			}
153
			else {
154
				this->state = XML_IN_TAG_NAME;
155
				if (this->step == XML_STEP_TAG)
156
					this->type = XML_start_tag_TAG_CH;
157
				else if (this->step == XML_STEP_NESTED)
158
					this->type = XML_TAG_CH;
159
				else if (this->step == XML_STEP_NONE)
160
					this->type = XML_end_cdata_TAG_CH;
161
				else
162
					this->type = XML_add_attr_TAG_CH;
163
				this->step = XML_STEP_TAG;
164
				increment_nesting(ch);
165
				SET_CHAR(this, ch);
166
			}
167
			break;
168
		case XML_LT_BANG:
169
			if (ch == '-') {
170
				this->state = XML_LT_BANG_DASH;
171
				this->type = XML_noop;
172
			}
173
			else if (ch == '[') {
174
				this->state = XML_LT_BANG_SQR;
175
				this->type = XML_noop;
176
			}
177
			else {
178
				this->state = XML_IN_TAG_NAME;
179
				if (this->step == XML_STEP_TAG)
180
					this->type = XML_start_tag_TAG_CH;
181
				else if (this->step == XML_STEP_NESTED)
182
					this->type = XML_TAG_CH;
183
				else if (this->step == XML_STEP_NONE)
184
					this->type = XML_end_cdata_TAG_CH;
185
				else
186
					this->type = XML_add_attr_TAG_CH;
187
				this->step = XML_STEP_TAG;
188
				increment_nesting('!');
189
				SET_CHAR(this, '!');
190
			}
191
			ADD_CHAR(this, ch);
192
			break;
193
		case XML_LT_BANG_DASH:
194
			if (ch == '-') {
195
				this->state = XML_IN_COMMENT;
196
				if (this->step == XML_STEP_TAG)
197
					this->type = XML_start_tag_start_comment;
198
				else if (this->step == XML_STEP_NESTED)
199
					this->type = XML_start_comment;
200
				else if (this->step == XML_STEP_NONE)
201
					this->type = XML_end_cdata_start_comment;
202
				else
203
					this->type = XML_add_attr_start_comment;
204
				increment_nesting(' ');
205
			}
206
			else {
207
				this->state = XML_IN_CDATA;
208
				this->type = XML_CDATA_CH;
209
				ADD_CHAR(this, ch);
210
			}
211
			break;
212
		case XML_LT_BANG_SQR:
213
			if (ISSPACE(ch))
214
				this->type = XML_noop;
215
			else if (ch == '[') {
216
				this->state = XML_BEFORE_ATTR;
217
				if (this->step == XML_STEP_TAG)
218
					this->type = XML_start_tag_TAG_CH;
219
				else if (this->step == XML_STEP_NESTED)
220
					this->type = XML_TAG_CH;
221
				else if (this->step == XML_STEP_NONE)
222
					this->type = XML_end_cdata_TAG_CH;
223
				else
224
					this->type = XML_add_attr_TAG_CH;
225
				this->step = XML_STEP_TAG;
226
				increment_nesting('[');
227
				SET_CHAR(this, '!');
228
				ADD_CHAR(this, '[');
229
			}
230
			else {
231
				this->state = XML_LT_BANG_SQR_IN_NAME;
232
				this->type = XML_noop;
233
				SET_CHAR(this, '!');
234
				ADD_CHAR(this, '[');
235
				ADD_CHAR(this, ch);
236
			}
237
			break;
238
		case XML_LT_BANG_SQR_IN_NAME:
239
			if (ISSPACE(ch)) {
240
				this->state = XML_LT_BANG_SQR_AFTER_NAME;
241
				this->type = XML_noop;
242
			}
243
			else if (ch == '[') {
244
				if (match_string("![CDATA")) {
245
					this->state = XML_IN_CDATA_TAG;
246
					if (this->step == XML_STEP_TAG)
247
						this->type = XML_start_tag_start_cdata_tag;
248
					else if (this->step == XML_STEP_NESTED)
249
						this->type = XML_start_cdata_tag;
250
					else if (this->step == XML_STEP_NONE)
251
						this->type = XML_end_cdata_start_cdata_tag;
252
					else
253
						this->type = XML_add_attr_start_cdata_tag;
254
					this->step = XML_STEP_TAG;
255
					increment_nesting('[');
256
				}
257
				else {
258
					this->state = XML_BEFORE_ATTR;
259
					if (this->step == XML_STEP_TAG)
260
						this->type = XML_start_tag_TAG_CH;
261
					else if (this->step == XML_STEP_NESTED)
262
						this->type = XML_TAG_CH;
263
					else if (this->step == XML_STEP_NONE)
264
						this->type = XML_end_cdata_TAG_CH;
265
					else
266
						this->type = XML_add_attr_TAG_CH;
267
					this->step = XML_STEP_TAG;
268
					increment_nesting('[');
269
				}
270
			}
271
			else {
272
				this->type = XML_noop;
273
				ADD_CHAR(this, ch);
274
			}
275
			break;
276
		case XML_LT_BANG_SQR_AFTER_NAME:
277
			if (ch == '[') {
278
				if (match_string("![CDATA")) {
279
					this->state = XML_IN_CDATA_TAG;
280
					if (this->step == XML_STEP_TAG)
281
						this->type = XML_start_tag_start_cdata_tag;
282
					else if (this->step == XML_STEP_NESTED)
283
						this->type = XML_start_cdata_tag;
284
					else if (this->step == XML_STEP_NONE)
285
						this->type = XML_end_cdata_start_cdata_tag;
286
					else
287
						this->type = XML_add_attr_start_cdata_tag;
288
					increment_nesting('[');
289
				}
290
				else {
291
					this->state = XML_BEFORE_ATTR;
292
					if (this->step == XML_STEP_TAG)
293
						this->type = XML_start_tag_TAG_CH;
294
					else if (this->step == XML_STEP_NESTED)
295
						this->type = XML_TAG_CH;
296
					else if (this->step == XML_STEP_NONE)
297
						this->type = XML_end_cdata_TAG_CH;
298
					else
299
						this->type = XML_add_attr_TAG_CH;
300
					this->step = XML_STEP_TAG;
301
					increment_nesting('[');
302
				}
303
			}
304
			else
305
				/* Ignore data until the '['!!! */
306
				this->type = XML_noop;
307
			break;
308
		case XML_IN_TAG_NAME:
309
			if (ISSPACE(ch)) {
310
				this->state = XML_BEFORE_ATTR;
311
				this->type = XML_noop;
312
			}
313
			else if (ch == '<') {
314
				this->state = XML_LT;
315
				this->type = XML_noop;
316
			}
317
			else if (ch == '>') {
318
				if (this->step == XML_STEP_TAG)
319
					this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
320
				else if (this->step == XML_STEP_NESTED)
321
					this->type = XML_end_tag(END_TAG_TYPE(this));
322
				else
323
					this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
324
				this->nesting--;
325
				if (this->nesting) {
326
					this->step = XML_STEP_NESTED;
327
					this->state = XML_BEFORE_ATTR;
328
				}
329
				else {
330
					this->step = XML_STEP_NONE;
331
					this->state = XML_IN_CDATA;
332
				}
333
			}
334
			else if (ch == '"' || ch == '\'') {
335
				this->state = XML_QUOTE_BEFORE_VALUE;
336
				this->quote = ch;
337
				this->type = XML_noop;
338
			}
339
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
340
				this->state = XML_SLASH;
341
				this->type = XML_noop;
342
			}
343
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
344
				this->state = XML_QMARK;
345
				this->type = XML_noop;
346
			}
347
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
348
				this->state = XML_SQR;
349
				this->type = XML_noop;
350
			}
351
			else if (ISSINGLE(ch)) {
352
				this->state = XML_BEFORE_ATTR;
353
				if (this->step == XML_STEP_TAG)
354
					this->type = XML_start_tag_ATTR_CH;
355
				else if (this->step == XML_STEP_NESTED)
356
					this->type = XML_ATTR_CH;
357
				else
358
					this->type = XML_add_attr_ATTR_CH;
359
				this->step = XML_STEP_ATTR;
360
				SET_CHAR(this, ch);
361
			}
362
			else {
363
				this->type = XML_TAG_CH;
364
				SET_CHAR(this, ch);
365
			}
366
			break;
367
		case XML_BEFORE_ATTR:
368
			if (ISSPACE(ch))
369
				this->type = XML_noop;
370
			else if (ch == '<') {
371
				this->state = XML_LT;
372
				this->type = XML_noop;
373
			}
374
			else if (ch == '>') {
375
				if (this->step == XML_STEP_TAG)
376
					this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
377
				else if (this->step == XML_STEP_NESTED)
378
					this->type = XML_end_tag(END_TAG_TYPE(this));
379
				else
380
					this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
381
				this->nesting--;
382
				if (this->nesting) {
383
					this->step = XML_STEP_NESTED;
384
					this->state = XML_BEFORE_ATTR;
385
				}
386
				else {
387
					this->step = XML_STEP_NONE;
388
					this->state = XML_IN_CDATA;
389
				}
390
			}
391
			else if (ch == '"' || ch == '\'') {
392
				this->state = XML_QUOTE_BEFORE_VALUE;
393
				this->quote = ch;
394
				this->type = XML_noop;
395
			}
396
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
397
				this->state = XML_SLASH;
398
				this->type = XML_noop;
399
			}
400
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
401
				this->state = XML_QMARK;
402
				this->type = XML_noop;
403
			}
404
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
405
				this->state = XML_SQR;
406
				this->type = XML_noop;
407
			}
408
			else if (ISSINGLE(ch)) {
409
				if (this->step == XML_STEP_TAG)
410
					this->type = XML_start_tag_ATTR_CH;
411
				else if (this->step == XML_STEP_NESTED)
412
					this->type = XML_ATTR_CH;
413
				else
414
					this->type = XML_add_attr_ATTR_CH;
415
				this->step = XML_STEP_ATTR;
416
				SET_CHAR(this, ch);
417
			}
418
			else {
419
				this->state = XML_IN_ATTR;
420
				if (this->step == XML_STEP_TAG)
421
					this->type = XML_start_tag_ATTR_CH;
422
				else if (this->step == XML_STEP_NESTED)
423
					this->type = XML_ATTR_CH;
424
				else
425
					this->type = XML_add_attr_ATTR_CH;
426
				this->step = XML_STEP_ATTR;
427
				SET_CHAR(this, ch);
428
			}
429
			break;
430
		case XML_IN_ATTR:
431
			if (ISSPACE(ch)) {
432
				this->state = XML_BEFORE_EQUAL;
433
				this->type = XML_noop;
434
			}
435
			else if (ch == '<') {
436
				this->state = XML_LT;
437
				this->type = XML_noop;
438
			}
439
			else if (ch == '>') {
440
				if (this->step == XML_STEP_TAG)
441
					this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
442
				else if (this->step == XML_STEP_NESTED)
443
					this->type = XML_end_tag(END_TAG_TYPE(this));
444
				else
445
					this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
446
				this->nesting--;
447
				if (this->nesting) {
448
					this->step = XML_STEP_NESTED;
449
					this->state = XML_BEFORE_ATTR;
450
				}
451
				else {
452
					this->step = XML_STEP_NONE;
453
					this->state = XML_IN_CDATA;
454
				}
455
			}
456
			else if (ch == '"' || ch == '\'') {
457
				this->state = XML_QUOTE_BEFORE_VALUE;
458
				this->quote = ch;
459
				this->type = XML_noop;
460
			}
461
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
462
				this->state = XML_SLASH;
463
				this->type = XML_noop;
464
			}
465
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
466
				this->state = XML_QMARK;
467
				this->type = XML_noop;
468
			}
469
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
470
				this->state = XML_SQR;
471
				this->type = XML_noop;
472
			}
473
			else if (ISSINGLE(ch)) {
474
				this->state = XML_BEFORE_ATTR;
475
				if (this->step == XML_STEP_TAG)
476
					this->type = XML_start_tag_ATTR_CH;
477
				else if (this->step == XML_STEP_NESTED)
478
					this->type = XML_ATTR_CH;
479
				else
480
					this->type = XML_add_attr_ATTR_CH;
481
				this->step = XML_STEP_ATTR;
482
				SET_CHAR(this, ch);
483
			}
484
			else if (ch == '=') {
485
				this->state = XML_AFTER_EQUAL;
486
				this->type = XML_noop;
487
			}
488
			else {
489
				this->type = XML_ATTR_CH;
490
				SET_CHAR(this, ch);
491
			}
492
			break;
493
		case XML_BEFORE_EQUAL:
494
			if (ISSPACE(ch))
495
				this->type = XML_noop;
496
			else if (ch == '<') {
497
				this->state = XML_LT;
498
				this->type = XML_noop;
499
			}
500
			else if (ch == '>') {
501
				if (this->step == XML_STEP_TAG)
502
					this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
503
				else if (this->step == XML_STEP_NESTED)
504
					this->type = XML_end_tag(END_TAG_TYPE(this));
505
				else
506
					this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
507
				this->nesting--;
508
				if (this->nesting) {
509
					this->step = XML_STEP_NESTED;
510
					this->state = XML_BEFORE_ATTR;
511
				}
512
				else {
513
					this->step = XML_STEP_NONE;
514
					this->state = XML_IN_CDATA;
515
				}
516
			}
517
			else if (ch == '"' || ch == '\'') {
518
				this->state = XML_QUOTE_BEFORE_VALUE;
519
				this->quote = ch;
520
				this->type = XML_noop;
521
			}
522
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
523
				this->state = XML_SLASH;
524
				this->type = XML_noop;
525
			}
526
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
527
				this->state = XML_QMARK;
528
				this->type = XML_noop;
529
			}
530
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
531
				this->state = XML_SQR;
532
				this->type = XML_noop;
533
			}
534
			else if (ISSINGLE(ch)) {
535
				this->state = XML_BEFORE_ATTR;
536
				if (this->step == XML_STEP_TAG)
537
					this->type = XML_start_tag_ATTR_CH;
538
				else if (this->step == XML_STEP_NESTED)
539
					this->type = XML_ATTR_CH;
540
				else
541
					this->type = XML_add_attr_ATTR_CH;
542
				this->step = XML_STEP_ATTR;
543
				SET_CHAR(this, ch);
544
			}
545
			else if (ch == '=') {
546
				this->state = XML_AFTER_EQUAL;
547
				this->type = XML_noop;
548
			}
549
			else {
550
				this->state = XML_IN_ATTR;
551
				if (this->step == XML_STEP_TAG)
552
					this->type = XML_start_tag_ATTR_CH;
553
				else if (this->step == XML_STEP_NESTED)
554
					this->type = XML_ATTR_CH;
555
				else
556
					this->type = XML_add_attr_ATTR_CH;
557
				this->step = XML_STEP_ATTR;
558
				SET_CHAR(this, ch);
559
			}
560
			break;
561
		case XML_AFTER_EQUAL:
562
			if (ISSPACE(ch)) {
563
				this->state = XML_AFTER_EQUAL;
564
				this->type = XML_noop;
565
			}
566
			else if (ch == '<') {
567
				this->state = XML_LT;
568
				this->type = XML_noop;
569
			}
570
			else if (ch == '>') {
571
				if (this->step == XML_STEP_TAG)
572
					this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
573
				else if (this->step == XML_STEP_NESTED)
574
					this->type = XML_end_tag(END_TAG_TYPE(this));
575
				else
576
					this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
577
				this->nesting--;
578
				if (this->nesting) {
579
					this->step = XML_STEP_NESTED;
580
					this->state = XML_BEFORE_ATTR;
581
				}
582
				else {
583
					this->step = XML_STEP_NONE;
584
					this->state = XML_IN_CDATA;
585
				}
586
			}
587
			else if (ch == '"' || ch == '\'') {
588
				this->state = XML_QUOTE_BEFORE_VALUE;
589
				this->quote = ch;
590
				this->type = XML_noop;
591
			}
592
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
593
				this->state = XML_SLASH;
594
				this->type = XML_noop;
595
			}
596
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
597
				this->state = XML_QMARK;
598
				this->type = XML_noop;
599
			}
600
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
601
				this->state = XML_SQR;
602
				this->type = XML_noop;
603
			}
604
			else if (ISSINGLE(ch)) {
605
				this->state = XML_BEFORE_ATTR;
606
				if (this->step == XML_STEP_TAG)
607
					this->type = XML_start_tag_ATTR_CH;
608
				else if (this->step == XML_STEP_NESTED)
609
					this->type = XML_ATTR_CH;
610
				else
611
					this->type = XML_add_attr_ATTR_CH;
612
				this->step = XML_STEP_ATTR;
613
				SET_CHAR(this, ch);
614
			}
615
			else {
616
				this->state = XML_IN_VALUE;
617
				this->quote = 0;
618
				if (this->step == XML_STEP_TAG)
619
					this->type = XML_start_tag_VALUE_CH;
620
				else if (this->step == XML_STEP_VALUE)
621
					this->type = XML_add_attr_VALUE_CH;
622
				else
623
					this->type = XML_VALUE_CH;
624
				this->step = XML_STEP_VALUE;
625
				SET_CHAR(this, ch);
626
			}
627
			break;
628
		case XML_QUOTE_BEFORE_VALUE:
629
			if (ch == this->quote) {
630
				this->state = XML_QUOTE_AFTER_VALUE;
631
				// Empty string:
632
				if (this->step == XML_STEP_TAG)
633
					this->type = XML_start_tag_VALUE_CH;
634
				else if (this->step == XML_STEP_VALUE)
635
					this->type = XML_add_attr_VALUE_CH;
636
				else
637
					this->type = XML_VALUE_CH;
638
				this->step = XML_STEP_VALUE;
639
				this->count = 0;
640
			}
641
			else {
642
				this->state = XML_IN_VALUE;
643
				if (this->step == XML_STEP_TAG)
644
					this->type = XML_start_tag_VALUE_CH;
645
				else if (this->step == XML_STEP_VALUE)
646
					this->type = XML_add_attr_VALUE_CH;
647
				else
648
					this->type = XML_VALUE_CH;
649
				this->step = XML_STEP_VALUE;
650
				SET_CHAR(this, ch);
651
			}
652
			break;
653
		case XML_IN_VALUE:
654
			if (this->quote) {
655
				if (ch == this->quote) {
656
					this->state = XML_QUOTE_AFTER_VALUE;
657
					this->type = XML_noop;
658
				}
659
				else {
660
					this->type = XML_VALUE_CH;
661
					SET_CHAR(this, ch);
662
				}
663
			}
664
			else {
665
				/* A value without quotes (for HTML!) */
666
				if (ISSPACE(ch)) {
667
					this->state = XML_BEFORE_ATTR;
668
					this->type = XML_noop;
669
				}
670
				else if (ch == '<') {
671
					this->state = XML_LT;
672
					this->type = XML_noop;
673
				}
674
				else if (ch == '>') {
675
					if (this->step == XML_STEP_TAG)
676
						this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
677
					else if (this->step == XML_STEP_NESTED)
678
						this->type = XML_end_tag(END_TAG_TYPE(this));
679
					else
680
						this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
681
					this->nesting--;
682
					if (this->nesting) {
683
						this->step = XML_STEP_NESTED;
684
						this->state = XML_BEFORE_ATTR;
685
					}
686
					else {
687
						this->step = XML_STEP_NONE;
688
						this->state = XML_IN_CDATA;
689
					}
690
				}
691
				else if (ch == '"' || ch == '\'') {
692
					this->state = XML_QUOTE_BEFORE_VALUE;
693
					this->quote = ch;
694
					this->type = XML_noop;
695
				}
696
				else {
697
					this->type = XML_VALUE_CH;
698
					SET_CHAR(this, ch);
699
				}
700
			}
701
			break;
702
		case XML_QUOTE_AFTER_VALUE:
703
			if (ISSPACE(ch)) {
704
				this->state = XML_BEFORE_ATTR;
705
				this->type = XML_noop;
706
			}
707
			else if (ch == '<') {
708
				this->state = XML_LT;
709
				this->type = XML_noop;
710
			}
711
			else if (ch == '>') {
712
				if (this->step == XML_STEP_TAG)
713
					this->type = XML_start_tag_end_tag(END_TAG_TYPE(this));
714
				else if (this->step == XML_STEP_NESTED)
715
					this->type = XML_end_tag(END_TAG_TYPE(this));
716
				else
717
					this->type = XML_add_attr_end_tag(END_TAG_TYPE(this));
718
				this->nesting--;
719
				if (this->nesting) {
720
					this->step = XML_STEP_NESTED;
721
					this->state = XML_BEFORE_ATTR;
722
				}
723
				else {
724
					this->step = XML_STEP_NONE;
725
					this->state = XML_IN_CDATA;
726
				}
727
			}
728
			else if (ch == '"' || ch == '\'') {
729
				this->state = XML_QUOTE_BEFORE_VALUE;
730
				this->quote = ch;
731
				this->type = XML_noop;
732
			}
733
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
734
				this->state = XML_SLASH;
735
				this->type = XML_noop;
736
			}
737
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
738
				this->state = XML_QMARK;
739
				this->type = XML_noop;
740
			}
741
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
742
				this->state = XML_SQR;
743
				this->type = XML_noop;
744
			}
745
			else if (ISSINGLE(ch)) {
746
				this->state = XML_BEFORE_ATTR;
747
				if (this->step == XML_STEP_TAG)
748
					this->type = XML_start_tag_ATTR_CH;
749
				else if (this->step == XML_STEP_NESTED)
750
					this->type = XML_ATTR_CH;
751
				else
752
					this->type = XML_add_attr_ATTR_CH;
753
				this->step = XML_STEP_ATTR;
754
				SET_CHAR(this, ch);
755
			}
756
			else {
757
				this->state = XML_IN_ATTR;
758
				if (this->step == XML_STEP_TAG)
759
					this->type = XML_start_tag_ATTR_CH;
760
				else if (this->step == XML_STEP_NESTED)
761
					this->type = XML_ATTR_CH;
762
				else
763
					this->type = XML_add_attr_ATTR_CH;
764
				this->step = XML_STEP_ATTR;
765
				SET_CHAR(this, ch);
766
			}
767
			break;
768
		case XML_SQR:
769
			SET_CHAR(this, ']');
770
			goto cont;
771
		case XML_SLASH:
772
			SET_CHAR(this, '/');
773
			goto cont;
774
		case XML_QMARK:
775
			SET_CHAR(this, '?');
776
			cont:
777
			if (ISSPACE(ch)) {
778
				this->state = XML_BEFORE_ATTR;
779
				if (this->step == XML_STEP_TAG)
780
					this->type = XML_start_tag_TAG_CH;
781
				else if (this->step == XML_STEP_NESTED)
782
					this->type = XML_TAG_CH;
783
				else if (this->step == XML_STEP_NONE)
784
					this->type = XML_end_cdata_TAG_CH;
785
				else
786
					this->type = XML_add_attr_TAG_CH;
787
				this->step = XML_STEP_ATTR;
788
			}
789
			else if (ch == '<') {
790
				this->state = XML_LT;
791
				if (this->step == XML_STEP_TAG)
792
					this->type = XML_start_tag_TAG_CH;
793
				else if (this->step == XML_STEP_NESTED)
794
					this->type = XML_TAG_CH;
795
				else if (this->step == XML_STEP_NONE)
796
					this->type = XML_end_cdata_TAG_CH;
797
				else
798
					this->type = XML_add_attr_TAG_CH;
799
				this->step = XML_STEP_TAG;
800
			}
801
			else if (ch == '>') {
802
				if (this->state == XML_SLASH) {
803
					if (this->step == XML_STEP_TAG)
804
						this->type = XML_start_tag_end_empty_tag;
805
					else if (this->step == XML_STEP_NESTED)
806
						this->type = XML_end_empty_tag;
807
					else
808
						this->type = XML_add_attr_end_empty_tag;
809
				}
810
				else if (this->state == XML_SQR) {
811
					if (this->step == XML_STEP_TAG)
812
						this->type = XML_start_tag_end_tag(XML_OP_1_END_BRACKET_TAG);
813
					else if (this->step == XML_STEP_NESTED)
814
						this->type = XML_end_tag(XML_OP_1_END_BRACKET_TAG);
815
					else
816
						this->type = XML_add_attr_end_tag(XML_OP_1_END_BRACKET_TAG);
817
				}
818
				else {
819
					if (this->step == XML_STEP_TAG)
820
						this->type = XML_start_tag_end_pi_tag;
821
					else if (this->step == XML_STEP_NESTED)
822
						this->type = XML_end_pi_tag;
823
					else
824
						this->type = XML_add_attr_end_pi_tag;
825
				}
826
				this->nesting--;
827
				if (this->nesting) {
828
					this->step = XML_STEP_NESTED;
829
					this->state = XML_BEFORE_ATTR;
830
				}
831
				else {
832
					this->step = XML_STEP_NONE;
833
					this->state = XML_IN_CDATA;
834
				}
835
			}
836
			else if (ch == '"' || ch == '\'') {
837
				this->state = XML_QUOTE_BEFORE_VALUE;
838
				this->quote = ch;
839
				if (this->step == XML_STEP_TAG)
840
					this->type = XML_start_tag_TAG_CH;
841
				else if (this->step == XML_STEP_NESTED)
842
					this->type = XML_TAG_CH;
843
				else if (this->step == XML_STEP_NONE)
844
					this->type = XML_end_cdata_TAG_CH;
845
				else
846
					this->type = XML_add_attr_TAG_CH;
847
				this->step = XML_STEP_ATTR;
848
			}
849
			else if (ch == '/' && (END_TAG_TYPE(this) == XML_OP_1_END_TAG)) {
850
				this->state = XML_SLASH;
851
				if (this->step == XML_STEP_TAG)
852
					this->type = XML_start_tag_TAG_CH;
853
				else if (this->step == XML_STEP_NESTED)
854
					this->type = XML_TAG_CH;
855
				else if (this->step == XML_STEP_NONE)
856
					this->type = XML_end_cdata_TAG_CH;
857
				else
858
					this->type = XML_add_attr_TAG_CH;
859
				this->step = XML_STEP_ATTR;
860
			}
861
			else if (ch == '?' && (END_TAG_TYPE(this) == XML_OP_1_END_PI_TAG)) {
862
				this->state = XML_QMARK;
863
				if (this->step == XML_STEP_TAG)
864
					this->type = XML_start_tag_TAG_CH;
865
				else if (this->step == XML_STEP_NESTED)
866
					this->type = XML_TAG_CH;
867
				else if (this->step == XML_STEP_NONE)
868
					this->type = XML_end_cdata_TAG_CH;
869
				else
870
					this->type = XML_add_attr_TAG_CH;
871
				this->step = XML_STEP_ATTR;
872
			}
873
			else if (ch == ']' && (END_TAG_TYPE(this) == XML_OP_1_END_BRACKET_TAG)) {
874
				this->state = XML_SQR;
875
				if (this->step == XML_STEP_TAG)
876
					this->type = XML_start_tag_TAG_CH;
877
				else if (this->step == XML_STEP_NESTED)
878
					this->type = XML_TAG_CH;
879
				else if (this->step == XML_STEP_NONE)
880
					this->type = XML_end_cdata_TAG_CH;
881
				else
882
					this->type = XML_add_attr_TAG_CH;
883
				this->step = XML_STEP_ATTR;
884
			}
885
			else if (ISSINGLE(ch)) {
886
				this->state = XML_BEFORE_ATTR;
887
				if (this->step == XML_STEP_TAG)
888
					this->type = XML_start_tag_TAG_CH;
889
				else if (this->step == XML_STEP_NESTED)
890
					this->type = XML_TAG_CH;
891
				else if (this->step == XML_STEP_NONE)
892
					this->type = XML_end_cdata_TAG_CH;
893
				else
894
					this->type = XML_add_attr_TAG_CH;
895
				this->step = XML_STEP_ATTR;
896
				ADD_CHAR(this, ch);
897
			}
898
			else {
899
				this->state = XML_IN_ATTR;
900
				if (this->step == XML_STEP_TAG)
901
					this->type = XML_start_tag_TAG_CH;
902
				else if (this->step == XML_STEP_NESTED)
903
					this->type = XML_TAG_CH;
904
				else if (this->step == XML_STEP_NONE)
905
					this->type = XML_end_cdata_TAG_CH;
906
				else
907
					this->type = XML_add_attr_TAG_CH;
908
				this->step = XML_STEP_ATTR;
909
				ADD_CHAR(this, ch);
910
			}
911
			break;
912
		case XML_IN_COMMENT:
913
			if (ch == '-') {
914
				this->state = XML_IN_COMMENT_DASH;
915
				this->type = XML_noop;
916
			}
917
			else
918
				this->type = XML_COMMENT_CH;
919
			SET_CHAR(this, ch);
920
			break;
921
		case XML_IN_COMMENT_DASH:
922
			if (ch == '-') {
923
				this->state = XML_IN_COMMENT_DASH_DASH;
924
				this->type = XML_noop;
925
			}
926
			else {
927
				this->state = XML_IN_COMMENT;
928
				this->type = XML_COMMENT_CH;
929
			}
930
			ADD_CHAR(this, ch);
931
			break;
932
		case XML_IN_COMMENT_DASH_DASH:
933
			if (ch == '-') {
934
				this->state = XML_IN_COMMENT_3_DASH;
935
				this->type = XML_COMMENT_CH;
936
				SET_CHAR(this, ch);
937
			}
938
			else if (ch == '>') {
939
				this->type = XML_end_comment;
940
				this->nesting--;
941
				if (this->nesting) {
942
					this->step = XML_STEP_NESTED;
943
					this->state = XML_BEFORE_ATTR;
944
				}
945
				else {
946
					this->step = XML_STEP_NONE;
947
					this->state = XML_IN_CDATA;
948
				}
949
			}
950
			else {
951
				this->state = XML_IN_COMMENT;
952
				this->type = XML_COMMENT_CH;
953
				ADD_CHAR(this, ch);
954
			}
955
			break;
956
		case XML_IN_COMMENT_3_DASH:
957
			if (ch == '-') {
958
				this->type = XML_COMMENT_CH;
959
				SET_CHAR(this, ch);
960
			}
961
			else if (ch == '>') {
962
				this->type = XML_end_comment;
963
				this->nesting--;
964
				if (this->nesting) {
965
					this->step = XML_STEP_NESTED;
966
					this->state = XML_BEFORE_ATTR;
967
				}
968
				else {
969
					this->step = XML_STEP_NONE;
970
					this->state = XML_IN_CDATA;
971
				}
972
			}
973
			else {
974
				this->state = XML_IN_COMMENT;
975
				this->type = XML_COMMENT_CH;
976
				SET_CHAR(this, '-');
977
				ADD_CHAR(this, '-');
978
				ADD_CHAR(this, ch);
979
			}
980
			break;
981
		case XML_IN_CDATA_TAG:
982
			if (ch == ']') {
983
				this->state = XML_IN_CDATA_TAG_SQR;
984
				this->type = XML_noop;
985
			}
986
			else
987
				this->type = XML_CDATA_TAG_CH;
988
			SET_CHAR(this, ch);
989
			break;
990
		case XML_IN_CDATA_TAG_SQR:
991
			if (ch == ']') {
992
				this->state = XML_IN_CDATA_TAG_SQR_SQR;
993
				this->type = XML_noop;
994
			}
995
			else {
996
				this->state = XML_IN_CDATA_TAG;
997
				this->type = XML_CDATA_TAG_CH;
998
			}
999
			ADD_CHAR(this, ch);
1000
			break;
1001
		case XML_IN_CDATA_TAG_SQR_SQR:
1002
			if (ch == ']') {
1003
				this->state = XML_IN_CDATA_TAG_3_SQR;
1004
				this->type = XML_CDATA_TAG_CH;
1005
				SET_CHAR(this, ch);
1006
			}
1007
			else if (ch == '>') {
1008
				this->type = XML_end_cdata_tag;
1009
				this->nesting--;
1010
				if (this->nesting) {
1011
					this->step = XML_STEP_NESTED;
1012
					this->state = XML_BEFORE_ATTR;
1013
				}
1014
				else {
1015
					this->step = XML_STEP_NONE;
1016
					this->state = XML_IN_CDATA;
1017
				}
1018
			}
1019
			else {
1020
				this->state = XML_IN_CDATA_TAG;
1021
				this->type = XML_CDATA_TAG_CH;
1022
				ADD_CHAR(this, ch);
1023
			}
1024
			break;
1025
		case XML_IN_CDATA_TAG_3_SQR:
1026
			if (ch == ']') {
1027
				this->type = XML_CDATA_TAG_CH;
1028
				SET_CHAR(this, ch);
1029
			}
1030
			else if (ch == '>') {
1031
				this->type = XML_end_cdata_tag;
1032
				this->nesting--;
1033
				if (this->nesting) {
1034
					this->step = XML_STEP_NESTED;
1035
					this->state = XML_BEFORE_ATTR;
1036
				}
1037
				else {
1038
					this->step = XML_STEP_NONE;
1039
					this->state = XML_IN_CDATA;
1040
				}
1041
			}
1042
			else {
1043
				this->state = XML_IN_CDATA_TAG;
1044
				this->type = XML_CDATA_TAG_CH;
1045
				SET_CHAR(this, ']');
1046
				ADD_CHAR(this, ']');
1047
				ADD_CHAR(this, ch);
1048
			}
1049
			break;
1050
	}
1051
	return(this->type);
1052
}
1053
1054
/* ------------------------------------------------------------------- */
1055
/* CSXMLProcessor */
1056
1057
bool CSXMLProcessor::buildConversionTable()
1058
{
1059
	int32_t i;
1060
1061
	/* By default we don't know how to convert any charset
1062
	 * other tha ISO-1 to unicode!
1063
	 */
1064
	if (strcasecmp(charset, "ISO-8859-1") == 0) {
1065
		for (i=0; i<128; i++)
1066
			conversion_table[i] = (wchar_t) (i + 128);
1067
	}
1068
	else {
1069
		for (i=0; i<128; i++)
1070
			conversion_table[i] = '?';
1071
	}
1072
	return true;
1073
}
1074
1075
// Private use are: E000 - F8FF
1076
1077
int32_t CSXMLProcessor::capture_initializer(wchar_t ch)
1078
/* We capture tag and attribute data for the parsing purposes.
1079
 * The buffers are initialized here (at the lowest level)
1080
 * of processing after parsing.
1081
 */
1082
{
1083
	int32_t op;
1084
1085
	op = parseChar(ch);
1086
	switch (op & XML_OP_1_MASK) {
1087
		case XML_OP_1_START_TAG:
1088
			this->tlength = 0;
1089
			break;
1090
		case XML_OP_1_ADD_ATTR:
1091
			this->nlength = 0;
1092
			this->vlength = 0;
1093
			break;
1094
	}
1095
	return(op);
1096
}
1097
1098
int32_t CSXMLProcessor::entity_translator(wchar_t ch)
1099
/* This function handles entities.
1100
 * Certain entities are translated into UNICODE characters.
1101
 * Strictly speaking, these enties are only recognised by HTML.
1102
 * The few entities that are recognised by XML are first translated
1103
 * into some reserved characters for the parser. This is to ensure
1104
 * that the parser does not recognize them as characters with special
1105
 * meaning! This includes '&', '<' and '>'.
1106
 */
1107
{
1108
	int32_t op;
1109
1110
	op = capture_initializer(ch);
1111
	return(op);
1112
}
1113
1114
/*
1115
 * This function translates the input character stream into UNICODE.
1116
 */
1117
int32_t CSXMLProcessor::charset_transformer(wchar_t ch)
1118
{
1119
	int32_t op;
1120
1121
	// Do transformation according to the charset.
1122
	switch (this->charset_type) {
1123
		case CHARSET_UTF_8:
1124
			if (ch > 127 && ch < 256) {
1125
				uint32_t utf_value;
1126
				uint8_t utf_ch = (uint8_t)ch;
1127
1128
				if ((utf_ch & 0xC0) != 0x80)
1129
					this->utf8_count = 0;
1130
				if ((utf_ch & 0x80) == 0x00)
1131
					this->utf8_length = 1;
1132
				else if ((utf_ch & 0xE0) == 0xC0)
1133
					this->utf8_length = 2;
1134
				else if ((utf_ch & 0xF0) == 0xE0)
1135
					this->utf8_length = 3;
1136
				else if ((utf_ch & 0xF8) == 0xF0)
1137
					this->utf8_length = 4;
1138
				else if ((utf_ch & 0xFC) == 0xF8)
1139
					this->utf8_length = 5;
1140
				else if ((utf_ch & 0xFE) == 0xFC)
1141
					this->utf8_length = 6;
1142
				this->utf8_buffer[this->utf8_count] = (uint32_t) utf_ch;
1143
				this->utf8_count++;
1144
				if (this->utf8_count < this->utf8_length) {
1145
					// I need more bytes!
1146
					setDataType(XML_noop);
1147
					return(XML_noop);
1148
				}
1149
				utf_value = 0;
1150
				switch (this->utf8_length) {
1151
					case 1:
1152
						utf_value = this->utf8_buffer[0] & 0x0000007F;
1153
						break;
1154
					case 2:
1155
						utf_value = ((this->utf8_buffer[0] & 0x0000001F) << 6) |
1156
									(this->utf8_buffer[1] & 0x0000003F);
1157
						if (utf_value < 0x00000080)
1158
							utf_value = '?';
1159
						break;
1160
					case 3:
1161
						utf_value = ((this->utf8_buffer[0] & 0x0000000F) << 12) |
1162
									((this->utf8_buffer[1] & 0x0000003F) << 6) |
1163
									(this->utf8_buffer[2] & 0x0000003F);
1164
						if (utf_value < 0x000000800)
1165
							utf_value = '?';
1166
						break;
1167
					case 4:
1168
						utf_value = ((this->utf8_buffer[0] & 0x00000007) << 18) |
1169
									((this->utf8_buffer[1] & 0x0000003F) << 12) |
1170
									((this->utf8_buffer[2] & 0x0000003F) << 6) |
1171
									(this->utf8_buffer[3] & 0x0000003F);
1172
						if (utf_value < 0x00010000)
1173
							utf_value = '?';
1174
						break;
1175
					case 5:
1176
						utf_value = ((this->utf8_buffer[0] & 0x00000003) << 24) |
1177
									((this->utf8_buffer[1] & 0x0000003F) << 18) |
1178
									((this->utf8_buffer[2] & 0x0000003F) << 12) |
1179
									((this->utf8_buffer[3] & 0x0000003F) << 6) |
1180
									(this->utf8_buffer[4] & 0x0000003F);
1181
						if (utf_value < 0x00200000)
1182
							utf_value = '?';
1183
						break;
1184
					case 6:
1185
						utf_value = ((this->utf8_buffer[0] & 0x00000001) << 30) |
1186
									((this->utf8_buffer[1] & 0x0000003F) << 24) |
1187
									((this->utf8_buffer[2] & 0x0000003F) << 18) |
1188
									((this->utf8_buffer[3] & 0x0000003F) << 12) |
1189
									((this->utf8_buffer[4] & 0x0000003F) << 6) |
1190
									(this->utf8_buffer[5] & 0x0000003F);
1191
						if (utf_value < 0x04000000)
1192
							utf_value = '?';
1193
						break;
1194
				}
1195
				if (utf_value > 0x0000FFFF)
1196
					ch = '?';
1197
				else
1198
					ch = utf_value;
1199
			}
1200
			break;
1201
		case CHARSET_TO_CONVERT_8_BIT:
1202
			if (ch > 127 && ch < 256)
1203
				ch = this->conversion_table[((unsigned char) ch) - 128];
1204
			break;
1205
	}
1206
1207
	op = entity_translator(ch);
1208
1209
	// Determine the characters set:
1210
	switch (op & XML_OP_1_MASK) {
1211
		case XML_OP_1_START_TAG:
1212
			if (strcmp(this->pr_tag, "?xml") == 0)
1213
				this->ip = true;
1214
			else
1215
				this->ip = false;
1216
			break;
1217
		case XML_OP_1_ADD_ATTR:
1218
			if (this->ip) {
1219
				if (strcasecmp(this->pr_name, "encoding") == 0) {
1220
					strcpy(this->charset, this->pr_value);
1644.3.1 by Barry.Leslie at PrimeBase
Merged in changes from PBMS project.
1221
					if (STRCASESTR(this->charset, "utf-8"))
1548.2.11 by Barry.Leslie at PrimeBase
Removed libxml reqirement by using a home grown xml parser.
1222
						this->charset_type = CHARSET_UTF_8;
1644.3.1 by Barry.Leslie at PrimeBase
Merged in changes from PBMS project.
1223
					else if (STRCASESTR(this->charset, "ucs-2") ||
1224
						STRCASESTR(this->charset, "ucs-4") ||
1225
						STRCASESTR(this->charset, "unicode"))
1548.2.11 by Barry.Leslie at PrimeBase
Removed libxml reqirement by using a home grown xml parser.
1226
						this->charset_type = CHARSET_STANDARD;
1227
					else {
1228
						this->charset_type = CHARSET_TO_CONVERT_8_BIT;
1229
						buildConversionTable();
1230
					}
1231
				}
1232
			}
1233
			break;
1234
	}
1235
	return(op);
1236
}
1237
1238
void CSXMLProcessor::appendWCharToString(char *dstr, size_t *dlen, size_t dsize, wchar_t *schars, size_t slen)
1239
{
1240
	for (size_t i=0; i < slen; i++) {
1241
		if (*dlen < dsize-1) {
1242
			if (*schars > 127)
1243
				dstr[*dlen] = '~';
1244
			else
1245
				dstr[*dlen] = (char)*schars;
1246
			(*dlen)++;
1247
			schars++;
1248
			dstr[*dlen] = 0;
1249
		}
1250
	}
1251
}
1252
1253
int32_t CSXMLProcessor::processChar(wchar_t ch)
1254
{
1255
	int32_t op;
1256
1257
	op = charset_transformer(ch);
1258
1259
	/*
1260
	 * Capture output tag and attribute data.
1261
	 * This must be done at the highest level, after
1262
	 * parsing.
1263
	 */
1264
	switch (op & XML_DATA_MASK) {
1265
		case XML_DATA_TAG:
1266
			appendWCharToString(this->pr_tag, &this->tlength, CS_MAX_XML_NAME_SIZE, this->getDataPtr(), this->getDataLen());
1267
			break;
1268
		case XML_DATA_ATTR:
1269
			appendWCharToString(this->pr_name, &this->nlength, CS_MAX_XML_NAME_SIZE, this->getDataPtr(), this->getDataLen());
1270
			break;
1271
		case XML_DATA_VALUE:
1272
			appendWCharToString(this->pr_value, &this->vlength, CS_MAX_XML_NAME_SIZE, this->getDataPtr(), this->getDataLen());
1273
			break;
1274
	}
1275
	return(op);
1276
}
1277
1278
bool CSXMLProcessor::getError(int32_t *err, char **msg)
1279
{
1280
	*err = err_no;
1281
	*msg = err_message;
1282
	return err_no != 0;
1283
}
1284
1285
void CSXMLProcessor::setError(int32_t err, char *msg)
1286
{
1287
	err_no = err;
1288
	if (msg) {
1289
		strncpy(err_message, msg, CS_XML_ERR_MSG_SIZE);
1290
		err_message[CS_XML_ERR_MSG_SIZE-1] = 0;
1291
		return;
1292
	}
1293
1294
	switch (err) {
1295
		case CS_XML_ERR_OUT_OF_MEMORY:
1296
			sprintf(err_message, "AES parse error- insufficient memory");			
1297
			break;
1298
		case CS_XML_ERR_CHAR_TOO_LARGE:
1299
			sprintf(err_message, "AES parse error- UNICODE character too large to be encoded as UTF-8");			
1300
			break;
1301
		default:
1302
			sprintf(err_message, "AES parse error- %s", strerror(err));
1303
			break;
1304
	}
1305
}
1306
1307
void CSXMLProcessor::printError(char *prefix)
1308
{
1309
	printf("%s%s", prefix, err_message);
1310
}
1311
1312
/* ------------------------------------------------------------------- */
1313
/* CSXMLString */
1314
1315
#ifdef DEBUG_ALL
1316
#define EXTRA_SIZE			2
1317
#else
1318
#define EXTRA_SIZE			100
1319
#endif
1320
1321
bool CSXMLString::addChar(char ch, CSXMLProcessor *xml)
1322
{
1323
	char *ptr;
1324
1325
	if (stringLen + 2 > stringSize) {
1326
		if (!(ptr = (char *) realloc(stringPtr, stringLen + 2 + EXTRA_SIZE))) {
1327
			xml->setError(CS_XML_ERR_OUT_OF_MEMORY, NULL);
1328
			return false;
1329
		}
1330
		stringPtr = ptr;
1331
		stringSize = stringLen + 2 + EXTRA_SIZE;
1332
	}
1333
	stringPtr[stringLen] = ch;
1334
	stringPtr[stringLen+1] = 0;
1335
	stringLen++;
1336
	return true;
1337
}
1338
1339
bool CSXMLString::addChars(size_t size, wchar_t *buffer, bool to_lower, CSXMLProcessor *xml)
1340
{
1341
	size_t		i;
1342
	uint32_t	uni_char;
1343
	int32_t			shift;
1344
1345
	for (i=0; i<size; i++) {
1346
		uni_char = (uint32_t) buffer[i];
1347
		
1348
		/* Convertion to lower only done for ASCII! */
1349
		if (to_lower && uni_char <= 127)
1350
			uni_char = (uint32_t) tolower((int32_t) uni_char);
1351
1352
		// Convert to UTF-8!
1353
		if (uni_char <= 0x0000007F) {
1354
			if (!addChar((char) uni_char, xml))
1355
				return false;
1356
			shift = -6;
1357
		}
1358
		else if (uni_char <= 0x000007FF) {
1359
			if (!addChar((char) ((0x000000C0) | ((uni_char >> 6) & 0x0000001F)), xml))
1360
				return false;
1361
			shift = 0;
1362
		}
1363
		else if (uni_char <= 0x00000FFFF) {
1364
			if (!addChar((char) ((0x000000E0) | ((uni_char >> 12) & 0x0000000F)), xml))
1365
				return false;
1366
			shift = 6;
1367
		}
1368
		else if (uni_char <= 0x001FFFFF) {
1369
			if (!addChar((char) ((0x000000F0) | ((uni_char >> 18) & 0x00000007)), xml))
1370
				return false;
1371
			shift = 12;
1372
		}
1373
		else if (uni_char <= 0x003FFFFFF) {
1374
			if (!addChar((char) ((0x000000F0) | ((uni_char >> 24) & 0x00000003)), xml))
1375
				return false;
1376
			shift = 18;
1377
		}
1378
		else if (uni_char <= 0x07FFFFFFF) {
1379
			if (!addChar((char) ((0x000000F0) | ((uni_char >> 30) & 0x00000001)), xml))
1380
				return false;
1381
			shift = 24;
1382
		}
1383
		else {
1384
			xml->setError(CS_XML_ERR_CHAR_TOO_LARGE, NULL);
1385
			return false;
1386
		}
1387
1388
		while (shift >= 0) {
1389
			if (!addChar((char) ((0x00000080) | ((uni_char >> shift) & 0x0000003F)), xml))
1390
				return false;
1391
			shift -= 6;
1392
		}
1393
	}
1394
	return true;
1395
}
1396
1397
bool CSXMLString::addString(const char *string, CSXMLProcessor *xml)
1398
{
1399
	bool ok = true;
1400
	
1401
	while (*string && ok) {
1402
		ok = addChar(*string, xml);
1403
		string++;
1404
	}
1405
	return ok;
1406
}
1407
1408
void CSXMLString::setEmpty()
1409
{
1410
	stringLen = 0;
1411
	if (stringPtr)
1412
		*stringPtr = 0;
1413
}
1414
1415
void CSXMLString::setNull()
1416
{
1417
	if (stringPtr)
1418
		free(stringPtr);
1419
	stringPtr = NULL;
1420
	stringLen = 0;
1421
	stringSize = 0;
1422
}
1423
1424
char *CSXMLString::lastComponent()
1425
{
1426
	char *ptr;
1427
1428
	if (stringLen == 0)
1429
		return NULL;
1430
1431
	ptr = stringPtr + stringLen - 1;
1432
	while (ptr > stringPtr && *ptr != '/')
1433
		ptr--;
1434
	return ptr;
1435
}
1436
1437
/* We assume comp begins with a '/' */
1438
char *CSXMLString::findTrailingComponent(const char *comp)
1439
{
1440
	char *ptr, *last_slash;
1441
1442
	if (stringLen == 0)
1443
		return NULL;
1444
1445
	ptr = stringPtr + stringLen - 1;
1446
	last_slash = NULL;
1447
1448
	do {
1449
		/* Find the next '/' */
1450
		while (ptr > stringPtr && *ptr != '/')
1451
			ptr--;
1452
		if (last_slash)
1453
			*last_slash = 0;
1454
		if (strcmp(ptr, comp) == 0) {
1455
			if (last_slash)
1456
				*last_slash = '/';
1457
			return ptr;
1458
		}
1459
		if (last_slash)
1460
			*last_slash = '/';
1461
		last_slash = ptr;
1462
		ptr--;
1463
	}
1464
	while (ptr > stringPtr);
1465
	return NULL;
1466
}
1467
1468
void CSXMLString::truncate(char *ptr)
1469
{
1470
	*ptr = 0;
1471
	stringLen = ptr - stringPtr;
1472
}
1473
1474
/* ------------------------------------------------------------------- */
1475
/* CSXML */
1476
1477
#define IS_XML_CDATA				0
1478
#define IS_XML_CDATA_TAG			1
1479
#define IS_XML_TAG					2
1480
#define IS_XML_CLOSE_TAG			3
1481
#define IS_XML_COMMENT				4
1482
#define IS_XML_DTD					5
1483
#define IS_XML_PI					6
1484
#define IS_XML_PI_XML				7
1485
#define IS_XML_IN_EX				8
1486
#define IS_XML_OPEN_BRACKET			9
1487
#define IS_XML_CLOSE_BRACKET		10
1488
1489
int32_t CSXML::nodeType(char *name)
1490
{
1491
	if (name) {
1492
		switch (*name) {
1493
			case 0:
1494
				return IS_XML_CDATA;
1495
			case '[':
1496
				if (strlen(name) == 1)
1497
					return IS_XML_OPEN_BRACKET;
1498
				break;
1499
			case ']':
1500
				if (strlen(name) == 1)
1501
					return IS_XML_CLOSE_BRACKET;
1502
				break;
1503
			case '/':
1504
				return IS_XML_CLOSE_TAG;
1505
			case '!':
1506
				if (strlen(name) > 1) {
1507
					if (strcasecmp(name, "!--") == 0)
1508
						return IS_XML_COMMENT;
1509
					if (name[1] == '[') {
1510
						if (strcasecmp(name, "![CDATA[") == 0)
1511
							return IS_XML_CDATA_TAG;
1512
						return IS_XML_IN_EX;
1513
					}
1514
				}
1515
				return IS_XML_DTD;
1516
			case '?':
1517
				if (strcasecmp(name, "?xml") == 0)
1518
					return IS_XML_PI_XML;
1519
				return IS_XML_PI;
1520
		}
1521
		return IS_XML_TAG;
1522
	}
1523
	return IS_XML_CDATA;
1524
}
1525
1526
bool CSXML::internalCloseNode(const char *name, bool single)
1527
{
1528
	bool	ok = true;
1529
	char	*ptr;
1530
1531
	if (single) {
1532
		if ((ptr = xml_path.lastComponent())) {
1533
			ok = closeNode(xml_path.stringPtr);
1534
			xml_path.truncate(ptr);
1535
		}
1536
	}
1537
	else if ((ptr = xml_path.findTrailingComponent(name))) {
1538
		/* Close the node that is named above. If the XML is
1539
		 * correct, then the node should be at the top of the
1540
		 * node stack (last element of the path).
1541
		 *
1542
		 * If not found, "ignore" the close.
1543
		 *
1544
		 * If not found on the top of the node stack, then
1545
		 * we close serveral nodes.
1546
		 */
1547
		for (;;) {
1548
			if (!(ptr = xml_path.lastComponent()))
1549
				break;
1550
			if (!(ok = closeNode(xml_path.stringPtr)))
1551
				break;
1552
			if (strcmp(ptr, name) == 0) {
1553
				xml_path.truncate(ptr);
1554
				break;
1555
			}
1556
			xml_path.truncate(ptr);
1557
		}
1558
	}
1559
	return ok;
1560
}
1561
1562
bool CSXML::internalOpenNode(const char *name)
1563
{
1564
	bool ok;
1565
1566
	ok = xml_path.addString("/", this);
1567
	if (!ok)
1568
		return ok;
1569
	ok = xml_path.addString(name, this);
1570
	if (!ok)
1571
		return ok;
1572
	return openNode(this->xml_path.stringPtr, this->xml_value.stringPtr);
1573
}
1574
1575
bool CSXML::parseXML(int32_t my_flags)
1576
{
1577
	wchar_t	ch;
1578
	bool	ok = true;
1579
	int32_t		op;
1580
	int32_t		tagtype;
1581
1582
	this->flags = my_flags;
1583
	ok = xml_path.addChars(0, NULL, false, this);
1584
	if (!ok)
1585
		goto exit;
1586
	ok = xml_name.addChars(0, NULL, false, this);
1587
	if (!ok)
1588
		goto exit;
1589
	ok = xml_value.addChars(0, NULL, false, this);
1590
	if (!ok)
1591
		goto exit;
1592
1593
	ok = getChar(&ch);
1594
	while (ch != CS_XML_EOF_CHAR && ok) {
1595
		op = processChar(ch);
1596
		switch (op & XML_OP_1_MASK) {
1597
			case XML_OP_1_NOOP:
1598
				break;
1599
			case XML_OP_1_END_TAG:
1600
				break;
1601
			case XML_OP_1_END_CLOSE_TAG:
1602
				break;
1603
			case XML_OP_1_END_EMPTY_TAG:
1604
				ok = internalCloseNode("/>", true);
1605
				break;
1606
			case XML_OP_1_END_PI_TAG:
1607
				ok = internalCloseNode("?>", true);
1608
				break;
1609
			case XML_OP_1_END_ENTITY_TAG:
1610
				ok = internalCloseNode(">", true);
1611
				break;
1612
			case XML_OP_1_END_BRACKET_TAG:
1613
				ok = internalCloseNode("]>", true);
1614
				break;
1615
			case XML_OP_1_END_UNKNOWN_TAG:
1616
				ok = internalCloseNode(">", true);
1617
				break;
1618
			case XML_OP_1_START_CDATA_TAG:
1619
				break;
1620
			case XML_OP_1_START_COMMENT:
1621
				break;
1622
			case XML_OP_1_START_TAG:
1623
				if (nodeType(xml_name.stringPtr) == IS_XML_CLOSE_TAG)
1624
					ok = internalCloseNode(xml_name.stringPtr, false);
1625
				else
1626
					ok = internalOpenNode(xml_name.stringPtr);
1627
				xml_name.setEmpty();
1628
				xml_value.setEmpty();
1629
				break;
1630
			case XML_OP_1_ADD_ATTR:
1631
				tagtype = nodeType(xml_name.stringPtr);
1632
				if (tagtype != IS_XML_OPEN_BRACKET && tagtype != IS_XML_CLOSE_BRACKET)
1633
					ok = addAttribute(xml_path.stringPtr, xml_name.stringPtr, xml_value.stringPtr);
1634
				xml_name.setEmpty();
1635
				xml_value.setEmpty();
1636
				break;
1637
			case XML_OP_1_END_CDATA:
1638
				if (xml_value.stringLen || (my_flags & XML_KEEP_EMPTY_CDATA)) {
1639
					ok = internalOpenNode("");
1640
					xml_name.setEmpty();
1641
					xml_value.setEmpty();
1642
					ok = internalCloseNode("", true);
1643
				}
1644
				break;
1645
			case XML_OP_1_END_CDATA_TAG:
1646
				ok = internalOpenNode("![CDATA[");
1647
				xml_name.setEmpty();
1648
				xml_value.setEmpty();
1649
				if (ok)
1650
					ok = internalCloseNode("]]>", true);
1651
				break;
1652
			case XML_OP_1_END_COMMENT:
1653
				ok = internalOpenNode("!--");
1654
				xml_name.setEmpty();
1655
				xml_value.setEmpty();
1656
				if (ok)
1657
					ok = internalCloseNode("-->", true);
1658
				break;
1659
		}
1660
		if (!ok)
1661
			break;
1662
		switch (op & XML_DATA_MASK) {
1663
			case XML_DATA_TAG:
1664
			case XML_DATA_ATTR:
1665
				ok = xml_name.addChars(getDataLen(), getDataPtr(), true, this);
1666
				break;
1667
			case XML_DATA_CDATA:
1668
			case XML_DATA_CDATA_TAG:
1669
			case XML_COMMENT:
1670
			case XML_DATA_VALUE:
1671
				ok = xml_value.addChars(getDataLen(), getDataPtr(), false, this);
1672
				break;
1673
		}
1674
		if (!ok)
1675
			break;
1676
		switch (op & XML_OP_2_MASK) {
1677
			case XML_OP_2_NOOP:
1678
				break;
1679
			case XML_OP_2_END_TAG:
1680
				break;
1681
			case XML_OP_2_END_CLOSE_TAG:
1682
				break;
1683
			case XML_OP_2_END_EMPTY_TAG:
1684
				ok = internalCloseNode("/>", true);
1685
				break;
1686
			case XML_OP_2_END_PI_TAG:
1687
				ok = internalCloseNode("?>", true);
1688
				break;
1689
			case XML_OP_2_END_ENTITY_TAG:
1690
				ok = internalCloseNode(">", true);
1691
				break;
1692
			case XML_OP_2_END_BRACKET_TAG:
1693
				ok = internalCloseNode("]>", true);
1694
				break;
1695
			case XML_OP_2_END_UNKNOWN_TAG:
1696
				ok = internalCloseNode(">", true);
1697
				break;
1698
			case XML_OP_2_START_CDATA_TAG:
1699
				break;
1700
			case XML_OP_2_START_COMMENT:
1701
				break;
1702
		}
1703
		ok = getChar(&ch);
1704
	}
1705
1706
	exit:
1707
	xml_path.setNull();
1708
	xml_name.setNull();
1709
	xml_value.setNull();
1710
	return ok;
1711
}
1712
1713
/* ------------------------------------------------------------------- */
1714
/* CSXMLPrint */
1715
1716
bool CSXMLPrint::openNode(char *path, char *value)
1717
{
1718
	printf("OPEN  %s\n", path);
1719
	if (value && *value)
1720
		printf("      %s\n", value);
1721
	return true;
1722
}
1723
1724
bool CSXMLPrint::closeNode(char *path)
1725
{
1726
	printf("close %s\n", path);
1727
	return true;
1728
}
1729
1730
bool CSXMLPrint::addAttribute(char *path, char *name, char *value)
1731
{
1732
	if (value)
1733
		printf("attr  %s %s=%s\n", path, name, value);
1734
	else
1735
		printf("attr  %s %s\n", path, name);
1736
	return true;
1737
}
1738
1739
/* ------------------------------------------------------------------- */
1740
/* CSXMLBuffer */
1741
1742
bool CSXMLBuffer::parseString(const char *data, int32_t my_flags)
1743
{
1744
	charData = data;
1745
	dataLen = strlen(data);
1746
	dataPos = 0;
1747
	return parseXML(my_flags);
1748
}
1749
1750
bool CSXMLBuffer::parseData(const char *data, size_t len, int32_t my_flags)
1751
{
1752
	charData = data;
1753
	dataLen = len;
1754
	dataPos = 0;
1755
	return parseXML(my_flags);
1756
}
1757
1758
bool CSXMLBuffer::getChar(wchar_t *ch)
1759
{
1760
	if (dataPos == dataLen)
1761
		*ch = CS_XML_EOF_CHAR;
1762
	else {
1763
		*ch = (wchar_t) (unsigned char) charData[dataPos];
1764
		dataPos++;
1765
	}
1766
	return true;
1767
}
1768
1769
/* ------------------------------------------------------------------- */
1770
/* CSXMLFile */
1771
1772
bool CSXMLFile::parseFile(char *file_name, int32_t my_flags)
1773
{
1774
	bool ok;
1775
1776
	if (!(this->file = fopen(file_name, "r"))) {
1777
		setError(errno, NULL);
1778
		return false;
1779
	}
1780
	ok = parseXML(my_flags);
1781
	fclose(this->file);
1782
	return ok;
1783
}
1784
1785
bool CSXMLFile::getChar(wchar_t *ch)
1786
{
1787
	int32_t next_ch;
1788
	
1789
	next_ch = fgetc(file);
1790
	if (next_ch == EOF) {
1791
		if (ferror(file)) {
1792
			setError(errno, NULL);
1793
			return false;
1794
		}
1795
		*ch = CS_XML_EOF_CHAR;
1796
	}
1797
	else
1798
		*ch = (wchar_t) next_ch;
1799
	return true;
1800
}
1801
1802