openMSX
rapidsax.hh
Go to the documentation of this file.
1#ifndef RAPIDSAX_HH
2#define RAPIDSAX_HH
3
4// This code is _heavily_ based on RapidXml 1.13
5// http://rapidxml.sourceforge.net/
6//
7// RapidXml is a very fast XML parser.
8// http://xmlbench.sourceforge.net/results/benchmark200910/index.html
9// One of the main reasons it can be this fast is that doesn't do any string
10// copies. Instead the XML input data is modified in-place (e.g. for stuff like
11// < replacements). Though this also means the output produced by the parser
12// is tied to the lifetime of the XML input data.
13//
14// RapidXml produces a DOM-like output. This parser has a SAX-like interface.
15
16#include "one_of.hh"
17#include "small_compare.hh"
18#include <array>
19#include <cassert>
20#include <cstdint>
21#include <string_view>
22
23namespace rapidsax {
24
25// Parse given XML text and call callback functions in the given handler.
26// - XML text must be zero-terminated
27// - Handler must implement the methods defined in NullHandler (below). An
28// easy way to do this is to inherit from NullHandler and only reimplement
29// the methods that you need.
30// - The behavior of the parser can be fine-tuned with the FLAGS parameter,
31// see below for more details.
32// - When a parse error is encounter, an instance of ParseError is thrown.
33// - The lifetime of the string_view's in the callback handler is the same as
34// the lifetime of the input XML data (no string copies are made, instead
35// the XML file is modified in-place and references to this data are passed).
36template<int FLAGS, typename HANDLER> void parse(HANDLER& handler, char* xml);
37
38// When loading an XML file from disk, the buffer needs to be 8 bytes bigger
39// than the filesize. The first of these bytes must be filled with zero
40// (zero-terminate the xml data). The other bytes are only there to allow to
41// read up-to 8 bytes past the end without triggering memory protection errors.
42inline constexpr size_t EXTRA_BUFFER_SPACE = 8;
43
44
45// Flags that influence parsing behavior. The flags can be OR'ed together.
46
47// Should XML entities like &lt; be expanded or not?
48inline constexpr int noEntityTranslation = 0x1;
49// Should leading and trailing whitespace be trimmed?
50inline constexpr int trimWhitespace = 0x2;
51// Should sequences of whitespace characters be replaced with a single
52// space character?
53inline constexpr int normalizeWhitespace = 0x4;
54// Should strings be modified (in-place) with a zero-terminator?
55inline constexpr int zeroTerminateStrings = 0x8;
56
57
58// Callback handler with all empty implementations (can be used as a base
59// class in case you only need to reimplement a few of the methods).
61{
62public:
63 // Called when an opening XML tag is encountered.
64 // 'name' is the name of the XML tag.
65 void start(std::string_view /*name*/) {}
66
67 // Called when a XML tag is closed.
68 // Note: the parser does currently not check whether the name of the
69 // opening nd closing tags matches.
70 void stop() {}
71
72 // Called when text inside a tag is parsed.
73 // XML entities are replaced (optional)
74 // Whitespace is (optionally) trimmed or normalized.
75 // This method is not called for an empty text string.
76 // (Unlike other SAX parsers) the whole text string is always
77 // passed in a single chunk (so no need to concatenate this text
78 // with previous chunks in the callback).
79 void text(std::string_view /*text*/) {}
80
81 // Called for each parsed attribute.
82 // Attributes can occur inside xml tags or inside XML declarations.
83 void attribute(std::string_view /*name*/, std::string_view /*value*/) {}
84
85 // Called for parsed CDATA sections.
86 void cdata(std::string_view /*value*/) {}
87
88 // Called when a XML comment (<!-- ... -->) is parsed.
89 void comment(std::string_view /*value*/) {}
90
91 // Called when XML declaration (<?xml .. ?>) is parsed.
92 // Inside a XML declaration there can be attributes.
94 void declAttribute(std::string_view /*name*/, std::string_view /*value*/) {}
96
97 // Called when the <!DOCTYPE ..> is parsed.
98 void doctype(std::string_view /*text*/) {}
99
100 // Called when XML processing instructions (<? .. ?>) are parsed.
101 void procInstr(std::string_view /*target*/, std::string_view /*instr*/) {}
102};
103
104
106{
107public:
108 ParseError(const char* what_, char* where_)
109 : m_what(what_)
110 , m_where(where_)
111 {
112 }
113
114 [[nodiscard]] const char* what() const { return m_what; }
115 [[nodiscard]] char* where() const { return m_where; }
116
117private:
118 const char* m_what;
119 char* m_where;
120};
121
122
123namespace internal {
124
125// Character class lookup table
126// bit 0: \0
127// bit 1: \t \r \r space
128// bit 2: <
129// bit 3: &
130// bit 4: '
131// bit 5: "
132// bit 6: / > ?
133// bit 7: ! =
134inline constexpr std::array<const uint8_t, 256> lutChar =
135{
136 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
137 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x02,0x00,0x00, // 0
138 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 1
139 0x02,0x80,0x20,0x00,0x00,0x00,0x08,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x40, // 2
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x80,0x40,0x40, // 3
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 4
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 5
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 6
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 7
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 8
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 9
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // A
148 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // B
149 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // C
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // D
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // E
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // F
153};
154
155// Digits (dec and hex, 255 denotes end of numeric character reference)
156inline constexpr std::array<const uint8_t, 256> lutDigits =
157{
158 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
159 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 0
160 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 1
161 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 2
162 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255, // 3
163 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 4
164 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 5
165 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 6
166 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 7
167 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 8
168 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 9
169 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // A
170 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // B
171 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // C
172 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // D
173 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // E
174 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255 // F
175};
176
177// Detect whitespace character (space \n \r \t)
178struct WhitespacePred {
179 [[nodiscard]] static bool test(char ch) { return (lutChar[uint8_t(ch)] & 0x02) != 0; }
180};
181
182// Detect node name character (anything but space \n \r \t / > ? \0)
183struct NodeNamePred {
184 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x43); }
185};
186
187// Detect attribute name character (anything but space \n \r \t / < > = ? ! \0)
188struct AttributeNamePred {
189 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0xC7); }
190};
191
192// Detect text character (PCDATA) (anything but < \0)
193struct TextPred {
194 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x05); }
195};
196
197// Detect text character (PCDATA) that does not require processing when ws
198// normalization is disabled (anything but < \0 &)
199struct TextPureNoWsPred {
200 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x0D); }
201};
202
203// Detect text character (PCDATA) that does not require processing when ws
204// normalization is enabled (anything but < \0 & space \n \r \t)
205struct TextPureWithWsPred {
206 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x0F); }
207};
208
209// Detect attribute value character, single quote (anything but ' \0)
210struct AttPred1 {
211 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x11); }
212};
213// Detect attribute value character, double quote (anything but " \0)
214struct AttPred2 {
215 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x21); }
216};
217
218// Detect attribute value character, single quote, that does not require
219// processing (anything but ' \0 &)
220struct AttPurePred1 {
221 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x19); }
222};
223// Detect attribute value character, double quote, that does not require
224// processing (anything but " \0 &)
225struct AttPurePred2 {
226 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x29); }
227};
228
229// Insert coded character, using UTF8
230inline void insertUTF8char(char*& text, uint32_t code)
231{
232 if (code < 0x80) { // 1 byte sequence
233 text[0] = char(code);
234 text += 1;
235 } else if (code < 0x800) {// 2 byte sequence
236 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
237 text[0] = char (code | 0xC0);
238 text += 2;
239 } else if (code < 0x10000) { // 3 byte sequence
240 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
241 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
242 text[0] = char (code | 0xE0);
243 text += 3;
244 } else if (code < 0x110000) { // 4 byte sequence
245 text[3] = char((code | 0x80) & 0xBF); code >>= 6;
246 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
247 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
248 text[0] = char (code | 0xF0);
249 text += 4;
250 } else { // Invalid, only codes up to 0x10FFFF are allowed in Unicode
251 throw ParseError("invalid numeric character entity", text);
252 }
253}
254
255template<StringLiteral Str> [[nodiscard]] static inline bool next(const char* p)
256{
257 return small_compare<Str>(p);
258}
259
260
261// Skip characters until predicate evaluates to true
262template<typename StopPred> static inline void skip(char*& text)
263{
264 char* tmp = text;
265 while (StopPred::test(*tmp)) ++tmp;
266 text = tmp;
267}
268
269// Skip characters until predicate evaluates to true while doing the following:
270// - replacing XML character entity references with proper characters
271// (&apos; &amp; &quot; &lt; &gt; &#...;)
272// - condensing whitespace sequences to single space character
273template<typename StopPred, class StopPredPure, int FLAGS>
274[[nodiscard]] static inline char* skipAndExpand(char*& text)
275{
276 // If entity translation, whitespace condense and whitespace
277 // trimming is disabled, use plain skip.
278 if constexpr ( (FLAGS & noEntityTranslation) &&
279 !(FLAGS & normalizeWhitespace) &&
280 !(FLAGS & trimWhitespace)) {
281 skip<StopPred>(text);
282 return text;
283 }
284
285 // Use simple skip until first modification is detected
286 skip<StopPredPure>(text);
287
288 // Use translation skip
289 char* src = text;
290 char* dest = src;
291 while (StopPred::test(*src)) {
292 // Test if replacement is needed
293 if (!(FLAGS & noEntityTranslation) &&
294 (src[0] == '&')) {
295 switch (src[1]) {
296 case 'a': // &amp; &apos;
297 if (next<"amp;">(&src[1])) {
298 *dest = '&';
299 ++dest;
300 src += 5;
301 continue;
302 }
303 if (next<"pos;">(&src[2])) {
304 *dest = '\'';
305 ++dest;
306 src += 6;
307 continue;
308 }
309 break;
310
311 case 'q': // &quot;
312 if (next<"uot;">(&src[2])) {
313 *dest = '"';
314 ++dest;
315 src += 6;
316 continue;
317 }
318 break;
319
320 case 'g': // &gt;
321 if (next<"t;">(&src[2])) {
322 *dest = '>';
323 ++dest;
324 src += 4;
325 continue;
326 }
327 break;
328
329 case 'l': // &lt;
330 if (next<"t;">(&src[2])) {
331 *dest = '<';
332 ++dest;
333 src += 4;
334 continue;
335 }
336 break;
337
338 case '#': // &#...; - assumes ASCII
339 if (src[2] == 'x') {
340 uint32_t code = 0;
341 src += 3; // skip &#x
342 while (true) {
343 uint8_t digit = lutDigits[uint8_t(*src)];
344 if (digit == 0xFF) break;
345 code = code * 16 + digit;
346 ++src;
347 }
348 insertUTF8char(dest, code);
349 } else {
350 uint32_t code = 0;
351 src += 2; // skip &#
352 while (true) {
353 uint8_t digit = lutDigits[uint8_t(*src)];
354 if (digit == 0xFF) break;
355 code = code * 10 + digit;
356 ++src;
357 }
358 insertUTF8char(dest, code);
359 }
360 if (*src != ';') {
361 throw ParseError("expected ;", src);
362 }
363 ++src;
364 continue;
365
366 default:
367 // Something else, ignore, just copy '&' verbatim
368 break;
369 }
370 }
371
372 // Test if condensing is needed
373 if ((FLAGS & normalizeWhitespace) &&
374 (WhitespacePred::test(*src))) {
375 *dest++ = ' '; // single space in dest
376 ++src; // skip first whitespace char
377 // Skip remaining whitespace chars
378 while (WhitespacePred::test(*src)) ++src;
379 continue;
380 }
381
382 // No replacement, only copy character
383 *dest++ = *src++;
384 }
385
386 // Return new end
387 text = src;
388 return dest;
389}
390
391inline void skipBOM(char*& text)
392{
393 if (next<"\357\273\277">(text)) { // char(0xEF), char(0xBB), char(0xBF)
394 text += 3; // skip utf-8 bom
395 }
396}
397
398
399template<int FLAGS, typename HANDLER> class Parser
400{
401 HANDLER& handler;
402
403public:
404 Parser(HANDLER& handler_, char* text)
405 : handler(handler_)
406 {
407 skipBOM(text);
408 while (true) {
409 // Skip whitespace before node
410 skip<WhitespacePred>(text);
411 if (*text == 0) break;
412
413 if (*text != '<') {
414 throw ParseError("expected <", text);
415 }
416 ++text; // skip '<'
417 parseNode(text);
418 }
419 }
420
421private:
422 // Parse XML declaration (<?xml...)
423 void parseDeclaration(char*& text)
424 {
425 handler.declarationStart();
426 skip<WhitespacePred>(text); // skip ws before attributes or ?>
427 parseAttributes(text, true);
428 handler.declarationStop();
429
430 // skip ?>
431 if (!next<"?>">(text)) {
432 throw ParseError("expected ?>", text);
433 }
434 text += 2;
435 }
436
437 // Parse XML comment (<!--...)
438 void parseComment(char*& text)
439 {
440 // Skip until end of comment
441 const char* value = text; // remember value start
442 while (!next<"-->">(text)) {
443 if (text[0] == 0) {
444 throw ParseError("unexpected end of data", text);
445 }
446 ++text;
447 }
448 if (FLAGS & zeroTerminateStrings) {
449 *text = '\0';
450 }
451 handler.comment(std::string_view(value, text - value));
452 text += 3; // skip '-->'
453 }
454
455 void parseDoctype(char*& text)
456 {
457 const char* value = text; // remember value start
458
459 // skip to >
460 while (*text != '>') {
461 switch (*text) {
462 case '[': {
463 // If '[' encountered, scan for matching ending
464 // ']' using naive algorithm with depth. This
465 // works for all W3C test files except for 2
466 // most wicked.
467 ++text; // skip '['
468 int depth = 1;
469 while (depth > 0) {
470 switch (*text) {
471 case char('['): ++depth; break;
472 case char(']'): --depth; break;
473 case 0: throw ParseError(
474 "unexpected end of data", text);
475 }
476 ++text;
477 }
478 break;
479 }
480 case '\0':
481 throw ParseError("unexpected end of data", text);
482
483 default:
484 ++text;
485 }
486 }
487
488 if (FLAGS & zeroTerminateStrings) {
489 *text = '\0';
490 }
491 handler.doctype(std::string_view(value, text - value));
492 text += 1; // skip '>'
493 }
494
495 void parsePI(char*& text)
496 {
497 // Extract PI target name
498 const char* name = text;
499 skip<NodeNamePred>(text);
500 char* nameEnd = text;
501 if (name == nameEnd) {
502 throw ParseError("expected PI target", text);
503 }
504
505 // Skip whitespace between pi target and pi
506 skip<WhitespacePred>(text);
507
508 // Skip to '?>'
509 const char* value = text; // Remember start of pi
510 while (!next<"?>">(text)) {
511 if (*text == 0) {
512 throw ParseError("unexpected end of data", text);
513 }
514 ++text;
515 }
516 // Set pi value (verbatim, no entity expansion or ws normalization)
517 if (FLAGS & zeroTerminateStrings) {
518 *nameEnd = '\0';
519 *text = '\0';
520 }
521 handler.procInstr(std::string_view(name, nameEnd - name),
522 std::string_view(value, text - value));
523 text += 2; // skip '?>'
524 }
525
526 void parseText(char*& text, char* contentsStart)
527 {
528 // Backup to contents start if whitespace trimming is disabled
529 if constexpr (!(FLAGS & trimWhitespace)) {
530 text = contentsStart;
531 }
532 // Skip until end of data
533 const char* value = text;
534 char* end = (FLAGS & normalizeWhitespace)
535 ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
536 : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
537
538 // Trim trailing whitespace; leading was already trimmed by
539 // whitespace skip after >
540 if constexpr ((FLAGS & trimWhitespace) != 0) {
541 if constexpr (FLAGS & normalizeWhitespace) {
542 // Whitespace is already condensed to single
543 // space characters by skipping function, so
544 // just trim 1 char off the end.
545 if (end[-1] == ' ') {
546 --end;
547 }
548 } else {
549 // Backup until non-whitespace character is found
550 while (WhitespacePred::test(end[-1])) {
551 --end;
552 }
553 }
554 }
555
556 // check next char before calling handler.text()
557 if (*text == '\0') {
558 throw ParseError("unexpected end of data", text);
559 } else {
560 assert(*text == '<');
561 }
562
563 // Handle text, but only if non-empty.
564 auto len = end - value;
565 if (len) {
566 if (FLAGS & zeroTerminateStrings) {
567 *end = '\0';
568 }
569 handler.text(std::string_view(value, len));
570 }
571 }
572
573 void parseCdata(char*& text)
574 {
575 // Skip until end of cdata
576 const char* value = text;
577 while (!next<"]]>">(text)) {
578 if (text[0] == 0) {
579 throw ParseError("unexpected end of data", text);
580 }
581 ++text;
582 }
583 if (FLAGS & zeroTerminateStrings) {
584 *text = '\0';
585 }
586 handler.cdata(std::string_view(value, text - value));
587 text += 3; // skip ]]>
588 }
589
590 void parseElement(char*& text)
591 {
592 // Extract element name
593 const char* name = text;
594 skip<NodeNamePred>(text);
595 char* nameEnd = text;
596 if (name == nameEnd) {
597 throw ParseError("expected element name", text);
598 }
599 handler.start(std::string_view(name, nameEnd - name));
600
601 skip<WhitespacePred>(text); // skip ws before attributes or >
602 parseAttributes(text, false);
603
604 // Determine ending type
605 if (*text == '>') {
606 if (FLAGS & zeroTerminateStrings) {
607 *nameEnd = '\0';
608 }
609 ++text;
610 parseNodeContents(text);
611 } else if (*text == '/') {
612 if (FLAGS & zeroTerminateStrings) {
613 *nameEnd = '\0';
614 }
615 handler.stop();
616 ++text;
617 if (*text != '>') {
618 throw ParseError("expected >", text);
619 }
620 ++text;
621 } else {
622 throw ParseError("expected >", text);
623 }
624 }
625
626 // Determine node type, and parse it
627 void parseNode(char*& text)
628 {
629 switch (text[0]) {
630 case '?': // <?...
631 ++text; // skip ?
632 // Note: this doesn't detect mixed case (xMl), does
633 // that matter?
634 if ((next<"xml">(text) || next<"XML">(text)) &&
635 WhitespacePred::test(text[3])) {
636 // '<?xml ' - xml declaration
637 text += 4; // skip 'xml '
638 parseDeclaration(text);
639 } else {
640 parsePI(text);
641 }
642 break;
643
644 case '!': // <!...
645 // Parse proper subset of <! node
646 switch (text[1]) {
647 case '-': // <!-
648 if (text[2] == '-') {
649 // '<!--' - xml comment
650 text += 3; // skip '!--'
651 parseComment(text);
652 return;
653 }
654 break;
655
656 case '[': // <![
657 if (next<"CDATA[">(&text[2])) {
658 // '<![CDATA[' - cdata
659 text += 8; // skip '![CDATA['
660 parseCdata(text);
661 return;
662 }
663 break;
664
665 case 'D': // <!D
666 if (next<"OCTYPE">(&text[2]) &&
667 WhitespacePred::test(text[8])) {
668 // '<!DOCTYPE ' - doctype
669 text += 9; // skip '!DOCTYPE '
670 parseDoctype(text);
671 return;
672 }
673 break;
674 }
675 // Attempt to skip other, unrecognized types starting with <!
676 ++text; // skip !
677 while (*text != '>') {
678 if (*text == 0) {
679 throw ParseError(
680 "unexpected end of data", text);
681 }
682 ++text;
683 }
684 ++text; // skip '>'
685 break;
686
687 default: // <...
688 parseElement(text);
689 break;
690 }
691 }
692
693 // Parse contents of the node - children, data etc.
694 void parseNodeContents(char*& text)
695 {
696 while (true) {
697 char* contentsStart = text; // start before ws is skipped
698 skip<WhitespacePred>(text); // Skip ws between > and contents
699
700 switch (*text) {
701 case '<': // Node closing or child node
702afterText: // After parseText() jump here instead of continuing
703 // the loop, because skipping whitespace is unnecessary.
704 if (text[1] == '/') {
705 // Node closing
706 text += 2; // skip '</'
707 skip<NodeNamePred>(text);
708 // TODO validate closing tag??
709 handler.stop();
710 // Skip remaining whitespace after node name
711 skip<WhitespacePred>(text);
712 if (*text != '>') {
713 throw ParseError("expected >", text);
714 }
715 ++text; // skip '>'
716 return;
717 } else {
718 // Child node
719 ++text; // skip '<'
720 parseNode(text);
721 }
722 break;
723
724 case '\0':
725 throw ParseError("unexpected end of data", text);
726
727 default:
728 parseText(text, contentsStart);
729 goto afterText;
730 }
731 }
732 }
733
734 // Parse XML attributes of the node
735 void parseAttributes(char*& text, bool declaration)
736 {
737 // For all attributes
738 while (AttributeNamePred::test(*text)) {
739 // Extract attribute name
740 char* name = text;
741 ++text; // Skip first character of attribute name
742 skip<AttributeNamePred>(text);
743 char* nameEnd = text;
744 if (name == nameEnd) {
745 throw ParseError("expected attribute name", name);
746 }
747
748 skip<WhitespacePred>(text); // skip ws after name
749 if (*text != '=') {
750 throw ParseError("expected =", text);
751 }
752 ++text; // skip =
753 skip<WhitespacePred>(text); // skip ws after =
754
755 // Skip quote and remember if it was ' or "
756 char quote = *text;
757 if (quote != one_of('\'', '"')) {
758 throw ParseError("expected ' or \"", text);
759 }
760 ++text;
761
762 // Extract attribute value and expand char refs in it
763 // No whitespace normalization in attributes
764 constexpr int FLAGS2 = FLAGS & ~normalizeWhitespace;
765 const char* value = text;
766 char* valueEnd = (quote == '\'')
767 ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
768 : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
769 // Make sure that end quote is present
770 // check before calling handler.xxx()
771 if (*text != quote) {
772 throw ParseError("expected ' or \"", text);
773 }
774 ++text; // skip quote
775
776 if (FLAGS & zeroTerminateStrings) {
777 *nameEnd = '\0';
778 *valueEnd = '\0';
779 }
780 if (!declaration) {
781 handler.attribute(std::string_view(name, nameEnd - name),
782 std::string_view(value, valueEnd - value));
783 } else {
784 handler.declAttribute(std::string_view(name, nameEnd - name),
785 std::string_view(value, valueEnd - value));
786 }
787
788 skip<WhitespacePred>(text); // skip ws after value
789 }
790 }
791};
792
793} // namespace internal
794
795template<int FLAGS, typename HANDLER>
796inline void parse(HANDLER& handler, char* xml)
797{
798 internal::Parser<FLAGS, HANDLER> parser(handler, xml);
799}
800
801} // namespace rapidsax
802
803#endif
void test(const IterableBitSet< N > &s, std::initializer_list< size_t > list)
void declAttribute(std::string_view, std::string_view)
Definition rapidsax.hh:94
void text(std::string_view)
Definition rapidsax.hh:79
void comment(std::string_view)
Definition rapidsax.hh:89
void procInstr(std::string_view, std::string_view)
Definition rapidsax.hh:101
void attribute(std::string_view, std::string_view)
Definition rapidsax.hh:83
void doctype(std::string_view)
Definition rapidsax.hh:98
void cdata(std::string_view)
Definition rapidsax.hh:86
void start(std::string_view)
Definition rapidsax.hh:65
char * where() const
Definition rapidsax.hh:115
ParseError(const char *what_, char *where_)
Definition rapidsax.hh:108
const char * what() const
Definition rapidsax.hh:114
constexpr int noEntityTranslation
Definition rapidsax.hh:48
constexpr int zeroTerminateStrings
Definition rapidsax.hh:55
constexpr int trimWhitespace
Definition rapidsax.hh:50
void parse(HANDLER &handler, char *xml)
Definition rapidsax.hh:796
constexpr int normalizeWhitespace
Definition rapidsax.hh:53
constexpr size_t EXTRA_BUFFER_SPACE
Definition rapidsax.hh:42
constexpr auto end(const zstring_view &x)