openMSX
rapidsax.hh
Go to the documentation of this file.
1#ifndef RAPIDSAX_HH
2#define RAPIDSAX_HH
3
4// This code is _heavily_ based on RapidXml 1.13
5// http://rapidxml.sourceforge.net/
6//
7// RapidXml is a very fast XML parser.
8// http://xmlbench.sourceforge.net/results/benchmark200910/index.html
9// One of the main reasons it can be this fast is that doesn't do any string
10// copies. Instead the XML input data is modified in-place (e.g. for stuff like
11// < replacements). Though this also means the output produced by the parser
12// is tied to the lifetime of the XML input data.
13//
14// RapidXml produces a DOM-like output. This parser has a SAX-like interface.
15
16#include "one_of.hh"
17#include "small_compare.hh"
18#include "zstring_view.hh"
19
20#include <array>
21#include <cassert>
22#include <cstdint>
23#include <string_view>
24
25namespace rapidsax {
26
27// Parse given XML text and call callback functions in the given handler.
28// - XML text must be zero-terminated
29// - Handler must implement the methods defined in NullHandler (below). An
30// easy way to do this is to inherit from NullHandler and only reimplement
31// the methods that you need.
32// - The behavior of the parser can be fine-tuned with the FLAGS parameter,
33// see below for more details.
34// - When a parse error is encounter, an instance of ParseError is thrown.
35// - The lifetime of the string_view's in the callback handler is the same as
36// the lifetime of the input XML data (no string copies are made, instead
37// the XML file is modified in-place and references to this data are passed).
38template<int FLAGS, typename HANDLER> void parse(HANDLER& handler, char* xml);
39
40// When loading an XML file from disk, the buffer needs to be 8 bytes bigger
41// than the filesize. The first of these bytes must be filled with zero
42// (zero-terminate the xml data). The other bytes are only there to allow to
43// read up-to 8 bytes past the end without triggering memory protection errors.
44inline constexpr size_t EXTRA_BUFFER_SPACE = 8;
45
46
47// Flags that influence parsing behavior. The flags can be OR'ed together.
48
49// Should XML entities like &lt; be expanded or not?
50inline constexpr int noEntityTranslation = 0x1;
51// Should leading and trailing whitespace be trimmed?
52inline constexpr int trimWhitespace = 0x2;
53// Should sequences of whitespace characters be replaced with a single
54// space character?
55inline constexpr int normalizeWhitespace = 0x4;
56// Should strings be modified (in-place) with a zero-terminator?
57inline constexpr int zeroTerminateStrings = 0x8;
58
59
60// Callback handler with all empty implementations (can be used as a base
61// class in case you only need to reimplement a few of the methods).
62//
63// Several methods are overloaded with either 'std::string_view' or
64// 'zstring_view' parameter types. Which one gets called depends on whether the
65// 'zeroTerminateStrings' flag was passed.
67{
68public:
69 // Called when an opening XML tag is encountered.
70 // 'name' is the name of the XML tag.
71 void start(std::string_view /*name*/) {}
72 void start(zstring_view /*name*/) {}
73
74 // Called when a XML tag is closed.
75 // Note: the parser does currently not check whether the name of the
76 // opening nd closing tags matches.
77 void stop() {}
78
79 // Called when text inside a tag is parsed.
80 // XML entities are replaced (optional)
81 // Whitespace is (optionally) trimmed or normalized.
82 // This method is not called for an empty text string.
83 // (Unlike other SAX parsers) the whole text string is always
84 // passed in a single chunk (so no need to concatenate this text
85 // with previous chunks in the callback).
86 void text(std::string_view /*text*/) {}
87 void text(zstring_view /*text*/) {}
88
89 // Called for each parsed attribute.
90 // Attributes can occur inside xml tags or inside XML declarations.
91 void attribute(std::string_view /*name*/, std::string_view /*value*/) {}
92 void attribute(zstring_view /*name*/, zstring_view /*value*/) {}
93
94 // Called for parsed CDATA sections.
95 void cdata(std::string_view /*value*/) {}
96 void cdata(zstring_view /*value*/) {}
97
98 // Called when a XML comment (<!-- ... -->) is parsed.
99 void comment(std::string_view /*value*/) {}
100 void comment(zstring_view /*value*/) {}
101
102 // Called when XML declaration (<?xml .. ?>) is parsed.
103 // Inside a XML declaration there can be attributes.
105 void declAttribute(std::string_view /*name*/, std::string_view /*value*/) {}
106 void declAttribute(zstring_view /*name*/, zstring_view /*value*/) {}
108
109 // Called when the <!DOCTYPE ..> is parsed.
110 void doctype(std::string_view /*text*/) {}
111 void doctype(zstring_view /*text*/) {}
112
113 // Called when XML processing instructions (<? .. ?>) are parsed.
114 void procInstr(std::string_view /*target*/, std::string_view /*instr*/) {}
115 void procInstr(zstring_view /*target*/, zstring_view /*instr*/) {}
116};
117
118
120{
121public:
122 ParseError(const char* what_, char* where_)
123 : m_what(what_)
124 , m_where(where_)
125 {
126 }
127
128 [[nodiscard]] const char* what() const { return m_what; }
129 [[nodiscard]] char* where() const { return m_where; }
130
131private:
132 const char* m_what;
133 char* m_where;
134};
135
136
137namespace internal {
138
139// Character class lookup table
140// bit 0: \0
141// bit 1: \t \r \r space
142// bit 2: <
143// bit 3: &
144// bit 4: '
145// bit 5: "
146// bit 6: / > ?
147// bit 7: ! =
148inline constexpr std::array<const uint8_t, 256> lutChar =
149{
150 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
151 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x02,0x00,0x00, // 0
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 1
153 0x02,0x80,0x20,0x00,0x00,0x00,0x08,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x40, // 2
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x80,0x40,0x40, // 3
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 4
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 5
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 6
158 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 7
159 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 8
160 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 9
161 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // A
162 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // B
163 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // C
164 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // D
165 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // E
166 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // F
167};
168
169// Digits (dec and hex, 255 denotes end of numeric character reference)
170inline constexpr std::array<const uint8_t, 256> lutDigits =
171{
172 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
173 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 0
174 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 1
175 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 2
176 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255, // 3
177 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 4
178 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 5
179 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255, // 6
180 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 7
181 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 8
182 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // 9
183 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // A
184 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // B
185 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // C
186 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // D
187 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, // E
188 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255 // F
189};
190
191// Detect whitespace character (space \n \r \t)
192struct WhitespacePred {
193 [[nodiscard]] static bool test(char ch) { return (lutChar[uint8_t(ch)] & 0x02) != 0; }
194};
195
196// Detect node name character (anything but space \n \r \t / > ? \0)
197struct NodeNamePred {
198 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x43); }
199};
200
201// Detect attribute name character (anything but space \n \r \t / < > = ? ! \0)
202struct AttributeNamePred {
203 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0xC7); }
204};
205
206// Detect text character (PCDATA) (anything but < \0)
207struct TextPred {
208 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x05); }
209};
210
211// Detect text character (PCDATA) that does not require processing when ws
212// normalization is disabled (anything but < \0 &)
213struct TextPureNoWsPred {
214 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x0D); }
215};
216
217// Detect text character (PCDATA) that does not require processing when ws
218// normalization is enabled (anything but < \0 & space \n \r \t)
219struct TextPureWithWsPred {
220 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x0F); }
221};
222
223// Detect attribute value character, single quote (anything but ' \0)
224struct AttPred1 {
225 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x11); }
226};
227// Detect attribute value character, double quote (anything but " \0)
228struct AttPred2 {
229 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x21); }
230};
231
232// Detect attribute value character, single quote, that does not require
233// processing (anything but ' \0 &)
234struct AttPurePred1 {
235 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x19); }
236};
237// Detect attribute value character, double quote, that does not require
238// processing (anything but " \0 &)
239struct AttPurePred2 {
240 [[nodiscard]] static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x29); }
241};
242
243// Insert coded character, using UTF8
244inline void insertUTF8char(char*& text, uint32_t code)
245{
246 if (code < 0x80) { // 1 byte sequence
247 text[0] = char(code);
248 text += 1;
249 } else if (code < 0x800) {// 2 byte sequence
250 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
251 text[0] = char (code | 0xC0);
252 text += 2;
253 } else if (code < 0x10000) { // 3 byte sequence
254 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
255 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
256 text[0] = char (code | 0xE0);
257 text += 3;
258 } else if (code < 0x110000) { // 4 byte sequence
259 text[3] = char((code | 0x80) & 0xBF); code >>= 6;
260 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
261 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
262 text[0] = char (code | 0xF0);
263 text += 4;
264 } else { // Invalid, only codes up to 0x10FFFF are allowed in Unicode
265 throw ParseError("invalid numeric character entity", text);
266 }
267}
268
269template<StringLiteral Str> [[nodiscard]] static inline bool next(const char* p)
270{
271 return small_compare<Str>(p);
272}
273
274
275// Skip characters until predicate evaluates to true
276template<typename StopPred> static inline void skip(char*& text)
277{
278 char* tmp = text;
279 while (StopPred::test(*tmp)) ++tmp;
280 text = tmp;
281}
282
283// Skip characters until predicate evaluates to true while doing the following:
284// - replacing XML character entity references with proper characters
285// (&apos; &amp; &quot; &lt; &gt; &#...;)
286// - condensing whitespace sequences to single space character
287template<typename StopPred, class StopPredPure, int FLAGS>
288[[nodiscard]] static inline char* skipAndExpand(char*& text)
289{
290 // If entity translation, whitespace condense and whitespace
291 // trimming is disabled, use plain skip.
292 if constexpr ( (FLAGS & noEntityTranslation) &&
293 !(FLAGS & normalizeWhitespace) &&
294 !(FLAGS & trimWhitespace)) {
295 skip<StopPred>(text);
296 return text;
297 }
298
299 // Use simple skip until first modification is detected
300 skip<StopPredPure>(text);
301
302 // Use translation skip
303 char* src = text;
304 char* dest = src;
305 while (StopPred::test(*src)) {
306 // Test if replacement is needed
307 if (!(FLAGS & noEntityTranslation) &&
308 (src[0] == '&')) {
309 switch (src[1]) {
310 case 'a': // &amp; &apos;
311 if (next<"amp;">(&src[1])) {
312 *dest = '&';
313 ++dest;
314 src += 5;
315 continue;
316 }
317 if (next<"pos;">(&src[2])) {
318 *dest = '\'';
319 ++dest;
320 src += 6;
321 continue;
322 }
323 break;
324
325 case 'q': // &quot;
326 if (next<"uot;">(&src[2])) {
327 *dest = '"';
328 ++dest;
329 src += 6;
330 continue;
331 }
332 break;
333
334 case 'g': // &gt;
335 if (next<"t;">(&src[2])) {
336 *dest = '>';
337 ++dest;
338 src += 4;
339 continue;
340 }
341 break;
342
343 case 'l': // &lt;
344 if (next<"t;">(&src[2])) {
345 *dest = '<';
346 ++dest;
347 src += 4;
348 continue;
349 }
350 break;
351
352 case '#': // &#...; - assumes ASCII
353 if (src[2] == 'x') {
354 uint32_t code = 0;
355 src += 3; // skip &#x
356 while (true) {
357 uint8_t digit = lutDigits[uint8_t(*src)];
358 if (digit == 0xFF) break;
359 code = code * 16 + digit;
360 ++src;
361 }
362 insertUTF8char(dest, code);
363 } else {
364 uint32_t code = 0;
365 src += 2; // skip &#
366 while (true) {
367 uint8_t digit = lutDigits[uint8_t(*src)];
368 if (digit == 0xFF) break;
369 code = code * 10 + digit;
370 ++src;
371 }
372 insertUTF8char(dest, code);
373 }
374 if (*src != ';') {
375 throw ParseError("expected ;", src);
376 }
377 ++src;
378 continue;
379
380 default:
381 // Something else, ignore, just copy '&' verbatim
382 break;
383 }
384 }
385
386 // Test if condensing is needed
387 if ((FLAGS & normalizeWhitespace) &&
388 (WhitespacePred::test(*src))) {
389 *dest++ = ' '; // single space in dest
390 ++src; // skip first whitespace char
391 // Skip remaining whitespace chars
392 while (WhitespacePred::test(*src)) ++src;
393 continue;
394 }
395
396 // No replacement, only copy character
397 *dest++ = *src++;
398 }
399
400 // Return new end
401 text = src;
402 return dest;
403}
404
405inline void skipBOM(char*& text)
406{
407 if (next<"\357\273\277">(text)) { // char(0xEF), char(0xBB), char(0xBF)
408 text += 3; // skip utf-8 bom
409 }
410}
411
412
413template<int FLAGS, typename HANDLER> class Parser
414{
415 HANDLER& handler;
416
417public:
418 Parser(HANDLER& handler_, char* text)
419 : handler(handler_)
420 {
421 skipBOM(text);
422 while (true) {
423 // Skip whitespace before node
424 skip<WhitespacePred>(text);
425 if (*text == 0) break;
426
427 if (*text != '<') {
428 throw ParseError("expected <", text);
429 }
430 ++text; // skip '<'
431 parseNode(text);
432 }
433 }
434
435private:
436 // Parse XML declaration (<?xml...)
437 void parseDeclaration(char*& text)
438 {
439 handler.declarationStart();
440 skip<WhitespacePred>(text); // skip ws before attributes or ?>
441 parseAttributes(text, true);
442 handler.declarationStop();
443
444 // skip ?>
445 if (!next<"?>">(text)) {
446 throw ParseError("expected ?>", text);
447 }
448 text += 2;
449 }
450
451 // Parse XML comment (<!--...)
452 void parseComment(char*& text)
453 {
454 // Skip until end of comment
455 const char* value = text; // remember value start
456 while (!next<"-->">(text)) {
457 if (text[0] == 0) {
458 throw ParseError("unexpected end of data", text);
459 }
460 ++text;
461 }
462 if constexpr (FLAGS & zeroTerminateStrings) {
463 *text = '\0';
464 handler.comment(zstring_view(value, text - value));
465 } else {
466 handler.comment(std::string_view(value, text - value));
467 }
468 text += 3; // skip '-->'
469 }
470
471 void parseDoctype(char*& text)
472 {
473 const char* value = text; // remember value start
474
475 // skip to >
476 while (*text != '>') {
477 switch (*text) {
478 case '[': {
479 // If '[' encountered, scan for matching ending
480 // ']' using naive algorithm with depth. This
481 // works for all W3C test files except for 2
482 // most wicked.
483 ++text; // skip '['
484 int depth = 1;
485 while (depth > 0) {
486 switch (*text) {
487 case char('['): ++depth; break;
488 case char(']'): --depth; break;
489 case 0: throw ParseError(
490 "unexpected end of data", text);
491 }
492 ++text;
493 }
494 break;
495 }
496 case '\0':
497 throw ParseError("unexpected end of data", text);
498
499 default:
500 ++text;
501 }
502 }
503
504 if constexpr (FLAGS & zeroTerminateStrings) {
505 *text = '\0';
506 handler.doctype(zstring_view(value, text - value));
507 } else {
508 handler.doctype(std::string_view(value, text - value));
509 }
510 text += 1; // skip '>'
511 }
512
513 void parsePI(char*& text)
514 {
515 // Extract PI target name
516 const char* name = text;
517 skip<NodeNamePred>(text);
518 char* nameEnd = text;
519 if (name == nameEnd) {
520 throw ParseError("expected PI target", text);
521 }
522
523 // Skip whitespace between pi target and pi
524 skip<WhitespacePred>(text);
525
526 // Skip to '?>'
527 const char* value = text; // Remember start of pi
528 while (!next<"?>">(text)) {
529 if (*text == 0) {
530 throw ParseError("unexpected end of data", text);
531 }
532 ++text;
533 }
534 // Set pi value (verbatim, no entity expansion or ws normalization)
535 if constexpr (FLAGS & zeroTerminateStrings) {
536 *nameEnd = '\0';
537 *text = '\0';
538 handler.procInstr(zstring_view(name, nameEnd - name),
539 zstring_view(value, text - value));
540 } else {
541 handler.procInstr(std::string_view(name, nameEnd - name),
542 std::string_view(value, text - value));
543 }
544 text += 2; // skip '?>'
545 }
546
547 void parseText(char*& text, char* contentsStart)
548 {
549 // Backup to contents start if whitespace trimming is disabled
550 if constexpr (!(FLAGS & trimWhitespace)) {
551 text = contentsStart;
552 }
553 // Skip until end of data
554 const char* value = text;
555 char* end = (FLAGS & normalizeWhitespace)
556 ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
557 : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
558
559 // Trim trailing whitespace; leading was already trimmed by
560 // whitespace skip after >
561 if constexpr ((FLAGS & trimWhitespace) != 0) {
562 if constexpr (FLAGS & normalizeWhitespace) {
563 // Whitespace is already condensed to single
564 // space characters by skipping function, so
565 // just trim 1 char off the end.
566 if (end[-1] == ' ') {
567 --end;
568 }
569 } else {
570 // Backup until non-whitespace character is found
571 while (WhitespacePred::test(end[-1])) {
572 --end;
573 }
574 }
575 }
576
577 // check next char before calling handler.text()
578 if (*text == '\0') {
579 throw ParseError("unexpected end of data", text);
580 } else {
581 assert(*text == '<');
582 }
583
584 // Handle text, but only if non-empty.
585 auto len = end - value;
586 if (len) {
587 if constexpr (FLAGS & zeroTerminateStrings) {
588 *end = '\0';
589 handler.text(zstring_view(value, len));
590 } else {
591 handler.text(std::string_view(value, len));
592 }
593 }
594 }
595
596 void parseCdata(char*& text)
597 {
598 // Skip until end of cdata
599 const char* value = text;
600 while (!next<"]]>">(text)) {
601 if (text[0] == 0) {
602 throw ParseError("unexpected end of data", text);
603 }
604 ++text;
605 }
606 if constexpr (FLAGS & zeroTerminateStrings) {
607 *text = '\0';
608 handler.cdata(zstring_view(value, text - value));
609 } else {
610 handler.cdata(std::string_view(value, text - value));
611 }
612 text += 3; // skip ]]>
613 }
614
615 void parseElement(char*& text)
616 {
617 // Extract element name
618 const char* name = text;
619 skip<NodeNamePred>(text);
620 char* nameEnd = text;
621 if (name == nameEnd) {
622 throw ParseError("expected element name", text);
623 }
624 char savedChar = *nameEnd;
625 skip<WhitespacePred>(text); // skip ws before attributes or >
626 if constexpr (FLAGS & zeroTerminateStrings) {
627 *nameEnd = '\0';
628 handler.start(zstring_view(name, nameEnd - name));
629 } else {
630 handler.start(std::string_view(name, nameEnd - name));
631 }
632
633 parseAttributes(text, false);
634
635 // Determine ending type
636 char endChar = ((FLAGS & zeroTerminateStrings) && (text == nameEnd))
637 ? savedChar : *text;
638 if (endChar == '>') {
639 ++text;
640 parseNodeContents(text);
641 } else if (endChar == '/') {
642 handler.stop();
643 ++text;
644 if (*text != '>') {
645 throw ParseError("expected >", text);
646 }
647 ++text;
648 } else {
649 throw ParseError("expected >", text);
650 }
651 }
652
653 // Determine node type, and parse it
654 void parseNode(char*& text)
655 {
656 switch (text[0]) {
657 case '?': // <?...
658 ++text; // skip ?
659 // Note: this doesn't detect mixed case (xMl), does
660 // that matter?
661 if ((next<"xml">(text) || next<"XML">(text)) &&
662 WhitespacePred::test(text[3])) {
663 // '<?xml ' - xml declaration
664 text += 4; // skip 'xml '
665 parseDeclaration(text);
666 } else {
667 parsePI(text);
668 }
669 break;
670
671 case '!': // <!...
672 // Parse proper subset of <! node
673 switch (text[1]) {
674 case '-': // <!-
675 if (text[2] == '-') {
676 // '<!--' - xml comment
677 text += 3; // skip '!--'
678 parseComment(text);
679 return;
680 }
681 break;
682
683 case '[': // <![
684 if (next<"CDATA[">(&text[2])) {
685 // '<![CDATA[' - cdata
686 text += 8; // skip '![CDATA['
687 parseCdata(text);
688 return;
689 }
690 break;
691
692 case 'D': // <!D
693 if (next<"OCTYPE">(&text[2]) &&
694 WhitespacePred::test(text[8])) {
695 // '<!DOCTYPE ' - doctype
696 text += 9; // skip '!DOCTYPE '
697 parseDoctype(text);
698 return;
699 }
700 break;
701 }
702 // Attempt to skip other, unrecognized types starting with <!
703 ++text; // skip !
704 while (*text != '>') {
705 if (*text == 0) {
706 throw ParseError(
707 "unexpected end of data", text);
708 }
709 ++text;
710 }
711 ++text; // skip '>'
712 break;
713
714 default: // <...
715 parseElement(text);
716 break;
717 }
718 }
719
720 // Parse contents of the node - children, data etc.
721 void parseNodeContents(char*& text)
722 {
723 while (true) {
724 char* contentsStart = text; // start before ws is skipped
725 skip<WhitespacePred>(text); // Skip ws between > and contents
726
727 switch (*text) {
728 case '<': // Node closing or child node
729afterText: // After parseText() jump here instead of continuing
730 // the loop, because skipping whitespace is unnecessary.
731 if (text[1] == '/') {
732 // Node closing
733 text += 2; // skip '</'
734 skip<NodeNamePred>(text);
735 // TODO validate closing tag??
736 handler.stop();
737 // Skip remaining whitespace after node name
738 skip<WhitespacePred>(text);
739 if (*text != '>') {
740 throw ParseError("expected >", text);
741 }
742 ++text; // skip '>'
743 return;
744 } else {
745 // Child node
746 ++text; // skip '<'
747 parseNode(text);
748 }
749 break;
750
751 case '\0':
752 throw ParseError("unexpected end of data", text);
753
754 default:
755 parseText(text, contentsStart);
756 goto afterText;
757 }
758 }
759 }
760
761 // Parse XML attributes of the node
762 void parseAttributes(char*& text, bool declaration)
763 {
764 // For all attributes
765 while (AttributeNamePred::test(*text)) {
766 // Extract attribute name
767 char* name = text;
768 ++text; // Skip first character of attribute name
769 skip<AttributeNamePred>(text);
770 char* nameEnd = text;
771 if (name == nameEnd) {
772 throw ParseError("expected attribute name", name);
773 }
774
775 skip<WhitespacePred>(text); // skip ws after name
776 if (*text != '=') {
777 throw ParseError("expected =", text);
778 }
779 ++text; // skip =
780 skip<WhitespacePred>(text); // skip ws after =
781
782 // Skip quote and remember if it was ' or "
783 char quote = *text;
784 if (quote != one_of('\'', '"')) {
785 throw ParseError("expected ' or \"", text);
786 }
787 ++text;
788
789 // Extract attribute value and expand char refs in it
790 // No whitespace normalization in attributes
791 constexpr int FLAGS2 = FLAGS & ~normalizeWhitespace;
792 const char* value = text;
793 char* valueEnd = (quote == '\'')
794 ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
795 : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
796 // Make sure that end quote is present
797 // check before calling handler.xxx()
798 if (*text != quote) {
799 throw ParseError("expected ' or \"", text);
800 }
801 ++text; // skip quote
802
803 if constexpr (FLAGS & zeroTerminateStrings) {
804 *nameEnd = '\0';
805 *valueEnd = '\0';
806 if (!declaration) {
807 handler.attribute(zstring_view(name, nameEnd - name),
808 zstring_view(value, valueEnd - value));
809 } else {
810 handler.declAttribute(zstring_view(name, nameEnd - name),
811 zstring_view(value, valueEnd - value));
812 }
813 } else {
814 if (!declaration) {
815 handler.attribute(std::string_view(name, nameEnd - name),
816 std::string_view(value, valueEnd - value));
817 } else {
818 handler.declAttribute(std::string_view(name, nameEnd - name),
819 std::string_view(value, valueEnd - value));
820 }
821 }
822
823 skip<WhitespacePred>(text); // skip ws after value
824 }
825 }
826};
827
828} // namespace internal
829
830template<int FLAGS, typename HANDLER>
831inline void parse(HANDLER& handler, char* xml)
832{
833 internal::Parser<FLAGS, HANDLER> parser(handler, xml);
834}
835
836} // namespace rapidsax
837
838#endif
void test(const IterableBitSet< N > &s, std::initializer_list< size_t > list)
void declAttribute(std::string_view, std::string_view)
Definition rapidsax.hh:105
void text(std::string_view)
Definition rapidsax.hh:86
void procInstr(zstring_view, zstring_view)
Definition rapidsax.hh:115
void comment(std::string_view)
Definition rapidsax.hh:99
void doctype(zstring_view)
Definition rapidsax.hh:111
void comment(zstring_view)
Definition rapidsax.hh:100
void start(zstring_view)
Definition rapidsax.hh:72
void declAttribute(zstring_view, zstring_view)
Definition rapidsax.hh:106
void procInstr(std::string_view, std::string_view)
Definition rapidsax.hh:114
void attribute(zstring_view, zstring_view)
Definition rapidsax.hh:92
void text(zstring_view)
Definition rapidsax.hh:87
void attribute(std::string_view, std::string_view)
Definition rapidsax.hh:91
void doctype(std::string_view)
Definition rapidsax.hh:110
void cdata(std::string_view)
Definition rapidsax.hh:95
void start(std::string_view)
Definition rapidsax.hh:71
void cdata(zstring_view)
Definition rapidsax.hh:96
char * where() const
Definition rapidsax.hh:129
ParseError(const char *what_, char *where_)
Definition rapidsax.hh:122
const char * what() const
Definition rapidsax.hh:128
Like std::string_view, but with the extra guarantee that it refers to a zero-terminated string.
constexpr int noEntityTranslation
Definition rapidsax.hh:50
constexpr int zeroTerminateStrings
Definition rapidsax.hh:57
constexpr int trimWhitespace
Definition rapidsax.hh:52
void parse(HANDLER &handler, char *xml)
Definition rapidsax.hh:831
constexpr int normalizeWhitespace
Definition rapidsax.hh:55
constexpr size_t EXTRA_BUFFER_SPACE
Definition rapidsax.hh:44
constexpr auto end(const zstring_view &x)