134inline constexpr std::array<const uint8_t, 256> lutChar =
137 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x02,0x00,0x00,
138 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
139 0x02,0x80,0x20,0x00,0x00,0x00,0x08,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x40,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x80,0x40,0x40,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156inline constexpr std::array<const uint8_t, 256> lutDigits =
159 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
160 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
161 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
162 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255,
163 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,
164 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
165 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,
166 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
167 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
168 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
169 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
170 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
171 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
172 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
173 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
174 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
178struct WhitespacePred {
179 [[nodiscard]]
static bool test(
char ch) {
return (lutChar[uint8_t(ch)] & 0x02) != 0; }
184 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x43); }
188struct AttributeNamePred {
189 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0xC7); }
194 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x05); }
199struct TextPureNoWsPred {
200 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x0D); }
205struct TextPureWithWsPred {
206 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x0F); }
211 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x11); }
215 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x21); }
221 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x19); }
226 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x29); }
230inline void insertUTF8char(
char*& text, uint32_t code)
233 text[0] = char(code);
235 }
else if (code < 0x800) {
236 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
237 text[0] = char (code | 0xC0);
239 }
else if (code < 0x10000) {
240 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
241 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
242 text[0] = char (code | 0xE0);
244 }
else if (code < 0x110000) {
245 text[3] = char((code | 0x80) & 0xBF); code >>= 6;
246 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
247 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
248 text[0] = char (code | 0xF0);
251 throw ParseError(
"invalid numeric character entity", text);
255template<StringLiteral Str> [[nodiscard]]
static inline bool next(
const char* p)
257 return small_compare<Str>(p);
262template<
typename StopPred>
static inline void skip(
char*& text)
265 while (StopPred::test(*tmp)) ++tmp;
273template<
typename StopPred,
class StopPredPure,
int FLAGS>
274[[nodiscard]]
static inline char* skipAndExpand(
char*& text)
281 skip<StopPred>(text);
286 skip<StopPredPure>(text);
291 while (StopPred::test(*src)) {
297 if (next<"amp;">(&src[1])) {
303 if (next<"pos;">(&src[2])) {
312 if (next<"uot;">(&src[2])) {
321 if (next<"t;">(&src[2])) {
330 if (next<"t;">(&src[2])) {
343 uint8_t digit = lutDigits[uint8_t(*src)];
344 if (digit == 0xFF)
break;
345 code = code * 16 + digit;
348 insertUTF8char(dest, code);
353 uint8_t digit = lutDigits[uint8_t(*src)];
354 if (digit == 0xFF)
break;
355 code = code * 10 + digit;
358 insertUTF8char(dest, code);
374 (WhitespacePred::test(*src))) {
378 while (WhitespacePred::test(*src)) ++src;
391inline void skipBOM(
char*& text)
393 if (next<"\357\273\277">(text)) {
399template<
int FLAGS,
typename HANDLER>
class Parser
404 Parser(HANDLER& handler_,
char* text)
410 skip<WhitespacePred>(text);
411 if (*text == 0)
break;
423 void parseDeclaration(
char*& text)
425 handler.declarationStart();
426 skip<WhitespacePred>(text);
427 parseAttributes(text,
true);
428 handler.declarationStop();
431 if (!next<"?>
">(text)) {
432 throw ParseError("expected ?>
", text);
437 // Parse XML comment (<!--...)
438 void parseComment(char*& text)
440 // Skip until end of comment
441 const char* value = text; // remember value start
442 while (!next<"-->
">(text)) {
444 throw ParseError("unexpected
end of data
", text);
448 if (FLAGS & zeroTerminateStrings) {
451 handler.comment(std::string_view(value, text - value));
452 text += 3; // skip '-->'
455 void parseDoctype(char*& text)
457 const char* value = text; // remember value start
460 while (*text != '>') {
463 // If '[' encountered, scan for matching ending
464 // ']' using naive algorithm with depth. This
465 // works for all W3C test files except for 2
471 case char('['): ++depth; break;
472 case char(']'): --depth; break;
473 case 0: throw ParseError(
474 "unexpected
end of data
", text);
481 throw ParseError("unexpected
end of data
", text);
488 if (FLAGS & zeroTerminateStrings) {
491 handler.doctype(std::string_view(value, text - value));
492 text += 1; // skip '>'
495 void parsePI(char*& text)
497 // Extract PI target name
498 const char* name = text;
499 skip<NodeNamePred>(text);
500 char* nameEnd = text;
501 if (name == nameEnd) {
502 throw ParseError("expected PI target
", text);
505 // Skip whitespace between pi target and pi
506 skip<WhitespacePred>(text);
509 const char* value = text; // Remember start of pi
510 while (!next<"?>
">(text)) {
512 throw ParseError("unexpected
end of data
", text);
516 // Set pi value (verbatim, no entity expansion or ws normalization)
517 if (FLAGS & zeroTerminateStrings) {
521 handler.procInstr(std::string_view(name, nameEnd - name),
522 std::string_view(value, text - value));
523 text += 2; // skip '?>'
526 void parseText(char*& text, char* contentsStart)
528 // Backup to contents start if whitespace trimming is disabled
529 if constexpr (!(FLAGS & trimWhitespace)) {
530 text = contentsStart;
532 // Skip until end of data
533 const char* value = text;
534 char* end = (FLAGS & normalizeWhitespace)
535 ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
536 : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
538 // Trim trailing whitespace; leading was already trimmed by
539 // whitespace skip after >
540 if constexpr ((FLAGS & trimWhitespace) != 0) {
541 if constexpr (FLAGS & normalizeWhitespace) {
542 // Whitespace is already condensed to single
543 // space characters by skipping function, so
544 // just trim 1 char off the end.
545 if (end[-1] == ' ') {
549 // Backup until non-whitespace character is found
550 while (WhitespacePred::test(end[-1])) {
556 // check next char before calling handler.text()
558 throw ParseError("unexpected
end of data
", text);
560 assert(*text == '<');
563 // Handle text, but only if non-empty.
564 auto len = end - value;
566 if (FLAGS & zeroTerminateStrings) {
569 handler.text(std::string_view(value, len));
573 void parseCdata(char*& text)
575 // Skip until end of cdata
576 const char* value = text;
577 while (!next<"]]>
">(text)) {
579 throw ParseError("unexpected
end of data
", text);
583 if (FLAGS & zeroTerminateStrings) {
586 handler.cdata(std::string_view(value, text - value));
587 text += 3; // skip ]]>
590 void parseElement(char*& text)
592 // Extract element name
593 const char* name = text;
594 skip<NodeNamePred>(text);
595 char* nameEnd = text;
596 if (name == nameEnd) {
597 throw ParseError("expected element name
", text);
599 handler.start(std::string_view(name, nameEnd - name));
601 skip<WhitespacePred>(text); // skip ws before attributes or >
602 parseAttributes(text, false);
604 // Determine ending type
606 if (FLAGS & zeroTerminateStrings) {
610 parseNodeContents(text);
611 } else if (*text == '/') {
612 if (FLAGS & zeroTerminateStrings) {
618 throw ParseError("expected >
", text);
622 throw ParseError("expected >
", text);
626 // Determine node type, and parse it
627 void parseNode(char*& text)
632 // Note: this doesn't detect mixed case (xMl), does
634 if ((next<"xml
">(text) || next<"XML
">(text)) &&
635 WhitespacePred::test(text[3])) {
636 // '<?xml ' - xml declaration
637 text += 4; // skip 'xml '
638 parseDeclaration(text);
645 // Parse proper subset of <! node
648 if (text[2] == '-') {
649 // '<!--' - xml comment
650 text += 3; // skip '!--'
657 if (next<"CDATA[
">(&text[2])) {
658 // '<![CDATA[' - cdata
659 text += 8; // skip '![CDATA['
666 if (next<"OCTYPE
">(&text[2]) &&
667 WhitespacePred::test(text[8])) {
668 // '<!DOCTYPE ' - doctype
669 text += 9; // skip '!DOCTYPE '
675 // Attempt to skip other, unrecognized types starting with <!
677 while (*text != '>') {
680 "unexpected
end of data
", text);
693 // Parse contents of the node - children, data etc.
694 void parseNodeContents(char*& text)
697 char* contentsStart = text; // start before ws is skipped
698 skip<WhitespacePred>(text); // Skip ws between > and contents
701 case '<': // Node closing or child node
702afterText: // After parseText() jump here instead of continuing
703 // the loop, because skipping whitespace is unnecessary.
704 if (text[1] == '/') {
706 text += 2; // skip '</'
707 skip<NodeNamePred>(text);
708 // TODO validate closing tag??
710 // Skip remaining whitespace after node name
711 skip<WhitespacePred>(text);
713 throw ParseError("expected >
", text);
725 throw ParseError("unexpected
end of data
", text);
728 parseText(text, contentsStart);
734 // Parse XML attributes of the node
735 void parseAttributes(char*& text, bool declaration)
737 // For all attributes
738 while (AttributeNamePred::test(*text)) {
739 // Extract attribute name
741 ++text; // Skip first character of attribute name
742 skip<AttributeNamePred>(text);
743 char* nameEnd = text;
744 if (name == nameEnd) {
745 throw ParseError("expected attribute name
", name);
748 skip<WhitespacePred>(text); // skip ws after name
750 throw ParseError("expected =
", text);
753 skip<WhitespacePred>(text); // skip ws after =
755 // Skip quote and remember if it was ' or "
757 if (quote !=
one_of(
'\'',
'"')) {
764 constexpr int FLAGS2 = FLAGS & ~normalizeWhitespace;
765 const char* value = text;
766 char* valueEnd = (quote ==
'\'')
767 ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
768 : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
771 if (*text != quote) {
781 handler.attribute(std::string_view(name, nameEnd - name),
782 std::string_view(value, valueEnd - value));
784 handler.declAttribute(std::string_view(name, nameEnd - name),
785 std::string_view(value, valueEnd - value));
788 skip<WhitespacePred>(text);