148inline constexpr std::array<const uint8_t, 256> lutChar =
151 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x02,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153 0x02,0x80,0x20,0x00,0x00,0x00,0x08,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x40,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x80,0x40,0x40,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
158 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
159 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
160 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
161 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
162 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
163 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
164 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
165 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
166 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
170inline constexpr std::array<const uint8_t, 256> lutDigits =
173 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
174 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
175 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
176 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255,
177 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,
178 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
179 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,
180 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
181 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
182 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
183 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
184 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
185 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
186 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
187 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
188 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
192struct WhitespacePred {
193 [[nodiscard]]
static bool test(
char ch) {
return (lutChar[uint8_t(ch)] & 0x02) != 0; }
198 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x43); }
202struct AttributeNamePred {
203 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0xC7); }
208 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x05); }
213struct TextPureNoWsPred {
214 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x0D); }
219struct TextPureWithWsPred {
220 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x0F); }
225 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x11); }
229 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x21); }
235 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x19); }
240 [[nodiscard]]
static bool test(
char ch) {
return !(lutChar[uint8_t(ch)] & 0x29); }
244inline void insertUTF8char(
char*& text, uint32_t code)
247 text[0] = char(code);
249 }
else if (code < 0x800) {
250 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
251 text[0] = char (code | 0xC0);
253 }
else if (code < 0x10000) {
254 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
255 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
256 text[0] = char (code | 0xE0);
258 }
else if (code < 0x110000) {
259 text[3] = char((code | 0x80) & 0xBF); code >>= 6;
260 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
261 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
262 text[0] = char (code | 0xF0);
265 throw ParseError(
"invalid numeric character entity", text);
269template<StringLiteral Str> [[nodiscard]]
static inline bool next(
const char* p)
271 return small_compare<Str>(p);
276template<
typename StopPred>
static inline void skip(
char*& text)
279 while (StopPred::test(*tmp)) ++tmp;
287template<
typename StopPred,
class StopPredPure,
int FLAGS>
288[[nodiscard]]
static inline char* skipAndExpand(
char*& text)
295 skip<StopPred>(text);
300 skip<StopPredPure>(text);
305 while (StopPred::test(*src)) {
311 if (next<"amp;">(&src[1])) {
317 if (next<"pos;">(&src[2])) {
326 if (next<"uot;">(&src[2])) {
335 if (next<"t;">(&src[2])) {
344 if (next<"t;">(&src[2])) {
357 uint8_t digit = lutDigits[uint8_t(*src)];
358 if (digit == 0xFF)
break;
359 code = code * 16 + digit;
362 insertUTF8char(dest, code);
367 uint8_t digit = lutDigits[uint8_t(*src)];
368 if (digit == 0xFF)
break;
369 code = code * 10 + digit;
372 insertUTF8char(dest, code);
388 (WhitespacePred::test(*src))) {
392 while (WhitespacePred::test(*src)) ++src;
405inline void skipBOM(
char*& text)
407 if (next<"\357\273\277">(text)) {
413template<
int FLAGS,
typename HANDLER>
class Parser
418 Parser(HANDLER& handler_,
char* text)
424 skip<WhitespacePred>(text);
425 if (*text == 0)
break;
437 void parseDeclaration(
char*& text)
439 handler.declarationStart();
440 skip<WhitespacePred>(text);
441 parseAttributes(text,
true);
442 handler.declarationStop();
445 if (!next<"?>
">(text)) {
446 throw ParseError("expected ?>
", text);
451 // Parse XML comment (<!--...)
452 void parseComment(char*& text)
454 // Skip until end of comment
455 const char* value = text; // remember value start
456 while (!next<"-->
">(text)) {
458 throw ParseError("unexpected
end of data
", text);
462 if constexpr (FLAGS & zeroTerminateStrings) {
464 handler.comment(zstring_view(value, text - value));
466 handler.comment(std::string_view(value, text - value));
468 text += 3; // skip '-->'
471 void parseDoctype(char*& text)
473 const char* value = text; // remember value start
476 while (*text != '>') {
479 // If '[' encountered, scan for matching ending
480 // ']' using naive algorithm with depth. This
481 // works for all W3C test files except for 2
487 case char('['): ++depth; break;
488 case char(']'): --depth; break;
489 case 0: throw ParseError(
490 "unexpected
end of data
", text);
497 throw ParseError("unexpected
end of data
", text);
504 if constexpr (FLAGS & zeroTerminateStrings) {
506 handler.doctype(zstring_view(value, text - value));
508 handler.doctype(std::string_view(value, text - value));
510 text += 1; // skip '>'
513 void parsePI(char*& text)
515 // Extract PI target name
516 const char* name = text;
517 skip<NodeNamePred>(text);
518 char* nameEnd = text;
519 if (name == nameEnd) {
520 throw ParseError("expected PI target
", text);
523 // Skip whitespace between pi target and pi
524 skip<WhitespacePred>(text);
527 const char* value = text; // Remember start of pi
528 while (!next<"?>
">(text)) {
530 throw ParseError("unexpected
end of data
", text);
534 // Set pi value (verbatim, no entity expansion or ws normalization)
535 if constexpr (FLAGS & zeroTerminateStrings) {
538 handler.procInstr(zstring_view(name, nameEnd - name),
539 zstring_view(value, text - value));
541 handler.procInstr(std::string_view(name, nameEnd - name),
542 std::string_view(value, text - value));
544 text += 2; // skip '?>'
547 void parseText(char*& text, char* contentsStart)
549 // Backup to contents start if whitespace trimming is disabled
550 if constexpr (!(FLAGS & trimWhitespace)) {
551 text = contentsStart;
553 // Skip until end of data
554 const char* value = text;
555 char* end = (FLAGS & normalizeWhitespace)
556 ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
557 : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
559 // Trim trailing whitespace; leading was already trimmed by
560 // whitespace skip after >
561 if constexpr ((FLAGS & trimWhitespace) != 0) {
562 if constexpr (FLAGS & normalizeWhitespace) {
563 // Whitespace is already condensed to single
564 // space characters by skipping function, so
565 // just trim 1 char off the end.
566 if (end[-1] == ' ') {
570 // Backup until non-whitespace character is found
571 while (WhitespacePred::test(end[-1])) {
577 // check next char before calling handler.text()
579 throw ParseError("unexpected
end of data
", text);
581 assert(*text == '<');
584 // Handle text, but only if non-empty.
585 auto len = end - value;
587 if constexpr (FLAGS & zeroTerminateStrings) {
589 handler.text(zstring_view(value, len));
591 handler.text(std::string_view(value, len));
596 void parseCdata(char*& text)
598 // Skip until end of cdata
599 const char* value = text;
600 while (!next<"]]>
">(text)) {
602 throw ParseError("unexpected
end of data
", text);
606 if constexpr (FLAGS & zeroTerminateStrings) {
608 handler.cdata(zstring_view(value, text - value));
610 handler.cdata(std::string_view(value, text - value));
612 text += 3; // skip ]]>
615 void parseElement(char*& text)
617 // Extract element name
618 const char* name = text;
619 skip<NodeNamePred>(text);
620 char* nameEnd = text;
621 if (name == nameEnd) {
622 throw ParseError("expected element name
", text);
624 char savedChar = *nameEnd;
625 skip<WhitespacePred>(text); // skip ws before attributes or >
626 if constexpr (FLAGS & zeroTerminateStrings) {
628 handler.start(zstring_view(name, nameEnd - name));
630 handler.start(std::string_view(name, nameEnd - name));
633 parseAttributes(text, false);
635 // Determine ending type
636 char endChar = ((FLAGS & zeroTerminateStrings) && (text == nameEnd))
638 if (endChar == '>') {
640 parseNodeContents(text);
641 } else if (endChar == '/') {
645 throw ParseError("expected >
", text);
649 throw ParseError("expected >
", text);
653 // Determine node type, and parse it
654 void parseNode(char*& text)
659 // Note: this doesn't detect mixed case (xMl), does
661 if ((next<"xml
">(text) || next<"XML
">(text)) &&
662 WhitespacePred::test(text[3])) {
663 // '<?xml ' - xml declaration
664 text += 4; // skip 'xml '
665 parseDeclaration(text);
672 // Parse proper subset of <! node
675 if (text[2] == '-') {
676 // '<!--' - xml comment
677 text += 3; // skip '!--'
684 if (next<"CDATA[
">(&text[2])) {
685 // '<![CDATA[' - cdata
686 text += 8; // skip '![CDATA['
693 if (next<"OCTYPE
">(&text[2]) &&
694 WhitespacePred::test(text[8])) {
695 // '<!DOCTYPE ' - doctype
696 text += 9; // skip '!DOCTYPE '
702 // Attempt to skip other, unrecognized types starting with <!
704 while (*text != '>') {
707 "unexpected
end of data
", text);
720 // Parse contents of the node - children, data etc.
721 void parseNodeContents(char*& text)
724 char* contentsStart = text; // start before ws is skipped
725 skip<WhitespacePred>(text); // Skip ws between > and contents
728 case '<': // Node closing or child node
729afterText: // After parseText() jump here instead of continuing
730 // the loop, because skipping whitespace is unnecessary.
731 if (text[1] == '/') {
733 text += 2; // skip '</'
734 skip<NodeNamePred>(text);
735 // TODO validate closing tag??
737 // Skip remaining whitespace after node name
738 skip<WhitespacePred>(text);
740 throw ParseError("expected >
", text);
752 throw ParseError("unexpected
end of data
", text);
755 parseText(text, contentsStart);
761 // Parse XML attributes of the node
762 void parseAttributes(char*& text, bool declaration)
764 // For all attributes
765 while (AttributeNamePred::test(*text)) {
766 // Extract attribute name
768 ++text; // Skip first character of attribute name
769 skip<AttributeNamePred>(text);
770 char* nameEnd = text;
771 if (name == nameEnd) {
772 throw ParseError("expected attribute name
", name);
775 skip<WhitespacePred>(text); // skip ws after name
777 throw ParseError("expected =
", text);
780 skip<WhitespacePred>(text); // skip ws after =
782 // Skip quote and remember if it was ' or "
784 if (quote !=
one_of(
'\'',
'"')) {
791 constexpr int FLAGS2 = FLAGS & ~normalizeWhitespace;
792 const char* value = text;
793 char* valueEnd = (quote ==
'\'')
794 ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
795 : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
798 if (*text != quote) {
810 handler.declAttribute(
zstring_view(name, nameEnd - name),
815 handler.attribute(std::string_view(name, nameEnd - name),
816 std::string_view(value, valueEnd - value));
818 handler.declAttribute(std::string_view(name, nameEnd - name),
819 std::string_view(value, valueEnd - value));
823 skip<WhitespacePred>(text);