36template<
int FLAGS,
typename HANDLER>
void parse(HANDLER& handler,
char* xml);
65 void start(std::string_view ) {}
79 void text(std::string_view ) {}
83 void attribute(std::string_view , std::string_view ) {}
86 void cdata(std::string_view ) {}
114 [[nodiscard]]
const char*
what()
const {
return m_what; }
115 [[nodiscard]]
char*
where()
const {
return m_where; }
134inline constexpr std::array<const uint8_t, 256>
lutChar =
137 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x02,0x00,0x00,
138 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
139 0x02,0x80,0x20,0x00,0x00,0x00,0x08,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x40,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x80,0x40,0x40,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156inline constexpr std::array<const uint8_t, 256>
lutDigits =
159 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
160 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
161 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
162 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,255,255,255,255,255,255,
163 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,
164 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
165 255, 10, 11, 12, 13, 14, 15,255,255,255,255,255,255,255,255,255,
166 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
167 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
168 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
169 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
170 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
171 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
172 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
173 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
174 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
179 [[nodiscard]]
static bool test(
char ch) {
return (
lutChar[uint8_t(ch)] & 0x02) != 0; }
184 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x43); }
189 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0xC7); }
194 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x05); }
200 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x0D); }
206 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x0F); }
211 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x11); }
215 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x21); }
221 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x19); }
226 [[nodiscard]]
static bool test(
char ch) {
return !(
lutChar[uint8_t(ch)] & 0x29); }
233 text[0] = char(code);
235 }
else if (code < 0x800) {
236 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
237 text[0] = char (code | 0xC0);
239 }
else if (code < 0x10000) {
240 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
241 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
242 text[0] = char (code | 0xE0);
244 }
else if (code < 0x110000) {
245 text[3] = char((code | 0x80) & 0xBF); code >>= 6;
246 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
247 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
248 text[0] = char (code | 0xF0);
251 throw ParseError(
"invalid numeric character entity", text);
255template<StringLiteral Str> [[nodiscard]]
static inline bool next(
const char* p)
257 return small_compare<Str>(p);
262template<
typename StopPred>
static inline void skip(
char*& text)
265 while (StopPred::test(*tmp)) ++tmp;
273template<
typename StopPred,
class StopPredPure,
int FLAGS>
274[[nodiscard]]
static inline char* skipAndExpand(
char*& text)
281 skip<StopPred>(text);
286 skip<StopPredPure>(text);
291 while (StopPred::test(*src)) {
297 if (next<"amp;">(&src[1])) {
303 if (next<"pos;">(&src[2])) {
312 if (next<"uot;">(&src[2])) {
321 if (next<"t;">(&src[2])) {
330 if (next<"t;">(&src[2])) {
343 uint8_t digit =
lutDigits[uint8_t(*src)];
344 if (digit == 0xFF)
break;
345 code = code * 16 + digit;
353 uint8_t digit =
lutDigits[uint8_t(*src)];
354 if (digit == 0xFF)
break;
355 code = code * 10 + digit;
361 throw ParseError(
"expected ;", src);
393 if (next<"\357\273\277">(text)) {
399template<
int FLAGS,
typename HANDLER>
class Parser
410 skip<WhitespacePred>(text);
411 if (*text == 0)
break;
423 void parseDeclaration(
char*& text)
425 handler.declarationStart();
426 skip<WhitespacePred>(text);
427 parseAttributes(text,
true);
428 handler.declarationStop();
431 if (!next<"?>
">(text)) {
432 throw ParseError("expected ?>
", text);
437 // Parse XML comment (<!--...)
438 void parseComment(char*& text)
440 // Skip until end of comment
441 char* value = text; // remember value start
442 while (!next<"-->
">(text)) {
444 throw ParseError("unexpected
end of data
", text);
448 if (FLAGS & zeroTerminateStrings) {
451 handler.comment(std::string_view(value, text - value));
452 text += 3; // skip '-->'
455 void parseDoctype(char*& text)
457 char* value = text; // remember value start
460 while (*text != '>') {
463 // If '[' encountered, scan for matching ending
464 // ']' using naive algorithm with depth. This
465 // works for all W3C test files except for 2
471 case char('['): ++depth; break;
472 case char(']'): --depth; break;
473 case 0: throw ParseError(
474 "unexpected
end of data
", text);
481 throw ParseError("unexpected
end of data
", text);
488 if (FLAGS & zeroTerminateStrings) {
491 handler.doctype(std::string_view(value, text - value));
492 text += 1; // skip '>'
495 void parsePI(char*& text)
497 // Extract PI target name
499 skip<NodeNamePred>(text);
500 char* nameEnd = text;
501 if (name == nameEnd) {
502 throw ParseError("expected PI target
", text);
505 // Skip whitespace between pi target and pi
506 skip<WhitespacePred>(text);
509 char* value = text; // Remember start of pi
510 while (!next<"?>
">(text)) {
512 throw ParseError("unexpected
end of data
", text);
516 // Set pi value (verbatim, no entity expansion or ws normalization)
517 if (FLAGS & zeroTerminateStrings) {
521 handler.procInstr(std::string_view(name, nameEnd - name),
522 std::string_view(value, text - value));
523 text += 2; // skip '?>'
526 void parseText(char*& text, char* contentsStart)
528 // Backup to contents start if whitespace trimming is disabled
529 if constexpr (!(FLAGS & trimWhitespace)) {
530 text = contentsStart;
532 // Skip until end of data
534 char* end = (FLAGS & normalizeWhitespace)
535 ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
536 : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
538 // Trim trailing whitespace; leading was already trimmed by
539 // whitespace skip after >
540 if constexpr ((FLAGS & trimWhitespace) != 0) {
541 if constexpr (FLAGS & normalizeWhitespace) {
542 // Whitespace is already condensed to single
543 // space characters by skipping function, so
544 // just trim 1 char off the end.
545 if (end[-1] == ' ') {
549 // Backup until non-whitespace character is found
550 while (WhitespacePred::test(end[-1])) {
556 // check next char before calling handler.text()
558 throw ParseError("unexpected
end of data
", text);
560 assert(*text == '<');
563 // Handle text, but only if non-empty.
564 auto len = end - value;
566 if (FLAGS & zeroTerminateStrings) {
569 handler.text(std::string_view(value, len));
573 void parseCdata(char*& text)
575 // Skip until end of cdata
577 while (!next<"]]>
">(text)) {
579 throw ParseError("unexpected
end of data
", text);
583 if (FLAGS & zeroTerminateStrings) {
586 handler.cdata(std::string_view(value, text - value));
587 text += 3; // skip ]]>
590 void parseElement(char*& text)
592 // Extract element name
594 skip<NodeNamePred>(text);
595 char* nameEnd = text;
596 if (name == nameEnd) {
597 throw ParseError("expected element name
", text);
599 handler.start(std::string_view(name, nameEnd - name));
601 skip<WhitespacePred>(text); // skip ws before attributes or >
602 parseAttributes(text, false);
604 // Determine ending type
606 if (FLAGS & zeroTerminateStrings) {
610 parseNodeContents(text);
611 } else if (*text == '/') {
612 if (FLAGS & zeroTerminateStrings) {
618 throw ParseError("expected >
", text);
622 throw ParseError("expected >
", text);
626 // Determine node type, and parse it
627 void parseNode(char*& text)
632 // Note: this doesn't detect mixed case (xMl), does
634 if ((next<"xml
">(text) || next<"XML
">(text)) &&
635 WhitespacePred::test(text[3])) {
636 // '<?xml ' - xml declaration
637 text += 4; // skip 'xml '
638 parseDeclaration(text);
645 // Parse proper subset of <! node
648 if (text[2] == '-') {
649 // '<!--' - xml comment
650 text += 3; // skip '!--'
657 if (next<"CDATA[
">(&text[2])) {
658 // '<![CDATA[' - cdata
659 text += 8; // skip '![CDATA['
666 if (next<"OCTYPE
">(&text[2]) &&
667 WhitespacePred::test(text[8])) {
668 // '<!DOCTYPE ' - doctype
669 text += 9; // skip '!DOCTYPE '
675 // Attempt to skip other, unrecognized types starting with <!
677 while (*text != '>') {
680 "unexpected
end of data
", text);
693 // Parse contents of the node - children, data etc.
694 void parseNodeContents(char*& text)
697 char* contentsStart = text; // start before ws is skipped
698 skip<WhitespacePred>(text); // Skip ws between > and contents
701 case '<': // Node closing or child node
702afterText: // After parseText() jump here instead of continuing
703 // the loop, because skipping whitespace is unnecessary.
704 if (text[1] == '/') {
706 text += 2; // skip '</'
707 skip<NodeNamePred>(text);
708 // TODO validate closing tag??
710 // Skip remaining whitespace after node name
711 skip<WhitespacePred>(text);
713 throw ParseError("expected >
", text);
725 throw ParseError("unexpected
end of data
", text);
728 parseText(text, contentsStart);
734 // Parse XML attributes of the node
735 void parseAttributes(char*& text, bool declaration)
737 // For all attributes
738 while (AttributeNamePred::test(*text)) {
739 // Extract attribute name
741 ++text; // Skip first character of attribute name
742 skip<AttributeNamePred>(text);
743 char* nameEnd = text;
744 if (name == nameEnd) {
745 throw ParseError("expected attribute name
", name);
748 skip<WhitespacePred>(text); // skip ws after name
750 throw ParseError("expected =
", text);
753 skip<WhitespacePred>(text); // skip ws after =
755 // Skip quote and remember if it was ' or "
757 if (quote !=
one_of(
'\'',
'"')) {
764 constexpr int FLAGS2 = FLAGS & ~normalizeWhitespace;
766 char* valueEnd = (quote ==
'\'')
767 ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
768 : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
771 if (*text != quote) {
781 handler.attribute(std::string_view(name, nameEnd - name),
782 std::string_view(value, valueEnd - value));
784 handler.declAttribute(std::string_view(name, nameEnd - name),
785 std::string_view(value, valueEnd - value));
788 skip<WhitespacePred>(text);
795template<
int FLAGS,
typename HANDLER>
796inline void parse(HANDLER& handler,
char* xml)
void declAttribute(std::string_view, std::string_view)
void text(std::string_view)
void comment(std::string_view)
void procInstr(std::string_view, std::string_view)
void attribute(std::string_view, std::string_view)
void doctype(std::string_view)
void cdata(std::string_view)
void start(std::string_view)
ParseError(const char *what_, char *where_)
const char * what() const
Parser(HANDLER &handler_, char *text)
void insertUTF8char(char *&text, uint32_t code)
void skipBOM(char *&text)
constexpr std::array< const uint8_t, 256 > lutChar
constexpr std::array< const uint8_t, 256 > lutDigits
constexpr int noEntityTranslation
constexpr int zeroTerminateStrings
constexpr int trimWhitespace
void parse(HANDLER &handler, char *xml)
constexpr int normalizeWhitespace
constexpr size_t EXTRA_BUFFER_SPACE
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
static bool test(char ch)
constexpr auto end(const zstring_view &x)