openMSX
rapidsax.hh
Go to the documentation of this file.
1 #ifndef RAPIDSAX_HH
2 #define RAPIDSAX_HH
3 
4 // This code is _heavily_ based on RapidXml 1.13
5 // http://rapidxml.sourceforge.net/
6 //
7 // RapidXml is a very fast XML parser.
8 // http://xmlbench.sourceforge.net/results/benchmark200910/index.html
9 // One of the main reasons it can be this fast is that doesn't do any string
10 // copies. Instead the XML input data is modified in-place (e.g. for stuff like
11 // < replacements). Though this also means the output produced by the parser
12 // is tied to the lifetime of the XML input data.
13 //
14 // RapidXml produces a DOM-like output. This parser has a SAX-like interface.
15 
16 #include "small_compare.hh"
17 #include <cassert>
18 #include <cstdint>
19 #include <string_view>
20 
21 namespace rapidsax {
22 
23 // Parse given XML text and call callback functions in the given handler.
24 // - XML text must be zero-terminated
25 // - Handler must implement the methods defined in NullHandler (below). An
26 // easy way to do this is to inherit from NullHandler and only reimplement
27 // the methods that you need.
28 // - The behavior of the parser can be fine-tuned with the FLAGS parameter,
29 // see below for more details.
30 // - When a parse error is encounter, an instance of ParseError is thrown.
31 // - The lifetime of the string_view's in the callback handler is the same as
32 // the lifetime of the input XML data (no string copies are made, instead
33 // the XML file is modified in-place and references to this data are passed).
34 template<int FLAGS, typename HANDLER> void parse(HANDLER& handler, char* xml);
35 
36 // When loading an XML file from disk, the buffer needs to be 8 bytes bigger
37 // than the filesize. The first of these bytes must be filled with zero
38 // (zero-terminate the xml data). The other bytes are only there to allow to
39 // read up-to 8 bytes past the end without triggering memory protection errors.
40 static const size_t EXTRA_BUFFER_SPACE = 8;
41 
42 
43 // Flags that influence parsing behavior. The flags can be OR'ed together.
44 
45 // Should XML entities like &lt; be expanded or not?
46 static const int noEntityTranslation = 0x1;
47 // Should leading and trailing whitespace be trimmed?
48 static const int trimWhitespace = 0x2;
49 // Should sequences of whitespace characters be replaced with a single
50 // space character?
51 static const int normalizeWhitespace = 0x4;
52 
53 
54 // Callback handler with all empty implementations (can be used as a base
55 // class in case you only need to reimplement a few of the methods).
56 class NullHandler
57 {
58 public:
59  // Called when an opening XML tag is encountered.
60  // 'name' is the name of the XML tag.
61  void start(std::string_view /*name*/) {}
62 
63  // Called when a XML tag is closed.
64  // Note: the parser does currently not check whether the name of the
65  // opening nd closing tags matches.
66  void stop() {}
67 
68  // Called when text inside a tag is parsed.
69  // XML entities are replaced (optional)
70  // Whitespace is (optionally) trimmed or normalized.
71  // This method is not called for an empty text string.
72  // (Unlike other SAX parsers) the whole text string is always
73  // passed in a single chunk (so no need to concatenate this text
74  // with previous chunks in the callback).
75  void text(std::string_view /*text*/) {}
76 
77  // Called for each parsed attribute.
78  // Attributes can occur inside xml tags or inside XML declarations.
79  void attribute(std::string_view /*name*/, std::string_view /*value*/) {}
80 
81  // Called for parsed CDATA sections.
82  void cdata(std::string_view /*value*/) {}
83 
84  // Called when a XML comment (<!-- ... -->) is parsed.
85  void comment(std::string_view /*value*/) {}
86 
87  // Called when XML declaration (<?xml .. ?>) is parsed.
88  // Inside a XML declaration there can be attributes.
89  void declarationStart() {}
90  void declAttribute(std::string_view /*name*/, std::string_view /*value*/) {}
91  void declarationStop() {}
92 
93  // Called when the <!DOCTYPE ..> is parsed.
94  void doctype(std::string_view /*text*/) {}
95 
96  // Called when XML processing instructions (<? .. ?>) are parsed.
97  void procInstr(std::string_view /*target*/, std::string_view /*instr*/) {}
98 };
99 
100 
101 class ParseError
102 {
103 public:
104  ParseError(const char* what_, char* where_)
105  : m_what(what_)
106  , m_where(where_)
107  {
108  }
109 
110  const char* what() const { return m_what; }
111  char* where() const { return m_where; }
112 
113 private:
114  const char* m_what;
115  char* m_where;
116 };
117 
118 
119 namespace internal {
120 
121 extern const uint8_t lutChar [256]; // Character class
122 extern const uint8_t lutDigits[256]; // Digits
123 
124 // Detect whitespace character (space \n \r \t)
125 struct WhitespacePred {
126  static bool test(char ch) { return (lutChar[uint8_t(ch)] & 0x02) != 0; }
127 };
128 
129 // Detect node name character (anything but space \n \r \t / > ? \0)
130 struct NodeNamePred {
131  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x43); }
132 };
133 
134 // Detect attribute name character (anything but space \n \r \t / < > = ? ! \0)
135 struct AttributeNamePred {
136  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0xC7); }
137 };
138 
139 // Detect text character (PCDATA) (anything but < \0)
140 struct TextPred {
141  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x05); }
142 };
143 
144 // Detect text character (PCDATA) that does not require processing when ws
145 // normalization is disabled (anything but < \0 &)
146 struct TextPureNoWsPred {
147  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x0D); }
148 };
149 
150 // Detect text character (PCDATA) that does not require processing when ws
151 // normalizationis is enabled (anything but < \0 & space \n \r \t)
152 struct TextPureWithWsPred {
153  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x0F); }
154 };
155 
156 // Detect attribute value character, single quote (anything but ' \0)
157 struct AttPred1 {
158  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x11); }
159 };
160 // Detect attribute value character, double quote (anything but " \0)
161 struct AttPred2 {
162  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x21); }
163 };
164 
165 // Detect attribute value character, single quote, that does not require
166 // processing (anything but ' \0 &)
167 struct AttPurePred1 {
168  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x19); }
169 };
170 // Detect attribute value character, double quote, that does not require
171 // processing (anything but " \0 &)
172 struct AttPurePred2 {
173  static bool test(char ch) { return !(lutChar[uint8_t(ch)] & 0x29); }
174 };
175 
176 // Insert coded character, using UTF8
177 static inline void insertUTF8char(char*& text, uint32_t code)
178 {
179  if (code < 0x80) { // 1 byte sequence
180  text[0] = char(code);
181  text += 1;
182  } else if (code < 0x800) {// 2 byte sequence
183  text[1] = char((code | 0x80) & 0xBF); code >>= 6;
184  text[0] = char (code | 0xC0);
185  text += 2;
186  } else if (code < 0x10000) { // 3 byte sequence
187  text[2] = char((code | 0x80) & 0xBF); code >>= 6;
188  text[1] = char((code | 0x80) & 0xBF); code >>= 6;
189  text[0] = char (code | 0xE0);
190  text += 3;
191  } else if (code < 0x110000) { // 4 byte sequence
192  text[3] = char((code | 0x80) & 0xBF); code >>= 6;
193  text[2] = char((code | 0x80) & 0xBF); code >>= 6;
194  text[1] = char((code | 0x80) & 0xBF); code >>= 6;
195  text[0] = char (code | 0xF0);
196  text += 4;
197  } else { // Invalid, only codes up to 0x10FFFF are allowed in Unicode
198  throw ParseError("invalid numeric character entity", text);
199  }
200 }
201 
202 template<char C0, char C1> static inline bool next(const char* p)
203 {
204  return small_compare<C0, C1>(p);
205 }
206 template<char C0, char C1, char C2> static inline bool next(const char* p)
207 {
208  return small_compare<C0, C1, C2>(p);
209 }
210 template<char C0, char C1, char C2, char C3> static inline bool next(const char* p)
211 {
212  return small_compare<C0, C1, C2, C3>(p);
213 }
214 template<char C0, char C1, char C2, char C3, char C4, char C5>
215 static inline bool next(const char* p)
216 {
217  return small_compare<C0, C1, C2, C3, C4, C5>(p);
218 }
219 
220 
221 // Skip characters until predicate evaluates to true
222 template<class StopPred> static inline void skip(char*& text)
223 {
224  char* tmp = text;
225  while (StopPred::test(*tmp)) ++tmp;
226  text = tmp;
227 }
228 
229 // Skip characters until predicate evaluates to true while doing the following:
230 // - replacing XML character entity references with proper characters
231 // (&apos; &amp; &quot; &lt; &gt; &#...;)
232 // - condensing whitespace sequences to single space character
233 template<class StopPred, class StopPredPure, int FLAGS>
234 static inline char* skipAndExpand(char*& text)
235 {
236  // If entity translation, whitespace condense and whitespace
237  // trimming is disabled, use plain skip.
238  if ( (FLAGS & noEntityTranslation) &&
239  !(FLAGS & normalizeWhitespace) &&
240  !(FLAGS & trimWhitespace)) {
241  skip<StopPred>(text);
242  return text;
243  }
244 
245  // Use simple skip until first modification is detected
246  skip<StopPredPure>(text);
247 
248  // Use translation skip
249  char* src = text;
250  char* dest = src;
251  while (StopPred::test(*src)) {
252  // Test if replacement is needed
253  if (!(FLAGS & noEntityTranslation) &&
254  (src[0] == '&')) {
255  switch (src[1]) {
256  case 'a': // &amp; &apos;
257  if (next<'m','p',';'>(&src[2])) {
258  *dest = '&';
259  ++dest;
260  src += 5;
261  continue;
262  }
263  if (next<'p','o','s',';'>(&src[2])) {
264  *dest = '\'';
265  ++dest;
266  src += 6;
267  continue;
268  }
269  break;
270 
271  case 'q': // &quot;
272  if (next<'u','o','t',';'>(&src[2])) {
273  *dest = '"';
274  ++dest;
275  src += 6;
276  continue;
277  }
278  break;
279 
280  case 'g': // &gt;
281  if (next<'t',';'>(&src[2])) {
282  *dest = '>';
283  ++dest;
284  src += 4;
285  continue;
286  }
287  break;
288 
289  case 'l': // &lt;
290  if (next<'t',';'>(&src[2])) {
291  *dest = '<';
292  ++dest;
293  src += 4;
294  continue;
295  }
296  break;
297 
298  case '#': // &#...; - assumes ASCII
299  if (src[2] == 'x') {
300  uint32_t code = 0;
301  src += 3; // skip &#x
302  while (true) {
303  uint8_t digit = lutDigits[uint8_t(*src)];
304  if (digit == 0xFF) break;
305  code = code * 16 + digit;
306  ++src;
307  }
308  insertUTF8char(dest, code);
309  } else {
310  uint32_t code = 0;
311  src += 2; // skip &#
312  while (true) {
313  uint8_t digit = lutDigits[uint8_t(*src)];
314  if (digit == 0xFF) break;
315  code = code * 10 + digit;
316  ++src;
317  }
318  insertUTF8char(dest, code);
319  }
320  if (*src != ';') {
321  throw ParseError("expected ;", src);
322  }
323  ++src;
324  continue;
325 
326  default:
327  // Something else, ignore, just copy '&' verbatim
328  break;
329  }
330  }
331 
332  // Test if condensing is needed
333  if ((FLAGS & normalizeWhitespace) &&
334  (WhitespacePred::test(*src))) {
335  *dest++ = ' '; // single space in dest
336  ++src; // skip first whitespace char
337  // Skip remaining whitespace chars
338  while (WhitespacePred::test(*src)) ++src;
339  continue;
340  }
341 
342  // No replacement, only copy character
343  *dest++ = *src++;
344  }
345 
346  // Return new end
347  text = src;
348  return dest;
349 }
350 
351 static inline void skipBOM(char*& text)
352 {
353  if (next<char(0xEF), char(0xBB), char(0xBF)>(text)) {
354  text += 3; // skip utf-8 bom
355  }
356 }
357 
358 
359 template<int FLAGS, typename HANDLER> class Parser
360 {
361  HANDLER& handler;
362 
363 public:
364  Parser(HANDLER& handler_, char* text)
365  : handler(handler_)
366  {
367  skipBOM(text);
368  while (true) {
369  // Skip whitespace before node
370  skip<WhitespacePred>(text);
371  if (*text == 0) break;
372 
373  if (*text != '<') {
374  throw ParseError("expected <", text);
375  }
376  ++text; // skip '<'
377  parseNode(text);
378  }
379  }
380 
381 private:
382  // Parse XML declaration (<?xml...)
383  void parseDeclaration(char*& text)
384  {
385  handler.declarationStart();
386  skip<WhitespacePred>(text); // skip ws before attributes or ?>
387  parseAttributes(text, true);
388  handler.declarationStop();
389 
390  // skip ?>
391  if (!next<'?','>'>(text)) {
392  throw ParseError("expected ?>", text);
393  }
394  text += 2;
395  }
396 
397  // Parse XML comment (<!--...)
398  void parseComment(char*& text)
399  {
400  // Skip until end of comment
401  char* value = text; // remember value start
402  while (!next<'-','-','>'>(text)) {
403  if (text[0] == 0) {
404  throw ParseError("unexpected end of data", text);
405  }
406  ++text;
407  }
408  handler.comment(std::string_view(value, text - value));
409  text += 3; // skip '-->'
410  }
411 
412  void parseDoctype(char*& text)
413  {
414  char* value = text; // remember value start
415 
416  // skip to >
417  while (*text != '>') {
418  switch (*text) {
419  case '[': {
420  // If '[' encountered, scan for matching ending
421  // ']' using naive algorithm with depth. This
422  // works for all W3C test files except for 2
423  // most wicked.
424  ++text; // skip '['
425  int depth = 1;
426  while (depth > 0) {
427  switch (*text) {
428  case char('['): ++depth; break;
429  case char(']'): --depth; break;
430  case 0: throw ParseError(
431  "unexpected end of data", text);
432  }
433  ++text;
434  }
435  break;
436  }
437  case '\0':
438  throw ParseError("unexpected end of data", text);
439 
440  default:
441  ++text;
442  }
443  }
444 
445  handler.doctype(std::string_view(value, text - value));
446  text += 1; // skip '>'
447  }
448 
449  void parsePI(char*& text)
450  {
451  // Extract PI target name
452  char* name = text;
453  skip<NodeNamePred>(text);
454  char* nameEnd = text;
455  if (name == nameEnd) {
456  throw ParseError("expected PI target", text);
457  }
458 
459  // Skip whitespace between pi target and pi
460  skip<WhitespacePred>(text);
461 
462  // Skip to '?>'
463  char* value = text; // Remember start of pi
464  while (!next<'?','>'>(text)) {
465  if (*text == 0) {
466  throw ParseError("unexpected end of data", text);
467  }
468  ++text;
469  }
470  // Set pi value (verbatim, no entity expansion or ws normalization)
471  handler.procInstr(std::string_view(name, nameEnd - name),
472  std::string_view(value, text - value));
473  text += 2; // skip '?>'
474  }
475 
476  void parseText(char*& text, char* contentsStart)
477  {
478  // Backup to contents start if whitespace trimming is disabled
479  if (!(FLAGS & trimWhitespace)) {
480  text = contentsStart;
481  }
482  // Skip until end of data
483  char* value = text;
484  char* end = (FLAGS & normalizeWhitespace)
485  ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
486  : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
487 
488  // Trim trailing whitespace; leading was already trimmed by
489  // whitespace skip after >
490  if (FLAGS & trimWhitespace) {
491  if (FLAGS & normalizeWhitespace) {
492  // Whitespace is already condensed to single
493  // space characters by skipping function, so
494  // just trim 1 char off the end.
495  if (end[-1] == ' ') {
496  --end;
497  }
498  } else {
499  // Backup until non-whitespace character is found
500  while (WhitespacePred::test(end[-1])) {
501  --end;
502  }
503  }
504  }
505 
506  // check next char before calling handler.text()
507  if (*text == '\0') {
508  throw ParseError("unexpected end of data", text);
509  } else {
510  assert(*text == '<');
511  }
512 
513  // Handle text, but only if non-empty.
514  auto len = end - value;
515  if (len) handler.text(std::string_view(value, len));
516  }
517 
518  void parseCdata(char*& text)
519  {
520  // Skip until end of cdata
521  char* value = text;
522  while (!next<']',']','>'>(text)) {
523  if (text[0] == 0) {
524  throw ParseError("unexpected end of data", text);
525  }
526  ++text;
527  }
528  handler.cdata(std::string_view(value, text - value));
529  text += 3; // skip ]]>
530  }
531 
532  void parseElement(char*& text)
533  {
534  // Extract element name
535  char* name = text;
536  skip<NodeNamePred>(text);
537  char* nameEnd = text;
538  if (name == nameEnd) {
539  throw ParseError("expected element name", text);
540  }
541  handler.start(std::string_view(name, nameEnd - name));
542 
543  skip<WhitespacePred>(text); // skip ws before attributes or >
544  parseAttributes(text, false);
545 
546  // Determine ending type
547  if (*text == '>') {
548  ++text;
549  parseNodeContents(text);
550  } else if (*text == '/') {
551  handler.stop();
552  ++text;
553  if (*text != '>') {
554  throw ParseError("expected >", text);
555  }
556  ++text;
557  } else {
558  throw ParseError("expected >", text);
559  }
560  }
561 
562  // Determine node type, and parse it
563  void parseNode(char*& text)
564  {
565  switch (text[0]) {
566  case '?': // <?...
567  ++text; // skip ?
568  // Note: this doesn't detect mixed case (xMl), does
569  // that matter?
570  if ((next<'x','m','l'>(text) ||
571  next<'X','M','L'>(text)) &&
572  WhitespacePred::test(text[3])) {
573  // '<?xml ' - xml declaration
574  text += 4; // skip 'xml '
575  parseDeclaration(text);
576  } else {
577  parsePI(text);
578  }
579  break;
580 
581  case '!': // <!...
582  // Parse proper subset of <! node
583  switch (text[1]) {
584  case '-': // <!-
585  if (text[2] == '-') {
586  // '<!--' - xml comment
587  text += 3; // skip '!--'
588  parseComment(text);
589  return;
590  }
591  break;
592 
593  case '[': // <![
594  if (next<'C','D','A','T','A','['>(&text[2])) {
595  // '<![CDATA[' - cdata
596  text += 8; // skip '![CDATA['
597  parseCdata(text);
598  return;
599  }
600  break;
601 
602  case 'D': // <!D
603  if (next<'O','C','T','Y','P','E'>(&text[2]) &&
604  WhitespacePred::test(text[8])) {
605  // '<!DOCTYPE ' - doctype
606  text += 9; // skip '!DOCTYPE '
607  parseDoctype(text);
608  return;
609  }
610  break;
611  }
612  // Attempt to skip other, unrecognized types starting with <!
613  ++text; // skip !
614  while (*text != '>') {
615  if (*text == 0) {
616  throw ParseError(
617  "unexpected end of data", text);
618  }
619  ++text;
620  }
621  ++text; // skip '>'
622  break;
623 
624  default: // <...
625  parseElement(text);
626  break;
627  }
628  }
629 
630  // Parse contents of the node - children, data etc.
631  void parseNodeContents(char*& text)
632  {
633  while (true) {
634  char* contentsStart = text; // start before ws is skipped
635  skip<WhitespacePred>(text); // Skip ws between > and contents
636 
637  switch (*text) {
638  case '<': // Node closing or child node
639 afterText: // After parseText() jump here instead of continuing
640  // the loop, because skipping whitespace is unnecessary.
641  if (text[1] == '/') {
642  // Node closing
643  text += 2; // skip '</'
644  skip<NodeNamePred>(text);
645  // TODO validate closing tag??
646  handler.stop();
647  // Skip remaining whitespace after node name
648  skip<WhitespacePred>(text);
649  if (*text != '>') {
650  throw ParseError("expected >", text);
651  }
652  ++text; // skip '>'
653  return;
654  } else {
655  // Child node
656  ++text; // skip '<'
657  parseNode(text);
658  }
659  break;
660 
661  case '\0':
662  throw ParseError("unexpected end of data", text);
663 
664  default:
665  parseText(text, contentsStart);
666  goto afterText;
667  }
668  }
669  }
670 
671  // Parse XML attributes of the node
672  void parseAttributes(char*& text, bool declaration)
673  {
674  // For all attributes
675  while (AttributeNamePred::test(*text)) {
676  // Extract attribute name
677  char* name = text;
678  ++text; // Skip first character of attribute name
679  skip<AttributeNamePred>(text);
680  char* nameEnd = text;
681  if (name == nameEnd) {
682  throw ParseError("expected attribute name", name);
683  }
684 
685  skip<WhitespacePred>(text); // skip ws after name
686  if (*text != '=') {
687  throw ParseError("expected =", text);
688  }
689  ++text; // skip =
690  skip<WhitespacePred>(text); // skip ws after =
691 
692  // Skip quote and remember if it was ' or "
693  char quote = *text;
694  if (quote != '\'' && quote != '"') {
695  throw ParseError("expected ' or \"", text);
696  }
697  ++text;
698 
699  // Extract attribute value and expand char refs in it
700  // No whitespace normalization in attributes
701  static const int FLAGS2 = FLAGS & ~normalizeWhitespace;
702  char* value = text;
703  char* valueEnd = (quote == '\'')
704  ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
705  : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
706  // Make sure that end quote is present
707  // check before calling handler.xxx()
708  if (*text != quote) {
709  throw ParseError("expected ' or \"", text);
710  }
711  ++text; // skip quote
712 
713  if (!declaration) {
714  handler.attribute(std::string_view(name, nameEnd - name),
715  std::string_view(value, valueEnd - value));
716  } else {
717  handler.declAttribute(std::string_view(name, nameEnd - name),
718  std::string_view(value, valueEnd - value));
719  }
720 
721  skip<WhitespacePred>(text); // skip ws after value
722  }
723  }
724 };
725 
726 } // namespace internal
727 
728 template<int FLAGS, typename HANDLER>
729 inline void parse(HANDLER& handler, char* xml)
730 {
731  internal::Parser<FLAGS, HANDLER> parser(handler, xml);
732 }
733 
734 } // namespace rapidsax
735 
736 #endif