openMSX
utf8_core.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 
31 #ifndef UTF8_CORE_HH
32 #define UTF8_CORE_HH
33 
34 #include <iterator>
35 #include <cstdint>
36 
37 namespace utf8 {
38 
39 // Helper code - not intended to be directly called by the library users.
40 // May be changed at any time
41 namespace internal {
42 
43 // Unicode constants
44 // Leading (high) surrogates: 0xd800 - 0xdbff
45 // Trailing (low) surrogates: 0xdc00 - 0xdfff
46 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
47 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
48 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
49 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
50 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
51 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
52 
53 // Maximum valid value for a Unicode code point
54 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
55 
56 [[nodiscard]] inline bool is_trail(uint8_t oc)
57 {
58  return (oc >> 6) == 0x2;
59 }
60 
61 [[nodiscard]] inline bool is_surrogate(uint16_t cp)
62 {
63  return (cp >= LEAD_SURROGATE_MIN) && (cp <= TRAIL_SURROGATE_MAX);
64 }
65 
66 [[nodiscard]] inline bool is_code_point_valid(uint32_t cp)
67 {
68  return (cp <= CODE_POINT_MAX) && !is_surrogate(cp) &&
69  (cp != 0xfffe) && (cp != 0xffff);
70 }
71 
72 [[nodiscard]] inline unsigned sequence_length(uint8_t lead)
73 {
74  if (lead < 0x80) {
75  return 1;
76  } else if ((lead >> 5) == 0x06) {
77  return 2;
78  } else if ((lead >> 4) == 0x0e) {
79  return 3;
80  } else if ((lead >> 3) == 0x1e) {
81  return 4;
82  } else {
83  return 0;
84  }
85 }
86 
87 enum utf_error {
88  OK,
94 };
95 
96 template <typename octet_iterator>
97 [[nodiscard]] utf_error validate_next(octet_iterator& it, octet_iterator end,
98  uint32_t* code_point)
99 {
100  uint32_t cp = *it;
101  // Check the lead octet
102  int length = sequence_length(*it);
103 
104  // "Shortcut" for ASCII characters
105  if (length == 1) {
106  if (end - it <= 0) {
107  return NOT_ENOUGH_ROOM;
108  }
109  if (code_point) {
110  *code_point = cp;
111  }
112  ++it;
113  return OK;
114  }
115 
116  // Do we have enough memory?
117  if (std::distance(it, end) < length) {
118  return NOT_ENOUGH_ROOM;
119  }
120 
121  // Check trail octets and calculate the code point
122  switch (length) {
123  case 0:
124  return INVALID_LEAD;
125  case 2:
126  if (is_trail(*(++it))) {
127  cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
128  } else {
129  --it;
130  return INCOMPLETE_SEQUENCE;
131  }
132  break;
133  case 3:
134  if (is_trail(*(++it))) {
135  cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
136  if (is_trail(*(++it))) {
137  cp += (*it) & 0x3f;
138  } else {
139  std::advance(it, -2);
140  return INCOMPLETE_SEQUENCE;
141  }
142  } else {
143  --it;
144  return INCOMPLETE_SEQUENCE;
145  }
146  break;
147  case 4:
148  if (is_trail(*(++it))) {
149  cp = ((cp << 18) & 0x1fffff) + ((*it << 12) & 0x3ffff);
150  if (is_trail(*(++it))) {
151  cp += (*it << 6) & 0xfff;
152  if (is_trail(*(++it))) {
153  cp += (*it) & 0x3f;
154  } else {
155  std::advance(it, -3);
156  return INCOMPLETE_SEQUENCE;
157  }
158  } else {
159  std::advance(it, -2);
160  return INCOMPLETE_SEQUENCE;
161  }
162  } else {
163  --it;
164  return INCOMPLETE_SEQUENCE;
165  }
166  break;
167  }
168  // Is the code point valid?
169  if (!is_code_point_valid(cp)) {
170  for (int i = 0; i < length - 1; ++i) {
171  --it;
172  }
173  return INVALID_CODE_POINT;
174  }
175 
176  if (code_point) {
177  *code_point = cp;
178  }
179  if (cp < 0x80) {
180  if (length != 1) {
181  std::advance(it, -(length-1));
182  return OVERLONG_SEQUENCE;
183  }
184  } else if (cp < 0x800) {
185  if (length != 2) {
186  std::advance(it, -(length-1));
187  return OVERLONG_SEQUENCE;
188  }
189  } else if (cp < 0x10000) {
190  if (length != 3) {
191  std::advance(it, -(length-1));
192  return OVERLONG_SEQUENCE;
193  }
194  }
195 
196  ++it;
197  return OK;
198 }
199 
200 template <typename octet_iterator>
201 [[nodiscard]] inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
202  return validate_next(it, end, nullptr);
203 }
204 
205 } // namespace internal
206 
208 
209 // Byte order mark
210 const uint8_t bom[] = { 0xef, 0xbb, 0xbf };
211 
212 template <typename octet_iterator>
213 [[nodiscard]] octet_iterator find_invalid(octet_iterator start, octet_iterator end)
214 {
215  auto result = start;
216  while (result != end) {
217  internal::utf_error err_code = internal::validate_next(result, end);
218  if (err_code != internal::OK) {
219  return result;
220  }
221  }
222  return result;
223 }
224 
225 template <typename octet_iterator>
226 [[nodiscard]] inline bool is_valid(octet_iterator start, octet_iterator end)
227 {
228  return find_invalid(start, end) == end;
229 }
230 
231 template <typename octet_iterator>
232 [[nodiscard]] inline bool is_bom(octet_iterator it)
233 {
234  return ((*it++ == bom[0]) &&
235  (*it++ == bom[1]) &&
236  (*it == bom[2]));
237 }
238 
239 template<typename octet_iterator>
240 [[nodiscard]] inline octet_iterator sync_forward(octet_iterator it)
241 {
242  while (internal::is_trail(*it)) ++it;
243  return it;
244 }
245 
246 template<typename octet_iterator>
247 [[nodiscard]] inline octet_iterator sync_backward(octet_iterator it)
248 {
249  while (internal::is_trail(*it)) --it;
250  return it;
251 }
252 
253 // Is this a code point in the 'Private Use Area' (PUA).
254 // https://en.wikipedia.org/wiki/Private_Use_Areas
255 [[nodiscard]] inline bool is_pua(uint32_t cp)
256 {
257  return ((0x00E000 <= cp) && (cp <= 0x00F8FF)) ||
258  ((0x0F0000 <= cp) && (cp <= 0x0FFFFD)) ||
259  ((0x100000 <= cp) && (cp <= 0x10FFFD));
260 }
261 
262 } // namespace utf8
263 
264 #endif
bool is_trail(uint8_t oc)
Definition: utf8_core.hh:56
T length(const vecN< N, T > &x)
Definition: gl_vec.hh:343
const uint16_t LEAD_SURROGATE_MAX
Definition: utf8_core.hh:47
const uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:49
auto distance(octet_iterator first, octet_iterator last)
const uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:48
bool is_pua(uint32_t cp)
Definition: utf8_core.hh:255
const uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:54
bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:61
void advance(octet_iterator &it, distance_type n, octet_iterator end)
const uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:51
bool is_valid(octet_iterator start, octet_iterator end)
Definition: utf8_core.hh:226
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition: utf8_core.hh:213
const uint8_t bom[]
The library API - functions intended to be called by the users.
Definition: utf8_core.hh:210
octet_iterator sync_backward(octet_iterator it)
Definition: utf8_core.hh:247
bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:66
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:97
unsigned sequence_length(uint8_t lead)
Definition: utf8_core.hh:72
bool is_bom(octet_iterator it)
Definition: utf8_core.hh:232
const uint16_t LEAD_OFFSET
Definition: utf8_core.hh:50
octet_iterator sync_forward(octet_iterator it)
Definition: utf8_core.hh:240
const uint16_t LEAD_SURROGATE_MIN
Definition: utf8_core.hh:46